├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── gradle.properties
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
├── main
├── kotlin
│ └── lab
│ │ └── mars
│ │ └── rl
│ │ ├── algo
│ │ ├── dp
│ │ │ ├── PolicyIteration.kt
│ │ │ └── ValueIteration.kt
│ │ ├── dyna
│ │ │ ├── Dyna-Q+.kt
│ │ │ ├── Dyna-Q-OnPolicy.kt
│ │ │ ├── Dyna-Q.kt
│ │ │ ├── PrioritizedSweeping.kt
│ │ │ ├── PrioritizedSweepingStochasticEnv.kt
│ │ │ └── RandomSampleOneStepTabularQLearning.kt
│ │ ├── eligibility_trace
│ │ │ ├── control
│ │ │ │ ├── Sarsa(λ).kt
│ │ │ │ └── True Online Sarsa(λ).kt
│ │ │ └── prediction
│ │ │ │ ├── Off-line λ-return.kt
│ │ │ │ ├── Semi-gradient TD(λ) prediction.kt
│ │ │ │ └── True Online TD(λ) prediction.kt
│ │ ├── func_approx
│ │ │ ├── Play.kt
│ │ │ ├── off_policy
│ │ │ │ ├── Semi-gradient Expected Sarsa.kt
│ │ │ │ ├── Semi-gradient off-policy TD(0).kt
│ │ │ │ ├── n-step semi-gradient off-policy Q(σ).kt
│ │ │ │ └── n-step semi-gradient off-policy sarsa.kt
│ │ │ ├── on_policy
│ │ │ │ ├── Differential semi-gradient Sarsa.kt
│ │ │ │ ├── Differential semi-gradient n-step Sarsa.kt
│ │ │ │ ├── Episodic Semi-gradient QLearning control.kt
│ │ │ │ ├── Episodic Semi-gradient Sarsa control.kt
│ │ │ │ └── Episodic semi-gradient n-step Sarsa.kt
│ │ │ └── prediction
│ │ │ │ ├── Gradient Monte Carlo algorithm.kt
│ │ │ │ ├── LSTD.kt
│ │ │ │ ├── Semi-gradient TD(0).kt
│ │ │ │ └── n-step semi-gradient TD.kt
│ │ ├── mc
│ │ │ ├── ExploringStarts.kt
│ │ │ ├── First visit Monte Carlo Prediction.kt
│ │ │ ├── Off-Policy Prediction.kt
│ │ │ ├── Off-policy Optimal.kt
│ │ │ └── On-Policy Optimal.kt
│ │ ├── ntd
│ │ │ ├── N-step Off-policy Sarsa.kt
│ │ │ ├── N-step Off-policy n-step Q(σ).kt
│ │ │ ├── N-step Sarsa.kt
│ │ │ ├── N-step TD prediction.kt
│ │ │ └── N-step Treebackup.kt
│ │ ├── package.kt
│ │ ├── policy_gradient
│ │ │ ├── Actor-Critic with Eligibility Traces (continuing).kt
│ │ │ ├── Actor-Critic with Eligibility Traces (episodic).kt
│ │ │ ├── One-step Actor-Critic (episodic).kt
│ │ │ ├── REINFORCE with Baseline (episodic).kt
│ │ │ └── REINFORCE.kt
│ │ └── td
│ │ │ ├── DoubleQLearning.kt
│ │ │ ├── ExpectedSarsa.kt
│ │ │ ├── QLearning.kt
│ │ │ ├── Sarsa.kt
│ │ │ └── Tabular TD(0).kt
│ │ ├── model
│ │ ├── ApproximateFunction.kt
│ │ ├── MDP.kt
│ │ └── impl
│ │ │ ├── func
│ │ │ ├── LinearFunc.kt
│ │ │ ├── SimpleCoarseCoding.kt
│ │ │ ├── SimpleTileCoding.kt
│ │ │ ├── StateAggregation.kt
│ │ │ └── SuttonTileCoding.kt
│ │ │ └── mdp
│ │ │ ├── DefaultAction.kt
│ │ │ ├── DefaultMDP.kt
│ │ │ ├── EpsilonGreedyFunctionPolicy.kt
│ │ │ ├── IndexedAction.kt
│ │ │ ├── IndexedMDP.kt
│ │ │ ├── IndexedPolicy.kt
│ │ │ ├── IndexedPossible.kt
│ │ │ ├── IndexedState.kt
│ │ │ ├── NSetMDP.kt
│ │ │ ├── SoftmaxpPolicy.kt
│ │ │ └── package.kt
│ │ ├── problem
│ │ ├── 1000-state RandomWalk.kt
│ │ ├── 19-state RandomWalk.kt
│ │ ├── AccessControl.kt
│ │ ├── Blackjack.kt
│ │ ├── CarRental.kt
│ │ ├── CliffWalking.kt
│ │ ├── DynaMaze.kt
│ │ ├── FlyPlane.kt
│ │ ├── Gambler.kt
│ │ ├── GridWorld.kt
│ │ ├── MaximizationBias.kt
│ │ ├── MountainCar.kt
│ │ ├── RandomWalk.kt
│ │ ├── RodManeuvering.kt
│ │ ├── SquareWave.kt
│ │ ├── WindyGridworld.kt
│ │ └── package.kt
│ │ └── util
│ │ ├── buf
│ │ ├── Buf.kt
│ │ ├── DefaultBuf.kt
│ │ ├── DefaultIntBuf.kt
│ │ ├── Index.kt
│ │ ├── IntBuf.kt
│ │ ├── MutableBuf.kt
│ │ └── MutableIntBuf.kt
│ │ ├── collection
│ │ ├── CompactNSet.kt
│ │ ├── Gettable.kt
│ │ ├── HashMapRAC.kt
│ │ ├── IndexedCollection.kt
│ │ ├── NSet.kt
│ │ └── extensions.kt
│ │ ├── dimension
│ │ ├── Dimension.kt
│ │ └── DimensionBuilder.kt
│ │ ├── exception
│ │ ├── IndexOutOfDimensionException.kt
│ │ └── NoMoreElementsException.kt
│ │ ├── log
│ │ └── LoggerHelpers.kt
│ │ ├── math
│ │ ├── Binomial.kt
│ │ ├── MathHelpers.kt
│ │ ├── Poisson.kt
│ │ └── Vector.kt
│ │ ├── matrix
│ │ └── Matrix.kt
│ │ ├── resource
│ │ ├── ClasspathLocation.kt
│ │ ├── FileSystemLocation.kt
│ │ ├── ResourceLoader.kt
│ │ └── ResourceLocation.kt
│ │ ├── tuples
│ │ ├── tuple2.kt
│ │ ├── tuple3.kt
│ │ ├── tuple4.kt
│ │ ├── tuple5.kt
│ │ └── tuple6.kt
│ │ └── ui
│ │ ├── ChartApp.kt
│ │ ├── D2DGameUI.kt
│ │ ├── D3DChartUI.kt
│ │ ├── GridWorldUI.kt
│ │ ├── MountainCarUI.kt
│ │ ├── RawD3DChartUI.kt
│ │ └── RodManeuveringUI.kt
└── resources
│ └── StockLineChart.css
└── test
├── kotlin
└── lab
│ └── mars
│ └── rl
│ ├── algo
│ ├── dp
│ │ ├── Test Policy Iteration.kt
│ │ └── Test Value Iteration.kt
│ ├── dyna
│ │ ├── Test Optimal Dyna-Q on-policy.kt
│ │ ├── Test Optimal Dyna-Q+.kt
│ │ ├── Test Optimal Dyna-Q.kt
│ │ ├── Test Optimal Prioritized Sweeping Stochastic.kt
│ │ ├── Test Optimal Prioritized Sweeping.kt
│ │ └── Test Optimal RandomSampleOneStepTabularQLearning.kt
│ ├── eligibility_trace
│ │ ├── control
│ │ │ └── Test Optimal Sarsa(λ).kt
│ │ └── prediction
│ │ │ ├── Test Prediction Off-line λ-return.kt
│ │ │ ├── Test Prediction Semi-gradient TD(λ).kt
│ │ │ └── Test Prediction True Online TD(λ).kt
│ ├── func_approx
│ │ ├── on_policy
│ │ │ ├── Test Optimal Differential semi-gradient Sarsa.kt
│ │ │ ├── Test Optimal Episodic Semi-gradient QLearning control.kt
│ │ │ ├── Test Optimal Episodic Semi-gradient Sarsa control.kt
│ │ │ └── Test Optimal n-step semi-gradient Sarsa.kt
│ │ └── prediction
│ │ │ ├── Test Function Approximator Coarse Coding.kt
│ │ │ ├── Test Function Approximator Fourier vs Poly.kt
│ │ │ ├── Test Function Approximator Tile coding.kt
│ │ │ ├── Test Prediction Gradient MC.kt
│ │ │ ├── Test Prediction LSTD.kt
│ │ │ ├── Test Prediction Semi-gradient TD(0).kt
│ │ │ └── Test Prediction n-step Semi-gradient TD.kt
│ ├── mc
│ │ ├── Test Optimal MC Exploring Starts.kt
│ │ ├── Test Optimal MC Off-policy.kt
│ │ ├── Test Optimal MC On-policy first-visit.kt
│ │ ├── Test Prediction MC Off-policy.kt
│ │ └── Test Prediction Monte Carlo Prediction.kt
│ ├── ntd
│ │ ├── Test Optimal n-TD Off-policy Sarsa.kt
│ │ ├── Test Optimal n-TD Q(σ).kt
│ │ ├── Test Optimal n-TD Sarsa.kt
│ │ ├── Test Optimal n-TD Treebackup.kt
│ │ └── Test Prediction n-TD.kt
│ └── td
│ │ ├── Test Optimal TD Doubel Q-Learning.kt
│ │ ├── Test Optimal TD Expected sarsa.kt
│ │ ├── Test Optimal TD Q-Learning.kt
│ │ ├── Test Optimal TD Sarsa.kt
│ │ └── Test Prediction Tabular TD(0).kt
│ ├── problem
│ ├── Test Mountain Car with Actor-Critic.kt
│ ├── `Test FlyPlane Problem with Actor-Critic`.kt
│ ├── `Test FlyPlane Problem with REINFORCE`.kt
│ └── `Test FlyPlane Problem with TD(λ)`.kt
│ └── util
│ ├── TestBase.kt
│ ├── TestCNSet.kt
│ ├── TestIndex.kt
│ ├── TestNSet.kt
│ ├── extensions.kt
│ └── range
│ └── DoubleProgression.kt
└── resources
├── Figure 10.1.PNG
├── Figure 10.4.PNG
├── Figure 12.10.PNG
├── Figure 12.11.PNG
├── Figure 12.3.PNG
├── Figure 12.6.PNG
├── Figure 12.8.PNG
├── Figure 7.2.PNG
└── logback-test.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/**
2 | .gradle/**
3 | logs/**
4 | **/build
5 | **/out
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 wumo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id 'org.jetbrains.kotlin.jvm' version '1.3.61'
3 | }
4 |
5 | group 'wumo'
6 | version '2.0'
7 |
8 | repositories {
9 | mavenCentral()
10 | }
11 |
12 | dependencies {
13 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk8"
14 | compile "org.jetbrains.kotlinx:kotlinx-coroutines-core:1.3.3"
15 | compile "com.object-refinery:orson-charts-fx:1.0"
16 | compile "no.tornado:tornadofx:1.7.20"
17 | compile "org.apache.commons:commons-math3:3.6.1"
18 | compile "org.slf4j:slf4j-api:1.7.30"
19 | compile "ch.qos.logback:logback-classic:1.2.3"
20 | testCompile "junit:junit:4.13.1"
21 | }
22 |
23 | compileKotlin {
24 | kotlinOptions.jvmTarget = "1.8"
25 | }
26 | compileTestKotlin {
27 | kotlinOptions.jvmTarget = "1.8"
28 | }
29 |
--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/gradle.properties
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS="-Xmx64m"
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'reinforcement-learning-model'
2 |
3 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dp/PolicyIteration.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dp
2 |
3 | import lab.mars.rl.algo.Q_from_V
4 | import lab.mars.rl.algo.V_from_Q
5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
7 | import lab.mars.rl.model.impl.mdp.OptimalSolution
8 | import lab.mars.rl.model.isNotTerminal
9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.collection.filter
11 | import lab.mars.rl.util.collection.fork
12 | import lab.mars.rl.util.log.debug
13 | import lab.mars.rl.util.math.argmax
14 | import lab.mars.rl.util.math.Σ
15 | import lab.mars.rl.util.tuples.tuple3
16 | import org.apache.commons.math3.util.FastMath.abs
17 | import org.apache.commons.math3.util.FastMath.max
18 |
19 | /**
20 | *
21 | * Created on 2017-09-05.
22 | *
23 | *
24 | * @author wumo
25 | */
26 |
27 | val θ = 1e-6
28 |
29 | fun IndexedMDP.`Policy Iteration V`(): OptimalSolution {
30 | val V = VFunc { 0.0 }
31 | val π = IndexedPolicy(QFunc { 1.0 })
32 | val Q = QFunc { 0.0 }
33 |
34 | do {
35 | //Policy Evaluation
36 | do {
37 | var Δ = 0.0
38 | for (s in states.filter { it.isNotTerminal }) {
39 | val v = V[s]
40 | V[s] = Σ(π(s).possibles) { probability * (reward + γ * V[next]) }
41 | Δ = max(Δ, abs(v - V[s]))
42 | }
43 | log.debug { "Δ=$Δ" }
44 | } while (Δ >= θ)
45 |
46 | //Policy Improvement
47 | var `policy-stable` = true
48 | for (s in states.filter { it.isNotTerminal }) {
49 | val `old-action` = π(s)
50 | val `new-action` = argmax(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
51 | π[s] = `new-action`
52 | if (`old-action` !== `new-action`) `policy-stable` = false
53 | }
54 | } while (!`policy-stable`)
55 | val result = tuple3(π, V, Q)
56 | Q_from_V(γ, states, result)
57 | return result
58 | }
59 |
60 | fun IndexedMDP.`Policy Iteration Q`(): OptimalSolution {
61 | val V = VFunc { 0.0 }
62 | val π = IndexedPolicy(QFunc { 1.0 })
63 | val Q = QFunc { 0.0 }
64 | do {
65 | //Policy Evaluation
66 | do {
67 | var Δ = 0.0
68 | for ((s, a) in states.fork { it.actions }) {
69 | val q = Q[s, a]
70 | Q[s, a] = Σ(a.possibles) { probability * (reward + γ * if (next.actions.any()) Q[next, π(next)] else 0.0) }
71 | Δ = max(Δ, abs(q - Q[s, a]))
72 | }
73 | log.debug { "Δ=$Δ" }
74 | } while (Δ >= θ)
75 |
76 | //Policy Improvement
77 | var `policy-stable` = true
78 | for (s in states.filter { it.isNotTerminal }) {
79 | val `old-action` = π(s)
80 | val `new-action` = argmax(s.actions) { Q[s, it] }
81 | π[s] = `new-action`
82 | if (`old-action` !== `new-action`) `policy-stable` = false
83 | }
84 | } while (!`policy-stable`)
85 | val result = tuple3(π, V, Q)
86 | V_from_Q(states, result)
87 | return result
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dp/ValueIteration.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dp
2 |
3 | import lab.mars.rl.algo.Q_from_V
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
6 | import lab.mars.rl.model.impl.mdp.OptimalSolution
7 | import lab.mars.rl.model.isNotTerminal
8 | import lab.mars.rl.model.log
9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.argmax
12 | import lab.mars.rl.util.math.max
13 | import lab.mars.rl.util.math.Σ
14 | import lab.mars.rl.util.tuples.tuple3
15 | import org.apache.commons.math3.util.FastMath.abs
16 | import org.apache.commons.math3.util.FastMath.max
17 |
18 | /**
19 | *
20 | * Created on 2017-09-06.
21 | *
22 | *
23 | * @author wumo
24 | */
25 | fun IndexedMDP.ValueIteration(): OptimalSolution {
26 | val V = VFunc { 0.0 }
27 | val π = IndexedPolicy(QFunc { 1.0 })
28 | val Q = QFunc { 0.0 }
29 | //value iteration
30 | do {
31 | var Δ = 0.0
32 | for (s in states.filter { it.isNotTerminal }) {
33 | val v = V[s]
34 | V[s] = max(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
35 | Δ = max(Δ, abs(v - V[s]))
36 | }
37 | log.debug { "Δ=$Δ" }
38 | } while (Δ >= θ)
39 | //policy generation
40 | for (s in states.filter { it.isNotTerminal })
41 | π[s] = argmax(s.actions) { Σ(possibles) { probability * (reward + γ * V[next]) } }
42 | val result = tuple3(π, V, Q)
43 | Q_from_V(γ, states, result)
44 | return result
45 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q+.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.model.null_state
9 | import lab.mars.rl.util.buf.DefaultBuf
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.max
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.tuples.tuple3
14 | import org.apache.commons.math3.util.FastMath.sqrt
15 |
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.`Dyna-Q+`(
18 | α: (IndexedState, IndexedAction) -> Double,
19 | ε: Double,
20 | κ: Double,
21 | n: Int,
22 | episodes: Int,
23 | stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
24 | episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
25 | val null_tuple3 = tuple3(null_state, Double.NaN, 0)
26 | val π = IndexedPolicy(QFunc { 0.0 })
27 | val Q = QFunc { 0.0 }
28 | val cachedSA = DefaultBuf.new>(Q.size)
29 | val Model = QFunc { null_tuple3 }
30 | val V = VFunc { 0.0 }
31 | val result = tuple3(π, V, Q)
32 | var time = 0
33 | for (episode in 1..episodes) {
34 | log.debug { "$episode/$episodes" }
35 | var s = started()
36 | while (s.isNotTerminal) {
37 | V_from_Q(states, result)
38 | stepListener(V, s)
39 | time++
40 | `ε-greedy`(s, Q, π, ε)
41 | val a = π(s)
42 | val (s_next, reward) = a.sample()
43 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
44 | for (_a in s.actions) {
45 | if (_a !== a && Model[s, _a] === null_tuple3) {
46 | cachedSA.append(tuple2(s, _a))
47 | Model[s, _a] = tuple3(s, 0.0, 1)
48 | }
49 | }
50 | if (Model[s, a] === null_tuple3)
51 | cachedSA.append(tuple2(s, a))
52 | Model[s, a] = tuple3(s_next, reward, time)
53 | repeat(n) {
54 | val (s, a) = cachedSA.rand()
55 | var (s_next, reward, t) = Model[s, a]
56 | reward += κ * sqrt((time - t).toDouble())
57 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
58 | }
59 | s = s_next
60 | }
61 | episodeListener(V)
62 | log.debug { "steps=$time" }
63 | }
64 | return result
65 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q-OnPolicy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.math.Rand
10 | import lab.mars.rl.util.math.max
11 | import lab.mars.rl.util.math.repeat
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.tuples.tuple3
14 | import java.util.*
15 |
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.`Dyna-Q-OnPolicy`(
18 | n: Int,
19 | ε: Double,
20 | α: (IndexedState, IndexedAction) -> Double,
21 | episodes: Int,
22 | stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
23 | episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
24 | val π = IndexedPolicy(QFunc { 0.0 })
25 | val Q = QFunc { 0.0 }
26 | val V = VFunc { 0.0 }
27 | val result = tuple3(π, V, Q)
28 |
29 | val startedStates = hashMapOf()
30 | val Model = QFunc { hashMapOf, Int>() }
31 | val N = QFunc { 0 }
32 |
33 | for (episode in 1..episodes) {
34 | log.debug { "$episode/$episodes" }
35 | var step = 0
36 | var stat = 0
37 | var s = started()
38 | startedStates.compute(s) { _, v -> (v ?: 0) + 1 }//record the total visits of each state
39 | while (s.isNotTerminal) {
40 | V_from_Q(states, result)
41 | stepListener(V, s)
42 | step++
43 | `ε-greedy (tie broken randomly)`(s, Q, π, ε)
44 | val a = π(s)
45 | val (s_next, reward) = a.sample()
46 | Model[s, a].compute(tuple2(s_next, reward)) { _, v -> (v ?: 0) + 1 }
47 | N[s, a]++
48 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
49 |
50 | var _s = startedStates.rand(episode)
51 | repeat(n, { _s.isNotTerminal }) {
52 | `ε-greedy (tie broken randomly)`(_s, Q, π, ε)//using on-policy to distribute computation
53 | val a = π(_s)
54 | if (Model[_s, a].isEmpty()) return@repeat
55 | stat++
56 | val (s_next, reward) = Model[_s, a].rand(N[_s, a])
57 | Q[_s, a] += α(_s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[_s, a])
58 | _s = s_next
59 | }
60 | s = s_next
61 | }
62 | episodeListener(V)
63 | log.debug { "steps=$step, stat=$stat" }
64 | }
65 | return result
66 | }
67 |
68 | private fun HashMap.rand(N: Int): K {
69 | val p = Rand().nextDouble()
70 | var acc = 0.0
71 | for ((k, v) in this) {
72 | acc += v.toDouble() / N
73 | if (p <= acc)
74 | return k
75 | }
76 | throw IllegalArgumentException("random=$p, but accumulation=$acc")
77 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/Dyna-Q.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
5 | import lab.mars.rl.model.emptyPossibleSet
6 | import lab.mars.rl.model.impl.mdp.*
7 | import lab.mars.rl.model.isNotTerminal
8 | import lab.mars.rl.model.log
9 | import lab.mars.rl.util.buf.DefaultBuf
10 | import lab.mars.rl.util.collection.cnsetOf
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.max
13 | import lab.mars.rl.util.tuples.tuple2
14 | import lab.mars.rl.util.tuples.tuple3
15 |
16 | @Suppress("NAME_SHADOWING")
17 | fun IndexedMDP.DynaQ(
18 | α: (IndexedState, IndexedAction) -> Double,
19 | ε: Double,
20 | n: Int,
21 | episodes: Int,
22 | stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
23 | episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
24 | val π = IndexedPolicy(QFunc { 0.0 })
25 | val Q = QFunc { 0.0 }
26 | val cachedSA = DefaultBuf.new>(Q.size)
27 | val Model = QFunc { emptyPossibleSet }
28 | val V = VFunc { 0.0 }
29 | val result = tuple3(π, V, Q)
30 | for (episode in 1..episodes) {
31 | log.debug { "$episode/$episodes" }
32 | var step = 0
33 | var s = started()
34 | while (s.isNotTerminal) {
35 | V_from_Q(states, result)
36 | stepListener(V, s)
37 | step++
38 | `ε-greedy (tie broken randomly)`(s, Q, π, ε)
39 | val a = π(s)
40 | val (s_next, reward) = a.sample()
41 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
42 | if (Model[s, a].isEmpty())
43 | cachedSA.append(tuple2(s, a))
44 | Model[s, a] = cnsetOf(IndexedPossible(s_next, reward, 1.0))
45 | repeat(n) {
46 | val (s, a) = cachedSA.rand()
47 | val (s_next, reward) = Model[s, a].rand()
48 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
49 | }
50 | s = s_next
51 | }
52 | episodeListener(V)
53 | log.debug { "steps=$step" }
54 | }
55 | return result
56 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/PrioritizedSweeping.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy (tie broken randomly)`
5 | import lab.mars.rl.model.emptyPossibleSet
6 | import lab.mars.rl.model.impl.mdp.*
7 | import lab.mars.rl.model.isNotTerminal
8 | import lab.mars.rl.model.log
9 | import lab.mars.rl.util.collection.cnsetOf
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.math.max
12 | import lab.mars.rl.util.math.repeat
13 | import lab.mars.rl.util.tuples.tuple2
14 | import lab.mars.rl.util.tuples.tuple3
15 | import org.apache.commons.math3.util.FastMath.abs
16 | import java.util.*
17 |
18 | @Suppress("NAME_SHADOWING")
19 | fun IndexedMDP.PrioritizedSweeping(
20 | n: Int,
21 | θ: Double,
22 | ε: Double,
23 | α: (IndexedState, IndexedAction) -> Double,
24 | episodes: Int,
25 | stepListener: (StateValueFunction, IndexedState) -> Unit = { _, _ -> },
26 | episodeListener: (StateValueFunction) -> Unit = {}): OptimalSolution {
27 | val π = IndexedPolicy(QFunc { 0.0 })
28 | val Q = QFunc { 0.0 }
29 | val PQueue = PriorityQueue(Q.size, Comparator> { o1, o2 ->
30 | o2._1.compareTo(o1._1)
31 | })
32 | val Model = QFunc { emptyPossibleSet }
33 | val predecessor = VFunc { hashSetOf>() }
34 | val V = VFunc { 0.0 }
35 | val result = tuple3(π, V, Q)
36 | for (episode in 1..episodes) {
37 | log.debug { "$episode/$episodes" }
38 | var step = 0
39 | var s = started()
40 | while (s.isNotTerminal) {
41 | V_from_Q(states, result)
42 | stepListener(V, s)
43 | step++
44 | `ε-greedy (tie broken randomly)`(s, Q, π, ε)
45 | val a = π(s)
46 | val (s_next, reward) = a.sample()
47 | Model[s, a] = cnsetOf(IndexedPossible(s_next, reward, 1.0))
48 | predecessor[s_next] += tuple2(s, a)
49 | val P = abs(reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
50 | if (P > θ) PQueue.add(tuple3(P, s, a))
51 | repeat(n, { PQueue.isNotEmpty() }) {
52 | val (_, s, a) = PQueue.poll()
53 | val (s_next, reward) = Model[s, a].rand()
54 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
55 | for ((s_pre, a_pre) in predecessor[s]) {
56 | val (s_next, reward) = Model[s_pre, a_pre].rand()
57 | assert(s_next === s)
58 | val P = abs(reward + γ * max(s.actions, 0.0) { Q[s, it] } - Q[s_pre, a_pre])
59 | if (P > θ) PQueue.add(tuple3(P, s_pre, a_pre))
60 | }
61 | }
62 | s = s_next
63 | }
64 | episodeListener(V)
65 | log.debug { "steps=$step" }
66 | }
67 | return result
68 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/dyna/RandomSampleOneStepTabularQLearning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.collection.filter
9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.math.max
11 | import lab.mars.rl.util.tuples.tuple3
12 |
13 | fun IndexedMDP.RandomSampleOneStepTabularQLearning(
14 | ε: Double,
15 | α: (IndexedState, IndexedAction) -> Double,
16 | episodes: Int): OptimalSolution {
17 |
18 | val Q = QFunc { 0.0 }
19 | for (episode in 1..episodes) {
20 | log.debug { "$episode/$episodes" }
21 | val s = started()
22 | val a = s.actions.rand()//Exploring Starts
23 | val (s_next, reward) = a.sample()
24 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
25 | }
26 | val π = IndexedPolicy(QFunc { 0.0 })
27 | for (s in states.filter { it.isNotTerminal })
28 | `ε-greedy`(s, Q, π, ε)
29 | val V = VFunc { 0.0 }
30 | val result = tuple3(π, V, Q)
31 | V_from_Q(states, result)
32 | return result
33 | }
34 |
35 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/control/True Online Sarsa(λ).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.control
2 |
3 | import lab.mars.rl.algo.EpisodeListener
4 | import lab.mars.rl.algo.StepListener
5 | import lab.mars.rl.model.MDP
6 | import lab.mars.rl.model.Policy
7 | import lab.mars.rl.model.impl.func.LinearFunc
8 | import lab.mars.rl.model.isNotTerminal
9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.log.debug
11 | import lab.mars.rl.util.matrix.Matrix
12 | import lab.mars.rl.util.matrix.MatrixSpec
13 | import lab.mars.rl.util.matrix.minus
14 | import lab.mars.rl.util.matrix.times
15 |
16 | fun MDP.`True Online Sarsa(λ)`(
17 | Qfunc: LinearFunc,
18 | π: Policy,
19 | λ: Double,
20 | α: Double,
21 | episodes: Int,
22 | z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
23 | maxStep: Int = Int.MAX_VALUE,
24 | episodeListener: EpisodeListener = { _, _, _, _ -> },
25 | stepListener: StepListener = { _, _, _, _ -> }) {
26 | val X = Qfunc.x
27 | val w = Qfunc.w
28 | val d = w.size
29 | val z = z_maker(d, 1)
30 | for (episode in 1..episodes) {
31 | log.debug { "$episode/$episodes" }
32 | var step = 0
33 | var s = started()
34 | var a = π(s)
35 | var x = X(s, a)
36 | z.zero()
37 | var Q_old = 0.0
38 | var G = 0.0
39 | var γn = 1.0
40 | while (true) {
41 | z `=` (γ * λ * z + (1.0 - α * γ * λ * (z `T*` x)) * x)
42 | val (s_next, reward) = a.sample()
43 | γn *= γ
44 | G += γn * reward
45 | s = s_next
46 | val Q = (w `T*` x).toScalar
47 | var δ = reward - Q
48 | if (s_next.isNotTerminal) {
49 | val a_next = π(s_next)
50 | val `x'` = X(s_next, a_next)
51 | val `Q'` = (w `T*` `x'`).toScalar
52 | δ += γ * `Q'`
53 | w += α * (δ + Q - Q_old) * z - α * (Q - Q_old) * x
54 | Q_old = `Q'`
55 | x = `x'`
56 | a = a_next
57 | } else {
58 | w += α * (δ + Q - Q_old) * z - α * (Q - Q_old) * x
59 | break
60 | }
61 | step++
62 | stepListener(episode, step, s_next, a)
63 | if (step >= maxStep) break
64 | }
65 | episodeListener(episode, step, s, G)
66 | }
67 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Off-line λ-return.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.buf.newBuf
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.math.Σ
7 | import lab.mars.rl.util.matrix.times
8 | import org.apache.commons.math3.util.FastMath.pow
9 |
10 | fun MDP.`Off-line λ-return`(
11 | V: ApproximateFunction,
12 | π: Policy,
13 | λ: Double,
14 | α: Double,
15 | episodes: Int,
16 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
17 | val R = newBuf()
18 | val S = newBuf()
19 | for (episode in 1..episodes) {
20 | log.debug { "$episode/$episodes" }
21 | var s = started()
22 | S.clear();S.append(s)
23 | R.clear();R.append(0.0)
24 | var T = 0
25 | while (s.isNotTerminal) {
26 | val a = π(s)
27 | val (s_next, reward) = a.sample()
28 | S.append(s_next)
29 | R.append(reward)
30 | s = s_next
31 | T++
32 | }
33 |
34 | fun Gt(t: Int, n: Int)
35 | = Σ(1..n) { pow(γ, it - 1) * R[t + it] } +
36 | (if (t + n < T) pow(γ, n) * V(S[t + n]) else 0.0)
37 |
38 | for (t in 0 until T) {
39 | val Gtλ = (1 - λ) * Σ(1..T - t - 1) { pow(λ, it - 1) * Gt(t, it) } +
40 | pow(λ, T - t - 1) * Gt(t, T - t)
41 | V.w += α * (Gtλ - V(S[t])) * V.`∇`(S[t])
42 | }
43 | episodeListener(episode, T)
44 | }
45 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Semi-gradient TD(λ) prediction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.matrix.Matrix
6 | import lab.mars.rl.util.matrix.MatrixSpec
7 | import lab.mars.rl.util.matrix.times
8 |
9 | fun MDP.`Semi-gradient TD(λ) prediction`(
10 | V: ApproximateFunction,
11 | π: Policy,
12 | λ: Double,
13 | α: Double,
14 | episodes: Int,
15 | z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
16 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
17 | val w = V.w
18 | val d = w.size
19 | for (episode in 1..episodes) {
20 | log.debug { "$episode/$episodes" }
21 | var step = 0
22 | var s = started()
23 | val z = z_maker(d, 1)
24 | while (s.isNotTerminal) {
25 | val a = π(s)
26 | val (s_next, reward) = a.sample()
27 | z `=` γ * λ * z + V.`∇`(s)
28 | val δ = reward + γ * (if (s_next.isTerminal) 0.0 else V(s_next)) - V(s)
29 | V.w += α * δ * z
30 | s = s_next
31 | step++
32 | }
33 | episodeListener(episode, step)
34 | }
35 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/True Online TD(λ) prediction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.matrix.Matrix
7 | import lab.mars.rl.util.matrix.MatrixSpec
8 | import lab.mars.rl.util.matrix.minus
9 | import lab.mars.rl.util.matrix.times
10 |
11 | fun MDP.`True Online TD(λ) prediction`(
12 | Vfunc: LinearFunc,
13 | π: Policy,
14 | λ: Double,
15 | α: Double,
16 | episodes: Int,
17 | z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
18 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
19 | val X = Vfunc.x
20 | val w = Vfunc.w
21 | val d = w.size
22 | for (episode in 1..episodes) {
23 | log.debug { "$episode/$episodes" }
24 | var step = 0
25 | var s = started()
26 | var x = X(s)
27 | val z = z_maker(d, 1)
28 | var V_old = 0.0
29 | while (s.isNotTerminal) {
30 | val a = π(s)
31 | val (s_next, reward) = a.sample()
32 | val `x'` = X(s_next)
33 | val V = (w `T*` x).toScalar
34 | val `V'` = if (s_next.isTerminal) 0.0 else (w `T*` `x'`).toScalar
35 | val δ = reward + γ * `V'` - V
36 | z `=` γ * λ * z + (1.0 - α * γ * λ * (z `T*` x)) * x
37 | w += α * (δ + V - V_old) * z - α * (V - V_old) * x
38 | V_old = `V'`
39 | x = `x'`
40 | s = s_next
41 | step++
42 | }
43 | episodeListener(episode, step)
44 | }
45 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/Play.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx
2 |
3 | import lab.mars.rl.model.*
4 |
5 | fun MDP.play(
6 | π: Policy,
7 | episodes: Int,
8 | maxStep: Int = Int.MAX_VALUE,
9 | episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
10 | stepListener: (Int, Int, State, Action) -> Unit = { _, _, _, _ -> }) {
11 | for (episode in 1..episodes) {
12 | var s = started()
13 | var step = 0
14 | var G = 0.0
15 | var γn = 1.0
16 | while (s.isNotTerminal) {
17 | val a = π(s)
18 | stepListener(episode, step, s, a)
19 | val (s_next, reward) = a.sample()
20 | γn *= γ
21 | G += γn * reward
22 | s = s_next
23 | step++
24 | if (step >= maxStep)
25 | break
26 | }
27 | episodeListener(episode, step, s, G)
28 | }
29 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/off_policy/Semi-gradient Expected Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.off_policy
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.math.Σ
6 | import lab.mars.rl.util.matrix.times
7 |
8 | fun MDP.`Semi-gradient Expected Sarsa`(
9 | q: ApproximateFunction, π: Policy,
10 | α: Double,
11 | episodes: Int,
12 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 | for (episode in 1..episodes) {
14 | log.debug { "$episode/$episodes" }
15 | var step = 0
16 | var s = started()
17 | while (s.isNotTerminal) {
18 | step++
19 | val a = π(s)
20 | val (s_next, reward) = a.sample()
21 | val δ = reward + γ * Σ(s_next.actions) { π[s_next, it] * q(s_next, it) } - q(s, a)
22 | q.w += α * δ * q.`∇`(s, a)
23 | s = s_next
24 | }
25 | episodeListener(episode, step)
26 | }
27 | }
28 |
29 | fun MDP.`Semi-gradient Expected Sarsa`(q: ApproximateFunction, π: Policy,
30 | α: Double, β: Double) {
31 | var average_reward = 0.0
32 | var s = started()
33 | while (true) {
34 | val a = π(s)
35 | val (s_next, reward) = a.sample()
36 | val δ = reward - average_reward + Σ(s_next.actions) { π[s_next, it] * q(s_next, it) } - q(s, a)
37 | q.w += α * δ * q.`∇`(s, a)
38 | average_reward += β * δ
39 | s = s_next
40 | }
41 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/off_policy/Semi-gradient off-policy TD(0).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.off_policy
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.matrix.times
6 |
7 | fun MDP.`Semi-gradient off-policy TD(0) episodic`(
8 | v: ApproximateFunction, π: Policy, b: Policy,
9 | α: Double,
10 | episodes: Int,
11 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
12 | for (episode in 1..episodes) {
13 | log.debug { "$episode/$episodes" }
14 | var step = 0
15 | var s = started()
16 | while (s.isNotTerminal) {
17 | step++
18 | val a = b(s)
19 | val (s_next, reward) = a.sample()
20 | val ρ = π[s, a] / b[s, a]
21 | val δ = reward + γ * v(s_next) - v(s)
22 | v.w += α * ρ * δ * v.`∇`(s)
23 | s = s_next
24 | }
25 | episodeListener(episode, step)
26 | }
27 | }
28 |
29 | fun MDP.`Semi-gradient off-policy TD(0) continuing`(v: ApproximateFunction, π: Policy, b: Policy,
30 | α: Double, β: Double) {
31 | var average_reward = 0.0
32 | var s = started()
33 | while (true) {
34 | val a = b(s)
35 | val (s_next, reward) = a.sample()
36 | val ρ = π[s, a] / b[s, a]
37 | val δ = reward - average_reward + v(s_next) - v(s)
38 | v.w += α * ρ * δ * v.`∇`(s)
39 | average_reward += β * δ
40 | s = s_next
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Differential semi-gradient Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.on_policy
2 |
3 | import lab.mars.rl.model.ApproximateFunction
4 | import lab.mars.rl.model.MDP
5 | import lab.mars.rl.model.Policy
6 | import lab.mars.rl.util.matrix.times
7 |
8 | fun MDP.`Differential semi-gradient Sarsa`(
9 | q: ApproximateFunction, π: Policy,
10 | α: Double, β: Double, maxStep: Int) {
11 | var average_reward = 0.0
12 | var s = started()
13 | var a = π(s)
14 | var step = 0
15 | while (true) {
16 | val (s_next, reward) = a.sample()
17 | val a_next = π(s_next)
18 | val δ = reward - average_reward + q(s_next, a_next) - q(s, a)
19 | average_reward += β * δ
20 | q.w += α * δ * q.`∇`(s, a)
21 | s = s_next
22 | a = a_next
23 | step++
24 | if (step >= maxStep) break
25 | }
26 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Differential semi-gradient n-step Sarsa.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.func_approx.on_policy
4 |
5 | import lab.mars.rl.algo.ntd.MAX_N
6 | import lab.mars.rl.model.*
7 | import lab.mars.rl.util.buf.newBuf
8 | import lab.mars.rl.util.math.Σ
9 | import lab.mars.rl.util.matrix.times
10 | import org.apache.commons.math3.util.FastMath.min
11 |
12 | fun MDP.`Differential semi-gradient n-step Sarsa`(
13 | q: ApproximateFunction, π: Policy,
14 | n: Int,
15 | α: Double, β: Double) {
16 | var average_reward = 0.0
17 | val _R = newBuf(min(n, MAX_N))
18 | val _S = newBuf(min(n, MAX_N))
19 | val _A = newBuf>(min(n, MAX_N))
20 |
21 | var t = 0
22 | val s = started()
23 | var a = π(s)
24 | _R.clear();_R.append(0.0)
25 | _S.clear();_S.append(s)
26 | _A.clear();_A.append(a)
27 | while (true) {
28 | if (t >= n) {
29 | _R.removeFirst()
30 | _S.removeFirst()
31 | _A.removeFirst()
32 | }
33 | val (s_next, reward) = a.sample()
34 | _R.append(reward)
35 | _S.append(s_next)
36 | a = π(s)
37 | _A.append(a)
38 | val τ = t - n + 1
39 | if (τ >= 0) {
40 | val δ = Σ(1..n) { _R[it] - average_reward } + q(_S[n], _A[n]) - q(_S[0], _A[0])
41 | average_reward += β * δ
42 | q.w += α * δ * q.`∇`(_S[0], _A[0])
43 | }
44 | t++
45 | }
46 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic Semi-gradient QLearning control.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.on_policy
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.math.max
6 | import lab.mars.rl.util.matrix.times
7 |
8 | fun MDP.`Episodic semi-gradient QLearning control`(
9 | Q: ApproximateFunction,
10 | π: Policy,
11 | α: Double,
12 | episodes: Int,
13 | episodeListener: (Int, Int) -> Unit = { _, _ -> },
14 | stepListener: (Int, Int, State, Action) -> Unit = { _, _, _, _ -> }) {
15 | for (episode in 1..episodes) {
16 | log.debug { "$episode/$episodes" }
17 | var step = 0
18 | var s = started()
19 | var a = π(s)
20 | while (true) {
21 | step++
22 | stepListener(episode, step, s, a)
23 | val (s_next, reward) = a.sample()
24 | if (s_next.isNotTerminal) {
25 | val a_next = π(s_next)
26 | Q.w += α * (reward + γ * max(s_next.actions) { Q(s_next, it) } - Q(s, a)) * Q.`∇`(s, a)
27 | s = s_next
28 | a = a_next
29 | } else {
30 | Q.w += α * (reward - Q(s, a)) * Q.`∇`(s, a)
31 | break
32 | }
33 | }
34 | episodeListener(episode, step)
35 | }
36 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic Semi-gradient Sarsa control.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.on_policy
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.matrix.times
6 |
7 | fun MDP.`Episodic semi-gradient Sarsa control`(
8 | Qfunc: ApproximateFunction,
9 | π: Policy,
10 | α: Double,
11 | episodes: Int,
12 | episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
13 | stepListener: (Int, Int, State, Action) -> Unit = { _, _, _, _ -> }) {
14 | for (episode in 1..episodes) {
15 | log.debug { "$episode/$episodes" }
16 | var step = 0
17 | var s = started()
18 | var a = π(s)
19 | var G = 0.0
20 | var γn = 1.0
21 | while (true) {
22 | step++
23 | stepListener(episode, step, s, a)
24 | val (s_next, reward) = a.sample()
25 | γn *= γ
26 | G += γn * reward
27 | if (s_next.isNotTerminal) {
28 | val a_next = π(s_next)
29 | Qfunc.w += α * (reward + γ * Qfunc(s_next, a_next) - Qfunc(s, a)) * Qfunc.`∇`(s, a)
30 | s = s_next
31 | a = a_next
32 | } else {
33 | Qfunc.w += α * (reward - Qfunc(s, a)) * Qfunc.`∇`(s, a)
34 | s = s_next
35 | break
36 | }
37 | }
38 | episodeListener(episode, step, s, G)
39 | }
40 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/on_policy/Episodic semi-gradient n-step Sarsa.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.func_approx.on_policy
4 |
5 | import lab.mars.rl.algo.ntd.MAX_N
6 | import lab.mars.rl.model.*
7 | import lab.mars.rl.util.buf.newBuf
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.matrix.times
11 | import org.apache.commons.math3.util.FastMath.min
12 | import org.apache.commons.math3.util.FastMath.pow
13 |
14 | fun MDP.`Episodic semi-gradient n-step Sarsa control`(
15 | q: ApproximateFunction, π: Policy,
16 | n: Int,
17 | α: Double,
18 | episodes: Int,
19 | maxStep: Int = Int.MAX_VALUE,
20 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
21 | val _R = newBuf(min(n + 1, MAX_N))
22 | val _S = newBuf(min(n + 1, MAX_N))
23 | val _A = newBuf>(min(n + 1, MAX_N))
24 |
25 | for (episode in 1..episodes) {
26 | log.debug { "$episode/$episodes" }
27 | var step = 0
28 | var n = n
29 | var T = Int.MAX_VALUE
30 | var t = 0
31 | var s = started()
32 | var a = π(s)
33 | _R.clear();_R.append(0.0)
34 | _S.clear();_S.append(s)
35 | _A.clear();_A.append(a)
36 | do {
37 | step++
38 | if (t >= n) {
39 | _R.removeFirst()
40 | _S.removeFirst()
41 | _A.removeFirst()
42 | }
43 | if (t < T) {
44 | val (s_next, reward) = a.sample()
45 | _R.append(reward)
46 | _S.append(s_next)
47 | s = s_next
48 | if (s.isTerminal || step >= maxStep) {
49 | T = t + 1
50 | val τ = t - n + 1
51 | if (τ < 0) n = T //n is too large
52 | } else {
53 | a = π(s)
54 | _A.append(a)
55 | }
56 | }
57 | val τ = t - n + 1
58 | if (τ >= 0) {
59 | var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
60 | if (τ + n < T) G += pow(γ, n) * q(_S[n], _A[n])
61 | q.w += α * (G - q(_S[0], _A[0])) * q.`∇`(_S[0], _A[0])
62 | }
63 | t++
64 | } while (τ < T - 1)
65 | log.debug { "n=$n,T=$T" }
66 | episodeListener(episode, step)
67 | }
68 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/Gradient Monte Carlo algorithm.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.buf.newBuf
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.matrix.times
7 |
8 | fun MDP.`Gradient Monte Carlo algorithm`(
9 | v: ApproximateFunction, π: Policy,
10 | α: Double,
11 | episodes: Int,
12 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 | val _S = newBuf()
14 | val _R = newBuf()
15 |
16 | for (episode in 1..episodes) {
17 | log.debug { "$episode/$episodes" }
18 | var step = 0
19 | _S.clear(); _R.clear()
20 | var s = started()
21 | _S.append(s); _R.append(0.0)
22 | var T = 0
23 | var accum = 0.0
24 | while (s.isNotTerminal) {
25 | step++
26 | val a = π(s)
27 | val (s_next, reward) = a.sample()
28 | accum += reward
29 | _S.append(s_next)
30 | _R.append(reward)
31 | s = s_next
32 | T++
33 | }
34 | var pre = 0.0
35 | for (t in 0 until T) {
36 | pre += _R[t]
37 | val Gt = accum - pre
38 | v.w += α * (Gt - v(_S[t])) * v.`∇`(_S[t])
39 | }
40 | episodeListener(episode, step)
41 | }
42 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/LSTD.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.matrix.Matrix
7 | import lab.mars.rl.util.matrix.plus
8 | import lab.mars.rl.util.matrix.times
9 |
10 | fun MDP.LSTD(vFunc: LinearFunc, π: Policy, ε: Double,
11 | episodes: Int,
12 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
13 | val xFeature = vFunc.x
14 | val d = xFeature.numOfComponents
15 | val A_ = 1 / ε * Matrix.identity(d)
16 | val b = Matrix.column(d)
17 | for (episode in 1..episodes) {
18 | log.debug { "$episode/$episodes" }
19 | var step = 0
20 | var s = started()
21 | var x = xFeature(s)
22 | while (s.isNotTerminal) {
23 | step++
24 | val a = π(s)
25 | val (s_next, reward) = a.sample()
26 | val _x = if (s_next.isTerminal) Matrix.column(d) else xFeature(s_next)
27 |
28 | val v = A_ `T*` (x - γ * _x)
29 | A_ -= (A_ * x) * v.T / (1.0 + (v `T*` x))
30 | b += reward * x
31 | s = s_next
32 | x = _x
33 | }
34 | episodeListener(episode, step)
35 | }
36 | vFunc.w `=` A_ * b
37 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/Semi-gradient TD(0).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.util.log.debug
5 | import lab.mars.rl.util.matrix.times
6 |
7 | fun MDP.`Semi-gradient TD(0)`(
8 | v: ApproximateFunction, π: Policy,
9 | α: Double,
10 | episodes: Int,
11 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
12 | for (episode in 1..episodes) {
13 | log.debug { "$episode/$episodes" }
14 | var step = 0
15 | var s = started()
16 | while (s.isNotTerminal) {
17 | step++
18 | val a = π(s)
19 | val (s_next, reward) = a.sample()
20 | v.w += α * (reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)) * v.`∇`(s)
21 | s = s_next
22 | }
23 | episodeListener(episode, step)
24 | }
25 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/func_approx/prediction/n-step semi-gradient TD.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.func_approx.prediction
4 |
5 | import lab.mars.rl.algo.ntd.MAX_N
6 | import lab.mars.rl.model.*
7 | import lab.mars.rl.util.buf.newBuf
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.matrix.times
11 | import org.apache.commons.math3.util.FastMath.min
12 | import org.apache.commons.math3.util.FastMath.pow
13 |
14 | fun MDP.`n-step semi-gradient TD`(
15 | v: ApproximateFunction, π: Policy,
16 | n: Int,
17 | α: Double,
18 | episodes: Int,
19 | episodeListener: (Int, Int) -> Unit = { _, _ -> }) {
20 | val _R = newBuf(min(n, MAX_N))
21 | val _S = newBuf(min(n, MAX_N))
22 | for (episode in 1..episodes) {
23 | log.debug { "$episode/$episodes" }
24 | var step = 0
25 | var n = n
26 | var T = Int.MAX_VALUE
27 | var t = 0
28 | var s = started()
29 | var a = π(s)
30 | _R.clear();_R.append(0.0)
31 | _S.clear();_S.append(s)
32 | do {
33 | step++
34 | if (t >= n) {
35 | _R.removeFirst()
36 | _S.removeFirst()
37 | }
38 | if (t < T) {
39 | val (s_next, reward) = a.sample()
40 |
41 | _R.append(reward)
42 | _S.append(s_next)
43 | s = s_next
44 | if (s.isTerminal) {
45 | T = t + 1
46 | val τ = t - n + 1
47 | if (τ < 0) n = T //n is too large
48 | } else
49 | a = π(s)
50 | }
51 | val τ = t - n + 1
52 | if (τ >= 0) {
53 | var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
54 | if (τ + n < T) G += pow(γ, n) * v(_S[n])
55 | v.w += α * (G - v(_S[0])) * v.`∇`(_S[0])
56 | }
57 | t++
58 | } while (τ < T - 1)
59 | log.debug { "n=$n,T=$T" }
60 | episodeListener(episode, step)
61 | }
62 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/ExploringStarts.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.mc
4 |
5 | import lab.mars.rl.algo.V_from_Q
6 | import lab.mars.rl.model.impl.mdp.*
7 | import lab.mars.rl.model.isNotTerminal
8 | import lab.mars.rl.model.log
9 | import lab.mars.rl.util.buf.newBuf
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.argmax
13 | import lab.mars.rl.util.tuples.tuple3
14 |
15 | fun IndexedMDP.`Monte Carlo Exploring Starts`(π: IndexedPolicy = null_policy, episodes: Int): OptimalSolution {
16 | val π = if (π == null_policy) IndexedPolicy(QFunc { 1.0 }) else π
17 | val Q = QFunc { 0.0 }
18 | val tmpQ = QFunc { Double.NaN }
19 | val count = QFunc { 0 }
20 | val tmpS = newBuf(states.size)
21 |
22 | for (episode in 1..episodes) {
23 | log.debug { "$episode/$episodes" }
24 | var s = started()
25 | var a = s.actions.rand()//Exploring Starts
26 |
27 | var accumulate = 0.0
28 | do {
29 | val (s_next, reward) = a.sample()
30 | if (tmpQ[s, a].isNaN())
31 | tmpQ[s, a] = accumulate
32 | accumulate += reward
33 | s = s_next
34 | } while (s.isNotTerminal.apply { if (this) a = π(s) })
35 |
36 | tmpS.clear()
37 | for ((s, a) in states.fork { it.actions }) {
38 | val value = tmpQ[s, a]
39 | if (!value.isNaN()) {
40 | Q[s, a] += accumulate - value
41 | count[s, a] += 1
42 | tmpS.append(s)
43 | tmpQ[s, a] = Double.NaN
44 | }
45 | }
46 | for (s in tmpS) {
47 | val a_greedy = argmax(s.actions) {
48 | val n = count[s, it]
49 | if (n > 0)
50 | Q[s, it] / n
51 | else
52 | Q[s, it]
53 | }
54 | for (a in s.actions)
55 | π[s, a] = if (a === a_greedy) 1.0 else 0.0
56 | }
57 | }
58 |
59 | Q.set { idx, value ->
60 | val n = count[idx]
61 | if (n > 0)
62 | value / n
63 | else
64 | value
65 | }
66 | val V = VFunc { 0.0 }
67 | val result = tuple3(π, V, Q)
68 | V_from_Q(states, result)
69 | return result
70 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/First visit Monte Carlo Prediction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 | import lab.mars.rl.model.impl.mdp.StateValueFunction
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 |
10 | fun IndexedMDP.`First Visit Monte Carlo Prediction`(π: IndexedPolicy, episodes: Int): StateValueFunction {
11 | val V = VFunc { 0.0 }
12 | val preReturn = VFunc { Double.NaN }
13 | val count = VFunc { 0 }
14 |
15 | for (episode in 1..episodes) {
16 | log.debug { "$episode/$episodes" }
17 | var s = started()
18 | var accumulate = 0.0
19 | while (s.isNotTerminal) {
20 | val a = π(s)
21 | val (s_next, reward) = a.sample()
22 | if (preReturn[s].isNaN())
23 | preReturn[s] = accumulate
24 | accumulate += reward
25 | s = s_next
26 | }
27 | preReturn.set { idx, value ->
28 | if (!value.isNaN()) {
29 | V[idx] += accumulate - value
30 | count[idx] += 1
31 | }
32 | Double.NaN
33 | }
34 | }
35 | for (s in states) {
36 | val n = count[s]
37 | if (n > 0)
38 | V[s] = V[s] / n
39 | }
40 | return V
41 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/Off-Policy Prediction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.model.impl.mdp.*
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.model.log
7 | import lab.mars.rl.util.buf.newBuf
8 | import lab.mars.rl.util.collection.filter
9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.tuples.tuple3
11 |
12 | fun IndexedMDP.`Off-policy MC prediction`(π: IndexedPolicy, episodes: Int): StateValueFunction {
13 | val Q = QFunc { 0.0 }
14 | val C = QFunc { 0.0 }
15 | val b = IndexedPolicy(QFunc { 1.0 })
16 | for (s in states.filter { it.isNotTerminal }) {
17 | val prob = 1.0 / s.actions.size
18 | for (a in s.actions)
19 | b[s, a] = prob
20 | }
21 |
22 | val R = newBuf()
23 | val S = newBuf()
24 | val A = newBuf()
25 |
26 | for (episode in 1..episodes) {
27 | log.debug { "$episode/$episodes" }
28 | var s = started()
29 | S.clear(); S.append(s)
30 | R.clear();R.append(0.0)
31 | A.clear()
32 | var T = 0
33 | while (s.isNotTerminal) {
34 | val a = b(s)
35 | A.append(a)
36 | val (s_next, reward) = a.sample()
37 | S.append(s_next)
38 | R.append(reward)
39 | s = s_next
40 | T++
41 | }
42 | var G = 0.0
43 | var W = 1.0
44 | for (t in T - 1 downTo 0) {
45 | val s_t = S[t]
46 | val a_t = A[t]
47 | G = γ * G + R[t + 1]
48 | C[s_t, a_t] += W
49 | Q[s_t, a_t] += W / C[s_t, a_t] * (G - Q[s_t, a_t])
50 | W = W * π[s_t, a_t] / b[s_t, a_t]
51 | if (W == 0.0) break
52 | }
53 | }
54 | val V = VFunc { 0.0 }
55 | val result = tuple3(π, V, Q)
56 | V_from_Q(states, result)
57 | return V
58 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/Off-policy Optimal.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.model.impl.mdp.*
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.model.isTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.buf.newBuf
9 | import lab.mars.rl.util.log.debug
10 | import lab.mars.rl.util.math.argmax
11 | import lab.mars.rl.util.tuples.tuple3
12 |
13 | fun IndexedMDP.`Off-policy MC Optimal`(episodes: Int): OptimalSolution {
14 | val Q = QFunc { 0.0 }
15 | val C = QFunc { 0.0 }
16 | val b = IndexedPolicy(QFunc { 1.0 })
17 | for (s in states) {
18 | if (s.isTerminal) continue
19 | val prob = 1.0 / s.actions.size
20 | for (a in s.actions)
21 | b[s, a] = prob
22 | }
23 | val π = QFunc { 1.0 }
24 |
25 | val R = newBuf()
26 | val S = newBuf()
27 | val A = newBuf()
28 |
29 | for (episode in 1..episodes) {
30 | log.debug { "$episode/$episodes" }
31 | var s = started()
32 | S.clear(); S.append(s)
33 | R.clear();R.append(0.0)
34 | A.clear()
35 | var T = 0
36 | while (s.isNotTerminal) {
37 | val a = b(s)
38 | A.append(a)
39 | val (s_next, reward) = a.sample()
40 | S.append(s_next)
41 | R.append(reward)
42 | s = s_next
43 | T++
44 | }
45 | var G = 0.0
46 | var W = 1.0
47 | for (t in T - 1 downTo 0) {
48 | val s_t = S[t]
49 | val a_t = A[t]
50 | G = γ * G + R[t + 1]
51 | C[s_t, a_t] += W
52 | Q[s_t, a_t] += W / C[s_t, a_t] * (G - Q[s_t, a_t])
53 |
54 | val a_opt = argmax(s_t.actions) { Q[s_t, it] }
55 | for (a in s_t.actions) {
56 | π[s_t, a] = when {
57 | a === a_opt -> 1.0
58 | else -> 0.0
59 | }
60 | }
61 | if (a_t !== a_opt) break
62 | W = W * 1 / b[s_t, a_t]
63 | }
64 | }
65 | val V = VFunc { 0.0 }
66 | val result = tuple3(IndexedPolicy(π), V, Q)
67 | V_from_Q(states, result)
68 | return result
69 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/mc/On-Policy Optimal.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.mc
4 |
5 | import lab.mars.rl.algo.V_from_Q
6 | import lab.mars.rl.model.impl.mdp.IndexedMDP
7 | import lab.mars.rl.model.impl.mdp.IndexedState
8 | import lab.mars.rl.model.impl.mdp.OptimalSolution
9 | import lab.mars.rl.model.isNotTerminal
10 | import lab.mars.rl.model.log
11 | import lab.mars.rl.util.buf.newBuf
12 | import lab.mars.rl.util.collection.fork
13 | import lab.mars.rl.util.log.debug
14 | import lab.mars.rl.util.math.argmax
15 | import lab.mars.rl.util.tuples.tuple3
16 |
17 | fun IndexedMDP.`On-policy first-visit MC control`(episodes: Int): OptimalSolution {
18 | val ε = 0.1
19 | val π = equiprobablePolicy()
20 | val Q = QFunc { 0.0 }
21 | val tmpQ = QFunc { Double.NaN }
22 | val count = QFunc { 0 }
23 | val tmpS = newBuf(states.size)
24 |
25 | for (episode in 1..episodes) {
26 | log.debug { "$episode/$episodes" }
27 | var s = started()
28 | var accumulate = 0.0
29 | while (s.isNotTerminal) {
30 | val a = π(s)
31 | val (s_next, reward) = a.sample()
32 | if (tmpQ[s, a].isNaN())
33 | tmpQ[s, a] = accumulate
34 | accumulate += reward
35 | s = s_next
36 | }
37 | tmpS.clear()
38 | for ((s, a) in states.fork { it.actions }) {
39 | val value = tmpQ[s, a]
40 | if (!value.isNaN()) {
41 | Q[s, a] += accumulate - value
42 | count[s, a] += 1
43 | tmpS.append(s)
44 | tmpQ[s, a] = Double.NaN
45 | }
46 | }
47 | for (s in tmpS) {
48 | val a_opt = argmax(s.actions) {
49 | val n = count[s, it]
50 | if (n > 0)
51 | Q[s, it] / n
52 | else
53 | Q[s, it]
54 | }
55 | val size = s.actions.size
56 | for (a in s.actions) {
57 | π[s, a] = when {
58 | a === a_opt -> 1 - ε + ε / size
59 | else -> ε / size
60 | }
61 | }
62 | }
63 | }
64 |
65 | Q.set { idx, value ->
66 | val n = count[idx]
67 | if (n > 0)
68 | value / n
69 | else
70 | value
71 | }
72 | val V = VFunc { 0.0 }
73 | val result = tuple3(π, V, Q)
74 | V_from_Q(states, result)
75 | return result
76 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Off-policy Sarsa.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.ntd
4 |
5 | import lab.mars.rl.algo.V_from_Q
6 | import lab.mars.rl.algo.`ε-greedy`
7 | import lab.mars.rl.model.impl.mdp.IndexedAction
8 | import lab.mars.rl.model.impl.mdp.IndexedMDP
9 | import lab.mars.rl.model.impl.mdp.IndexedState
10 | import lab.mars.rl.model.impl.mdp.OptimalSolution
11 | import lab.mars.rl.model.isTerminal
12 | import lab.mars.rl.model.log
13 | import lab.mars.rl.util.buf.newBuf
14 | import lab.mars.rl.util.log.debug
15 | import lab.mars.rl.util.math.Π
16 | import lab.mars.rl.util.math.Σ
17 | import lab.mars.rl.util.tuples.tuple3
18 | import org.apache.commons.math3.util.FastMath.min
19 | import org.apache.commons.math3.util.FastMath.pow
20 |
21 | fun IndexedMDP.`N-step off-policy sarsa`(
22 | n: Int,
23 | ε: Double,
24 | α: (IndexedState, IndexedAction) -> Double,
25 | episodes: Int): OptimalSolution {
26 | val b = equiprobablePolicy()
27 | val π = equiprobablePolicy()
28 |
29 | val Q = QFunc { 0.0 }
30 | val _R = newBuf(min(n, MAX_N))
31 | val _S = newBuf(min(n, MAX_N))
32 | val _A = newBuf(min(n, MAX_N))
33 |
34 | for (episode in 1..episodes) {
35 | log.debug { "$episode/$episodes" }
36 | var n = n
37 | var T = Int.MAX_VALUE
38 | var t = 0
39 | var s = started()
40 | var a = b(s)
41 | _R.clear();_R.append(0.0)
42 | _S.clear();_S.append(s)
43 | _A.clear();_A.append(a)
44 | do {
45 | if (t >= n) {
46 | _R.removeFirst()
47 | _S.removeFirst()
48 | _A.removeFirst()
49 | }
50 | if (t < T) {
51 | val (s_next, reward) = a.sample()
52 | _R.append(reward)
53 | _S.append(s_next)
54 | s = s_next
55 | if (s.isTerminal) {
56 | T = t + 1
57 | val τ = t - n + 1
58 | if (τ < 0) n = T
59 | } else {
60 | a = b(s)
61 | _A.append(a)
62 | }
63 | }
64 | val τ = t - n + 1
65 | if (τ >= 0) {
66 | val ρ = Π(1..min(n - 1, T - 1 - τ)) { π[_S[it], _A[it]] / b[_S[it], _A[it]] }
67 | var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
68 | if (τ + n < T) G += pow(γ, n) * Q[_S[n], _A[n]]
69 | Q[_S[0], _A[0]] += α(_S[0], _A[0]) * ρ * (G - Q[_S[0], _A[0]])
70 | `ε-greedy`(_S[0], Q, π, ε)
71 | }
72 | t++
73 | } while (τ < T - 1)
74 | log.debug { "n=$n,T=$T" }
75 | }
76 | val V = VFunc { 0.0 }
77 | val result = tuple3(π, V, Q)
78 | V_from_Q(states, result)
79 | return result
80 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Sarsa.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.ntd
4 |
5 | import lab.mars.rl.algo.V_from_Q
6 | import lab.mars.rl.algo.`ε-greedy`
7 | import lab.mars.rl.model.impl.mdp.*
8 | import lab.mars.rl.model.isTerminal
9 | import lab.mars.rl.model.log
10 | import lab.mars.rl.util.buf.newBuf
11 | import lab.mars.rl.util.log.debug
12 | import lab.mars.rl.util.math.Σ
13 | import lab.mars.rl.util.tuples.tuple3
14 | import org.apache.commons.math3.util.FastMath.min
15 | import org.apache.commons.math3.util.FastMath.pow
16 |
17 | fun IndexedMDP.`N-step Sarsa`(
18 | n: Int,
19 | ε: Double,
20 | α: (IndexedState, IndexedAction) -> Double,
21 | episodes: Int): OptimalSolution {
22 | val π = IndexedPolicy(QFunc { 0.0 })
23 | val Q = QFunc { 0.0 }
24 | val _R = newBuf(min(n, MAX_N))
25 | val _S = newBuf(min(n, MAX_N))
26 | val _A = newBuf(min(n, MAX_N))
27 |
28 | for (episode in 1..episodes) {
29 | log.debug { "$episode/$episodes" }
30 | var n = n
31 | var T = Int.MAX_VALUE
32 | var t = 0
33 | var s = started()
34 |
35 | `ε-greedy`(s, Q, π, ε)
36 | var a = π(s)
37 | _R.clear();_R.append(0.0)
38 | _S.clear();_S.append(s)
39 | _A.clear();_A.append(a)
40 | do {
41 | if (t >= n) {
42 | _R.removeFirst()
43 | _S.removeFirst()
44 | _A.removeFirst()
45 | }
46 | if (t < T) {
47 | val (s_next, reward) = a.sample()
48 | _R.append(reward)
49 | _S.append(s_next)
50 | s = s_next
51 | if (s.isTerminal) {
52 | T = t + 1
53 | val τ = t - n + 1
54 | if (τ < 0) n = T //n is too large
55 | } else {
56 | `ε-greedy`(s, Q, π, ε)
57 | a = π(s)
58 | _A.append(a)
59 | }
60 | }
61 | val τ = t - n + 1
62 | if (τ >= 0) {
63 | var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * _R[it] }
64 | if (τ + n < T) G += pow(γ, n) * Q[_S[n], _A[n]]
65 | Q[_S[0], _A[0]] += α(_S[0], _A[0]) * (G - Q[_S[0], _A[0]])
66 | `ε-greedy`(_S[0], Q, π, ε)
67 | }
68 | t++
69 | } while (τ < T - 1)
70 | log.debug { "n=$n,T=$T" }
71 | }
72 | val V = VFunc { 0.0 }
73 | val result = tuple3(π, V, Q)
74 | V_from_Q(states, result)
75 | return result
76 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step TD prediction.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.ntd
4 |
5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
7 | import lab.mars.rl.model.impl.mdp.IndexedState
8 | import lab.mars.rl.model.impl.mdp.StateValueFunction
9 | import lab.mars.rl.model.isTerminal
10 | import lab.mars.rl.model.log
11 | import lab.mars.rl.util.buf.newBuf
12 | import lab.mars.rl.util.collection.IndexedCollection
13 | import lab.mars.rl.util.log.debug
14 | import lab.mars.rl.util.math.Σ
15 | import org.apache.commons.math3.util.FastMath.min
16 | import org.apache.commons.math3.util.FastMath.pow
17 |
18 | val MAX_N = 1024
19 |
20 | fun IndexedMDP.`N-step TD prediction`(
21 | n: Int, π: IndexedPolicy,
22 | α: Double,
23 | episodes: Int,
24 | episodeListener: (Int, IndexedCollection) -> Unit = { _, _ -> }): StateValueFunction {
25 | val V = VFunc { 0.0 }
26 | val R = newBuf(min(n, MAX_N))
27 | val S = newBuf(min(n, MAX_N))
28 |
29 | for (episode in 1..episodes) {
30 | log.debug { "$episode/$episodes" }
31 | var T = Int.MAX_VALUE
32 | var n = n
33 | var t = 0
34 | var s = started()
35 | R.clear();R.append(0.0)
36 | S.clear();S.append(s)
37 |
38 | do {
39 | if (t >= n) {
40 | R.removeFirst(1)
41 | S.removeFirst(1)
42 | }
43 | if (t < T) {
44 | val a = π(s)
45 | val (s_next, reward) = a.sample()
46 | S.append(s_next)
47 | R.append(reward)
48 | s = s_next
49 | if (s.isTerminal) {
50 | T = t + 1
51 | val τ = t - n + 1
52 | if (τ < 0) n = T //n is too large
53 | }
54 | }
55 | val τ = t - n + 1
56 | if (τ >= 0) {
57 | var G = Σ(1..min(n, T - τ)) { pow(γ, it - 1) * R[it] }
58 | if (τ + n < T) G += pow(γ, n) * V[S[n]]
59 | V[S[0]] += α * (G - V[S[0]])
60 | }
61 | t++
62 | } while (τ < T - 1)
63 | log.debug { "n=$n,T=$T" }
64 | episodeListener(episode, V)
65 | }
66 | return V
67 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/ntd/N-step Treebackup.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.ntd
4 |
5 | import lab.mars.rl.algo.V_from_Q
6 | import lab.mars.rl.algo.`ε-greedy`
7 | import lab.mars.rl.model.impl.mdp.IndexedAction
8 | import lab.mars.rl.model.impl.mdp.IndexedMDP
9 | import lab.mars.rl.model.impl.mdp.IndexedState
10 | import lab.mars.rl.model.impl.mdp.OptimalSolution
11 | import lab.mars.rl.model.isTerminal
12 | import lab.mars.rl.model.log
13 | import lab.mars.rl.util.buf.newBuf
14 | import lab.mars.rl.util.log.debug
15 | import lab.mars.rl.util.math.Σ
16 | import lab.mars.rl.util.tuples.tuple3
17 | import org.apache.commons.math3.util.FastMath.min
18 |
19 | fun IndexedMDP.`N-step Treebackup`(
20 | n: Int, ε: Double,
21 | α: (IndexedState, IndexedAction) -> Double,
22 | episodes: Int): OptimalSolution {
23 | val π = equiprobablePolicy()
24 | val Q = QFunc { 0.0 }
25 |
26 | val _Q = newBuf(min(n, MAX_N))
27 | val _π = newBuf(min(n, MAX_N))
28 | val δ = newBuf(min(n, MAX_N))
29 | val _S = newBuf(min(n, MAX_N))
30 | val _A = newBuf(min(n, MAX_N))
31 |
32 | for (episode in 1..episodes) {
33 | var n = n
34 | log.debug { "$episode/$episodes" }
35 | var T = Int.MAX_VALUE
36 | var t = 0
37 | var s = started()
38 | var a = π(s)
39 |
40 | _Q.clear(); _Q.append(0.0)
41 | _π.clear();_π.append(π[s, a])
42 | δ.clear()
43 | _S.clear();_S.append(s)
44 | _A.clear(); _A.append(a)
45 |
46 | do {
47 | if (t >= n) {
48 | _Q.removeFirst()
49 | _π.removeFirst()
50 | δ.removeFirst()
51 | _S.removeFirst()
52 | _A.removeFirst()
53 | }
54 | if (t < T) {
55 | val (s_next, reward) = a.sample()
56 | _S.append(s_next)
57 | s = s_next
58 | if (s.isTerminal) {
59 | δ.append(reward - _Q.last)
60 | T = t + 1
61 | val τ = t - n + 1
62 | if (τ < 0) n = T //n is too large
63 | } else {
64 | δ.append(reward + γ * Σ(s.actions) { π[s, it] * Q[s, it] } - _Q.last)
65 | a = s.actions.rand()
66 | _A.append(a)
67 | _Q.append(Q[s, a])
68 | _π.append(π[s, a])
69 | }
70 | }
71 | val τ = t - n + 1
72 | if (τ >= 0) {
73 | var Z = 1.0
74 | var G = _Q[0]
75 | val end = min(n - 1, T - 1 - τ)
76 | for (k in 0..end) {
77 | G += Z * δ[k]
78 | if (k < end) Z *= γ * _π[k + 1]
79 | }
80 | Q[_S[0], _A[0]] += α(_S[0], _A[0]) * (G - Q[_S[0], _A[0]])
81 | `ε-greedy`(_S[0], Q, π, ε)
82 | }
83 | t++
84 | } while (τ < T - 1)
85 | log.debug { "n=$n,T=$T" }
86 | }
87 | val V = VFunc { 0.0 }
88 | val result = tuple3(π, V, Q)
89 | V_from_Q(states, result)
90 | return result
91 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/package.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.ApproximateFunction
5 | import lab.mars.rl.model.State
6 | import lab.mars.rl.model.impl.mdp.*
7 | import lab.mars.rl.model.isNotTerminal
8 | import lab.mars.rl.util.collection.Gettable
9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.math.argmax
12 | import lab.mars.rl.util.math.argmax_tie_random
13 | import lab.mars.rl.util.math.Σ
14 |
15 | /**
16 | *
17 | * Created on 2017-09-06.
18 | *
19 | *
20 | * @author wumo
21 | */
22 |
23 | typealias EpisodeListener = (Int, Int, State, Double) -> Unit
24 |
25 | typealias StepListener = (Int, Int, State, Action) -> Unit
26 |
27 | fun V_from_Q(states: StateSet, pvq: OptimalSolution) {
28 | val (π, V, Q) = pvq
29 | for (s in states.filter { it.isNotTerminal }) {
30 | V[s] = Σ(s.actions) {
31 | π[s, it] * Q[s, it]
32 | }
33 | }
34 | }
35 |
36 | fun Q_from_V(gamma: Double, states: StateSet, pvq: OptimalSolution) {
37 | val (_, V, Q) = pvq
38 | for ((s, a) in states.fork { it.actions })
39 | Q[s, a] = Σ(a.possibles) { probability * (reward + gamma * V[next]) }
40 | }
41 |
42 | fun average_α(indexedMdp: IndexedMDP): (IndexedState, IndexedAction) -> Double {
43 | val N = indexedMdp.QFunc { 0 }
44 | return { s, a ->
45 | N[s, a]++
46 | 1.0 / N[s, a]
47 | }
48 | }
49 |
50 | fun `ε-greedy`(s: IndexedState, Q: ActionValueFunction, π: IndexedPolicy, ε: Double) {
51 | val a_opt = argmax(s.actions) { Q[s, it] }
52 | val size = s.actions.size
53 | for (a in s.actions) {
54 | π[s, a] = when {
55 | a === a_opt -> 1 - ε + ε / size
56 | else -> ε / size
57 | }
58 | }
59 | }
60 |
61 | fun `ε-greedy`(s: IndexedState, evaluate: Gettable, Double>, π: IndexedPolicy, ε: Double) {
62 | val a_opt = argmax(s.actions) { evaluate[it] }
63 | val size = s.actions.size
64 | for (a in s.actions) {
65 | π[s, a] = when {
66 | a === a_opt -> 1 - ε + ε / size
67 | else -> ε / size
68 | }
69 | }
70 | }
71 |
72 | fun `ε-greedy`(s: IndexedState, Q: ApproximateFunction, π: IndexedPolicy, ε: Double) {
73 | val a_opt = argmax(s.actions) { Q(s, it) }
74 | val size = s.actions.size
75 | for (a in s.actions) {
76 | π[s, a] = when {
77 | a === a_opt -> 1 - ε + ε / size
78 | else -> ε / size
79 | }
80 | }
81 | }
82 |
83 | fun `ε-greedy (tie broken randomly)`(s: IndexedState, Q: ActionValueFunction, π: IndexedPolicy, ε: Double) {
84 | val a_opt = argmax_tie_random(s.actions) { Q[s, it] }
85 | val size = s.actions.size
86 | for (a in s.actions) {
87 | π[s, a] = when {
88 | a === a_opt -> 1 - ε + ε / size
89 | else -> ε / size
90 | }
91 | }
92 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/Actor-Critic with Eligibility Traces (continuing).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.policy_gradient
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.math.rand
7 | import lab.mars.rl.util.matrix.Matrix
8 | import lab.mars.rl.util.matrix.MatrixSpec
9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.matrix.Σ
11 | import kotlin.math.exp
12 |
13 | fun MDP.`Actor-Critic with Eligibility Traces (continuing)`(
14 | h: LinearFunc, α_θ: Double, λ_θ: Double,
15 | v: ApproximateFunction, α_w: Double, λ_w: Double, η: Double,
16 | episodes: Int,
17 | z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) }) {
18 | for (episode in 1..episodes) {
19 | log.debug { "$episode/$episodes" }
20 | var step = 0
21 | var s = started()
22 | val z_θ = z_maker(h.w.size, 1)
23 | val z_w = z_maker(v.w.size, 1)
24 | var averageR = 0.0
25 | while (s.isNotTerminal) {
26 | step++
27 | val a = rand(s.actions) { exp(h(s, it)) }
28 | val (s_next, reward) = a.sample()
29 | val δ = reward - averageR + γ * if (s_next.isTerminal) 0.0 else v(s_next) - v(s)
30 | averageR += η * δ
31 | z_w `=` λ_w * z_w + v.`∇`(s)
32 | val `∇` = h.x(s, a) - Σ(s.actions) { b ->
33 | val tmp = h(s, b)
34 | h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
35 | }
36 | z_θ `=` λ_θ * z_θ + `∇`
37 | v.w += α_w * δ * z_w
38 | h.w += α_θ * δ * z_θ
39 | s = s_next
40 | }
41 | }
42 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/Actor-Critic with Eligibility Traces (episodic).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.policy_gradient
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.math.rand
7 | import lab.mars.rl.util.matrix.Matrix
8 | import lab.mars.rl.util.matrix.MatrixSpec
9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.matrix.Σ
11 | import kotlin.math.exp
12 |
13 | fun MDP.`Actor-Critic with Eligibility Traces (episodic)`(
14 | h: LinearFunc, α_θ: Double, λ_θ: Double,
15 | v: ApproximateFunction, α_w: Double, λ_w: Double,
16 | episodes: Int,
17 | z_maker: (Int, Int) -> MatrixSpec = { m, n -> Matrix(m, n) },
18 | maxStep: Int = Int.MAX_VALUE,
19 | episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
20 | stepListener: (Int, Int, State, Action) -> Unit = { _, _, _, _ -> }) {
21 | for (episode in 1..episodes) {
22 | log.debug { "$episode/$episodes" }
23 | var step = 0
24 | var G = 0.0
25 | var s = started()
26 | val z_θ = z_maker(h.w.size, 1)
27 | val z_w = z_maker(v.w.size, 1)
28 | var γ_t = 1.0
29 | while (s.isNotTerminal) {
30 | step++
31 | val a = rand(s.actions) { exp(h(s, it)) }
32 | val (s_next, reward) = a.sample()
33 | G += γ_t * reward
34 | val δ = reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)
35 | z_w *= γ * λ_w
36 | z_w += γ_t * v.`∇`(s)
37 | // z_w `=` γ * λ_w * z_w + γ_t * v.`∇`(s)
38 | val `∇` = h.x(s, a) - Σ(s.actions) { b ->
39 | val tmp = h(s, b)
40 | h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
41 | }
42 | z_θ *= γ * λ_θ
43 | z_θ += γ_t * `∇`
44 | // z_θ `=` γ * λ_θ * z_θ + γ_t * `∇`
45 | v.w += α_w * δ * z_w
46 | h.w += α_θ * δ * z_θ
47 | γ_t *= γ
48 | stepListener(episode, step, s, a)
49 | s = s_next
50 | if (step >= maxStep) break
51 | }
52 | episodeListener(episode, step, s, G)
53 | }
54 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/One-step Actor-Critic (episodic).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.policy_gradient
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.log.debug
6 | import lab.mars.rl.util.math.rand
7 | import lab.mars.rl.util.matrix.times
8 | import lab.mars.rl.util.matrix.Σ
9 | import kotlin.math.exp
10 |
11 | fun MDP.`One-step Actor-Critic (episodic)`(
12 | h: LinearFunc, α_θ: Double,
13 | v: ApproximateFunction, α_w: Double,
14 | episodes: Int) {
15 | for (episode in 1..episodes) {
16 | log.debug { "$episode/$episodes" }
17 | var step = 0
18 | var s = started()
19 | var γ_t = 1.0
20 | while (s.isNotTerminal) {
21 | step++
22 | val a = rand(s.actions) { exp(h(s, it)) }
23 | val (s_next, reward) = a.sample()
24 | val δ = reward + γ * (if (s_next.isTerminal) 0.0 else v(s_next)) - v(s)
25 | v.w += α_w * γ_t * δ * v.`∇`(s)
26 | val `∇` = h.x(s, a) - Σ(s.actions) { b ->
27 | val tmp = h(s, b)
28 | h.x(s, b) / s.actions.sumByDouble { exp(h(s, it) - tmp) }
29 | }
30 | h.w += α_θ * γ_t * δ * `∇`
31 | γ_t *= γ
32 | s = s_next
33 | }
34 | }
35 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/REINFORCE with Baseline (episodic).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.policy_gradient
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.buf.newBuf
6 | import lab.mars.rl.util.log.debug
7 | import lab.mars.rl.util.math.rand
8 | import lab.mars.rl.util.matrix.times
9 | import lab.mars.rl.util.matrix.Σ
10 | import kotlin.math.exp
11 |
12 | fun MDP.`REINFORCE with Baseline (episodic)`(
13 | h: LinearFunc, α_θ: Double,
14 | v: ApproximateFunction, α_w: Double,
15 | episodes: Int,
16 | episodeListener: (Int, Int, State, Double) -> Unit = { _, _, _, _ -> },
17 | stepListener: (Int, Int, State, Action) -> Unit = { _, _, _, _ -> }) {
18 | for (episode in 1..episodes) {
19 | log.debug { "$episode/$episodes" }
20 | var step = 0
21 | var s = started()
22 | var a = rand(s.actions) { exp(h(s, it)) }
23 | val S = newBuf()
24 | val A = newBuf>()
25 | val R = newBuf()
26 |
27 | S.append(s)
28 | R.append(0.0)
29 | var accu = 0.0
30 | var T: Int
31 | while (true) {
32 | step++
33 | A.append(a)
34 | val (s_next, reward) = a.sample()
35 | accu += reward
36 | stepListener(episode, step, s, a)
37 | R.append(accu)
38 | S.append(s_next)
39 | s = s_next
40 | if (s_next.isTerminal) {
41 | T = step
42 | break
43 | }
44 |
45 | a = rand(s.actions) { exp(h(s, it)) }
46 | }
47 | var γ_t = 1.0
48 | for (t in 0 until T) {
49 | val G = accu - R[t]
50 | val δ = G - v(S[t])
51 | v.w += α_w * γ_t * δ * v.`∇`(S[t])
52 | val `∇` = h.x(S[t], A[t]) - Σ(S[t].actions) { b ->
53 | val tmp = exp(h(S[t], b))
54 | h.x(S[t], b) / S[t].actions.sumByDouble { exp(h(S[t], it) - tmp) }
55 | }
56 | h.w += α_θ * γ_t * δ * `∇`
57 | γ_t *= γ
58 | }
59 | episodeListener(episode, T, s, accu)
60 | }
61 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/policy_gradient/REINFORCE.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.policy_gradient
2 |
3 | import lab.mars.rl.model.*
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.util.buf.newBuf
6 | import lab.mars.rl.util.log.debug
7 | import lab.mars.rl.util.math.rand
8 | import lab.mars.rl.util.matrix.times
9 | import lab.mars.rl.util.matrix.Σ
10 | import kotlin.math.exp
11 |
12 | fun MDP.REINFORCE(h: LinearFunc, α: Double, episodes: Int) {
13 | for (episode in 1..episodes) {
14 | log.debug { "$episode/$episodes" }
15 | var step = 0
16 | var s = started()
17 | var a = rand(s.actions) { exp(h(s, it)) }
18 | val S = newBuf()
19 | val A = newBuf>()
20 | val R = newBuf()
21 |
22 | S.append(s)
23 | R.append(0.0)
24 | var accu = 0.0
25 | var T: Int
26 | while (true) {
27 | step++
28 | A.append(a)
29 | val (s_next, reward) = a.sample()
30 | accu += reward
31 | R.append(accu)
32 | S.append(s_next)
33 | s = s_next
34 | if (s_next.isTerminal) {
35 | T = step
36 | break
37 | }
38 | a = rand(s.actions) { exp(h(s, it)) }
39 | }
40 | var γ_t = 1.0
41 | for (t in 0 until T) {
42 | val G = accu - R[t]
43 | val `∇` = h.x(S[t], A[t]) - Σ(S[t].actions) { b ->
44 | val tmp = exp(h(S[t], b))
45 | h.x(S[t], b) / S[t].actions.sumByDouble { exp(h(S[t], it) - tmp) }
46 | }
47 | h.w += α * γ_t * G * `∇`
48 | γ_t *= γ
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/DoubleQLearning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.model.impl.mdp.*
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.model.log
7 | import lab.mars.rl.util.log.debug
8 | import lab.mars.rl.util.math.Rand
9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.tuples.tuple3
11 |
12 | fun IndexedMDP.DoubleQLearning(
13 | ε: Double,
14 | α: (IndexedState, IndexedAction) -> Double,
15 | episodes: Int): OptimalSolution {
16 | fun `ε-greedy`(s: IndexedState, Q1: ActionValueFunction, Q2: ActionValueFunction, π: IndexedPolicy) {
17 | val a_opt = argmax(s.actions) { Q1[s, it] + Q2[s, it] }
18 | val size = s.actions.size
19 | for (a in s.actions) {
20 | π[s, a] = when {
21 | a === a_opt -> 1 - ε + ε / size
22 | else -> ε / size
23 | }
24 | }
25 | }
26 |
27 | val π = IndexedPolicy(QFunc { 0.0 })
28 | var Q1 = QFunc { 0.0 }
29 | var Q2 = QFunc { 0.0 }
30 |
31 | for (episode in 1..episodes) {
32 | log.debug { "$episode/$episodes" }
33 | var s = started()
34 | while (true) {
35 | `ε-greedy`(s, Q1, Q2, π)
36 | val a = π(s)
37 | val (s_next, reward) = a.sample()
38 | if (Rand().nextBoolean()) {
39 | val tmp = Q1
40 | Q1 = Q2
41 | Q2 = tmp
42 | }
43 | if (s_next.isNotTerminal) {
44 | Q1[s, a] += α(s, a) * (reward + γ * Q2[s_next, argmax(s_next.actions) { Q1[s_next, it] }] - Q1[s, a])
45 | s = s_next
46 | } else {
47 | Q1[s, a] += α(s, a) * (reward + γ * 0.0 - Q1[s, a])//Q[terminalState,*]=0.0
48 | break
49 | }
50 | }
51 | }
52 | val V = VFunc { 0.0 }
53 | val result = tuple3(π, V, Q1)
54 | V_from_Q(states, result)
55 | return result
56 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/ExpectedSarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.math.Σ
10 | import lab.mars.rl.util.tuples.tuple3
11 |
12 | fun IndexedMDP.expectedSarsa(
13 | ε: Double,
14 | α: (IndexedState, IndexedAction) -> Double,
15 | episodes: Int): OptimalSolution {
16 | val π = IndexedPolicy(QFunc { 0.0 })
17 | val Q = QFunc { 0.0 }
18 |
19 | for (episode in 1..episodes) {
20 | log.debug { "$episode/$episodes" }
21 | var s = started()
22 | while (s.isNotTerminal) {
23 | `ε-greedy`(s, Q, π, ε)
24 | val a = π(s)
25 | val (s_next, reward) = a.sample()
26 | Q[s, a] += α(s, a) * (reward + γ * Σ(s_next.actions) { π[s_next, it] * Q[s_next, it] } - Q[s, a])
27 | s = s_next
28 | }
29 | }
30 | val V = VFunc { 0.0 }
31 | val result = tuple3(π, V, Q)
32 | V_from_Q(states, result)
33 | return result
34 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/QLearning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.math.max
10 | import lab.mars.rl.util.tuples.tuple3
11 |
12 | fun IndexedMDP.QLearning(
13 | ε: Double,
14 | α: (IndexedState, IndexedAction) -> Double,
15 | episodes: Int): OptimalSolution {
16 | val π = IndexedPolicy(QFunc { 0.0 })
17 | val Q = QFunc { 0.0 }
18 |
19 | for (episode in 1..episodes) {
20 | log.debug { "$episode/$episodes" }
21 | var s = started()
22 | while (s.isNotTerminal) {
23 | `ε-greedy`(s, Q, π, ε)
24 | val a = π(s)
25 | val (s_next, reward) = a.sample()
26 | Q[s, a] += α(s, a) * (reward + γ * max(s_next.actions, 0.0) { Q[s_next, it] } - Q[s, a])
27 | s = s_next
28 | }
29 | }
30 | val V = VFunc { 0.0 }
31 | val result = tuple3(π, V, Q)
32 | V_from_Q(states, result)
33 | return result
34 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.V_from_Q
4 | import lab.mars.rl.algo.`ε-greedy`
5 | import lab.mars.rl.model.impl.mdp.*
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 | import lab.mars.rl.util.tuples.tuple3
10 |
11 | fun IndexedMDP.sarsa(
12 | ε: Double,
13 | α: (IndexedState, IndexedAction) -> Double,
14 | episodes: Int): OptimalSolution {
15 | val π = IndexedPolicy(QFunc { 0.0 })
16 | val Q = QFunc { 0.0 }
17 |
18 | for (episode in 1..episodes) {
19 | log.debug { "$episode/$episodes" }
20 | var s = started()
21 | `ε-greedy`(s, Q, π, ε)
22 | var a = π(s)
23 | while (true) {
24 | val (s_next, reward) = a.sample()
25 | if (s_next.isNotTerminal) {
26 | `ε-greedy`(s_next, Q, π, ε)
27 | val a_next = π(s_next)
28 | Q[s, a] += α(s, a) * (reward + γ * Q[s_next, a_next] - Q[s, a])
29 | s = s_next
30 | a = a_next
31 | } else {
32 | Q[s, a] += α(s, a) * (reward + γ * 0.0 - Q[s, a])//Q[terminalState,*]=0.0
33 | break
34 | }
35 | }
36 | }
37 | val V = VFunc { 0.0 }
38 | val result = tuple3(π, V, Q)
39 | V_from_Q(states, result)
40 | return result
41 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/algo/td/Tabular TD(0).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 | import lab.mars.rl.model.impl.mdp.StateValueFunction
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.model.log
8 | import lab.mars.rl.util.log.debug
9 |
10 | fun IndexedMDP.`Tabular TD(0)`(π: IndexedPolicy, α: Double, episodes: Int): StateValueFunction {
11 | val V = VFunc { 0.0 }
12 | for (episode in 1..episodes) {
13 | log.debug { "$episode/$episodes" }
14 | var s = started()
15 | while (s.isNotTerminal) {
16 | val a = π(s)
17 | val (s_next, reward) = a.sample()
18 | V[s] += α * (reward + γ * V[s_next] - V[s])
19 | s = s_next
20 | }
21 | }
22 | return V
23 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/ApproximateFunction.kt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/main/kotlin/lab/mars/rl/model/ApproximateFunction.kt
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/MDP.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NOTHING_TO_INLINE", "OVERRIDE_BY_INLINE", "UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.model
4 |
5 | import lab.mars.rl.model.impl.mdp.IndexedAction
6 | import lab.mars.rl.model.impl.mdp.IndexedPossible
7 | import lab.mars.rl.model.impl.mdp.IndexedState
8 | import lab.mars.rl.model.impl.mdp.PossibleSet
9 | import lab.mars.rl.util.buf.DefaultIntBuf
10 | import lab.mars.rl.util.collection.emptyNSet
11 | import org.slf4j.LoggerFactory
12 |
13 | /**
14 | *
15 | * Created on 2017-08-31.
16 | *
17 | *
18 | * @author wumo
19 | */
20 |
21 | interface MDP {
22 | val γ: Double
23 | val started: () -> State
24 | }
25 |
26 | interface Policy {
27 | /**sample action when in state [s]*/
28 | operator fun invoke(s: State): Action
29 |
30 | /**probability of taking action [a] when in state [s]*/
31 | operator fun get(s: State, a: Action): Double
32 |
33 | fun greedy(s: State): Action
34 | }
35 |
36 | interface RandomIterable: Iterable {
37 | fun rand(): E
38 | val size: Int
39 | }
40 |
41 | interface State {
42 | val actions: RandomIterable>
43 | }
44 |
45 | inline val State.isTerminal
46 | get() = !isNotTerminal
47 |
48 | inline val State.isNotTerminal
49 | get() = actions.any()
50 |
51 | interface Action {
52 | val sample: () -> Possible
53 | }
54 |
55 | open class Possible(val next: S, val reward: Double) {
56 | open operator fun component1() = next
57 | open operator fun component2() = reward
58 | }
59 |
60 | val log = LoggerFactory.getLogger(MDP::class.java)!!
61 | val null_index = DefaultIntBuf.of(-1)
62 | val null_state = IndexedState(null_index)
63 | val null_action = IndexedAction(null_index)
64 | val null_possible = IndexedPossible(null_state, 0.0, 0.0)
65 | val emptyPossibleSet: PossibleSet = emptyNSet()
66 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/LinearFunc.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.func
2 |
3 | import lab.mars.rl.model.ApproximateFunction
4 | import lab.mars.rl.util.matrix.Matrix
5 | import lab.mars.rl.util.matrix.MatrixSpec
6 | import org.apache.commons.math3.util.FastMath.*
7 |
8 | abstract class Feature(val conv: (Array) -> E) {
9 | operator fun invoke(vararg args: Any): MatrixSpec = _invoke(conv(args))
10 |
11 | abstract fun _invoke(s: E): MatrixSpec
12 | abstract val numOfComponents: Int
13 | }
14 |
15 | operator fun DoubleArray.times(elements: DoubleArray): Double {
16 | var result = 0.0
17 | for (i in 0..lastIndex)
18 | result += this[i] * elements[i]
19 | return result
20 | }
21 |
22 | class SimplePolynomial(override val numOfComponents: Int, conv: (Array) -> Double) : Feature(conv) {
23 | override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
24 | pow(s, it)
25 | }
26 | }
27 |
28 | class SimpleFourier(override val numOfComponents: Int, conv: (Array) -> Double) : Feature(conv) {
29 | override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
30 | cos(it * PI * s)
31 | }
32 | }
33 |
34 | class LinearFunc(val x: Feature) : ApproximateFunction(x.conv) {
35 | override fun `_∇`(input: E) = x._invoke(input)
36 |
37 | override val w = Matrix.column(x.numOfComponents)
38 |
39 | override fun _invoke(input: E) = (w `T*` x._invoke(input)).toScalar
40 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SimpleCoarseCoding.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.func
2 |
3 | import lab.mars.rl.util.matrix.Matrix
4 |
5 | val ClosedRange.size: Double
6 | get() = endInclusive - start
7 |
8 | class SimpleCoarseCoding(featureWidth: Double, domain: ClosedRange,
9 | override val numOfComponents: Int, conv: (Array) -> Double): Feature(conv) {
10 | val features: Array>
11 |
12 | init {
13 | val step = (domain.size - featureWidth) / (numOfComponents - 1)
14 | var left = domain.start
15 | features = Array(numOfComponents) {
16 | (left..(left + featureWidth)).apply { left += step }
17 | }
18 | }
19 |
20 | override fun _invoke(s: Double) = Matrix.column(numOfComponents) {
21 | if (features[it].contains(s)) 1.0 //quantize the interval
22 | else 0.0
23 | }
24 |
25 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SimpleTileCoding.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NAME_SHADOWING")
2 |
3 | package lab.mars.rl.model.impl.func
4 |
5 | import lab.mars.rl.util.matrix.Matrix
6 |
7 | class SimpleTileCoding(numOfTilings: Int,
8 | _tilingSize: Int,
9 | val tileWidth: Int,
10 | val tilingOffset: Double, conv: (Array) -> Double): Feature(conv) {
11 | val tilingSize = _tilingSize + 1
12 | override val numOfComponents = numOfTilings * tilingSize
13 |
14 | override fun _invoke(s: Double): Matrix {
15 | return Matrix.column(numOfComponents) {
16 | val tilingIdx = it / tilingSize
17 | val tileIdx = it % tilingSize
18 | val start = -tileWidth + tilingIdx * tilingOffset + tileIdx * tileWidth
19 | if (start <= s && s < start + tileWidth) 1.0 else 0.0
20 | }
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/StateAggregation.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.func
2 |
3 | import lab.mars.rl.model.ApproximateFunction
4 | import lab.mars.rl.util.matrix.Matrix
5 | import org.apache.commons.math3.util.FastMath.ceil
6 |
7 | class StateAggregation(numStates: Int, val numOfGroups: Int, conv: (Array) -> Int): ApproximateFunction(conv) {
8 | override fun `_∇`(input: Int): Matrix {
9 | val groupIdx = input / groupSize
10 | return Matrix.column(numOfGroups) { if (it == groupIdx) 1.0 else 0.0 }
11 | }
12 |
13 | override val w = Matrix.column(numOfGroups) { 0.0 }
14 | val groupSize = ceil(numStates.toDouble() / numOfGroups).toInt()
15 |
16 | override fun _invoke(input: Int): Double {
17 | val groupIdx = input / groupSize
18 | return w[groupIdx]
19 | }
20 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/func/SuttonTileCoding.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.func
2 |
3 | import lab.mars.rl.util.matrix.MatrixSpec
4 | import lab.mars.rl.util.matrix.SparseMatrix
5 | import lab.mars.rl.util.tuples.tuple2
6 | import org.apache.commons.math3.util.FastMath.*
7 |
8 | val MAXIMUM_CAPACITY = 1 shl 30
9 |
10 | /**
11 | * @param unit_scales scale input unit to tile coding unit so as to ensure expected resolution. Usually defined as #(grid tilings)/(range of data).
12 | */
13 | private val emptyDoubleArray = DoubleArray(0)
14 |
15 | class SuttonTileCoding(numTilesPerTiling: Int, _numTilings: Int, val unit_scales: DoubleArray = emptyDoubleArray, val allowCollisions: Boolean = false,
16 | conv: (Array) -> tuple2) : Feature>(conv) {
17 | val numTilings = tableSizeFor(_numTilings)
18 | override val numOfComponents = numTilings * (numTilesPerTiling + 1)
19 | override fun _invoke(s: tuple2): MatrixSpec {
20 | val (floats, ints) = s
21 | val activeTiles = tiles(floats, ints)
22 | val x = SparseMatrix(numOfComponents, 1)
23 | for (activeTile in activeTiles)
24 | x[activeTile] = 1.0
25 | return x
26 | }
27 |
28 | val data = HashMap, Int>(ceil(numOfComponents / 0.75).toInt())
29 |
30 | private fun tiles(floats: DoubleArray, ints: IntArray): IntArray {
31 | val qfloats = DoubleArray(floats.size) {
32 | floor(floats[it] * (if (it <= unit_scales.lastIndex) unit_scales[it] else 1.0) * numTilings)
33 | }
34 | val result = IntArray(numTilings)
35 | for (tiling in 0 until numTilings) {
36 | val tilingX2 = tiling * 2
37 | val coords = ArrayList(1 + floats.size + ints.size)
38 | coords.add(tiling.toDouble())
39 | var b = tiling
40 | for (q in qfloats) {
41 | coords.add(floor(((q + b) / numTilings)))
42 | b += tilingX2
43 | }
44 | for (int in ints)
45 | coords.add(int.toDouble())
46 | if (data.size < numOfComponents)
47 | result[tiling] = data.getOrPut(coords, { data.size })
48 | else if (allowCollisions)
49 | abs(coords.hashCode()) % numOfComponents
50 | }
51 | return result
52 | }
53 |
54 | /** Returns a power of two size for the given target capacity.*/
55 | fun tableSizeFor(cap: Int): Int {
56 | var n = cap - 1
57 | n = n or n.ushr(1)
58 | n = n or n.ushr(2)
59 | n = n or n.ushr(4)
60 | n = n or n.ushr(8)
61 | n = n or n.ushr(16)
62 | return if (n < 0) 1 else if (n >= MAXIMUM_CAPACITY) MAXIMUM_CAPACITY else n + 1
63 | }
64 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/DefaultAction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.Possible
5 | import lab.mars.rl.model.State
6 |
7 | class DefaultAction(val value: E, override val sample: () -> Possible): Action
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/DefaultMDP.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.MDP
4 | import lab.mars.rl.model.State
5 |
6 | class DefaultMDP(override val γ: Double, override val started: () -> State): MDP
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/EpsilonGreedyFunctionPolicy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.ApproximateFunction
5 | import lab.mars.rl.model.Policy
6 | import lab.mars.rl.model.State
7 | import lab.mars.rl.util.math.Rand
8 | import lab.mars.rl.util.math.argmax_tie_random
9 | import lab.mars.rl.util.math.max_count
10 |
11 | class EpsilonGreedyFunctionPolicy(val q: ApproximateFunction, val ε: Double = 0.1): Policy {
12 | override fun invoke(s: State): Action {
13 | return if (Rand().nextDouble() < ε)
14 | s.actions.rand()
15 | else
16 | argmax_tie_random(s.actions) { q(s, it) }
17 | }
18 |
19 | override fun get(s: State, a: Action): Double {
20 | val (m, c) = max_count(s.actions) { q(s, it) }
21 | return if (q(s, a) == m) (1.0 / c - ε / c - ε / s.actions.size) else ε / s.actions.size
22 | }
23 |
24 | override fun greedy(s: State) = argmax_tie_random(s.actions) { q(s, it) }
25 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedAction.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("OVERRIDE_BY_INLINE", "NOTHING_TO_INLINE", "UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.model.impl.mdp
4 |
5 | import lab.mars.rl.model.Action
6 | import lab.mars.rl.util.buf.Index
7 | import lab.mars.rl.util.buf.IntBuf
8 | import lab.mars.rl.util.collection.emptyNSet
9 | import lab.mars.rl.util.exception.NoMoreElementsException
10 | import lab.mars.rl.util.math.Rand
11 |
12 | class IndexedAction(val index: IntBuf): Index(), Action {
13 | inline override val size: Int
14 | get() = index.size
15 |
16 | inline override operator fun get(idx: Int) = index[idx]
17 |
18 | var possibles: PossibleSet = emptyNSet()
19 |
20 | override var sample = outer@ {
21 | if (possibles.isEmpty()) throw NoMoreElementsException()
22 | val p = Rand().nextDouble()
23 | var acc = 0.0
24 | for (possible in possibles) {
25 | acc += possible.probability
26 | if (p <= acc)
27 | return@outer possible
28 | }
29 | throw IllegalArgumentException("random=$p, but accumulation=$acc")
30 | }
31 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedMDP.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.model.impl.mdp
4 |
5 | import lab.mars.rl.model.MDP
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.util.buf.Index
8 | import lab.mars.rl.util.collection.IndexedCollection
9 |
10 | /**
11 | *
12 | * @property states state set
13 | * @property γ decay factor
14 | * @property state_function [state_function] generator
15 | * @property state_action_function [state_action_function] generator
16 | */
17 | class IndexedMDP(
18 | override val γ: Double,
19 | val states: StateSet,
20 | private val state_function: ((Index) -> Any) -> IndexedCollection,
21 | private val state_action_function: ((Index) -> Any) -> IndexedCollection): MDP {
22 | override var started = { states.rand() }
23 | /**
24 | *
25 | * create state function indexed by [IndexedState]
26 | */
27 | fun VFunc(element_maker: (Index) -> T) =
28 | state_function(element_maker) as IndexedCollection
29 |
30 | /**
31 | *
32 | * create state action function indexed by [IndexedState] pair [IndexedAction]
33 | */
34 | fun QFunc(element_maker: (Index) -> T) =
35 | state_action_function(element_maker) as IndexedCollection
36 |
37 | /**
38 | * equiprobable random policy
39 | */
40 | fun equiprobablePolicy(): IndexedPolicy {
41 | val policy = QFunc { 0.0 }
42 | for (s in states.filter { it.isNotTerminal }) {
43 | val prob = 1.0 / s.actions.size
44 | for (a in s.actions)
45 | policy[s, a] = prob
46 | }
47 | return IndexedPolicy(policy)
48 | }
49 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedPolicy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.Policy
5 | import lab.mars.rl.model.State
6 | import lab.mars.rl.util.collection.IndexedCollection
7 | import lab.mars.rl.util.collection.emptyNSet
8 | import lab.mars.rl.util.math.argmax
9 |
10 | class IndexedPolicy(val p: IndexedCollection, val ε: Double = 0.1): Policy {
11 |
12 | override fun invoke(s: State): IndexedAction {
13 | val eval = p(s as IndexedState)
14 | return s.actions.rand { eval[it] }
15 | }
16 |
17 | override fun get(s: State, a: Action)
18 | = p[s as IndexedState, a as IndexedAction]
19 |
20 | operator fun set(s: IndexedState, a: IndexedAction, v: Double) {
21 | p[s, a] = v
22 | }
23 |
24 | operator fun set(s: IndexedState, newaction: IndexedAction) {
25 | for (a in s.actions)
26 | p[s, a] = 0.0
27 | p[s, newaction] = 1.0
28 | }
29 |
30 | override fun greedy(s: State): IndexedAction {
31 | s as IndexedState
32 | return argmax(s.actions) { get(s, it) }
33 | }
34 | }
35 |
36 | val null_policy = IndexedPolicy(emptyNSet())
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedPossible.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.Possible
4 |
5 | class IndexedPossible(next: IndexedState, reward: Double, var probability: Double): Possible(next, reward) {
6 | override operator fun component1() = next
7 | override operator fun component2() = reward
8 | operator fun component3() = probability
9 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/IndexedState.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("OVERRIDE_BY_INLINE", "NOTHING_TO_INLINE", "UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.model.impl.mdp
4 |
5 | import lab.mars.rl.model.State
6 | import lab.mars.rl.util.buf.Index
7 | import lab.mars.rl.util.buf.IntBuf
8 | import lab.mars.rl.util.collection.IndexedCollection
9 | import lab.mars.rl.util.collection.emptyNSet
10 |
11 | class IndexedState(val index: IntBuf): Index(), State {
12 | override inline val size: Int
13 | get() = index.size
14 |
15 | override inline operator fun get(idx: Int) = index[idx]
16 |
17 | override var actions: IndexedCollection = emptyNSet()
18 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/NSetMDP.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("NOTHING_TO_INLINE")
2 |
3 | package lab.mars.rl.model.impl.mdp
4 |
5 | import lab.mars.rl.util.buf.IntBuf
6 | import lab.mars.rl.util.dimension.*
7 |
8 | /**
9 | *
10 | * Created on 2017-09-14.
11 | *
12 | *
13 | * @author wumo
14 | */
15 |
16 | /**
17 | * @param gamma `γ` decay
18 | * @param state_dim state dimension
19 | * @param action_dim action dimension
20 | * @return mdp with same state dimension pair same action dimension
21 | */
22 | inline fun NSetMDP(gamma: Double, state_dim: Any, action_dim: Any): IndexedMDP {
23 | val a_dim = action_dim.toDim()
24 | return NSetMDP(gamma, state_dim.toDim(), { a_dim })
25 | }
26 |
27 | /**
28 | * @param gamma `γ` decay factor
29 | * @param state_dim state dimension
30 | * @param action_dim different action dimension according to specific state
31 | * @return mdp with same state dimension but different action dimension
32 | */
33 | fun NSetMDP(gamma: Double, state_dim: Any, action_dim: (IntBuf) -> Any): IndexedMDP {
34 | val s_dim = state_dim.toDim() as GeneralDimension
35 | val s_a_dim = s_dim.copy() x action_dim
36 | return IndexedMDP(
37 | γ = gamma,
38 | states = nsetFrom(s_dim) {
39 | IndexedState(it.copy()).apply { actions = nsetFrom(action_dim(it).toDim()) { IndexedAction(it.copy()) } }
40 | },
41 | state_function = { element_maker -> nsetFrom(s_dim, element_maker) },
42 | state_action_function = { element_maker -> nsetFrom(s_a_dim, element_maker) })
43 | }
44 |
45 | /**
46 | * Note that: dimension shouldn't be 0. It it needs to be 0, then you can set `emptyNSet()` afterStartup the construction.
47 | * @param gamma `γ` decay factor
48 | * @param state_dim state dimension
49 | * @param action_dim action dimension
50 | * @return mdp with same state dimension pair same action dimension
51 | */
52 | inline fun CNSetMDP(gamma: Double, state_dim: Any, action_dim: Any): IndexedMDP {
53 | val a_dim = action_dim.toDim() as GeneralDimension
54 | return CNSetMDP(gamma, state_dim.toDim(), { a_dim })
55 | }
56 |
57 | /**
58 | * Note that: dimension shouldn't be 0. It it needs to be 0, then you can set `emptyNSet()` afterStartup the construction.
59 | * @param gamma `γ` decay factor
60 | * @param state_dim state dimension
61 | * @param action_dim different action dimension according to specific state
62 | * @return mdp with same state dimension but different action dimension
63 | */
64 | fun CNSetMDP(gamma: Double, state_dim: Any, action_dim: (IntBuf) -> Any): IndexedMDP {
65 | val s_dim = state_dim.toDim() as GeneralDimension
66 | val states = cnsetFrom(s_dim) {
67 | IndexedState(it.copy()).apply { actions = cnsetFrom(action_dim(it).toDim()) { IndexedAction(it.copy()) } }
68 | }
69 | val s_a_dim = s_dim.copy() x action_dim
70 | return IndexedMDP(
71 | γ = gamma,
72 | states = states,
73 | state_function = { element_maker -> states.copycat(element_maker) },
74 | state_action_function = { element_maker -> cnsetFrom(s_a_dim, element_maker) })
75 | }
76 |
77 | inline fun mdpOf(gamma: Double, state_dim: Any, action_dim: Any)
78 | = CNSetMDP(gamma, state_dim, action_dim)
79 |
80 | inline fun mdpOf(gamma: Double, state_dim: Any, noinline action_dim: (IntBuf) -> Any)
81 | = CNSetMDP(gamma, state_dim, action_dim)
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/SoftmaxpPolicy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.ApproximateFunction
5 | import lab.mars.rl.model.Policy
6 | import lab.mars.rl.model.State
7 | import lab.mars.rl.util.math.rand
8 | import kotlin.math.exp
9 |
10 | class SoftmaxpPolicy(val π: ApproximateFunction) : Policy {
11 | override fun invoke(s: State): Action {
12 | return rand(s.actions) { exp(π(s, it)) }
13 | }
14 |
15 | override fun get(s: State, a: Action) = exp(π(s, a))
16 |
17 | override fun greedy(s: State) = rand(s.actions) { exp(π(s, it)) }
18 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/model/impl/mdp/package.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.model.impl.mdp
2 |
3 | import lab.mars.rl.util.collection.IndexedCollection
4 | import lab.mars.rl.util.tuples.tuple3
5 |
6 | typealias StateSet = IndexedCollection
7 | typealias PossibleSet = IndexedCollection
8 | typealias StateValueFunction = IndexedCollection
9 | typealias ActionValueFunction = IndexedCollection
10 | typealias OptimalSolution = tuple3
11 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/1000-state RandomWalk.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
6 | import lab.mars.rl.model.impl.mdp.IndexedPossible
7 | import lab.mars.rl.util.collection.emptyNSet
8 | import lab.mars.rl.util.math.Rand
9 |
10 | /**
11 | *
12 | * Created on 2017-10-10.
13 | *
14 | *
15 | * @author wumo
16 | */
17 | object `1000-state RandomWalk` {
18 | val num_states = 1000
19 | val step_range = 100
20 | fun make(): Pair {
21 | val mdp = CNSetMDP(1.0, num_states + 2, 1)
22 | mdp.apply {
23 | val last = num_states + 1
24 | states[0].actions = emptyNSet()
25 | states[last].actions = emptyNSet()
26 | started = { states(num_states / 2).rand() }
27 | for (a in 1 until last)
28 | states[a].actions[0].sample = {
29 | val move = Rand().nextInt(1, step_range + 1) *
30 | (if (Rand().nextBoolean()) 1 else -1)
31 | val next = (a + move).coerceIn(0, last)
32 | IndexedPossible(states[next],
33 | when (next) {
34 | 0 -> -1.0
35 | last -> 1.0
36 | else -> 0.0
37 | }, 1.0)
38 | }
39 | }
40 | val policy = IndexedPolicy(mdp.QFunc { 1.0 })
41 | return Pair(mdp, policy)
42 | }
43 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/19-state RandomWalk.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.cnsetOf
7 | import lab.mars.rl.util.collection.emptyNSet
8 |
9 | /**
10 | *
11 | * Created on 2017-10-10.
12 | *
13 | *
14 | * @author wumo
15 | */
16 | object `19-state RandomWalk` {
17 | val num_states = 19
18 | fun make(): IndexedProblem {
19 | val mdp = CNSetMDP(1.0, num_states + 2, 1)
20 | mdp.apply {
21 | val last = num_states + 1
22 | states[0].actions = emptyNSet()
23 | states[last].actions = emptyNSet()
24 | started = { states((num_states + 1) / 2).rand() }
25 | for (a in 1 until last) {
26 | states[a].actions[0].apply {
27 | val left = a - 1
28 | val right = a + 1
29 | possibles = cnsetOf(IndexedPossible(states[left], if (left == 0) -1.0 else 0.0, 0.5),
30 | IndexedPossible(states[right], if (right == last) 1.0 else 0.0, 0.5))
31 | }
32 | }
33 | }
34 | val policy = mdp.QFunc { 1.0 }
35 | return Pair(mdp, IndexedPolicy(policy))
36 | }
37 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/AccessControl.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.dimension.cnsetFrom
7 | import lab.mars.rl.util.dimension.x
8 | import lab.mars.rl.util.math.Rand
9 | import lab.mars.rl.util.math.binomial
10 | import lab.mars.rl.util.math.pow
11 |
12 | object AccessControl {
13 | val k = 10
14 | val p = 0.06
15 | val priorities = 0..3
16 | val rewards = pow(2.0, priorities)
17 | val reject = 0
18 | val accept = 1
19 | fun make(): IndexedMDP {
20 | val mdp = CNSetMDP(gamma = 0.9, state_dim = (k + 1) x 4, action_dim = { (fs) ->
21 | if (fs == 0) 1 else 2
22 | })
23 |
24 | return mdp.apply {
25 | started = { states[k, Rand().nextInt(4)] }
26 | for (s in states) {
27 | var (freeServers, priority) = s
28 | for (a in s.actions) {
29 | var reward = 0.0
30 | if (freeServers > 0 && a[0] == accept) {
31 | freeServers--
32 | reward = rewards[priority]
33 | }
34 | val busyServers = k - freeServers
35 | a.possibles = cnsetFrom((busyServers + 1) x 4) { (released, pr) ->
36 | IndexedPossible(states[freeServers + released, pr],
37 | reward,
38 | binomial(busyServers, released, p) * (1 / 4.0))
39 | }
40 | }
41 | }
42 | }
43 | }
44 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/CliffWalking.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.cnsetOf
7 | import lab.mars.rl.util.collection.emptyNSet
8 | import lab.mars.rl.util.collection.fork
9 | import lab.mars.rl.util.collection.map
10 | import lab.mars.rl.util.dimension.x
11 |
12 | object CliffWalking {
13 | val world_height = 4
14 | val world_width = 12
15 | val move = arrayOf(
16 | intArrayOf(0, 1), //up
17 | intArrayOf(0, -1), //down
18 | intArrayOf(-1, 0), //left
19 | intArrayOf(1, 0)//right
20 | )
21 | val desc_move = arrayOf("↑", "↓", "←", "→")
22 | fun make(): IndexedMDP {
23 | val mdp = CNSetMDP(gamma = 1.0,
24 | state_dim = world_width x world_height,
25 | action_dim = 4)
26 | return mdp.apply {
27 | val goal = states[11, 0]
28 | goal.actions = emptyNSet()
29 | started = { states(0, 0).rand() }
30 | val startedState = states[0, 0]
31 |
32 | //cliff
33 | for (x in 1 until world_width - 1)
34 | states[x, 0].actions = emptyNSet()
35 |
36 | for ((s, a) in states.fork { it.actions }) {
37 | val m = move[a[0]]
38 | val _x = (s[0] + m[0]).coerceIn(0, world_width - 1)
39 | val _y = (s[1] + +m[1]).coerceIn(0, world_height - 1)
40 | val next = states[_x, _y]
41 | a.possibles = cnsetOf(IndexedPossible(next, if (next === goal) 0.0 else -1.0, 1.0))
42 | }
43 | startedState.actions[3].possibles = cnsetOf(IndexedPossible(startedState, -100.0, 1.0))
44 | for ((s, a) in (1 until world_width - 1).map { states[it, 1] }.fork { it.actions }) {
45 | val m = move[a[0]]
46 | var _x = s[0] + m[0]
47 | var _y = s[1] + +m[1]
48 | if (_y == 0) {
49 | _x = 0
50 | _y = 0
51 | }
52 | val next = states[_x, _y]
53 | a.possibles = cnsetOf(IndexedPossible(next, if (next === startedState) -100.0 else -1.0, 1.0))
54 | }
55 | }
56 | }
57 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/DynaMaze.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.buf.DefaultIntBuf
7 | import lab.mars.rl.util.buf.IntBuf
8 | import lab.mars.rl.util.collection.cnsetOf
9 | import lab.mars.rl.util.collection.emptyNSet
10 | import lab.mars.rl.util.dimension.x
11 |
12 | object DynaMaze {
13 | private val move = arrayOf(
14 | intArrayOf(-1, 0), //left
15 | intArrayOf(1, 0), //right
16 | intArrayOf(0, 1), //up
17 | intArrayOf(0, -1)//down
18 | )
19 | val desc_move = arrayOf("←", "→", " ↑", " ↓")
20 | val wall = hashSetOf()
21 | val obstacle = hashSetOf()
22 |
23 | init {
24 | for (x in -1..9) {
25 | wall += DefaultIntBuf.of(x, -1)
26 | wall += DefaultIntBuf.of(x, 6)
27 | }
28 | for (y in -1..6) {
29 | wall += DefaultIntBuf.of(-1, y)
30 | wall += DefaultIntBuf.of(9, y)
31 | }
32 | obstacle += DefaultIntBuf.of(2, 2)
33 | obstacle += DefaultIntBuf.of(2, 3)
34 | obstacle += DefaultIntBuf.of(2, 4)
35 |
36 | obstacle += DefaultIntBuf.of(5, 1)
37 |
38 | obstacle += DefaultIntBuf.of(7, 3)
39 | obstacle += DefaultIntBuf.of(7, 4)
40 | obstacle += DefaultIntBuf.of(7, 5)
41 |
42 | wall += obstacle
43 | }
44 |
45 | fun make(): IndexedMDP {
46 | val mdp = CNSetMDP(gamma = 0.95,
47 | state_dim = 9 x 6,
48 | action_dim = 4)
49 | return mdp.apply {
50 | for (s in states)
51 | for (action in s.actions) {
52 | val tmp = DefaultIntBuf.of(0, 0)
53 | tmp[0] = s[0] + move[action[0]][0]
54 | tmp[1] = s[1] + move[action[0]][1]
55 | if (tmp in wall) {
56 | tmp[0] = s[0]
57 | tmp[1] = s[1]
58 | }
59 | val reward = if (tmp[0] == 8 && tmp[1] == 5) 1.0 else 0.0
60 | action.possibles = cnsetOf(IndexedPossible(states[tmp], reward, 1.0))
61 |
62 | }
63 | states[8, 5].actions = emptyNSet()
64 | for (o in obstacle)
65 | states[o].actions = emptyNSet()
66 | started = { states(0, 3).rand() }
67 | }
68 |
69 | }
70 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/Gambler.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.cnsetOf
7 | import org.apache.commons.math3.util.FastMath.min
8 |
9 | /**
10 | *
11 | * Created on 2017-09-13.
12 | *
13 | *
14 | * @author wumo
15 | */
16 | object Gambler {
17 | val goal_coin = 100
18 |
19 | fun make(p_head: Double): IndexedMDP {
20 | val mdp = CNSetMDP(gamma = 1.0,
21 | state_dim = goal_coin + 1,
22 | action_dim = { min(it[0], goal_coin - it[0]) + 1 })
23 | mdp.apply {
24 | for (s in states) {
25 | val capital = s[0]
26 | val max_stake = min(capital, goal_coin - capital)
27 | for (action in s.actions) {
28 | val stake = action[0]
29 | action.possibles = if (max_stake == 0)
30 | cnsetOf(IndexedPossible(states[capital], 0.0, 1.0))
31 | else
32 | cnsetOf(IndexedPossible(states[capital - stake], 0.0, 1 - p_head), //lose
33 | IndexedPossible(states[capital + stake], if (capital + stake == goal_coin) 1.0 else 0.0, p_head))//win
34 | }
35 | }
36 | }
37 | return mdp
38 | }
39 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/GridWorld.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.model.isNotTerminal
7 | import lab.mars.rl.util.collection.cnsetOf
8 | import lab.mars.rl.util.collection.emptyNSet
9 | import lab.mars.rl.util.collection.filter
10 | import lab.mars.rl.util.collection.fork
11 | import lab.mars.rl.util.dimension.x
12 |
13 | /**
14 | *
15 | * Created on 2017-09-05.
16 | *
17 | *
18 | * @author wumo
19 | */
20 | object GridWorld {
21 | private const val n = 4
22 | private const val m = 4
23 | private val move = arrayOf(
24 | intArrayOf(-1, 0), //up
25 | intArrayOf(1, 0), //down
26 | intArrayOf(0, 1), //right
27 | intArrayOf(0, -1)//left
28 | )
29 | val desc_move = arrayOf(" ↑", " ↓", "→", "←")
30 | fun make(): IndexedMDP {
31 | val mdp = CNSetMDP(gamma = 0.9,
32 | state_dim = n x n,
33 | action_dim = m)
34 | mdp.apply {
35 | for ((s, action) in states.filter { it.isNotTerminal }.fork { it.actions }) {
36 | val (s0, s1) = s
37 | val (a) = action
38 | var x = s0 + move[a][0]
39 | var y = s1 + move[a][1]
40 | if (x < 0 || x >= n || y < 0 || y >= n) {
41 | x = s0
42 | y = s1
43 | }
44 | action.possibles = cnsetOf(IndexedPossible(states[x, y], -1.0, 1.0))
45 | }
46 | states[0, 0].actions = emptyNSet()
47 | states[n - 1, n - 1].actions = emptyNSet()
48 | }
49 |
50 | return mdp
51 | }
52 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/MaximizationBias.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.emptyNSet
7 | import lab.mars.rl.util.math.Rand
8 |
9 | object MaximizationBias {
10 | val mean = -0.1
11 | val actionsOfB = 10
12 | fun make(): IndexedMDP {
13 | val mdp = CNSetMDP(gamma = 1.0,
14 | state_dim = 4,
15 | action_dim = {
16 | when (it[0]) {
17 | 1 -> actionsOfB
18 | 2 -> 2
19 | else -> 1
20 | }
21 | })
22 | mdp.apply {
23 | states[0].actions = emptyNSet()
24 | states[3].actions = emptyNSet()
25 | started = { states(2).rand() }
26 | for (a in states[2].actions)
27 | a.sample = {
28 | val next = if (a[0] == 0) 1 else 3
29 | IndexedPossible(states[next], 0.0, 1.0)
30 | }
31 | for (a in states[1].actions)
32 | a.sample = {
33 | IndexedPossible(states[0], Rand().nextGaussian() + mean, 1.0)
34 | }
35 | }
36 | return mdp
37 | }
38 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/MountainCar.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.Possible
5 | import lab.mars.rl.model.RandomIterable
6 | import lab.mars.rl.model.State
7 | import lab.mars.rl.model.impl.mdp.DefaultAction
8 | import lab.mars.rl.model.impl.mdp.DefaultMDP
9 | import lab.mars.rl.util.collection.emptyNSet
10 | import lab.mars.rl.util.dimension.cnsetFrom
11 | import lab.mars.rl.util.math.Rand
12 | import org.apache.commons.math3.util.FastMath.cos
13 |
14 | object MountainCar {
15 | class CarState(val position: Double, val velocity: Double) : State {
16 | override val actions: RandomIterable> =
17 | if (position == POSITION_MAX) emptyNSet()
18 | else cnsetFrom(3) {
19 | val a = it[0] - 1
20 | DefaultAction(a) {
21 | var newVelocity = (velocity + 0.001 * a - 0.0025 * cos(3 * position))
22 | .coerceIn(VELOCITY_MIN, VELOCITY_MAX)
23 | val newPosition = (position + newVelocity).coerceIn(POSITION_MIN, POSITION_MAX)
24 | if (newPosition == POSITION_MIN) newVelocity = 0.0
25 | Possible(CarState(newPosition, newVelocity), -1.0)
26 | }
27 | }
28 | }
29 |
30 | const val POSITION_MIN = -1.2
31 | const val POSITION_MAX = 0.5
32 | const val VELOCITY_MIN = -0.07
33 | const val VELOCITY_MAX = 0.07
34 | fun make() = DefaultMDP(1.0) {
35 | CarState(Rand().nextDouble(-0.6, -0.4), 0.0)
36 | }
37 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/RandomWalk.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.cnsetOf
7 | import lab.mars.rl.util.collection.emptyNSet
8 |
9 | /**
10 | *
11 | * Created on 2017-10-10.
12 | *
13 | *
14 | * @author wumo
15 | */
16 | object RandomWalk {
17 | fun make(): IndexedProblem {
18 | val mdp = CNSetMDP(1.0, 7, 1)
19 | mdp.apply {
20 | states[0].actions = emptyNSet()
21 | states[6].actions = emptyNSet()
22 | started = { states(3).rand() }
23 | for (a in 1..5) {
24 | states[a].actions[0].apply {
25 | possibles = cnsetOf(IndexedPossible(states[a - 1], 0.0, 0.5),
26 | IndexedPossible(states[a + 1], if (a == 5) 1.0 else 0.0, 0.5))
27 | }
28 | }
29 | }
30 | val policy = mdp.QFunc { 1.0 }
31 | return Pair(mdp, IndexedPolicy(policy))
32 | }
33 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/SquareWave.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.Action
4 | import lab.mars.rl.model.RandomIterable
5 | import lab.mars.rl.model.State
6 | import lab.mars.rl.util.collection.emptyNSet
7 | import lab.mars.rl.util.math.Rand
8 | import lab.mars.rl.util.tuples.tuple2
9 |
10 | class WaveState(val x: Double): State {
11 | override var actions: RandomIterable> = emptyNSet()
12 | }
13 |
14 | object SquareWave {
15 | val domain = 0.0..2.0
16 | val maxResolution = 100
17 | fun invoke(x: Double) = if (x in 0.5..1.5) 1.0 else 0.0
18 | fun sample(): tuple2 {
19 | val x = Rand().nextDouble(domain.start, domain.endInclusive)
20 | val y = invoke(x)
21 | return tuple2(WaveState(x), y)
22 | }
23 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/WindyGridworld.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.CNSetMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedMDP
5 | import lab.mars.rl.model.impl.mdp.IndexedPossible
6 | import lab.mars.rl.util.collection.cnsetOf
7 | import lab.mars.rl.util.collection.emptyNSet
8 | import lab.mars.rl.util.collection.fork
9 | import lab.mars.rl.util.dimension.x
10 |
11 | object WindyGridworld {
12 | val world_height = 7
13 | val world_width = 10
14 | val wind = intArrayOf(0, 0, 0, 1, 1, 1, 2, 2, 1, 0)//wind strength for each column
15 | val move = arrayOf(
16 | intArrayOf(0, 1), //up
17 | intArrayOf(0, -1), //down
18 | intArrayOf(-1, 0), //left
19 | intArrayOf(1, 0)//right
20 | )
21 | val kingMove = arrayOf(
22 | intArrayOf(0, 1), //up
23 | intArrayOf(0, -1), //down
24 | intArrayOf(-1, 0), //left
25 | intArrayOf(1, 0),//right
26 | intArrayOf(-1, 1), //up
27 | intArrayOf(1, 1), //down
28 | intArrayOf(1, -1), //left
29 | intArrayOf(-1, -1)//right
30 | )
31 | val desc_move = arrayOf(" ↑", " ↓", "←", "→")
32 | val desc_king_move = arrayOf(" ↑", " ↓", "←", "→", "↖", "↗", "↘", "↙")
33 | fun make(KingMove: Boolean = false): IndexedMDP {
34 | val mdp = CNSetMDP(gamma = 1.0,
35 | state_dim = world_width x world_height,
36 | action_dim = if (KingMove) 8 else 4)
37 | return mdp.apply {
38 | val goal = states[7, 3]
39 | goal.actions = emptyNSet()
40 | started = { states(0, 3).rand() }
41 | for ((s, a) in states.fork { it.actions }) {
42 | val m = (if (KingMove) kingMove else move)[a[0]]
43 | val x = (s[0] + m[0]).coerceIn(0, world_width - 1)
44 | val y = (s[1] + wind[s[0]] + m[1]).coerceIn(0, world_height - 1)
45 | val next = states[x, y]
46 | a.possibles = cnsetOf(IndexedPossible(next, if (next === goal) 0.0 else -1.0, 1.0))
47 | }
48 | }
49 | }
50 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/problem/package.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.problem
2 |
3 | import lab.mars.rl.model.impl.mdp.IndexedMDP
4 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
5 |
6 | typealias IndexedProblem = Pair
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/Buf.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.buf
2 |
3 | import lab.mars.rl.util.math.Rand
4 |
5 | /**
6 | *
7 | * Created on 2017-09-28.
8 | *
9 | *
10 | * @author wumo
11 | */
12 | interface Buf: Iterable {
13 | /** [end]>=[start] */
14 | operator fun get(start: Int, end: Int): Buf
15 |
16 | fun toTypedArray(): Array
17 | fun copy(): Buf
18 |
19 | val size: Int
20 | val isEmpty: Boolean
21 | get() = size == 0
22 | val writePtr: Int
23 | get() = size
24 | val lastIndex: Int
25 | get() = size - 1
26 | val last: T
27 | get() = get(lastIndex)
28 |
29 | /**
30 | * get value at the specific [idx]
31 | */
32 | operator fun get(idx: Int): T
33 |
34 | fun forEach(start: Int = 0, end: Int = lastIndex, block: (Int, T) -> Unit) {
35 | for (i in start..end)
36 | block(i, get(i))
37 | }
38 |
39 | fun equals(other: Buf): Boolean {
40 | if (this === other) return true
41 | if (size != other.size) return false
42 | for (i in 0..lastIndex)
43 | if (get(i) != other[i]) return false
44 | return true
45 | }
46 |
47 | override fun iterator() = object: Iterator {
48 | var a = 0
49 | override fun hasNext() = a < size
50 |
51 | override fun next() = get(a++)
52 | }
53 |
54 | fun rand() = get(Rand().nextInt(size))
55 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/IntBuf.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.buf
2 |
3 | /**
4 | *
5 | * Created on 2017-09-28.
6 | *
7 | *
8 | * @author wumo
9 | */
10 | abstract class IntBuf: Index() {
11 | /** [end]>=[start] */
12 | abstract operator fun get(start: Int, end: Int): IntBuf
13 |
14 | abstract fun toIntArray(): IntArray
15 | abstract fun copy(): IntBuf
16 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/MutableBuf.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.buf
2 |
3 | /**
4 | *
5 | * Created on 2017-09-28.
6 | *
7 | *
8 | * @author wumo
9 | */
10 | interface MutableBuf: Buf {
11 | val cap: Int
12 |
13 | override fun get(start: Int, end: Int): MutableBuf
14 |
15 | operator fun set(idx: Int, s: T)
16 |
17 | /** [end]>=[start] */
18 | operator fun set(start: Int, end: Int, s: T)
19 |
20 | fun unfold(num: Int)
21 |
22 | fun ensure(minCap: Int)
23 |
24 | fun prepend(s: T)
25 | fun prepend(num: Int, s: T)
26 | fun prepend(another: Buf)
27 |
28 | fun append(s: T)
29 | fun append(num: Int, s: T)
30 | fun append(another: Buf)
31 |
32 | fun remove(range: IntRange) {
33 | remove(range.start, range.endInclusive)
34 | }
35 |
36 | /** [end]>=[start] */
37 | fun remove(start: Int, end: Int)
38 |
39 | fun remove(index: Int) = remove(index, index)
40 | fun removeFirst(num: Int = 1) {
41 | if (num == 0) return
42 | remove(0, num - 1)
43 | }
44 |
45 | fun removeLast(num: Int) {
46 | if (num == 0) return
47 | remove(lastIndex - num + 1, lastIndex)
48 | }
49 |
50 | fun clear() {
51 | removeLast(size)
52 | }
53 |
54 | fun reuseBacked(): Buf
55 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/buf/MutableIntBuf.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.buf
2 |
3 | /**
4 | *
5 | * Created on 2017-09-28.
6 | *
7 | *
8 | * @author wumo
9 | */
10 | abstract class MutableIntBuf: IntBuf() {
11 | abstract val cap: Int
12 | abstract operator fun set(idx: Int, s: Int)
13 |
14 | /** [end]>=[start] */
15 | abstract operator fun set(start: Int, end: Int, s: Int)
16 |
17 | abstract fun ensure(minCap: Int)
18 |
19 | abstract fun prepend(s: Int)
20 | abstract fun prepend(num: Int, s: Int)
21 | abstract fun prepend(another: Index)
22 |
23 | abstract fun append(s: Int)
24 | abstract fun append(num: Int, s: Int)
25 | abstract fun append(another: Index)
26 |
27 | fun remove(range: IntRange) {
28 | remove(range.start, range.endInclusive)
29 | }
30 |
31 | /** [end]>=[start] */
32 | abstract fun remove(start: Int, end: Int)
33 |
34 | fun remove(index: Int) = remove(index, index)
35 | fun removeFirst(num: Int) {
36 | if (num == 0) return
37 | remove(0, num - 1)
38 | }
39 |
40 | fun removeLast(num: Int) {
41 | if (num == 0) return
42 | remove(lastIndex - num + 1, lastIndex)
43 | }
44 |
45 | fun clear() {
46 | removeLast(size)
47 | }
48 |
49 | abstract fun reuseBacked(): IntBuf
50 | abstract fun append(data: IntArray)
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/Gettable.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.collection
2 |
3 | interface Gettable {
4 | operator fun get(k: K): V
5 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/HashMapRAC.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.collection
2 |
3 | import lab.mars.rl.util.buf.DefaultBuf
4 | import lab.mars.rl.util.buf.Index
5 | import lab.mars.rl.util.math.Rand
6 | import lab.mars.rl.util.tuples.tuple2
7 |
8 | class HashMapRAC(): IndexedCollection {
9 | private val raw = hashMapOf()
10 | private val contigus = DefaultBuf.new()
11 |
12 | override fun copycat(element_maker: (Index) -> T): IndexedCollection {
13 | TODO()
14 | }
15 |
16 | override fun indices() = raw.keys.iterator()
17 |
18 | override fun withIndices(): Iterator> {
19 | val iter = raw.entries.iterator()
20 | return object: Iterator> {
21 | override fun hasNext() = iter.hasNext()
22 |
23 | override fun next(): tuple2 {
24 | val entry = iter.next()
25 | return tuple2(entry.key, entry.value)
26 | }
27 | }
28 | }
29 |
30 | override fun rand() = contigus[Rand().nextInt(contigus.size)]
31 |
32 | override fun get(dim: Index) = raw[dim]!!
33 |
34 | override fun invoke(subset_dim: Index): IndexedCollection {
35 | TODO()
36 | }
37 |
38 | override fun set(dim: Index, s: E) {
39 | raw.put(dim, s) ?: contigus.append(s)
40 | }
41 |
42 | override fun iterator() = raw.values.iterator()
43 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/collection/extensions.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST", "NOTHING_TO_INLINE")
2 |
3 | package lab.mars.rl.util.collection
4 |
5 | inline fun Iterable.fork(crossinline subset: (E) -> Iterable)
6 | = asSequence().flatMap { s -> subset(s).asSequence().map { s to it } }
7 |
8 | inline fun Sequence.fork(crossinline subset: (E) -> Iterable)
9 | = flatMap { s -> subset(s).asSequence().map { s to it } }
10 |
11 | inline fun Iterable.filter(crossinline predicate: (E) -> Boolean)
12 | = asSequence().filter { predicate(it) }
13 |
14 | inline fun Iterable.map(crossinline transform: (T) -> R)
15 | = asSequence().map { transform(it) }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/exception/IndexOutOfDimensionException.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.exception
2 |
3 | class IndexOutOfDimensionException: Exception()
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/exception/NoMoreElementsException.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.exception
2 |
3 | class NoMoreElementsException: Exception()
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/log/LoggerHelpers.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.log
2 |
3 | import org.slf4j.Logger
4 |
5 | inline fun Logger.info(block: () -> String) {
6 | if (isInfoEnabled) info(block())
7 | }
8 |
9 | inline fun Logger.debug(block: () -> String) {
10 | if (isDebugEnabled) debug(block())
11 | }
12 |
13 | inline fun Logger.warn(block: () -> String) {
14 | if (isWarnEnabled) warn(block())
15 | }
16 |
17 | inline fun Logger.error(block: () -> String) {
18 | if (isErrorEnabled) error(block())
19 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/math/Binomial.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.math
2 |
3 | import org.apache.commons.math3.util.FastMath.exp
4 |
5 | fun binomial(trial: Int, x: Int, p: Double): Double {
6 | if (trial == 0) return if (x == 0) 1.0 else 0.0
7 | if (x < 0 || x > trial) return 0.0
8 | return exp(logBinomialProbability(x, trial, p, 1.0 - p))
9 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/math/Vector.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.math
2 |
3 | import kotlin.math.PI
4 | import kotlin.math.sqrt
5 |
6 |
7 | data class Vector2(var x: Double = 0.0, var y: Double = 0.0) {
8 | companion object {
9 | fun zero() = Vector2(0.0, 0.0)
10 | val ZERO = zero()
11 | }
12 |
13 | fun set(v: Vector2) {
14 | x = v.x
15 | y = v.y
16 | }
17 |
18 | fun set(x: Double, y: Double) {
19 | this.x = x
20 | this.y = y
21 | }
22 |
23 | operator fun plus(v: Vector2) = Vector2(x + v.x, y + v.y)
24 | operator fun plusAssign(v: Vector2) {
25 | x += v.x
26 | y += v.y
27 | }
28 |
29 | operator fun minus(v: Vector2) = Vector2(x - v.x, y - v.y)
30 | operator fun minusAssign(v: Vector2) {
31 | x -= v.x
32 | y -= v.y
33 | }
34 |
35 | operator fun times(s: Double) = Vector2(x * s, y * s)
36 | operator fun timesAssign(s: Double) {
37 | x *= s
38 | y *= s
39 | }
40 |
41 | operator fun div(s: Double) = Vector2(x / s, y / s)
42 | operator fun divAssign(s: Double) {
43 | x /= s
44 | y /= s
45 | }
46 |
47 | fun norm(): Vector2 {
48 | val v = dist()
49 | x /= v
50 | y /= v
51 | return this
52 | }
53 |
54 | fun rot90L(): Vector2 {
55 | val tmp = x
56 | x = -y
57 | y = tmp
58 | return this
59 | }
60 |
61 | fun rot90R(): Vector2 {
62 | val tmp = x
63 | x = y
64 | y = -tmp
65 | return this
66 | }
67 |
68 | fun copy() = Vector2(x, y)
69 | fun dist() = sqrt(x * x + y * y)
70 | fun dist(v: Vector2) = sqrt((x - v.x) * (x - v.x) + (y - v.y) * (y - v.y))
71 |
72 | /** @return the angle in degrees of this vector (point) relative to the x-axis. Angles are towards the positive y-axis
73 | * (typically counter-clockwise) and between 0 and 360. */
74 | fun angle(): Double {
75 | var angle = Math.atan2(y, x).toFloat() * 180f / PI
76 | if (angle < 0) angle += 360f
77 | return angle
78 | }
79 |
80 | fun outOf(_x: Double, _y: Double, width: Double, height: Double): Boolean {
81 | return x < _x || x > _x + width || y < _y || y > _y + height
82 | }
83 |
84 | fun rotate(degrees: Double): Vector2 {
85 | val radians = degrees * 180f / PI
86 | val cos = Math.cos(radians)
87 | val sin = Math.sin(radians)
88 |
89 | val newX = x * cos - y * sin
90 | val newY = x * sin + y * cos
91 |
92 | x = newX
93 | y = newY
94 |
95 | return this
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/ClasspathLocation.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.resource
2 |
3 | import java.io.InputStream
4 | import java.net.URL
5 |
6 | /**
7 | * A resource location that searches the classpath
8 | *
9 | * @author kevin
10 | */
11 | class ClasspathLocation : ResourceLocation {
12 | override fun getResource(ref: String): URL {
13 | val cpRef = ref.replace('\\', '/')
14 | return ResourceLoader::class.java.classLoader.getResource(cpRef)
15 | }
16 |
17 | override fun getResourceAsStream(ref: String): InputStream {
18 | val cpRef = ref.replace('\\', '/')
19 | return ResourceLoader::class.java.classLoader.getResourceAsStream(cpRef)
20 | }
21 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/FileSystemLocation.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.resource
2 |
3 | import java.io.File
4 | import java.io.FileInputStream
5 | import java.io.IOException
6 | import java.io.InputStream
7 | import java.net.URL
8 |
9 | /**
10 | * A resource loading location that searches somewhere on the classpath
11 | *
12 | * @author kevin
13 | */
14 |
15 | /**
16 | * Create a new resoruce location based on the file system
17 | *
18 | * @param root The root of the file system to search
19 | */
20 | class FileSystemLocation(private val root: File) : ResourceLocation {
21 |
22 | /**
23 | * @see ResourceLocation.getResource
24 | */
25 | override fun getResource(ref: String): URL? {
26 | return try {
27 | var file = File(root, ref)
28 | if (!file.exists()) {
29 | file = File(ref)
30 | }
31 | if (!file.exists()) {
32 | null
33 | } else file.toURI().toURL()
34 | } catch (e: IOException) {
35 | null
36 | }
37 | }
38 |
39 | /**
40 | * @see ResourceLocation.getResourceAsStream
41 | */
42 | override fun getResourceAsStream(ref: String): InputStream? {
43 | return try {
44 | var file = File(root, ref)
45 | if (!file.exists()) {
46 | file = File(ref)
47 | }
48 | FileInputStream(file)
49 | } catch (e: IOException) {
50 | null
51 | }
52 | }
53 |
54 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/resource/ResourceLocation.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.resource
2 |
3 | import java.io.InputStream
4 | import java.net.URL
5 |
6 | /**
7 | * A location from which resources can be loaded
8 | *
9 | * @author kevin
10 | */
11 | interface ResourceLocation {
12 | /**
13 | * Get a resource as an input stream
14 | *
15 | * @param ref The reference to the resource to retrieve
16 | * @return A stream from which the resource can be read or
17 | * null if the resource can't be found in this location
18 | */
19 | fun getResourceAsStream(ref: String): InputStream?
20 |
21 | /**
22 | * Get a resource as a URL
23 | *
24 | * @param ref The reference to the resource to retrieve
25 | * @return A URL from which the resource can be read
26 | */
27 | fun getResource(ref: String): URL?
28 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple2.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 |
3 | data class tuple2(var _1: A, var _2: B) {
4 | override fun toString(): String = "($_1,$_2)"
5 |
6 | operator fun invoke(a: A, b: B): tuple2 {
7 | _1 = a
8 | _2 = b
9 | return this
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple3.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 |
3 | data class tuple3(var _1: A, var _2: B, var _3: C) {
4 | override fun toString(): String {
5 | return "($_1,$_2,$_3)"
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple4.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 |
3 | data class tuple4(var _1: A, var _2: B, var _3: C, var _4: D) {
4 | override fun toString(): String {
5 | return "($_1,$_2,$_3,$_4)"
6 | }
7 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple5.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 |
3 | data class tuple5(var _1: A, var _2: B, var _3: C, var _4: D, var _5: E) {
4 | override fun toString(): String {
5 | return "($_1,$_2,$_3,$_4,$_5)"
6 | }
7 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/tuples/tuple6.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.tuples
2 |
3 | data class tuple6(var _1: A, var _2: B, var _3: C, var _4: D, var _5: E, var _6: F) {
4 | override fun toString(): String {
5 | return "($_1,$_2,$_3,$_4,$_5,$_6)"
6 | }
7 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/ChartApp.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.ui
2 |
3 | import javafx.scene.chart.NumberAxis
4 | import tornadofx.*
5 | import java.util.concurrent.ConcurrentLinkedQueue
6 |
7 | class Line(val description: String, val data: MutableMap = hashMapOf()) {
8 | operator fun set(x: Number, y: Number) = data.put(x, y)
9 | }
10 |
11 | class LineChart(val title: String, val xAxisLabel: String, val yAxisLabel: String,
12 | val lines: MutableCollection = ConcurrentLinkedQueue(),
13 | val xAxisConfig: NumberAxis.() -> Unit = {}, val yAxisConfig: NumberAxis.() -> Unit = {},
14 | val linesSortor: Array.() -> Unit = {}) {
15 | operator fun plusAssign(line: Line) {
16 | lines += line
17 | }
18 | }
19 |
20 | class D2DChart: View() {
21 | companion object {
22 | val charts = mutableListOf()
23 | }
24 |
25 | override val root = stackpane {
26 | flowpane {
27 | for (chart in charts)
28 | chart.apply {
29 | linechart(title, NumberAxis(), NumberAxis()) {
30 | (xAxis as NumberAxis).apply {
31 | isForceZeroInRange = false
32 | isAutoRanging = true
33 | label = xAxisLabel
34 | xAxisConfig(this)
35 | }
36 | (yAxis as NumberAxis).apply {
37 | isForceZeroInRange = false
38 | isAutoRanging = true
39 | label = yAxisLabel
40 | yAxisConfig(this)
41 | }
42 | val lines = chart.lines.toTypedArray()
43 | linesSortor(lines)
44 | for (line in lines)
45 | series(line.description) {
46 | for ((k, v) in line.data)
47 | data(k, v)
48 | }
49 | createSymbols = false
50 | }
51 | }
52 | }
53 | }
54 | }
55 |
56 | class ChartApp: App(D2DChart::class)
57 |
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/D2DGameUI.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.ui
2 |
3 | import javafx.application.Application
4 | import javafx.application.Platform
5 | import javafx.collections.FXCollections
6 | import javafx.geometry.Orientation
7 | import javafx.scene.Scene
8 | import javafx.scene.canvas.Canvas
9 | import javafx.scene.canvas.GraphicsContext
10 | import javafx.scene.chart.LineChart
11 | import javafx.scene.chart.NumberAxis
12 | import javafx.scene.chart.XYChart
13 | import javafx.scene.layout.FlowPane
14 | import javafx.stage.Stage
15 | import lab.mars.rl.util.resource.ResourceLoader
16 | import java.util.concurrent.CyclicBarrier
17 |
18 | class D2DGameUI : Application() {
19 | class ChartDescription(val title: String,
20 | val xAxisLabel: String, val yAxisLabel: String,
21 | val numSeries: Int = 1,
22 | val xForceZeroInRange: Boolean = true,
23 | val yForceZeroInRange: Boolean = true) {
24 | val data = Array(numSeries) { FXCollections.observableArrayList>()!! }
25 | }
26 |
27 | lateinit var canvas: Canvas
28 | lateinit var primaryStage: Stage
29 |
30 | companion object {
31 | var width = 1000.0
32 | var height = 800.0
33 | var canvas_width = 600.0
34 | var canvas_height = 800.0
35 | var title = ""
36 | val charts = FXCollections.observableArrayList()!!
37 | var afterStartup: (GraphicsContext) -> Unit = {}
38 | lateinit var render: ((GraphicsContext) -> Unit) -> Unit
39 |
40 | }
41 |
42 | override fun start(ps: Stage?) {
43 | primaryStage = ps!!
44 |
45 | primaryStage.title = title
46 | val root = FlowPane(Orientation.HORIZONTAL)
47 | canvas = Canvas(canvas_width, canvas_height)
48 | root.children.add(canvas)
49 | for (c in charts) {
50 | val chart = LineChart(NumberAxis().apply { label = c.xAxisLabel;isForceZeroInRange = c.xForceZeroInRange },
51 | NumberAxis().apply { label = c.yAxisLabel;isForceZeroInRange = c.yForceZeroInRange },
52 | FXCollections.observableArrayList>().apply {
53 | var i = 0
54 | for (d in c.data)
55 | add(XYChart.Series("${i++}", d))
56 | }).apply {
57 | title = c.title
58 | createSymbols = false
59 | // isLegendVisible = false
60 | animated = false
61 | stylesheets.add(ResourceLoader.getResource("StockLineChart.css").toExternalForm())
62 | }
63 | root.children.add(chart)
64 | }
65 |
66 | primaryStage.scene = Scene(root, width, height)
67 | primaryStage.show()
68 | render = this::render
69 | afterStartup(canvas.graphicsContext2D)
70 | }
71 |
72 | val barrier = CyclicBarrier(2)
73 | fun render(draw: (GraphicsContext) -> Unit = {}) {
74 | barrier.reset()
75 | Platform.runLater {
76 | val gc = canvas.graphicsContext2D
77 | draw(gc)
78 | primaryStage.title = title
79 | barrier.await()
80 | }
81 | barrier.await()
82 | }
83 |
84 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/GridWorldUI.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.ui
2 |
3 | import javafx.application.Application
4 | import javafx.application.Platform
5 | import javafx.scene.Group
6 | import javafx.scene.Scene
7 | import javafx.scene.canvas.Canvas
8 | import javafx.scene.paint.Color
9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.ActionValueFunction
11 | import lab.mars.rl.model.impl.mdp.IndexedState
12 | import lab.mars.rl.model.impl.mdp.StateValueFunction
13 | import java.util.concurrent.CyclicBarrier
14 |
15 | class GridWorldUI: Application() {
16 | lateinit var canvas: Canvas
17 |
18 | companion object {
19 | var after: () -> Unit = {}
20 | var render: (ActionValueFunction, IndexedState) -> Unit = { _, _ -> }
21 | var width = 450.0
22 | var height = 300.0
23 | var grid_x = 9
24 | var grid_y = 6
25 | }
26 |
27 | override fun start(ps: Stage?) {
28 | val primaryStage = ps!!
29 | primaryStage.title = "Drawing Operations Test"
30 | val root = Group()
31 | canvas = Canvas(width, height)
32 | root.children.add(canvas)
33 | primaryStage.scene = Scene(root)
34 | primaryStage.show()
35 | render = this::render
36 | after()
37 | }
38 |
39 | val barrier = CyclicBarrier(2)
40 | var max = 1.0
41 | var min = 0.0
42 | fun render(V: StateValueFunction, s: IndexedState) {
43 | barrier.reset()
44 | Platform.runLater {
45 | val gc = canvas.graphicsContext2D
46 | gc.clearRect(0.0, 0.0, width, height)
47 | gc.stroke = Color.BLACK
48 | val u_x = width / grid_x
49 | val u_y = height / grid_y
50 | for ((dim, value) in V.withIndices()) {
51 | max = maxOf(max, value)
52 | min = minOf(min, value)
53 | val nx = dim[0]
54 | val ny = dim[1]
55 | gc.fill = Color.BLUE.interpolate(Color.RED, if (max == min) 0.5 else (value - min) / (max - min))
56 | val x = u_x * nx
57 | val y = u_y * ny
58 | gc.fillRect(x, y, u_x, u_y)
59 | }
60 | gc.fill = Color.GREEN
61 | gc.fillRect(s[0] * u_x, s[1] * u_y, u_x, u_y)
62 | for ((dim, value) in V.withIndices()) {
63 | max = maxOf(max, value)
64 | val nx = dim[0]
65 | val ny = dim[1]
66 | val x = u_x * nx
67 | val y = u_y * ny
68 | gc.strokeRect(x, y, u_x, u_y)
69 | }
70 | barrier.await()
71 | }
72 | barrier.await()
73 | }
74 |
75 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/MountainCarUI.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.ui
2 |
3 | import javafx.application.Application
4 | import javafx.application.Platform
5 | import javafx.scene.Group
6 | import javafx.scene.Scene
7 | import javafx.scene.canvas.Canvas
8 | import javafx.scene.paint.Color
9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.DefaultAction
11 | import lab.mars.rl.problem.MountainCar
12 | import lab.mars.rl.problem.MountainCar.CarState
13 | import java.util.concurrent.CyclicBarrier
14 | import kotlin.math.PI
15 | import kotlin.math.sin
16 |
17 | class MountainCarUI: Application() {
18 | lateinit var canvas: Canvas
19 |
20 | companion object {
21 | var render: (Int, Int, CarState, DefaultAction) -> Unit = { _, _, _, _ -> }
22 | var after: () -> Unit = {}
23 | var width = 450.0
24 | var height = 300.0
25 | }
26 |
27 | override fun start(ps: Stage?) {
28 | val primaryStage = ps!!
29 | primaryStage.title = "Mountain Car"
30 | val root = Group()
31 | canvas = Canvas(width, height)
32 | root.children.add(canvas)
33 | primaryStage.scene = Scene(root)
34 | primaryStage.show()
35 | render = this::render
36 | after()
37 | }
38 |
39 | val barrier = CyclicBarrier(2)
40 | fun tx(x: Double) = (x + PI / 2) / (2 * PI / 3) * width
41 | fun ty(y: Double) = (-y + 1) / 2 * height
42 | fun render(episode: Int, step: Int, s: CarState, a: DefaultAction) {
43 | barrier.reset()
44 | Platform.runLater {
45 | val gc = canvas.graphicsContext2D
46 | gc.clearRect(0.0, 0.0, width, height)
47 | gc.stroke = Color.BLACK
48 | gc.strokeText("episode:$episode\nstep:$step", width / 2-50, height / 2)
49 | for (i in 0..40) {
50 | val x1 = i / 40.0 * 2 * PI / 3
51 | val y1 = sin(3 * (x1 + PI / 6))
52 | val x2 = (i + 1) / 40.0 * 2 * PI / 3
53 | val y2 = sin(3 * (x2 + PI / 6))
54 | gc.strokeLine(i / 40.0 * width, ty(y1), (i + 1) / 40.0 * width, ty(y2))
55 | }
56 | val min_x=tx(MountainCar.POSITION_MIN)
57 | val min_y=ty(sin(3*MountainCar.POSITION_MIN))
58 | gc.strokeLine(min_x,min_y,min_x+10,min_y)
59 | val ball_x = tx(s.position)
60 | val ball_y = ty(sin(3 * s.position))
61 | gc.strokeOval(ball_x, ball_y, 10.0, 10.0)
62 | gc.stroke = Color.RED
63 | gc.strokeLine(ball_x, ball_y, ball_x + a.value * 40, ball_y)
64 | barrier.await()
65 | }
66 | Thread.sleep(30)
67 | barrier.await()
68 | }
69 | }
--------------------------------------------------------------------------------
/src/main/kotlin/lab/mars/rl/util/ui/RodManeuveringUI.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.ui
2 |
3 | import javafx.application.Application
4 | import javafx.application.Platform.runLater
5 | import javafx.scene.Group
6 | import javafx.scene.Scene
7 | import javafx.scene.canvas.Canvas
8 | import javafx.scene.paint.Color
9 | import javafx.stage.Stage
10 | import lab.mars.rl.model.impl.mdp.IndexedState
11 | import lab.mars.rl.model.impl.mdp.StateValueFunction
12 | import lab.mars.rl.problem.RodManeuvering
13 | import lab.mars.rl.problem.RodManeuvering.currentStatus
14 | import lab.mars.rl.problem.RodManeuvering.height
15 | import lab.mars.rl.problem.RodManeuvering.resolution
16 | import lab.mars.rl.problem.RodManeuvering.rodEdges
17 | import lab.mars.rl.problem.RodManeuvering.rotate
18 | import lab.mars.rl.problem.RodManeuvering.rotation_resolution
19 | import lab.mars.rl.problem.RodManeuvering.unit_x
20 | import lab.mars.rl.problem.RodManeuvering.unit_y
21 | import lab.mars.rl.problem.RodManeuvering.width
22 | import lab.mars.rl.util.math.max
23 | import java.util.concurrent.CyclicBarrier
24 |
25 | class RodManeuveringUI: Application() {
26 | lateinit var canvas: Canvas
27 |
28 | companion object {
29 | var after: () -> Unit = {}
30 | var render: (StateValueFunction, IndexedState) -> Unit = { _, _ -> }
31 | }
32 |
33 | override fun start(ps: Stage?) {
34 | val primaryStage = ps!!
35 | // primaryStage.title = "Drawing Operations Test"
36 | val root = Group()
37 | canvas = Canvas(width, height)
38 | drawMap()
39 | root.children.add(canvas)
40 | primaryStage.scene = Scene(root)
41 | primaryStage.show()
42 | render = this::render
43 | after()
44 | }
45 |
46 | fun drawMap() {
47 | val gc = canvas.graphicsContext2D
48 | gc.stroke = Color.BLACK
49 | for (o in RodManeuvering.obstacles) {
50 | o.v.apply {
51 | val xPoints = DoubleArray(size) { this[it].x }
52 | val yPoints = DoubleArray(size) { this[it].y }
53 | gc.strokePolygon(xPoints,
54 | yPoints, size)
55 | }
56 | }
57 | }
58 |
59 | val barrier = CyclicBarrier(2)
60 | var max = 1.0
61 | var min = 0.0
62 | fun render(V: StateValueFunction, s: IndexedState) {
63 | barrier.reset()
64 | runLater {
65 | val (x, y, rotation) = currentStatus(s)
66 | val gc = canvas.graphicsContext2D
67 | gc.clearRect(0.0, 0.0, width, height)
68 | gc.stroke = Color.BLACK
69 | for (nx in 0 until resolution)
70 | for (ny in 0 until resolution) {
71 | val value = max(0 until rotation_resolution) { V[nx, ny, it] }
72 | max = maxOf(max, value)
73 | min = minOf(min, value)
74 | gc.fill = Color.BLUE.interpolate(Color.RED, if (max == min) 0.5 else (value - min) / (max - min))
75 | gc.fillRect(nx * unit_x, ny * unit_y, unit_x, unit_y)
76 | }
77 | gc.fill = Color.GREEN
78 | for (edge in rodEdges) {
79 | val p1 = edge._1.rotate(rotation).add(x, y)
80 | val p2 = edge._2.rotate(rotation).add(x, y)
81 | gc.strokeLine(p1.x, p1.y, p2.x, p2.y)
82 | }
83 |
84 | drawMap()
85 | barrier.await()
86 | }
87 | barrier.await()
88 | }
89 |
90 | }
--------------------------------------------------------------------------------
/src/main/resources/StockLineChart.css:
--------------------------------------------------------------------------------
1 | .chart-series-line {
2 | -fx-stroke-width: 0.5px;
3 | -fx-effect: null;
4 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dp/Test Value Iteration.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dp
2 |
3 | import lab.mars.rl.problem.CarRental
4 | import lab.mars.rl.problem.GridWorld
5 | import lab.mars.rl.util.format
6 | import org.junit.Assert
7 | import org.junit.Test
8 |
9 | class `Test Value Iteration` {
10 | @Test
11 | fun `GridWorld Problem`() {
12 | val prob = GridWorld.make()
13 | val (_, V, _) = prob.ValueIteration()
14 | for (s in prob.states) {
15 | println(V[s])
16 | }
17 | }
18 |
19 | @Test
20 | fun `Car Rental Value Iteration`() {
21 | val prob = CarRental.make(false)
22 | val (_, V, _) = prob.ValueIteration()
23 | var i = 0
24 | for (a in CarRental.max_car downTo 0)
25 | for (b in 0..CarRental.max_car)
26 | Assert.assertEquals(`Car Rental Result`[i++], V[prob.states[a, b]].format(2))
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q on-policy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.average_α
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.problem.Blackjack
7 | import lab.mars.rl.problem.DynaMaze
8 | import lab.mars.rl.problem.RodManeuvering
9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.printBlackjack
11 | import lab.mars.rl.util.ui.GridWorldUI
12 | import lab.mars.rl.util.ui.RodManeuveringUI
13 | import org.junit.Test
14 | import java.util.concurrent.CountDownLatch
15 | import kotlin.concurrent.thread
16 |
17 | class `Test Optimal Dyna-Q on-policy` {
18 | @Test
19 | fun `Blackjack`() {
20 | val (prob) = Blackjack.make()
21 | val (π, V) = prob.`Dyna-Q-OnPolicy`(
22 | n = 10,
23 | ε = 0.1,
24 | α = average_α(prob),
25 | episodes = 1000000)
26 | printBlackjack(prob, π, V)
27 | }
28 |
29 | @Test
30 | fun `Dyna Maze UI`() {
31 | val prob = DynaMaze.make()
32 | val latch = CountDownLatch(1)
33 |
34 | thread {
35 | latch.await()
36 | val (π) = prob.`Dyna-Q-OnPolicy`(
37 | n = 20,
38 | ε = 0.1,
39 | α = average_α(prob),
40 | episodes = 1000,
41 | stepListener = { V, s ->
42 | GridWorldUI.render(V, s)
43 | })
44 | var s = prob.started()
45 | var count = 0
46 | print(s)
47 | while (s.isNotTerminal) {
48 | val a = argmax(s.actions) { π[s, it] }
49 | val possible = a.sample()
50 | s = possible.next
51 | count++
52 | print("${DynaMaze.desc_move[a[0]]}$s")
53 | }
54 | println("\nsteps=$count")//optimal=14
55 | }
56 | GridWorldUI.after = { latch.countDown() }
57 | Application.launch(GridWorldUI::class.java)
58 | }
59 |
60 | @Test
61 | fun `Rod Maneuvering UI`() {
62 | val prob = RodManeuvering.make()
63 | val latch = CountDownLatch(1)
64 |
65 | thread {
66 | latch.await()
67 | val (π) = prob.`Dyna-Q-OnPolicy`(
68 | n = 20,
69 | ε = 0.1,
70 | α = average_α(prob),
71 | episodes = 1000,
72 | stepListener = { V, s ->
73 | RodManeuveringUI.render(V, s)
74 | })
75 | var s = prob.started()
76 | var count = 0
77 | print(s)
78 | while (s.isNotTerminal) {
79 | val a = argmax(s.actions) { π[s, it] }
80 | val possible = a.sample()
81 | s = possible.next
82 | count++
83 | print("$a$s")
84 | }
85 | println("\nsteps=$count")//optimal=39
86 | }
87 | RodManeuveringUI.after = { latch.countDown() }
88 | Application.launch(RodManeuveringUI::class.java)
89 | }
90 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q+.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.DynaMaze
6 | import lab.mars.rl.util.math.argmax
7 | import lab.mars.rl.util.ui.GridWorldUI
8 | import org.junit.Test
9 | import java.util.concurrent.CountDownLatch
10 | import kotlin.concurrent.thread
11 |
12 | class `Test Optimal Dyna-Q+` {
13 | @Test
14 | fun `Dyna Maze`() {
15 | val prob = DynaMaze.make()
16 | val latch = CountDownLatch(1)
17 |
18 | thread {
19 | latch.await()
20 | val (π) = prob.`Dyna-Q+`(
21 | n = 10,
22 | α = { _, _ -> 0.1 },
23 | ε = 0.1,
24 | κ = 1e-4,
25 | episodes = 1000,
26 | stepListener = { V, s ->
27 | GridWorldUI.render(V, s)
28 | })
29 | var s = prob.started()
30 | var count = 0
31 | print(s)
32 | while (s.isNotTerminal) {
33 | val a = argmax(s.actions) { π[s, it] }
34 | val possible = a.sample()
35 | s = possible.next
36 | count++
37 | print("${DynaMaze.desc_move[a[0]]}$s")
38 | }
39 | println("\nsteps=$count")//optimal=14
40 | }
41 | GridWorldUI.after = { latch.countDown() }
42 | Application.launch(GridWorldUI::class.java)
43 | }
44 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Dyna-Q.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.average_α
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.problem.Blackjack
7 | import lab.mars.rl.problem.DynaMaze
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import lab.mars.rl.util.ui.GridWorldUI
11 | import org.junit.Test
12 | import java.util.concurrent.CountDownLatch
13 | import kotlin.concurrent.thread
14 |
15 | class `Test Optimal Dyna-Q` {
16 | @Test
17 | fun `Blackjack`() {
18 | val (prob) = Blackjack.make()
19 | val (π, V) = prob.DynaQ(
20 | n = 10,
21 | ε = 0.1,
22 | α = average_α(prob),
23 | episodes = 100000)
24 | printBlackjack(prob, π, V)
25 | }
26 |
27 | @Test
28 | fun `Dyna Maze`() {
29 | val prob = DynaMaze.make()
30 | val (π) = prob.DynaQ(
31 | n = 10,
32 | ε = 0.1,
33 | α = average_α(prob),
34 | episodes = 100000)
35 | var s = prob.started()
36 | var count = 0
37 | print(s)
38 | while (s.isNotTerminal) {
39 | val a = argmax(s.actions) { π[s, it] }
40 | val possible = a.sample()
41 | s = possible.next
42 | count++
43 | print("${DynaMaze.desc_move[a[0]]}$s")
44 | }
45 | println("\nsteps=$count")//optimal=14
46 | }
47 |
48 | @Test
49 | fun `Dyna Maze UI`() {
50 | val prob = DynaMaze.make()
51 |
52 | val latch = CountDownLatch(1)
53 |
54 | thread {
55 | latch.await()
56 | val (π) = prob.DynaQ(
57 | n = 10,
58 | ε = 0.1,
59 | α = average_α(prob),
60 | episodes = 100000,
61 | stepListener = { V, s ->
62 | GridWorldUI.render(V, s)
63 | })
64 | var s = prob.started()
65 | var count = 0
66 | print(s)
67 | while (s.isNotTerminal) {
68 | val a = argmax(s.actions) { π[s, it] }
69 | val possible = a.sample()
70 | s = possible.next
71 | count++
72 | print("${DynaMaze.desc_move[a[0]]}$s")
73 | }
74 | println("\nsteps=$count")//optimal=14
75 | }
76 | GridWorldUI.after = { latch.countDown() }
77 | Application.launch(GridWorldUI::class.java)
78 | }
79 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal Prioritized Sweeping Stochastic.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.average_α
5 | import lab.mars.rl.model.isNotTerminal
6 | import lab.mars.rl.problem.Blackjack
7 | import lab.mars.rl.problem.DynaMaze
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import lab.mars.rl.util.ui.GridWorldUI
11 | import org.junit.Test
12 | import java.util.concurrent.CountDownLatch
13 | import kotlin.concurrent.thread
14 |
15 | class `Test Optimal Prioritized Sweeping Stochastic` {
16 | @Test
17 | fun `Blackjack`() {
18 | val (prob) = Blackjack.make()
19 | val (π, V) = prob.PrioritizedSweepingStochasticEnv(
20 | n = 10,
21 | θ = 0.0,
22 | ε = 0.1,
23 | α = average_α(prob),
24 | episodes = 100000)
25 | printBlackjack(prob, π, V)
26 | }
27 |
28 | @Test
29 | fun `Dyna Maze UI`() {
30 | val prob = DynaMaze.make()
31 | val latch = CountDownLatch(1)
32 |
33 | thread {
34 | latch.await()
35 | val (π) = prob.PrioritizedSweepingStochasticEnv(
36 | n = 10,
37 | θ = 0.0,
38 | ε = 0.1,
39 | α = { _, _ -> 0.1 },
40 | episodes = 1000,
41 | stepListener = { V, s ->
42 | GridWorldUI.render(V, s)
43 | })
44 | var s = prob.started()
45 | var count = 0
46 | print(s)
47 | while (s.isNotTerminal) {
48 | val a = argmax(s.actions) { π[s, it] }
49 | val possible = a.sample()
50 | s = possible.next
51 | count++
52 | print("${DynaMaze.desc_move[a[0]]}$s")
53 | }
54 | println("\nsteps=$count")//optimal=14
55 | }
56 | GridWorldUI.after = { latch.countDown() }
57 | Application.launch(GridWorldUI::class.java)
58 | }
59 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/dyna/Test Optimal RandomSampleOneStepTabularQLearning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.dyna
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.problem.Blackjack
5 | import lab.mars.rl.util.printBlackjack
6 | import org.junit.Test
7 |
8 | class `Test Optimal RandomSampleOneStepTabularQLearning` {
9 | @Test
10 | fun `Blackjack`() {
11 | val (prob) = Blackjack.make()
12 | val (π, V) = prob.RandomSampleOneStepTabularQLearning(
13 | ε = 0.1,
14 | α = average_α(prob),
15 | episodes = 1000000)
16 | printBlackjack(prob, π, V)
17 | }
18 |
19 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction Off-line λ-return.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import ch.qos.logback.classic.Level
4 | import javafx.application.Application
5 | import kotlinx.coroutines.runBlocking
6 | import lab.mars.rl.model.impl.func.LinearFunc
7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
8 | import lab.mars.rl.model.impl.mdp.IndexedState
9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.asyncs
11 | import lab.mars.rl.util.await
12 | import lab.mars.rl.util.listOf
13 | import lab.mars.rl.util.logLevel
14 | import lab.mars.rl.util.tuples.tuple2
15 | import lab.mars.rl.util.ui.ChartApp
16 | import lab.mars.rl.util.ui.D2DChart
17 | import lab.mars.rl.util.ui.Line
18 | import lab.mars.rl.util.ui.LineChart
19 | import org.apache.commons.math3.util.FastMath.pow
20 | import org.apache.commons.math3.util.FastMath.sqrt
21 | import org.junit.Test
22 |
23 | class `Test Prediction Off-line λ-return` {
24 | @Test
25 | fun `Performance`() {
26 | logLevel(Level.ERROR)
27 |
28 | val (prob, π) = `19-state RandomWalk`.make()
29 | val realV = listOf(-20..20 step 2) { it / 20.0 }
30 | realV[0] = 0.0
31 | realV[20] = 0.0
32 |
33 | val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
34 | val αs = listOf(0.0, 0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
35 |
36 | val episodes = 10
37 | val runs = 100
38 | val truncateValue = 0.55
39 |
40 | val chart = LineChart("Off-line λ-return", "α", "Average RMS")
41 | runBlocking {
42 | for (λ in λs) {
43 | val line = Line("λ=$λ")
44 | chart += line
45 | asyncs(αs) { α ->
46 | var rms_sum = 0.0
47 | asyncs(runs) { run ->
48 | val func = LinearFunc(
49 | SimpleTileCoding(1,
50 | prob.states.size,
51 | 1,
52 | 0.0) { (s) -> (s as IndexedState)[0].toDouble() })
53 | var rms = 0.0
54 | prob.`Off-line λ-return`(
55 | V = func, π = π,
56 | α = α, λ = λ,
57 | episodes = episodes,
58 | episodeListener = { _, _ ->
59 | var error = 0.0
60 | for (s in prob.states)
61 | error += pow(func(s) - realV[s[0]], 2)
62 | error /= prob.states.size
63 | rms += sqrt(error)
64 | })
65 | println("finish λ=$λ α=$α run=$run")
66 | rms
67 | }.await { rms_sum += it }
68 | println("finish λ=$λ α=$α")
69 | tuple2(α, rms_sum / (episodes * runs))
70 | }.await { (α, rms) ->
71 | if (rms < truncateValue)
72 | line[α] = rms
73 | }
74 | println("finish λ=$λ")
75 | }
76 | }
77 | D2DChart.charts += chart
78 | Application.launch(ChartApp::class.java)
79 | }
80 |
81 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction Semi-gradient TD(λ).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import ch.qos.logback.classic.Level
4 | import javafx.application.Application
5 | import kotlinx.coroutines.runBlocking
6 | import lab.mars.rl.model.impl.func.LinearFunc
7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
8 | import lab.mars.rl.model.impl.mdp.IndexedState
9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.*
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.ChartApp
13 | import lab.mars.rl.util.ui.D2DChart
14 | import lab.mars.rl.util.ui.Line
15 | import lab.mars.rl.util.ui.LineChart
16 | import org.apache.commons.math3.util.FastMath
17 | import org.junit.Test
18 |
19 | class `Test Prediction Semi-gradient TDλ` {
20 | @Test
21 | fun `Performance`() {
22 | logLevel(Level.ERROR)
23 |
24 | val (prob, π) = `19-state RandomWalk`.make()
25 | val realV = listOf(-20..20 step 2) { it / 20.0 }
26 | realV[0] = 0.0
27 | realV[20] = 0.0
28 |
29 | val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
30 | val αs = listOf(100) { it * 0.01 }
31 |
32 | val episodes = 10
33 | val runs = 100
34 | val truncateValue = 0.6
35 |
36 | val chart = LineChart("Semi-gradient TD(λ)", "α", "Average RMS")
37 | runBlocking {
38 | for (λ in λs) {
39 | val line = Line("λ=$λ")
40 | chart += line
41 | asyncs(αs) { α ->
42 | var rms_sum = 0.0
43 | asyncs(runs) { run ->
44 | val func = LinearFunc(
45 | SimpleTileCoding(1,
46 | prob.states.size,
47 | 1,
48 | 0.0) { (s) -> (s as IndexedState)[0].toDouble() }
49 | )
50 | var rms = 0.0
51 | prob.`Semi-gradient TD(λ) prediction`(
52 | V = func, π = π,
53 | α = α, λ = λ,
54 | episodes = episodes,
55 | episodeListener = { _, _ ->
56 | var error = 0.0
57 | for (s in prob.states)
58 | error += FastMath.pow(func(s) - realV[s[0]], 2)
59 | error /= prob.states.size
60 | rms += FastMath.sqrt(error)
61 | })
62 | println("finish λ=${λ.format(2)} α=$α run=$run")
63 | rms
64 | }.await { rms_sum += it }
65 | println("finish λ=$λ α=$α")
66 | tuple2(α, rms_sum / (episodes * runs))
67 | }.await { (α, rms) ->
68 | if (rms < truncateValue)
69 | line[α] = rms
70 | }
71 | println("finish λ=$λ")
72 | }
73 | }
74 | D2DChart.charts += chart
75 | Application.launch(ChartApp::class.java)
76 | }
77 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/eligibility_trace/prediction/Test Prediction True Online TD(λ).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.eligibility_trace.prediction
2 |
3 | import ch.qos.logback.classic.Level
4 | import javafx.application.Application
5 | import kotlinx.coroutines.runBlocking
6 | import lab.mars.rl.model.impl.func.LinearFunc
7 | import lab.mars.rl.model.impl.func.SimpleTileCoding
8 | import lab.mars.rl.model.impl.mdp.IndexedState
9 | import lab.mars.rl.problem.`19-state RandomWalk`
10 | import lab.mars.rl.util.*
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.ChartApp
13 | import lab.mars.rl.util.ui.D2DChart
14 | import lab.mars.rl.util.ui.Line
15 | import lab.mars.rl.util.ui.LineChart
16 | import org.apache.commons.math3.util.FastMath.pow
17 | import org.apache.commons.math3.util.FastMath.sqrt
18 | import org.junit.Test
19 |
20 | class `Test Prediction True Online TDλ` {
21 | @Test
22 | fun `Performance`() {
23 | logLevel(Level.ERROR)
24 |
25 | val (prob, π) = `19-state RandomWalk`.make()
26 | val realV = listOf(-20..20 step 2) { it / 20.0 }
27 | realV[0] = 0.0
28 | realV[20] = 0.0
29 |
30 | val λs = listOf(0.0, 0.4, 0.8, 0.9, 0.95, 0.975, 0.99, 1.0)
31 | val αs = listOf(100) { it * 0.01 }
32 |
33 | val episodes = 10
34 | val runs = 100
35 | val truncateValue = 0.6
36 |
37 | val chart = LineChart("True Online TD(λ)", "α", "Average RMS")
38 | runBlocking {
39 | for (λ in λs) {
40 | val line = Line("λ=$λ")
41 | chart += line
42 | asyncs(αs) { α ->
43 | var rms_sum = 0.0
44 | asyncs(runs) { run ->
45 | val func = LinearFunc(
46 | SimpleTileCoding(1,
47 | prob.states.size,
48 | 1,
49 | 0.0) { (s) -> (s as IndexedState)[0].toDouble() }
50 | )
51 | var rms = 0.0
52 | prob.`True Online TD(λ) prediction`(
53 | Vfunc = func, π = π,
54 | α = α, λ = λ,
55 | episodes = episodes,
56 | episodeListener = { _, _ ->
57 | var error = 0.0
58 | for (s in prob.states)
59 | error += pow(func(s) - realV[s[0]], 2)
60 | error /= prob.states.size
61 | rms += sqrt(error)
62 | })
63 | println("finish λ=$λ α=${α.format(2)} run=$run")
64 | rms
65 | }.await { rms_sum += it }
66 | println("finish λ=$λ α=$α")
67 | tuple2(α, rms_sum / (episodes * runs))
68 | }.await { (α, rms) ->
69 | if (rms < truncateValue)
70 | line[α] = rms
71 | }
72 | println("finish λ=$λ")
73 | }
74 | }
75 | D2DChart.charts += chart
76 | Application.launch(ChartApp::class.java)
77 | }
78 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/on_policy/Test Optimal Episodic Semi-gradient QLearning control.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST", "NAME_SHADOWING")
2 |
3 | package lab.mars.rl.algo.func_approx.on_policy
4 |
5 | import javafx.application.Application
6 | import lab.mars.rl.model.impl.func.LinearFunc
7 | import lab.mars.rl.model.impl.func.SuttonTileCoding
8 | import lab.mars.rl.model.impl.mdp.DefaultAction
9 | import lab.mars.rl.model.impl.mdp.EpsilonGreedyFunctionPolicy
10 | import lab.mars.rl.problem.MountainCar
11 | import lab.mars.rl.problem.MountainCar.CarState
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.ui.MountainCarUI
14 | import org.junit.Test
15 | import java.util.concurrent.CountDownLatch
16 | import kotlin.concurrent.thread
17 |
18 | class `Test Optimal Episodic Semi-gradient QLearning control` {
19 |
20 | @Test
21 | fun `Mountain Car UI`() {
22 | val prob = MountainCar.make()
23 | val feature = SuttonTileCoding(511, 8, doubleArrayOf(8 / (MountainCar.POSITION_MAX - MountainCar.POSITION_MIN),
24 | 8 / (MountainCar.VELOCITY_MAX - MountainCar.VELOCITY_MIN))) { (s, a) ->
25 | s as CarState
26 | a as DefaultAction
27 | tuple2(doubleArrayOf(s.position, s.velocity), intArrayOf(a.value))
28 | }
29 | val func = LinearFunc(feature)
30 |
31 | val episodes = intArrayOf(1, 12, 104, 1000, 9000)
32 | val latch = CountDownLatch(1)
33 | thread {
34 | latch.await()
35 | prob.`Episodic semi-gradient QLearning control`(
36 | Q = func,
37 | π = EpsilonGreedyFunctionPolicy(func, 0.0),
38 | α = 0.3 / 8,
39 | episodes = 9000,
40 | stepListener = step@{ episode, step, s, a ->
41 | if (episode !in episodes) return@step
42 | MountainCarUI.render(episode, step, s as CarState, a as DefaultAction)
43 | })
44 | }
45 | MountainCarUI.after = { latch.countDown() }
46 | Application.launch(MountainCarUI::class.java)
47 | }
48 |
49 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Function Approximator Coarse Coding.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.model.impl.func.LinearFunc
5 | import lab.mars.rl.model.impl.func.SimpleCoarseCoding
6 | import lab.mars.rl.problem.SquareWave
7 | import lab.mars.rl.problem.WaveState
8 | import lab.mars.rl.util.format
9 | import lab.mars.rl.util.matrix.times
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.junit.Test
15 |
16 | class `Coarse Coding` {
17 | @Test
18 | fun `Coarse Coding`() {
19 | val alpha = 0.2
20 | val numOfSamples = listOf(10, 40, 160, 2560, 10240)
21 | val featureWidths = listOf(0.2, .4, 1.0)
22 | for (numOfSample in numOfSamples) {
23 | val chart = LineChart("$numOfSample samples", "state", "value")
24 | for (featureWidth in featureWidths) {
25 | val line = Line("feature width: ${featureWidth.format(1)}")
26 | val feature = SimpleCoarseCoding(featureWidth,
27 | SquareWave.domain, 50) { (s) -> (s as WaveState).x }
28 | val func = LinearFunc(feature)
29 | repeat(numOfSample) {
30 | val (s, y) = SquareWave.sample()
31 | func.w += alpha / feature.features.sumBy { if (it.contains(feature.conv(arrayOf(s)))) 1 else 0 } * (y - func(s)) * func.`∇`(s)
32 | }
33 | for (i in 0 until SquareWave.maxResolution) {
34 | val s = WaveState(i * 2.0 / SquareWave.maxResolution)
35 | val y = func(s)
36 | line[i * 2.0 / SquareWave.maxResolution] = y
37 | }
38 | chart += line
39 | }
40 | D2DChart.charts += chart
41 | }
42 | Application.launch(ChartApp::class.java)
43 | }
44 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction Gradient MC.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
5 | import lab.mars.rl.model.impl.func.StateAggregation
6 | import lab.mars.rl.model.impl.mdp.IndexedState
7 | import lab.mars.rl.problem.`1000-state RandomWalk`
8 | import lab.mars.rl.util.format
9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 |
15 | class `Test Prediction Gradient MC` {
16 | @Test
17 | fun `1000-state Random walk`() {
18 | val chart = LineChart("V", "state", "value")
19 | val (prob, π) = `1000-state RandomWalk`.make()
20 | val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 | prob.apply {
22 | val line = Line("TD")
23 | for (s in states) {
24 | println("${V[s].format(2)} ")
25 | line[s[0]] = V[s]
26 | }
27 | chart += line
28 | }
29 |
30 | val func = StateAggregation(`1000-state RandomWalk`.num_states + 2, 10) { (s) -> (s as IndexedState)[0] }
31 | prob.`Gradient Monte Carlo algorithm`(
32 | v = func, π = π,
33 | α = 2e-5,
34 | episodes = 100000
35 | )
36 | prob.apply {
37 | val line = Line("gradient MC")
38 | for (s in states) {
39 | println("${func(s).format(2)} ")
40 | line[s[0]] = func(s)
41 | }
42 | chart += line
43 | }
44 | D2DChart.charts += chart
45 | Application.launch(ChartApp::class.java)
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction LSTD.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
5 | import lab.mars.rl.model.impl.func.LinearFunc
6 | import lab.mars.rl.model.impl.func.SimpleTileCoding
7 | import lab.mars.rl.model.impl.mdp.IndexedState
8 | import lab.mars.rl.problem.`1000-state RandomWalk`
9 | import lab.mars.rl.util.format
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.apache.commons.math3.util.FastMath.ceil
15 | import org.junit.Test
16 |
17 | class `Test Prediction LSTD` {
18 | @Test
19 | fun `1000-state RandomWalk`() {
20 | val chart = LineChart("V", "state", "value")
21 | val (prob, π) = `1000-state RandomWalk`.make()
22 | val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
23 | prob.apply {
24 | val line = Line("TD")
25 | for (s in states) {
26 | println("${V[s].format(2)} ")
27 | line[s[0]] = V[s]
28 | }
29 | chart += line
30 | }
31 |
32 | val numOfTilings = 50
33 | val feature = SimpleTileCoding(numOfTilings,
34 | 5,
35 | ceil(`1000-state RandomWalk`.num_states / 5.0).toInt(),
36 | 4.0) { (s) -> ((s as IndexedState)[0] - 1).toDouble() }
37 | val func = LinearFunc(feature)
38 | prob.LSTD(vFunc = func, π = π, ε = 1.0, episodes = 100)
39 | prob.apply {
40 | val line = Line("LSTD")
41 | for (s in states) {
42 | println("${func(s).format(2)} ")
43 | line[s[0]] = func(s)
44 | }
45 | chart += line
46 | }
47 | D2DChart.charts += chart
48 | Application.launch(ChartApp::class.java)
49 | }
50 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction Semi-gradient TD(0).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
5 | import lab.mars.rl.model.impl.func.StateAggregation
6 | import lab.mars.rl.model.impl.mdp.IndexedState
7 | import lab.mars.rl.problem.`1000-state RandomWalk`
8 | import lab.mars.rl.util.format
9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 |
15 | class `Test Prediction Semi-gradient TD(0)` {
16 | @Test
17 | fun `1000-state Random walk`() {
18 | val chart = LineChart("V", "state", "value")
19 | val (prob, π) = `1000-state RandomWalk`.make()
20 | val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 | prob.apply {
22 | val line = Line("TD")
23 | for (s in states) {
24 | println("${V[s].format(2)} ")
25 | line[s[0]] = V[s]
26 | }
27 | chart += line
28 | }
29 |
30 | val func = StateAggregation(`1000-state RandomWalk`.num_states + 2,
31 | 10) { (s) -> (s as IndexedState)[0] }
32 | prob.`Semi-gradient TD(0)`(v = func, π = π, α = 2e-4, episodes = 100000)
33 | prob.apply {
34 | val line = Line("Semi-gradient TD(0)")
35 | for (s in states) {
36 | println("${func(s).format(2)} ")
37 | line[s[0]] = func(s)
38 | }
39 | chart += line
40 | }
41 | D2DChart.charts += chart
42 | Application.launch(ChartApp::class.java)
43 | }
44 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/func_approx/prediction/Test Prediction n-step Semi-gradient TD.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.func_approx.prediction
2 |
3 | import javafx.application.Application
4 | import lab.mars.rl.algo.td.`Tabular TD(0)`
5 | import lab.mars.rl.model.impl.func.StateAggregation
6 | import lab.mars.rl.model.impl.mdp.IndexedState
7 | import lab.mars.rl.problem.`1000-state RandomWalk`
8 | import lab.mars.rl.util.format
9 | import lab.mars.rl.util.ui.ChartApp
10 | import lab.mars.rl.util.ui.D2DChart
11 | import lab.mars.rl.util.ui.Line
12 | import lab.mars.rl.util.ui.LineChart
13 | import org.junit.Test
14 |
15 | class `Test Prediction n-step Semi-gradient TD` {
16 | @Test
17 | fun `1000-state Random walk`() {
18 | val chart = LineChart("V", "state", "value")
19 | val (prob, π) = `1000-state RandomWalk`.make()
20 | val V = prob.`Tabular TD(0)`(π = π, episodes = 100000, α = 0.1)
21 | prob.apply {
22 | val line = Line("TD")
23 | for (s in states) {
24 | println("${V[s].format(2)} ")
25 | line[s[0]] = V[s]
26 | }
27 | chart += line
28 | }
29 |
30 | val func = StateAggregation(`1000-state RandomWalk`.num_states + 2,
31 | 10) { (s) -> (s as IndexedState)[0] }
32 | prob.`n-step semi-gradient TD`(
33 | v = func, π = π, n = 10,
34 | α = 2e-4,
35 | episodes = 100000)
36 | prob.apply {
37 | val line = Line("n-step semi-gradient TD")
38 | for (s in states) {
39 | println("${func(s).format(2)} ")
40 | line[s[0]] = func(s)
41 | }
42 | chart += line
43 | }
44 | D2DChart.charts += chart
45 | Application.launch(ChartApp::class.java)
46 | }
47 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC Exploring Starts.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.util.printBlackjack
5 | import org.junit.Test
6 |
7 | class `Test Optimal MC Exploring Starts` {
8 | @Test
9 | fun `Blackjack`() {
10 | val (prob, π) = Blackjack.make()
11 | val (PI, V) = prob.`Monte Carlo Exploring Starts`(π, 1000_000)
12 | printBlackjack(prob, PI, V)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC Off-policy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.util.printBlackjack
5 | import org.junit.Test
6 |
7 | class `Test Optimal MC Off-policy` {
8 | @Test
9 | fun `Blackjack`() {
10 | val (prob) = Blackjack.make()
11 | val (π, V) = prob.`Off-policy MC Optimal`(1000_000)
12 | printBlackjack(prob, π, V)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Optimal MC On-policy first-visit.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.util.printBlackjack
5 | import org.junit.Test
6 |
7 | class `Test Optimal MC On-policy first-visit` {
8 | @Test
9 | fun `Blackjack`() {
10 | val (prob) = Blackjack.make()
11 | val (π, V) = prob.`On-policy first-visit MC control`(1000_000)
12 | printBlackjack(prob, π, V)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Prediction MC Off-policy.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.util.printBlackjack
5 | import org.junit.Test
6 |
7 | class `Test Monte Carlo Off-policy prediction` {
8 | @Test
9 | fun `Blackjack`() {
10 | val (prob, π) = Blackjack.make()
11 | val V = prob.`Off-policy MC prediction`(π, 500_000)
12 | printBlackjack(prob, π, V)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/mc/Test Prediction Monte Carlo Prediction.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.mc
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.problem.RandomWalk
5 | import lab.mars.rl.util.format
6 | import lab.mars.rl.util.printBlackjack
7 | import org.junit.Test
8 |
9 | class `Test Prediction Monte Carlo Prediction` {
10 | @Test
11 | fun `Blackjack`() {
12 | val (prob, π) = Blackjack.make()
13 | val V = prob.`First Visit Monte Carlo Prediction`(π, 500_000)
14 | printBlackjack(prob, π, V)
15 | }
16 |
17 | @Test
18 | fun `RandomWalk`() {
19 | val (prob, π) = RandomWalk.make()
20 | val V = prob.`First Visit Monte Carlo Prediction`(π, 1000)
21 | prob.apply {
22 | for (s in states) {
23 | println("${V[s].format(2)} ")
24 | }
25 | }
26 | }
27 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Off-policy Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.ntd
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.CliffWalking
7 | import lab.mars.rl.problem.WindyGridworld
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 |
12 | class `Test Optimal n-TD Off-policy Sarsa` {
13 |
14 | @Test
15 | fun `Blackjack constant alpha`() {
16 | val (prob) = Blackjack.make()
17 | val (π, V) = prob.`N-step off-policy sarsa`(
18 | n = Int.MAX_VALUE,
19 | ε = 0.1,
20 | α = { _, _ -> 0.1 },
21 | episodes = 1000000)
22 | printBlackjack(prob, π, V)
23 | }
24 |
25 | @Test
26 | fun `Blackjack average alpha`() {
27 | val (prob) = Blackjack.make()
28 | val (π, V) = prob.`N-step off-policy sarsa`(
29 | n = Int.MAX_VALUE,
30 | ε = 0.1,
31 | α = average_α(prob),
32 | episodes = 1000000)
33 | printBlackjack(prob, π, V)
34 | }
35 |
36 | @Test
37 | fun `Cliff Walking`() {
38 | val prob = CliffWalking.make()
39 | val (π) = prob.`N-step off-policy sarsa`(
40 | n = 10,
41 | ε = 0.1,
42 | α = { _, _ -> 0.5 },
43 | episodes = 10000)
44 | var s = prob.started()
45 | var sum = 0.0
46 | print(s)
47 | while (s.isNotTerminal) {
48 | val a = argmax(s.actions) { π[s, it] }
49 | val possible = a.sample()
50 | s = possible.next
51 | sum += possible.reward
52 | print("${WindyGridworld.desc_move[a[0]]}$s")
53 | }
54 | println("\nreturn=$sum")//optimal=-12
55 | }
56 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Q(σ).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.ntd
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.problem.Blackjack
5 | import lab.mars.rl.util.math.Rand
6 | import lab.mars.rl.util.printBlackjack
7 | import org.junit.Test
8 |
9 | class `Test Optimal n-TD Q(σ)` {
10 | @Test
11 | fun `Blackjack σ=0`() {
12 | val (prob) = Blackjack.make()
13 | val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
14 | n = Int.MAX_VALUE,
15 | σ = { 0 },//same as treebackup
16 | ε = 0.1,
17 | α = average_α(prob),
18 | episodes = 1000000)
19 | printBlackjack(prob, π, V)
20 | }
21 |
22 | @Test
23 | fun `Blackjack σ=1`() {
24 | val (prob) = Blackjack.make()
25 | val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
26 | n = Int.MAX_VALUE,
27 | σ = { 1 },//like off-policy sarsa
28 | ε = 0.1,
29 | α = average_α(prob),
30 | episodes = 1000000)
31 | printBlackjack(prob, π, V)
32 | }
33 |
34 | @Test
35 | fun `Blackjack σ=%2`() {
36 | val (prob) = Blackjack.make()
37 | val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
38 | n = Int.MAX_VALUE,
39 | σ = { it % 2 },
40 | ε = 0.1,
41 | α = average_α(prob),
42 | episodes = 1000000)
43 | printBlackjack(prob, π, V)
44 | }
45 |
46 | @Test
47 | fun `Blackjack σ=random`() {
48 | val (prob) = Blackjack.make()
49 | val (π, V) = prob.`N-step off-policy n-step Q(σ)`(
50 | n = Int.MAX_VALUE,
51 | σ = { Rand().nextInt(2) },
52 | ε = 0.1,
53 | α = average_α(prob),
54 | episodes = 1000000)
55 | printBlackjack(prob, π, V)
56 | }
57 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.ntd
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.WindyGridworld
7 | import lab.mars.rl.util.math.argmax
8 | import lab.mars.rl.util.printBlackjack
9 | import org.junit.Test
10 |
11 | class `Test Optimal n-TD Sarsa` {
12 | @Test
13 | fun `Blackjack constant alpha`() {
14 | val (prob) = Blackjack.make()
15 | val (π, V) = prob.`N-step Sarsa`(
16 | n = Int.MAX_VALUE,
17 | ε = 0.1,
18 | α = { _, _ -> 0.1 },
19 | episodes = 1000000)
20 | printBlackjack(prob, π, V)
21 | }
22 |
23 | @Test
24 | fun `Blackjack average alpha`() {
25 | val (prob) = Blackjack.make()
26 | val (π, V) = prob.`N-step Sarsa`(
27 | n = Int.MAX_VALUE,
28 | ε = 0.1,
29 | α = { _, _ -> 0.1 },
30 | episodes = 1000000)
31 | printBlackjack(prob, π, V)
32 | }
33 |
34 | @Test
35 | fun `WindyGridworld`() {
36 | val prob = WindyGridworld.make()
37 | val (π) = prob.`N-step Sarsa`(
38 | n = 10,
39 | ε = 0.1,
40 | α = average_α(prob),
41 | episodes = 1000000)
42 | var s = prob.started()
43 | var sum = 0.0
44 | print(s)
45 | while (s.isNotTerminal) {
46 | val a = argmax(s.actions) { π[s, it] }
47 | val possible = a.sample()
48 | s = possible.next
49 | sum += possible.reward
50 | print("${WindyGridworld.desc_move[a[0]]}$s")
51 | }
52 | println("\nreturn=$sum")//optimal=-14
53 | }
54 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Optimal n-TD Treebackup.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.ntd
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.problem.Blackjack
5 | import lab.mars.rl.util.printBlackjack
6 | import org.junit.Test
7 |
8 | class `Test Optimal n-TD Treebackup` {
9 | @Test
10 | fun `Blackjack constant alpha`() {
11 | val (prob) = Blackjack.make()
12 | val (π, V) = prob.`N-step Treebackup`(
13 | n = 4, ε = 0.1,
14 | α = { _, _ -> 0.1 },
15 | episodes = 1000000)
16 | printBlackjack(prob, π, V)
17 | }
18 |
19 | @Test
20 | fun `Blackjack average alpha`() {
21 | val (prob) = Blackjack.make()
22 | val (π, V) = prob.`N-step Treebackup`(
23 | n = Int.MAX_VALUE, ε = 0.1,
24 | α = average_α(prob),
25 | episodes = 1000000)
26 | printBlackjack(prob, π, V)
27 | }
28 |
29 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/ntd/Test Prediction n-TD.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.ntd
2 |
3 | import ch.qos.logback.classic.Level
4 | import javafx.application.Application
5 | import kotlinx.coroutines.runBlocking
6 | import lab.mars.rl.problem.Blackjack
7 | import lab.mars.rl.problem.`19-state RandomWalk`
8 | import lab.mars.rl.util.*
9 | import lab.mars.rl.util.tuples.tuple2
10 | import lab.mars.rl.util.ui.ChartApp
11 | import lab.mars.rl.util.ui.D2DChart
12 | import lab.mars.rl.util.ui.Line
13 | import lab.mars.rl.util.ui.LineChart
14 | import org.apache.commons.math3.util.FastMath.pow
15 | import org.apache.commons.math3.util.FastMath.sqrt
16 | import org.junit.Test
17 |
18 | class `Test Prediction n-TD` {
19 | @Test
20 | fun `Blackjack`() {
21 | val (prob, π) = Blackjack.make()
22 | val V = prob.`N-step TD prediction`(
23 | n = 102400, π = π,
24 | α = 0.1, episodes = 500000)
25 | printBlackjack(prob, π, V)
26 | }
27 |
28 | @Test
29 | fun `RandomWalk`() {
30 | val (prob, π) = `19-state RandomWalk`.make()
31 | val V = prob.`N-step TD prediction`(
32 | n = 8, π = π,
33 | α = 0.1,
34 | episodes = 1000)
35 | prob.apply {
36 | for (s in states) {
37 | println("${V[s].format(2)} ")
38 | }
39 | }
40 | }
41 |
42 | @Test
43 | fun `RandomWalk RMS`() {
44 | logLevel(Level.ERROR)
45 |
46 | val (prob, π) = `19-state RandomWalk`.make()
47 | val realV = listOf(-20..20 step 2) { it / 20.0 }
48 | realV[0] = 0.0
49 | realV[20] = 0.0
50 |
51 | val ns = listOf(10) { pow(2.0, it).toInt() }
52 | val αs = listOf(110) { it * 0.01 }
53 |
54 | val episodes = 10
55 | val runs = 100
56 | val truncateValue = 0.55
57 |
58 | val chart = LineChart("RMS", "α", "Average RMS")
59 | runBlocking {
60 | for (n in ns) {
61 | val line = Line("n=$n")
62 | chart += line
63 | asyncs(αs) { α ->
64 | var rms_sum = 0.0
65 | asyncs(runs) {
66 | var rms = 0.0
67 | prob.`N-step TD prediction`(
68 | n = n, π = π,
69 | α = α,
70 | episodes = episodes,
71 | episodeListener = { _, V ->
72 | var error = 0.0
73 | for (s in prob.states)
74 | error += pow(V[s] - realV[s[0]], 2)
75 | error /= prob.states.size
76 | rms += sqrt(error)
77 | })
78 | rms
79 | }.await { rms_sum += it }
80 | tuple2(α, rms_sum / (episodes * runs))
81 | }.await { (α, rms) ->
82 | if (rms < truncateValue)
83 | line[α] = rms
84 | }
85 | println("finish n=$n")
86 | }
87 | }
88 | D2DChart.charts += chart
89 | Application.launch(ChartApp::class.java)
90 | }
91 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Doubel Q-Learning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.CliffWalking
7 | import lab.mars.rl.problem.MaximizationBias
8 | import lab.mars.rl.problem.WindyGridworld
9 | import lab.mars.rl.util.math.argmax
10 | import lab.mars.rl.util.printBlackjack
11 | import org.junit.Test
12 |
13 | class `Test Optimal TD Doubel Q-Learning` {
14 | @Test
15 | fun `Blackjack constant alpha`() {
16 | val (prob) = Blackjack.make()
17 | val (π, V) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 1000000)
18 | printBlackjack(prob, π, V)
19 | }
20 |
21 | @Test
22 | fun `Blackjack average alpha`() {
23 | val (prob) = Blackjack.make()
24 | val (π, V) = prob.DoubleQLearning(ε = 0.1, α = average_α(prob), episodes = 1000000)
25 | printBlackjack(prob, π, V)
26 | }
27 |
28 | @Test
29 | fun `Cliff Walking`() {
30 | val prob = CliffWalking.make()
31 | val (π) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 10000)
32 | var s = prob.started()
33 | var sum = 0.0
34 | print(s)
35 | while (s.isNotTerminal) {
36 | val a = argmax(s.actions) { π[s, it] }
37 | val possible = a.sample()
38 | s = possible.next
39 | sum += possible.reward
40 | print("${WindyGridworld.desc_move[a[0]]}$s")
41 | }
42 | println("\nreturn=$sum")//optimal=-12
43 | }
44 |
45 | @Test
46 | fun `Maximization Bias Double Q-Learning`() {
47 | val prob = MaximizationBias.make()
48 | val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 10)
49 | val A = prob.started()
50 | println(π(A))
51 |
52 | val (π2) = prob.DoubleQLearning(ε = 0.1, α = { _, _ -> 0.1 }, episodes = 10)
53 | println(π2(A))
54 |
55 | }
56 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Expected sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.CliffWalking
7 | import lab.mars.rl.problem.WindyGridworld
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 |
12 | class `Test Optimal TD Expected sarsa` {
13 | @Test
14 | fun `Blackjack constant alpha`() {
15 | val (prob) = Blackjack.make()
16 | val (π, V) = prob.expectedSarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000_000)
17 | printBlackjack(prob, π, V)
18 | }
19 |
20 | @Test
21 | fun `Blackjack average alpha`() {
22 | val (prob) = Blackjack.make()
23 | val (π, V) = prob.expectedSarsa(ε = 0.1, α = average_α(prob), episodes = 1000_000)
24 | printBlackjack(prob, π, V)
25 | }
26 |
27 | @Test
28 | fun `Cliff Walking TD Expected Sarsa`() {
29 | val prob = CliffWalking.make()
30 | val (PI) = prob.expectedSarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000_000)
31 | var s = prob.started()
32 | var sum = 0.0
33 | print(s)
34 | while (s.isNotTerminal) {
35 | val a = argmax(s.actions) { PI[s, it] }
36 | val possible = a.sample()
37 | s = possible.next
38 | sum += possible.reward
39 | print("${WindyGridworld.desc_move[a[0]]}$s")
40 | }
41 | println("\nreturn=$sum")//optimal=-12
42 | }
43 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Q-Learning.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.CliffWalking
7 | import lab.mars.rl.problem.WindyGridworld
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 |
12 | class `Test Optimal TD Q-Learning` {
13 | @Test
14 | fun `Blackjack constant alpha`() {
15 | val (prob) = Blackjack.make()
16 | val (π, V) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
17 | printBlackjack(prob, π, V)
18 | }
19 |
20 | @Test
21 | fun `Blackjack average alpha`() {
22 | val (prob) = Blackjack.make()
23 | val (π, V) = prob.QLearning(ε = 0.1, α = average_α(prob), episodes = 1000000)
24 | printBlackjack(prob, π, V)
25 | }
26 |
27 | @Test
28 | fun `WindyGridworld`() {
29 | val prob = WindyGridworld.make()
30 | val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
31 | var s = prob.started()
32 | var sum = 0.0
33 | print(s)
34 | while (s.isNotTerminal) {
35 | val a = argmax(s.actions) { π[s, it] }
36 | val possible = a.sample()
37 | s = possible.next
38 | sum += possible.reward
39 | print("${WindyGridworld.desc_move[a[0]]}$s")
40 | }
41 | println("\nreturn=$sum")//optimal=-14
42 | }
43 |
44 | @Test
45 | fun `WindyGridworld King's Move`() {
46 | val prob = WindyGridworld.make(true)
47 | val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
48 | var s = prob.started()
49 | var sum = 0.0
50 | print(s)
51 | while (s.isNotTerminal) {
52 | val a = argmax(s.actions) { π[s, it] }
53 | val possible = a.sample()
54 | s = possible.next
55 | sum += possible.reward
56 | print("${WindyGridworld.desc_king_move[a[0]]}$s")
57 | }
58 | println("\nreturn=$sum")//optimal=-6
59 | }
60 |
61 | @Test
62 | fun `Cliff Walking`() {
63 | val prob = CliffWalking.make()
64 | val (π) = prob.QLearning(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
65 | var s = prob.started()
66 | var sum = 0.0
67 | print(s)
68 | while (s.isNotTerminal) {
69 | val a = argmax(s.actions) { π[s, it] }
70 | val possible = a.sample()
71 | s = possible.next
72 | sum += possible.reward
73 | print("${WindyGridworld.desc_move[a[0]]}$s")
74 | }
75 | println("\nreturn=$sum")//optimal=-12
76 | }
77 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Optimal TD Sarsa.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.algo.average_α
4 | import lab.mars.rl.model.isNotTerminal
5 | import lab.mars.rl.problem.Blackjack
6 | import lab.mars.rl.problem.CliffWalking
7 | import lab.mars.rl.problem.WindyGridworld
8 | import lab.mars.rl.util.math.argmax
9 | import lab.mars.rl.util.printBlackjack
10 | import org.junit.Test
11 |
12 | class `Test Optimal TD Sarsa` {
13 |
14 | @Test
15 | fun `Blackjack constant alpha`() {
16 | val (prob) = Blackjack.make()
17 | val (π, V) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
18 | printBlackjack(prob, π, V)
19 | }
20 |
21 | @Test
22 | fun `Blackjack average alpha`() {
23 | val (prob) = Blackjack.make()
24 | val (π, V) = prob.sarsa(ε = 0.1, α = average_α(prob), episodes = 100000)
25 | printBlackjack(prob, π, V)
26 | }
27 |
28 | @Test
29 | fun `WindyGridworld`() {
30 | val prob = WindyGridworld.make()
31 | val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
32 | var s = prob.started()
33 | var sum = 0.0
34 | print(s)
35 | while (s.isNotTerminal) {
36 | val a = argmax(s.actions) { π[s, it] }
37 | val possible = a.sample()
38 | s = possible.next
39 | sum += possible.reward
40 | print("${WindyGridworld.desc_move[a[0]]}$s")
41 | }
42 | println("\nreturn=$sum")//optimal=-14
43 | }
44 |
45 | @Test
46 | fun `WindyGridworld King's Move`() {
47 | val prob = WindyGridworld.make(true)
48 | val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 1000)
49 | var s = prob.started()
50 | var sum = 0.0
51 | print(s)
52 | while (s.isNotTerminal) {
53 | val a = argmax(s.actions) { π[s, it] }
54 | val possible = a.sample()
55 | s = possible.next
56 | sum += possible.reward
57 | print("${WindyGridworld.desc_king_move[a[0]]}$s")
58 | }
59 | println("\nreturn=$sum")//optimal=-6
60 | }
61 |
62 | @Test
63 | fun `Cliff Walking`() {
64 | val prob = CliffWalking.make()
65 | val (π) = prob.sarsa(ε = 0.1, α = { _, _ -> 0.5 }, episodes = 100000)
66 | var s = prob.started()
67 | var sum = 0.0
68 | print(s)
69 | while (s.isNotTerminal) {
70 | val a = argmax(s.actions) { π[s, it] }
71 | val possible = a.sample()
72 | s = possible.next
73 | sum += possible.reward
74 | print("${WindyGridworld.desc_move[a[0]]}$s")
75 | }
76 | println("\nreturn=$sum")//optimal=-12
77 | }
78 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/algo/td/Test Prediction Tabular TD(0).kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.algo.td
2 |
3 | import lab.mars.rl.problem.Blackjack
4 | import lab.mars.rl.problem.RandomWalk
5 | import lab.mars.rl.problem.`1000-state RandomWalk`
6 | import lab.mars.rl.util.format
7 | import lab.mars.rl.util.printBlackjack
8 | import org.junit.Test
9 |
10 | class `Test Prediction Tabular TD(0)` {
11 | @Test
12 | fun `Blackjack`() {
13 | val (prob, π) = Blackjack.make()
14 | val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 500000)
15 | printBlackjack(prob, π, V)
16 | }
17 |
18 | @Test
19 | fun `RandomWalk`() {
20 | val (prob, π) = RandomWalk.make()
21 | val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 1000)
22 | prob.apply {
23 | for (s in states) {
24 | println("${V[s].format(2)} ")
25 | }
26 | }
27 | }
28 |
29 | @Test
30 | fun `1000-state RandomWalk`() {
31 | val (prob, π) = `1000-state RandomWalk`.make()
32 | val V = prob.`Tabular TD(0)`(π = π, α = 0.1, episodes = 10000)
33 | prob.apply {
34 | for (s in states) {
35 | println("${V[s].format(2)} ")
36 | }
37 | }
38 | }
39 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/problem/Test Mountain Car with Actor-Critic.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.problem
4 |
5 | import javafx.application.Application
6 | import lab.mars.rl.algo.policy_gradient.`Actor-Critic with Eligibility Traces (episodic)`
7 | import lab.mars.rl.model.impl.func.LinearFunc
8 | import lab.mars.rl.model.impl.func.SuttonTileCoding
9 | import lab.mars.rl.model.impl.mdp.DefaultAction
10 | import lab.mars.rl.util.matrix.SparseMatrix
11 | import lab.mars.rl.util.tuples.tuple2
12 | import lab.mars.rl.util.ui.MountainCarUI
13 | import org.junit.Test
14 | import java.util.concurrent.CountDownLatch
15 | import kotlin.concurrent.thread
16 |
17 | class `Test Mountain Car with Actor-Critic` {
18 | val numTilings = 8
19 | val positionScale = numTilings / (MountainCar.POSITION_MAX - MountainCar.POSITION_MIN)
20 | val velocityScale = numTilings / (MountainCar.VELOCITY_MAX - MountainCar.VELOCITY_MIN)
21 |
22 | fun func(): LinearFunc> {
23 | val feature = SuttonTileCoding(511, numTilings) { (s, a) ->
24 | s as MountainCar.CarState
25 | a as DefaultAction
26 | tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity),
27 | intArrayOf(a.value))
28 | }
29 | return LinearFunc(feature)
30 | }
31 |
32 | @Test
33 | fun `Mountain Car UI`() {
34 | val prob = MountainCar.make()
35 |
36 | val policyFeature = SuttonTileCoding(511, numTilings) { (s, a) ->
37 | s as MountainCar.CarState
38 | a as DefaultAction
39 | tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity),
40 | intArrayOf(a.value))
41 | }
42 | val h = LinearFunc(policyFeature)
43 | val emptyIntArray = IntArray(0)
44 | val valueFeature = SuttonTileCoding(511, numTilings) { (s) ->
45 | s as MountainCar.CarState
46 | tuple2(doubleArrayOf(positionScale * s.position, velocityScale * s.velocity), emptyIntArray)
47 | }
48 | val v = LinearFunc(valueFeature)
49 |
50 | val episodes = intArrayOf(1, 12, 104, 1000, 9000)
51 | val latch = CountDownLatch(1)
52 | thread {
53 | latch.await()
54 | prob.`Actor-Critic with Eligibility Traces (episodic)`(
55 | h = h, α_θ = 2e-9 / numTilings, λ_θ = 0.96,
56 | v = v, α_w = 0.6 / numTilings, λ_w = 0.96,
57 | episodes = 9000,
58 | z_maker = { m, n -> SparseMatrix(m, n) },
59 | stepListener = step@{ episode, step, s, a ->
60 | if (episode !in episodes) return@step
61 | MountainCarUI.render(episode, step, s as MountainCar.CarState, a as DefaultAction)
62 | })
63 | }
64 | MountainCarUI.after = { latch.countDown() }
65 | Application.launch(MountainCarUI::class.java)
66 | }
67 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/problem/`Test FlyPlane Problem with REINFORCE`.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.problem
4 |
5 | import javafx.application.Application
6 | import lab.mars.rl.algo.func_approx.play
7 | import lab.mars.rl.algo.policy_gradient.`REINFORCE with Baseline (episodic)`
8 | import lab.mars.rl.model.impl.func.LinearFunc
9 | import lab.mars.rl.model.impl.func.SuttonTileCoding
10 | import lab.mars.rl.model.impl.mdp.DefaultAction
11 | import lab.mars.rl.model.impl.mdp.SoftmaxpPolicy
12 | import lab.mars.rl.util.tuples.tuple2
13 | import lab.mars.rl.util.ui.D2DGameUI
14 | import org.junit.Test
15 | import java.util.concurrent.CountDownLatch
16 | import kotlin.concurrent.thread
17 |
18 | class `Test FlyPlane Problem with REINFORCE` {
19 | @Test
20 | fun `Fly Plane UI`() {
21 | val numTilings = 10
22 | val emptyIntArray = IntArray(0)
23 | val valueFeature = SuttonTileCoding(1000, numTilings, doubleArrayOf(1 / 100.0, 1 / 100.0, 1 / 10.0, 1 / 10.0)) { (s) ->
24 | s as FlyPlane.PlaneState
25 | tuple2(doubleArrayOf(s.loc.x, s.loc.y, s.vel.x, s.vel.y), emptyIntArray)
26 | }
27 | val v = LinearFunc(valueFeature)
28 | val policyFeature = SuttonTileCoding(1000, numTilings, doubleArrayOf(1 / 100.0, 1 / 100.0, 1 / 10.0, 1 / 10.0)) { (s, a) ->
29 | s as FlyPlane.PlaneState
30 | a as DefaultAction
31 | tuple2(doubleArrayOf(s.loc.x, s.loc.y, s.vel.x, s.vel.y), intArrayOf(a.value))
32 | }
33 | val h = LinearFunc(policyFeature)
34 | val resolution = 100
35 | val unit = FlyPlane.fieldWidth / resolution
36 | val qvalue = Array(resolution) { Array(resolution + 1) { Double.NEGATIVE_INFINITY } }
37 | var accuG = 0.0
38 | var wins = 0.0
39 | var win_step = 0.0
40 | val episode_round = 100
41 | val step_round = 10
42 | val max_episode = 100000
43 | var episode_base = 0
44 | var animate = false
45 | val latch = CountDownLatch(1)
46 | thread {
47 | latch.await()
48 |
49 | while (true) {
50 | val prob = FlyPlane.makeRand()
51 | animate = false
52 | prob.`REINFORCE with Baseline (episodic)`(
53 | h = h, α_θ = 1e-12,
54 | v = v, α_w = 1e-3,
55 | episodes = max_episode
56 | )
57 | animate = true
58 | prob.play(
59 | π = SoftmaxpPolicy(h),
60 | episodes = 10,
61 | stepListener = { _, _, s, a ->
62 | Thread.sleep(Math.floor(1000 / 60.0).toLong())
63 | }
64 | )
65 | episode_base += max_episode
66 | }
67 | }
68 | D2DGameUI.apply {
69 | canvas_width = FlyPlane.fieldWidth
70 | canvas_height = FlyPlane.fieldWidth
71 | width = 1200.0
72 | height = 800.0
73 | charts.addAll(D2DGameUI.ChartDescription("average return per $episode_round episodes", "episode", "average return"),
74 | D2DGameUI.ChartDescription("win rate per $episode_round episodes", "episode", "win rate"),
75 | D2DGameUI.ChartDescription("average win step per $episode_round episodes", "episode", "average win step",
76 | yForceZeroInRange = false))
77 | afterStartup = { gc ->
78 | latch.countDown()
79 | }
80 | }
81 | Application.launch(D2DGameUI::class.java)
82 | }
83 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/TestBase.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util
2 |
3 | import ch.qos.logback.classic.Level
4 | import ch.qos.logback.classic.LoggerContext
5 | import lab.mars.rl.model.impl.mdp.IndexedMDP
6 | import lab.mars.rl.model.impl.mdp.IndexedPolicy
7 | import lab.mars.rl.model.impl.mdp.StateValueFunction
8 | import org.slf4j.Logger
9 | import org.slf4j.LoggerFactory
10 |
11 | /**
12 | *
13 | * Created on 2017-09-06.
14 | *
15 | *
16 | * @author wumo
17 | */
18 | val ANSI_BLACK = "\u001B[30m"
19 | val ANSI_RED = "\u001B[31m"
20 | val ANSI_GREEN = "\u001B[32m"
21 | val ANSI_YELLOW = "\u001B[33m"
22 | val ANSI_BLUE = "\u001B[34m"
23 | val ANSI_PURPLE = "\u001B[35m"
24 | val ANSI_CYAN = "\u001B[36m"
25 | val ANSI_WHITE = "\u001B[37m"
26 | val ANSI_RESET = "\u001B[0m"
27 | val ANSI_BLACK_BACKGROUND = "\u001B[40m"
28 | val ANSI_RED_BACKGROUND = "\u001B[41m"
29 | val ANSI_GREEN_BACKGROUND = "\u001B[42m"
30 | val ANSI_YELLOW_BACKGROUND = "\u001B[43m"
31 | val ANSI_BLUE_BACKGROUND = "\u001B[44m"
32 | val ANSI_PURPLE_BACKGROUND = "\u001B[45m"
33 | val ANSI_CYAN_BACKGROUND = "\u001B[46m"
34 | val ANSI_WHITE_BACKGROUND = "\u001B[47m"
35 |
36 | val colors = arrayOf(
37 | ANSI_WHITE_BACKGROUND + ANSI_WHITE,
38 | ANSI_BLACK_BACKGROUND + ANSI_BLACK,
39 | ANSI_RED_BACKGROUND + ANSI_RED,
40 | ANSI_GREEN_BACKGROUND + ANSI_GREEN,
41 | ANSI_YELLOW_BACKGROUND + ANSI_YELLOW,
42 | ANSI_BLUE_BACKGROUND + ANSI_BLUE,
43 | ANSI_PURPLE_BACKGROUND + ANSI_PURPLE,
44 | ANSI_CYAN_BACKGROUND + ANSI_CYAN)
45 |
46 | fun color(idx: Int): String {
47 | if (idx in 0..colors.lastIndex)
48 | return colors[idx]
49 | return idx.toString()
50 | }
51 |
52 | fun reset() = ANSI_RESET
53 | fun Double.format(digits: Int) = String.format("%.${digits}f", this)
54 |
55 | fun logLevel(level: Level) {
56 | val loggerContext: LoggerContext = LoggerFactory.getILoggerFactory() as LoggerContext
57 | val rootLogger = loggerContext.getLogger(Logger.ROOT_LOGGER_NAME)
58 | rootLogger.level = level
59 | }
60 |
61 | fun printBlackjack(prob: IndexedMDP, π: IndexedPolicy, V: StateValueFunction) {
62 | println("---------------------Usable Ace--------------------------")
63 | for (a in 9 downTo 0) {
64 | for (b in 0 until 10) {
65 | val s = prob.states[1, 1, b, a]
66 | print("${color(π.greedy(s)[0])} ${reset()}")
67 | }
68 | println()
69 | }
70 | println("---------------------No Usable Ace--------------------------")
71 | for (a in 9 downTo 0) {
72 | for (b in 0 until 10) {
73 | val s = prob.states[1, 0, b, a]
74 | print("${color(π.greedy(s)[0])} ${reset()}")
75 | }
76 | println()
77 | }
78 | for (a in 0 until 10) {
79 | for (b in 0 until 10)
80 | print("${V[1, 1, a, b].format(2)} ")
81 | println()
82 | }
83 | println("------------------------------------------------------------")
84 | for (a in 0 until 10) {
85 | for (b in 0 until 10)
86 | print("${V[1, 0, a, b].format(2)} ")
87 | println()
88 | }
89 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/TestIndex.kt:
--------------------------------------------------------------------------------
1 | @file:Suppress("UNCHECKED_CAST")
2 |
3 | package lab.mars.rl.util
4 |
5 | import lab.mars.rl.util.buf.DefaultIntBuf
6 | import lab.mars.rl.util.buf.Index
7 | import lab.mars.rl.util.buf.MultiIndex
8 | import org.junit.Assert.assertEquals
9 | import org.junit.Test
10 |
11 | /**
12 | *
13 | * Created on 2017-09-18.
14 | *
15 | *
16 | * @author wumo
17 | */
18 | class TestIndex {
19 | @Test
20 | fun `range forEach`() {
21 | val indices = arrayOf(DefaultIntBuf.of(0),
22 | DefaultIntBuf.of(1, 2, 3),
23 | DefaultIntBuf.of(4, 5, 6, 7))
24 | val _idx = MultiIndex(indices as Array)
25 | val expected = IntArray(8) { it }
26 | _idx.forEach(0, 0) { idx, value ->
27 | assertEquals(expected[idx], value)
28 | }
29 | _idx.forEach(4, 7) { idx, value ->
30 | assertEquals(expected[idx], value)
31 | }
32 | _idx.forEach { idx, value ->
33 | assertEquals(expected[idx], value)
34 | }
35 | _idx.forEach(2, 5) { idx, value ->
36 | assertEquals(expected[idx], value)
37 | }
38 | _idx.forEach(0, 5) { idx, value ->
39 | assertEquals(expected[idx], value)
40 | }
41 | }
42 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/extensions.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util
2 |
3 | import kotlinx.coroutines.Deferred
4 | import kotlinx.coroutines.GlobalScope
5 | import kotlinx.coroutines.async
6 |
7 | inline fun listOf(size: Int, init: (Int) -> R): ArrayList {
8 | val list = ArrayList()
9 | for (i in 0 until size)
10 | list += init(i)
11 | return list
12 | }
13 |
14 | inline fun listOf(iter: Iterable, init: (I) -> R): ArrayList {
15 | val list = ArrayList()
16 | for (i in iter)
17 | list += init(i)
18 | return list
19 | }
20 |
21 | inline fun listOf(iter1: Iterable, iter2: Iterable, init: (I1, I2) -> R): List {
22 | val list = mutableListOf()
23 | for (i in iter1)
24 | for (j in iter2)
25 | list += init(i, j)
26 | return list
27 | }
28 |
29 | fun asyncs(size: Int, init: suspend (Int) -> R): ArrayList> {
30 | val list = ArrayList>()
31 | for (i in 0 until size)
32 | list += GlobalScope.async {
33 | init(i)
34 | }
35 |
36 | return list
37 | }
38 |
39 | fun asyncs(iter: Iterable, init: suspend (I) -> R): ArrayList> {
40 | val list = ArrayList>()
41 | for (i in iter)
42 | list += GlobalScope.async {
43 | init(i)
44 | }
45 |
46 | return list
47 | }
48 |
49 | fun asyncs(iter1: Iterable, iter2: Iterable, init: suspend (I1, I2) -> R): ArrayList> {
50 | val list = ArrayList>()
51 | for (i in iter1)
52 | for (j in iter2)
53 | list += GlobalScope.async { init(i, j) }
54 | return list
55 | }
56 |
57 | suspend fun ArrayList>.await(process: suspend (R) -> Unit = {}) {
58 | forEach { process(it.await()) }
59 | }
--------------------------------------------------------------------------------
/src/test/kotlin/lab/mars/rl/util/range/DoubleProgression.kt:
--------------------------------------------------------------------------------
1 | package lab.mars.rl.util.range
2 |
3 | import kotlin.math.sign
4 |
5 | class DoubleProgression(val start: Double,
6 | val endInclusive: Double,
7 | val step: Double): Iterable {
8 | override fun iterator() = object: Iterator {
9 | var current = start
10 | override fun hasNext() = step.sign * (endInclusive - current) >= 0
11 |
12 | override fun next() = current.apply { current += step }
13 | }
14 | }
15 |
16 | operator fun Double.rangeTo(that: Double)
17 | = DoubleProgression(this, that, 0.1)
18 |
19 | infix fun DoubleProgression.step(step: Double)
20 | = DoubleProgression(start, endInclusive, step)
--------------------------------------------------------------------------------
/src/test/resources/Figure 10.1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 10.1.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 10.4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 10.4.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 12.10.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.10.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 12.11.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.11.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 12.3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.3.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 12.6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.6.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 12.8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 12.8.PNG
--------------------------------------------------------------------------------
/src/test/resources/Figure 7.2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wumo/Reinforcement-Learning-An-Introduction/d419efccdca9a7f34b7805929b41651b172a3ee7/src/test/resources/Figure 7.2.PNG
--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |