├── data └── pacman │ └── Q.json ├── .gitignore ├── project ├── build.properties └── plugins.sbt ├── .scalafmt.conf ├── src └── main │ └── scala │ └── rl │ ├── core │ ├── package.scala │ ├── ActionResult.scala │ ├── AgentBehaviour.scala │ ├── StateConversion.scala │ ├── Environment.scala │ └── QLearning.scala │ ├── pacman │ ├── training │ │ ├── QKeyValue.scala │ │ └── PacmanTraining.scala │ ├── ui │ │ └── PacmanUI.scala │ └── core │ │ └── PacmanProblem.scala │ ├── gridworld │ ├── core │ │ └── GridworldProblem.scala │ └── ui │ │ └── GridworldUI.scala │ └── polecart │ ├── ui │ ├── HumanUI.scala │ └── QLearningUI.scala │ └── core │ └── PoleBalancingProblem.scala ├── pacman.html ├── polecart-human.html ├── README.md ├── gridworld.html ├── index.html └── polecart-qlearning.html /data/pacman/Q.json: -------------------------------------------------------------------------------- 1 | [] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pacman-training/ 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.2.3 2 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | align = true 2 | maxColumn = 100 3 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scala-js" % "sbt-scalajs" % "0.6.25") 2 | addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1") 3 | -------------------------------------------------------------------------------- /src/main/scala/rl/core/package.scala: -------------------------------------------------------------------------------- 1 | package rl 2 | 3 | package object core { 4 | 5 | // In reinforcement learning the reward is always numeric 6 | type Reward = Double 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/rl/core/ActionResult.scala: -------------------------------------------------------------------------------- 1 | package rl.core 2 | 3 | /** 4 | * The results of the agent taking an action: 5 | * it receives a reward and ends up in a new state. 6 | */ 7 | case class ActionResult[State](reward: Reward, nextState: State) 8 | -------------------------------------------------------------------------------- /src/main/scala/rl/core/AgentBehaviour.scala: -------------------------------------------------------------------------------- 1 | package rl.core 2 | 3 | trait AgentBehaviour[AgentData, State, Action] { 4 | 5 | /** 6 | * Given an agent and the current state, asks the agent to choose the next action. 7 | * 8 | * Returns two things: 9 | * 10 | * 1. the action that the agent chose 11 | * 2. a function that, given the results of taking the action, 12 | * uses it to improve the agent's policy and thus returns a new version of the agent 13 | */ 14 | def chooseAction(agentData: AgentData, 15 | state: State, 16 | validActions: List[Action]): (Action, ActionResult[State] => AgentData) 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/rl/core/StateConversion.scala: -------------------------------------------------------------------------------- 1 | package rl.core 2 | 3 | trait StateConversion[EnvState, AgentState] { 4 | 5 | /** 6 | * Convert from the "true", complete state as known by the environment, 7 | * into a simplified state that we give to the agent. 8 | * 9 | * This is a chance to do things: 10 | * 11 | * 1. If the problem includes any constraints that say the agent should have incomplete 12 | * knowledge of the environment, we can encode that here. 13 | * 14 | * 2. We can discard some information in order to reduce the agent's state space, 15 | * e.g. by bucketing a large number of environment states into a single agent state. 16 | */ 17 | def convertState(envState: EnvState): AgentState 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/rl/core/Environment.scala: -------------------------------------------------------------------------------- 1 | package rl.core 2 | 3 | trait Environment[State, Action] { 4 | 5 | /** 6 | * Given the current state, what are the legal actions the agent can take? 7 | */ 8 | def possibleActions(currentState: State): List[Action] 9 | 10 | /** 11 | * Given the current state and the action chosen by the agent, 12 | * what state does the agent move into and what reward does it get? 13 | * 14 | * Things to note: 15 | * - The reward might be positive, negative or zero. 16 | * - The next state might be the same as the current state. 17 | * - Both the state transition function and the reward function may be stochastic, 18 | * meaning they follow some probability distribution and do not always 19 | * give the same output for a given input. 20 | */ 21 | def step(currentState: State, actionTaken: Action): (State, Reward) 22 | 23 | /** 24 | * Is the given state terminal or not? 25 | * For continuous (non-episodic) problems, this will always be false. 26 | */ 27 | def isTerminal(state: State): Boolean 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/rl/pacman/training/QKeyValue.scala: -------------------------------------------------------------------------------- 1 | package rl.pacman.training 2 | 3 | import io.circe.{Decoder, Encoder, KeyDecoder, KeyEncoder} 4 | import io.circe.generic.auto._ 5 | import io.circe.generic.semiauto._ 6 | import rl.pacman.core.PacmanProblem.{AgentState, Move} 7 | 8 | /* 9 | This is just an artifact of the way we encode the Q-values as JSON. 10 | Q is a Map[AgentState, Map[Move, Double]], so it has non-String keys. 11 | When we write it to the JSON file we turn it into a List[(AgentState, Map[Move, Double])]. 12 | */ 13 | case class QKeyValue(key: AgentState, value: Map[Move, Double]) 14 | 15 | object QKeyValue { 16 | 17 | implicit val moveEncoder: KeyEncoder[Move] = (move: Move) => move.toString 18 | implicit val moveDecoder: KeyDecoder[Move] = { 19 | case "Left" => Some(Move.Left) 20 | case "Right" => Some(Move.Right) 21 | case "Up" => Some(Move.Up) 22 | case "Down" => Some(Move.Down) 23 | case _ => None 24 | } 25 | 26 | implicit val encoder: Encoder[QKeyValue] = deriveEncoder 27 | implicit val decoder: Decoder[QKeyValue] = deriveDecoder 28 | 29 | } 30 | -------------------------------------------------------------------------------- /pacman.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |-1A → jump to A', reward = 10B → jump to B', reward = 50| 73 | | 74 | | 75 | | 76 | | 77 | |
| 80 | | 81 | | 82 | | 83 | | 84 | |
| 87 | | 88 | | 89 | | 90 | | 91 | |
| 94 | | 95 | | 96 | | 97 | | 98 | |
| 101 | | 102 | | 103 | | 104 | | 105 | |
| 116 | | 117 | | 118 | | 119 | | 120 | |
| 123 | | 124 | | 125 | | 126 | | 127 | |
| 130 | | 131 | | 132 | | 133 | | 134 | |
| 137 | | 138 | | 139 | | 140 | | 141 | |
| 144 | | 145 | | 146 | | 147 | | 148 | |
This site contains the demos for my 'Reinforcement Learning in Scala' talk.
43 | 44 |The slides for the talk are available here.
47 | 48 |The source code for all the demos is available on GitHub.
49 | 50 |53 | There are 3 demos, all of which use the same RL algorithm known as Q-learning. 54 |
55 | 56 | 61 | 62 |This is a continous (non-episodic) problem with very simple rules:
64 |A and moves in any direction, it jumps to A' and gets a reward of 10.B and moves in any direction, it jumps to B' and gets a reward of 5.
72 | Of course, the optimal policy is to always move towards A in order to pick up the reward of 10.
73 | If you run the demo, you should see the agent gradually learn this policy.
74 |
76 | It may get stuck in a local minimum (i.e. preferring the B cell) for a while,
77 | but it is guaranteed to eventually converge on the optimal policy.
78 | This is because the agent constantly explores the state space using the ε-greedy algorithm.
79 |
81 | The big table under the grid shows the agent's current Q(s, a) for all state-action pairs.
82 | This is the estimate that the agent holds for being in state s and taking action a.
83 |
85 | The smaller table shows the same information summarised as a policy. 86 | In other words, for a given state, what action(s) the agent currently believes to be the best. 87 |
88 |This episodic problem is a classic in RL literature.
92 | 93 |94 | At every time step the agent must push the cart either to the left or the right. 95 | The goal is to stop the pole from toppling too far either to the left or the right, 96 | whilst also ensuring the cart does not crash into the walls. 97 |
98 | 99 |The rules are as follows:
100 |107 | It's fascinating to see how quickly the agent learns, especially bearing in mind: 108 |
109 |122 | To get a feel for the problem, you might want to try it yourself first. 123 | Use the Left and Right arrow keys on your keyboard to move the cart. 124 |
125 | 126 |127 | Next you can watch the agent learn. 128 | Use the buttons to run through a single time step, a single episode or continously. 129 |
130 |This one is an exercise for the reader.
134 | 135 |136 | The demo shows a very "dumb" agent. Its state space is enormous, so it has no chance of doing any meaningful learning. 137 |
138 | 139 |140 | See if you can improve the agent by redesigning its state space and putting it through some training. 141 |
142 | 143 |144 | Take a look at the README for more details. 145 |
146 || Cart velocity | 98 | 99 |Fast left | 100 |Slow | 101 |Fast right | 102 ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| Pole velocity | 105 | 106 |Fast left | 107 |Slow | 108 |Fast right | 109 | 110 |Fast left | 111 |Slow | 112 |Fast right | 113 | 114 |Fast left | 115 |Slow | 116 |Fast right | 117 ||
| Pole angle | 122 |Very left | 123 | 124 |125 | | 126 | | 127 | | 128 | | 129 | | 130 | | 131 | | 132 | | 133 | |
| Quite left | 137 | 138 |139 | | 140 | | 141 | | 142 | | 143 | | 144 | | 145 | | 146 | | 147 | | |
| Slightly left | 151 | 152 |153 | | 154 | | 155 | | 156 | | 157 | | 158 | | 159 | | 160 | | 161 | | |
| Slightly right | 165 | 166 |167 | | 168 | | 169 | | 170 | | 171 | | 172 | | 173 | | 174 | | 175 | | |
| Quite right | 179 | 180 |181 | | 182 | | 183 | | 184 | | 185 | | 186 | | 187 | | 188 | | 189 | | |
| Very right | 193 | 194 |195 | | 196 | | 197 | | 198 | | 199 | | 200 | | 201 | | 202 | | 203 | | |
| Cart velocity | 213 | 214 |Fast left | 215 |Slow | 216 |Fast right | 217 ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| Pole velocity | 220 | 221 |Fast left | 222 |Slow | 223 |Fast right | 224 | 225 |Fast left | 226 |Slow | 227 |Fast right | 228 | 229 |Fast left | 230 |Slow | 231 |Fast right | 232 ||
| Pole angle | 237 |Very left | 238 | 239 |240 | | 241 | | 242 | | 243 | | 244 | | 245 | | 246 | | 247 | | 248 | |
| Quite left | 252 | 253 |254 | | 255 | | 256 | | 257 | | 258 | | 259 | | 260 | | 261 | | 262 | | |
| Slightly left | 266 | 267 |268 | | 269 | | 270 | | 271 | | 272 | | 273 | | 274 | | 275 | | 276 | | |
| Slightly right | 280 | 281 |282 | | 283 | | 284 | | 285 | | 286 | | 287 | | 288 | | 289 | | 290 | | |
| Quite right | 294 | 295 |296 | | 297 | | 298 | | 299 | | 300 | | 301 | | 302 | | 303 | | 304 | | |
| Very right | 308 | 309 |310 | | 311 | | 312 | | 313 | | 314 | | 315 | | 316 | | 317 | | 318 | | |
| Cart velocity | 328 | 329 |Fast left | 330 |Slow | 331 |Fast right | 332 ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| Pole velocity | 335 | 336 |Fast left | 337 |Slow | 338 |Fast right | 339 | 340 |Fast left | 341 |Slow | 342 |Fast right | 343 | 344 |Fast left | 345 |Slow | 346 |Fast right | 347 ||
| Pole angle | 352 |Very left | 353 | 354 |355 | | 356 | | 357 | | 358 | | 359 | | 360 | | 361 | | 362 | | 363 | |
| Quite left | 367 | 368 |369 | | 370 | | 371 | | 372 | | 373 | | 374 | | 375 | | 376 | | 377 | | |
| Slightly left | 381 | 382 |383 | | 384 | | 385 | | 386 | | 387 | | 388 | | 389 | | 390 | | 391 | | |
| Slightly right | 395 | 396 |397 | | 398 | | 399 | | 400 | | 401 | | 402 | | 403 | | 404 | | 405 | | |
| Quite right | 409 | 410 |411 | | 412 | | 413 | | 414 | | 415 | | 416 | | 417 | | 418 | | 419 | | |
| Very right | 423 | 424 |425 | | 426 | | 427 | | 428 | | 429 | | 430 | | 431 | | 432 | | 433 | | |