├── .gitignore ├── .travis.yml ├── COPYING ├── LICENSE ├── README.md ├── build.sbt ├── docs ├── build.sbt └── src │ └── main │ └── paradox │ ├── back-end.md │ ├── experimenting.md │ ├── front-end.md │ ├── images │ ├── cluster-nodes.png │ ├── cluster-nodes.svg │ ├── master-worker-message-flow.png │ ├── master-worker-message-flow.svg │ ├── singleton-manager.png │ └── singleton-manager.svg │ ├── index.md │ ├── master-in-detail.md │ ├── next-steps.md │ └── worker.md ├── project ├── build.properties ├── default.properties ├── giter8.sbt └── paradox.sbt └── src └── main └── g8 ├── build.sbt ├── default.properties ├── project └── build.properties └── src ├── main ├── resources │ ├── application.conf │ └── logback.xml └── scala │ └── worker │ ├── FrontEnd.scala │ ├── Main.scala │ ├── Master.scala │ ├── MasterSingleton.scala │ ├── MasterWorkerProtocol.scala │ ├── Work.scala │ ├── WorkExecutor.scala │ ├── WorkResultConsumer.scala │ ├── WorkState.scala │ └── Worker.scala └── test └── scala └── worker └── DistributedWorkerSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *# 2 | *.iml 3 | *.ipr 4 | *.iws 5 | *.pyc 6 | *.tm.epoch 7 | *.vim 8 | */project/boot 9 | */project/build/target 10 | */project/project.target.config-classes 11 | *-shim.sbt 12 | *~ 13 | .#* 14 | .*.swp 15 | .DS_Store 16 | .cache 17 | .cache 18 | .classpath 19 | .codefellow 20 | .ensime* 21 | .eprj 22 | .history 23 | .idea 24 | .manager 25 | .multi-jvm 26 | .project 27 | .scala_dependencies 28 | .scalastyle 29 | .settings 30 | .tags 31 | .tags_sorted_by_file 32 | .target 33 | .worksheet 34 | Makefile 35 | TAGS 36 | _akka_cluster/ 37 | _dump 38 | _mb 39 | activemq-data 40 | akka-contrib/rst_preprocessed/ 41 | akka-docs/_build/ 42 | akka-docs/exts/ 43 | akka-docs/rst_preprocessed/ 44 | akka-osgi/src/main/resources/*.conf 45 | akka.sublime-project 46 | akka.sublime-workspace 47 | akka.tmproj 48 | beanstalk/ 49 | bin/ 50 | data 51 | deploy/*.jar 52 | etags 53 | lib_managed 54 | logs 55 | manifest.mf 56 | mongoDB/ 57 | multiverse.log 58 | out 59 | project/akka-build.properties 60 | project/boot/* 61 | project/plugins/project 62 | redis/ 63 | reports 64 | run-codefellow 65 | schoir.props 66 | semantic.cache 67 | src_managed 68 | storage 69 | tags 70 | target 71 | tm*.lck 72 | tm*.log 73 | tm.out 74 | worker*.log 75 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.12.6 4 | jdk: 5 | - openjdk8 6 | 7 | script: 8 | - sbt -Dfile.encoding=UTF8 -J-XX:ReservedCodeCacheSize=256M test docs/paradox 9 | 10 | before_cache: 11 | - find $HOME/.ivy2 -name "ivydata-*.properties" -print -delete 12 | - find $HOME/.sbt -name "*.lock" -print -delete 13 | 14 | cache: 15 | directories: 16 | - $HOME/.ivy2/cache 17 | - $HOME/.sbt/boot 18 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Activator Template by Typesafe 2 | 3 | Licensed under Public Domain (CC0) 4 | 5 | To the extent possible under law, the person who associated CC0 with 6 | this Activator Tempate has waived all copyright and related or neighboring 7 | rights to this Activator Template. 8 | 9 | You should have received a copy of the CC0 legalcode along with this 10 | work. If not, see . 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Akka Distributed Workers Quickstart 2 | 3 | ## Project has moved 4 | 5 | This example has moved to [akka-samples](https://github.com/akka/akka-samples/tree/2.6/akka-sample-distributed-workers-scala) 6 | and is not maintained here anymore. 7 | 8 | ## Template license 9 | 10 | Written in 2017 by Lightbend, Inc. 11 | 12 | To the extent possible under law, the author(s) have dedicated all copyright and related and 13 | neighboring rights to this template to the public domain worldwide. This template is distributed 14 | without any warranty. See http://creativecommons.org/publicdomain/zero/1.0/. -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | // This build is for this Giter8 template. 2 | // To test the template run `g8` or `g8Test` from the sbt session. 3 | // See http://www.foundweekends.org/giter8/testing.html#Using+the+Giter8Plugin for more details. 4 | lazy val root = (project in file(".")) 5 | .settings( 6 | name := "akka-scala-seed", 7 | test in Test := { 8 | val _ = (g8Test in Test).toTask("").value 9 | }, 10 | scriptedLaunchOpts ++= List("-Xms1024m", "-Xmx1024m", "-XX:ReservedCodeCacheSize=128m", "-XX:MaxPermSize=256m", "-Xss2m", "-Dfile.encoding=UTF-8") 11 | ) 12 | 13 | // Documentation for this project: 14 | // sbt "project docs" "~ paradox" 15 | // open docs/target/paradox/site/main/index.html 16 | lazy val docs = (project in file("docs")) 17 | .enablePlugins(ParadoxPlugin) 18 | 19 | resolvers in ThisBuild += Resolver.url("typesafe", url("http://repo.typesafe.com/typesafe/ivy-releases/"))(Resolver.ivyStylePatterns) 20 | -------------------------------------------------------------------------------- /docs/build.sbt: -------------------------------------------------------------------------------- 1 | // Uses the out of the box generic theme. 2 | paradoxTheme := Some(builtinParadoxTheme("generic")) 3 | 4 | scalaVersion := "2.12.6" 5 | 6 | paradoxProperties in Compile ++= Map( 7 | "snip.g8root.base_dir" -> "../../../../src/main/g8", 8 | "snip.g8src.base_dir" -> "../../../../src/main/g8/src/main/", 9 | "snip.g8srctest.base_dir" -> "../../../../src/main/g8/src/test/" 10 | ) -------------------------------------------------------------------------------- /docs/src/main/paradox/back-end.md: -------------------------------------------------------------------------------- 1 | # The Back-End Nodes 2 | 3 | The back-end nodes host the `Master` actor, which manages work, keeps track of available workers, and notifies registered workers when new work is available. The single `Master` actor is the heart of the solution, with built-in resilience provided by the [Akka Cluster Singleton](http://doc.akka.io/docs/akka/current/scala/guide/modules.html#cluster-singleton). 4 | 5 | ## The Master singleton 6 | 7 | The [Cluster Singleton](http://doc.akka.io/docs/akka/current/scala/guide/modules.html#cluster-singleton) tool in Akka makes sure an actor only runs concurrently on one node within the subset of nodes marked with the role `back-end` at any given time. It will run on the oldest back-end node. If the node on which the 'Master' is running is removed from the cluster, Akka starts a new `Master` on the next oldest node. Other nodes in the cluster interact with the `Master` through the `ClusterSingletonProxy` without knowing the explicit location. You can see this interaction in the `FrontEnd` and `Worker` actors. 8 | 9 | In case of the master node crashing and being removed from the cluster another master actor is automatically started on the new oldest node. 10 | 11 | ![Managed Singleton](images/singleton-manager.png) 12 | 13 | You can see how the master singleton is started in the method `startSingleton` 14 | in `MasterSingleton`: 15 | 16 | @@snip [MasterSingleton.scala]($g8src$/scala/worker/MasterSingleton.scala) { #singleton } 17 | 18 | The singleton accepts the `Prop`s of the actual singleton actor, as well as configuration which allows us to decide that the singleton actors should only run on the nodes with the role `back-end`. 19 | 20 | The proxy is similarly configured, with the role where the singleton will be running, and also a path where the singleton manager actor can be found: 21 | 22 | @@snip [MasterSingleton.scala]($g8src$/scala/worker/MasterSingleton.scala) { #proxy } 23 | 24 | 25 | The state of the master is recovered on the standby node in the case of the node being lost through event sourcing. 26 | 27 | An alternative to event sourcing and the singleton master would be to keep track of all jobs in a central database, but that is more complicated and not as scalable. In the end of the tutorial we will describe how multiple masters can be supported with a small adjustment. 28 | 29 | Let's now explore the implementation of the `Master` actor in depth. 30 | -------------------------------------------------------------------------------- /docs/src/main/paradox/experimenting.md: -------------------------------------------------------------------------------- 1 | ## Experimenting with the example 2 | 3 | When running the appliction without parameters it runs a six node cluster within the same JVM and starts a Cassandra database. It can be more interesting to run them in separate processes. Open four terminal windows. 4 | 5 | In the first terminal window, start the Cassandra database with the following command: 6 | 7 | ```bash 8 | sbt "runMain worker.Main cassandra" 9 | ``` 10 | 11 | The Cassandra database will stay alive as long as you do not kill this process, when you want to stop it you can do that with `Ctrl + C`. Without the database the back-end nodes will not be able to start up. 12 | 13 | You could also run your own local installation of Cassandra given that it runs on the default port on localhost and does not require a password. 14 | 15 | 16 | With the database running, go to the second terminal window and start the first seed node with the following command: 17 | 18 | ```bash 19 | sbt "runMain worker.Main 2551" 20 | ``` 21 | 22 | 2551 corresponds to the port of the first seed-nodes element in the configuration. In the log output you see that the cluster node has been started and changed status to 'Up'. 23 | 24 | In the third terminal window, start the front-end node with the following command: 25 | 26 | ```bash 27 | sbt "runMain worker.Main 3001" 28 | ``` 29 | 30 | 3001 is to the port of the node. In the log output you see that the cluster node has been started and joins the 2551 node and becomes a member of the cluster. Its status changed to 'Up'. 31 | 32 | Switch over to the second terminal window and see in the log output that the member joined. So far, no `Worker` has not been started, i.e. jobs are produced and accepted but not processed. 33 | 34 | In the fourth terminal window, start a worker node with the following command: 35 | 36 | ```bash 37 | sbt "runMain worker.Main 5001 3" 38 | ``` 39 | 40 | 5001 means the node will be a worker node, and the second parameter `3` means that it will host three worker actors. 41 | 42 | Look at the log output in the different terminal windows. In the second window (front-end) you should see that the produced jobs are processed and logged as `"Consumed result"`. 43 | 44 | Take a look at the logging that is done in `WorkProducer`, `Master` and `Worker`. Identify the corresponding log entries in the 3 terminal windows with Akka nodes. 45 | 46 | Shutdown the worker node (fourth terminal window) with `ctrl-c`. Observe how the `"Consumed result"` logs in the front-end node (second terminal window) stops. Start the worker node again. 47 | 48 | ```bash 49 | sbt "runMain worker.Main 5001 3" 50 | ``` 51 | 52 | You can also start more such worker nodes in new terminal windows. 53 | 54 | You can start more cluster back-end nodes using port numbers between 2000-2999. 55 | 56 | ```bash 57 | sbt "runMain worker.Main 2552" 58 | ``` 59 | 60 | The nodes with port 2551 to 2554 are configured to be used as "seed nodes" in this sample, if you shutdown all or start none of these the other nodes will not know how to join the cluster. If all four are shut down and 2551 is started it will join itself and form a new cluster. 61 | 62 | As long as one of the four nodes is alive the cluster will keep working. You can read more about this in the [Akka documentation section on seed nodes](http://doc.akka.io/docs/akka/current/scala/cluster-usage.html). 63 | 64 | You can start more cluster front-end nodes using port numbers between 3000-3999: 65 | 66 | ```bash 67 | sbt "runMain worker.Main 3002" 68 | ``` 69 | 70 | Any port outside these ranges creates a worker node, for which you can also play around with the number of worker actors on using the second parameter. 71 | 72 | ```bash 73 | sbt "runMain worker.Main 5009 4" 74 | ``` 75 | 76 | ## The journal 77 | 78 | The files of the Cassandra database are saved in the target directory and when you restart the application the state is recovered. You can clean the state with: 79 | 80 | ```bash 81 | sbt clean 82 | ``` -------------------------------------------------------------------------------- /docs/src/main/paradox/front-end.md: -------------------------------------------------------------------------------- 1 | # Front-End Nodes 2 | 3 | Typically in systems built with Akka, clients submit requests using a RESTful API. Either [Akka HTTP](http://doc.akka.io/docs/akka-http/current/scala/http/introduction.html) or [Play Framework](https://www.playframework.com) are great choices for implementing an HTTP API for the front-end. To limit the scope of this example, we have chosen to emulate client activity with two ordinary actors: 4 | 5 | * The `FrontEnd` actor generates payloads at random intervals and sends them to the 'Master' actor. 6 | * The `WorkResultConsumerActor` that consumes results and logs them. 7 | 8 | 9 | The `FrontEnd` actor only concerns itself with posting workloads, and does not care when the work has been completed. When a workload has been processed successfully and passed to the `Master` actor it publishes the result to all interested cluster nodes through Distributed Pub-Sub. 10 | 11 | The `WorkResultConsumerActor` subscribes to the completion events and logs when a workload has completed. 12 | 13 | Now, let's take a look at the code that accomplishes this front-end behavior. 14 | 15 | ## The Front-end Actor 16 | 17 | @@snip [FrontEnd.scala]($g8src$/scala/worker/FrontEnd.scala) { #front-end } 18 | 19 | Note in the source code that as the 'FrontEnd' actor starts up, it: 20 | 21 | 1. Schedules 'Tick' messages to itself. 22 | 1. Each 'Tick' message: 23 | 1. Triggers creation of a new 'Work' message. 24 | 1. Sends the 'Work' message to the 'Master' actor of a 'back-end' node. 25 | 1. Switches to a new 'busy' behavior. 26 | 27 | As you can see the `FrontEnd` actor schedules `Tick` messages to itself when starting up. the `Tick` message then triggers creation of a new `Work`, sending the work to the `Master` actor on a `back-end` node and switching to a new `busy` behavior. 28 | 29 | The cluster contains one `Master` actor. The `FrontEnd` actor does not need to know the exact location because it sends work to the master using the `ClusterSingletonProxy`. 30 | 31 | The 'Master' actor can accept or deny a work request and we need to deal with unexpected errors: 32 | 33 | * If the 'Master' accepts the request, the actor schedules a new tick to itself and toggles back to the idle behavior. 34 | * To deal with failures, the message uses the [ask pattern](http://doc.akka.io/docs/akka/current/scala/actors.html#ask-send-and-receive-future) to add a timeout to wait for a reply. If the timeout expires before the master responds, the returned 'Future' fails with an akka.pattern.AskTimeoutException. 35 | * We transform timeouts or denials from the 'Master' into a 'NotOK' value. The 'Future' is piped to the 'FrontEnd' actor as a message with the completed value, either the successful result, or 'NotOK'. If the work is not accepted or there is no response, for example if the message or response got lost, the FrontEnd actor backs off a bit and then sends the work again. 36 | 37 | 38 | The future is then `pipe`d to the actor itself, meaning that when it completes the value it is completed with is sent to the actor as a message. 39 | 40 | When a workload has been acknowledged by the master, the actor schedules a new tick to itself and toggles back to the `idle` behavior. 41 | 42 | If the work is not accepted or there is no response, for example if the message or response got lost, the `FrontEnd` actor backs off a bit and then sends the work again. 43 | 44 | You can see the how the actors on a front-end node is started in the method `Main.startFrontEnd`: 45 | 46 | @@snip [Main.scala]($g8src$/scala/worker/Main.scala) { #front-end } 47 | 48 | ## The Work Result Consumer Actor 49 | As mentioned in the introduction, results are published using Distributed Pub-Sub. The 'WorkResultConsumerActor' subscribes to completion events and logs when a workload has completed: 50 | 51 | @@snip [Main.scala]($g8src$/scala/worker/WorkResultConsumer.scala) { #work-result-consumer } 52 | 53 | In an actual application you would probably want a way for clients to poll or stream the status changes of the submitted work. 54 | -------------------------------------------------------------------------------- /docs/src/main/paradox/images/cluster-nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akka/akka-distributed-workers-scala.g8/55fb750cfafdfec68444c967d047f1f72a5ac231/docs/src/main/paradox/images/cluster-nodes.png -------------------------------------------------------------------------------- /docs/src/main/paradox/images/cluster-nodes.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Produced by OmniGraffle 6.6.2 2017-07-27 09:39:38 +0000Canvas 1Layer 1MasterStandbyBack-end Node 1Back-end Node 2Front-end Node 1Worker Node 1Worker Worker Worker Front EndFront EndMasterWorker Worker Node 2Front-end Node 2 4 | -------------------------------------------------------------------------------- /docs/src/main/paradox/images/master-worker-message-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akka/akka-distributed-workers-scala.g8/55fb750cfafdfec68444c967d047f1f72a5ac231/docs/src/main/paradox/images/master-worker-message-flow.png -------------------------------------------------------------------------------- /docs/src/main/paradox/images/master-worker-message-flow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Produced by OmniGraffle 6.6.2 2017-07-21 14:46:15 +0000Canvas 1Layer 1MasterWorker5. WorkIsReadyWork Executor6. WorkerRequestsWork7. Work8. Work 4 | -------------------------------------------------------------------------------- /docs/src/main/paradox/images/singleton-manager.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akka/akka-distributed-workers-scala.g8/55fb750cfafdfec68444c967d047f1f72a5ac231/docs/src/main/paradox/images/singleton-manager.png -------------------------------------------------------------------------------- /docs/src/main/paradox/images/singleton-manager.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | Produced by OmniGraffle 6.6.2 2017-07-27 09:41:21 +0000Canvas 1Layer 1Cluster-Singleton-ManagerMasterCluster-Singleton-ManagerMasterStandbyBack-end Node 1Back-end Node 2 4 | -------------------------------------------------------------------------------- /docs/src/main/paradox/index.md: -------------------------------------------------------------------------------- 1 | # Akka Distributed Workers with Scala Guide 2 | 3 | Akka is a toolkit and runtime for building highly concurrent, distributed, and fault-tolerant event-driven applications on the JVM. Akka can be used with both Java and Scala. 4 | 5 | This guide introduces Akka clusters by describing the Scala version of a distributed workers example. 6 | 7 | A Java version of the guide is not yet available but will be soon, so check back in a while! 8 | 9 | The guide contains advanced usage of Akka and requires familiarity with Akka and Actors. If you have no previous experience with Actors you should start with the [Akka Quickstart with Scala](http://developer.lightbend.com/guides/akka-quickstart-scala/), which goes through the basics. 10 | 11 | ## Example overview 12 | 13 | To be reactive, distributed applications must deal gracefully with temporary and prolonged outages as well as have the ability to scale up and down to make the best use of resources. Akka clustering provides these capabilities so that you don't have to implement them yourself. The distributed workers example demonstrates the following Akka clustering capabilities: 14 | 15 | * elastic addition and removal of the front-end actors that accept client requests 16 | * elastic addition and removal of the back-end actors that perform the work distribution of actors across different nodes 17 | * how jobs are re-tried in the face of failures 18 | 19 | But before we dive into how the example accomplishes these goals, download the example and try it out! 20 | 21 | The design is based on Derek Wyatt's blog post [Balancing Workload Across Nodes with Akka 2](http://letitcrash.com/post/29044669086/balancing-workload-across-nodes-with-akka-2) from 2009, which is a bit old, but still a good description of the advantages of letting the workers pull work from the master instead of pushing work to the workers. 22 | 23 | ## Downloading the example 24 | 25 | The Akka Distributed Workers example for Scala is a zipped project that includes a distribution of sbt (build tool). You can run it on Linux, MacOS, or Windows. The only prerequisite is Java 8. 26 | 27 | Download and unzip the example: 28 | 29 | 1. Download the zip file from [Lightbend Tech Hub](https://developer.lightbend.com/start/?group=akka&project=akka-distributed-workers-scala) by clicking `CREATE A PROJECT FOR ME`. 30 | 1. Extract the zip file to a convenient location: 31 | - On Linux and MacOS systems, open a terminal and use the command `unzip akka-distributed-workers-scala.zip`. Note: On MacOS, if you unzip using Archiver, you also have to make the sbt files executable: 32 | ``` 33 | $ chmod u+x ./sbt 34 | $ chmod u+x ./sbt-dist/bin/sbt 35 | ``` 36 | - On Windows, use a tool such as File Explorer to extract the project. 37 | 38 | ## Running the example 39 | 40 | To run the sample application, which starts a small cluster inside of the same JVM instance: 41 | 42 | 1. In a console, change directories to the top level of the unzipped project. 43 | 44 | For example, if you used the default project name, akka-distributed-workers-scala, and extracted the project to your root directory, 45 | from the root directory, enter: `cd akka-distributed-workers-scala` 46 | 47 | 1. Enter `./sbt` on MacOS/Linux or `sbt.bat` on Windows to start sbt. 48 | 49 | sbt downloads project dependencies. The `>` prompt indicates sbt has started in interactive mode. 50 | 51 | 1. At the sbt prompt, enter `run`. 52 | 53 | sbt builds the project and runs the `Main` of the project: 54 | 55 | After waiting a few seconds for the cluster to form the output should start look _something_ like this (scroll all the way to the right to see the Actor output): 56 | 57 | ``` 58 | [INFO] [07/21/2017 17:41:53.320] [ClusterSystem-akka.actor.default-dispatcher-16] [akka.tcp://ClusterSystem@127.0.0.1:51983/user/producer] Produced work: 3 59 | [INFO] [07/21/2017 17:41:53.322] [ClusterSystem-akka.actor.default-dispatcher-3] [akka.tcp://ClusterSystem@127.0.0.1:2551/user/master/singleton] Accepted work: 3bce4d6d-eaae-4da6-b316-0c6f566f2399 60 | [INFO] [07/21/2017 17:41:53.328] [ClusterSystem-akka.actor.default-dispatcher-3] [akka.tcp://ClusterSystem@127.0.0.1:2551/user/master/singleton] Giving worker 2b646020-6273-437c-aa0d-4aad6f12fb47 some work 3bce4d6d-eaae-4da6-b316-0c6f566f2399 61 | [INFO] [07/21/2017 17:41:53.328] [ClusterSystem-akka.actor.default-dispatcher-2] [akka.tcp://ClusterSystem@127.0.0.1:51980/user/worker] Got work: 3 62 | [INFO] [07/21/2017 17:41:53.328] [ClusterSystem-akka.actor.default-dispatcher-16] [akka.tcp://ClusterSystem@127.0.0.1:51980/user/worker] Work is complete. Result 3 * 3 = 9. 63 | [INFO] [07/21/2017 17:41:53.329] [ClusterSystem-akka.actor.default-dispatcher-19] [akka.tcp://ClusterSystem@127.0.0.1:2551/user/master/singleton] Work 3bce4d6d-eaae-4da6-b316-0c6f566f2399 is done by worker 2b646020-6273-437c-aa0d-4aad6f12fb47 64 | ``` 65 | 66 | Congratulations, you just ran your first Akka Cluster app. Now take a look at what happened under the covers. 67 | 68 | ## What happens when you run it 69 | 70 | When `Main` is run without any parameters, it starts six `ActorSystem`s in the same JVM. These six `ActorSystem`s form a single cluster. The six nodes include two each that perform front-end, back-end, and worker tasks: 71 | 72 | * The front-end nodes simulate an external interface, such as a REST API, that accepts workloads from clients. 73 | * The worker nodes have worker actors that accept and process workloads. 74 | * The back-end nodes contain a Master actor that coordinates workloads, keeps track of the workers, and delegates work to available workers. One of the nodes is active and one is on standby. If the active Master goes down, the standby takes over. 75 | 76 | A bird's eye perspective of the architecture looks like this: 77 | 78 | ![Overview](images/cluster-nodes.png) 79 | 80 | Let's look at the details of each part of the application, starting with the front-end. 81 | 82 | @@@index 83 | 84 | * [The Front-end Nodes](front-end.md) 85 | * [The Back-end Nodes](back-end.md) 86 | * [The Master Actor in Detail](master-in-detail.md) 87 | * [The Worker Nodes](worker.md) 88 | * [Experimenting with the example](experimenting.md) 89 | * [Next Steps](next-steps.md) 90 | 91 | @@@ 92 | -------------------------------------------------------------------------------- /docs/src/main/paradox/master-in-detail.md: -------------------------------------------------------------------------------- 1 | # The Master Actor in Detail 2 | 3 | The `Master` actor is without question the most involved component in this example. This is because it is designed to deal with failures. While the Akka cluster takes care of restarting the `Master` in case of a failure, we also want to make sure that the new `Master` can arrive at the same state as the failed `Master`. We use event sourcing and Akka Persistence to achieve this. 4 | 5 | 6 | 7 | The `Master` actor is it without question the most involved component in the guide. 8 | 9 | If the `back-end` node hosting the `Master` actor would crash the Akka Cluster Singleton makes sure it starts up on a different node, but we would also want it to reach the exact same state as the crashed node `Master`. This is achieved through use of event sourcing and [Akka Persistence](http://doc.akka.io/docs/akka/current/scala/persistence.html). 10 | 11 | ## Tracking current work items 12 | 13 | The current set of work item is modelled in the `WorkState` class. It keeps track of the current set of work that is pending, has been accepted by a worker, has completed etc. Every change to the `WorkState` is modelled as a domain event: 14 | 15 | @@snip [WorkState.scala]($g8src$/scala/worker/WorkState.scala) { #events } 16 | 17 | This allows us to capture and store each such event that happens, we can later replay all of them on an empty model and arrive at the exact same state. This is how event sourcing and [Akka Persistence](http://doc.akka.io/docs/akka/current/scala/persistence.html) allows the actor to start on any node and reach the same state as a previous instance. 18 | 19 | If the `Master` fails and is restarted, the replacement `Master` replays events from the log to retrieve the current state. This means that when the WorkState is modified, the `Master` must persist the event before acting on it. When the event is successfully stored, we can modify the state. Otherwise, if a failure occurs before the event is persisted, the replacement `Master` will not be able to attain the same state as the failed `Master`. 20 | 21 | Let's look at how a command to process a work item from the front-end comes in: 22 | 23 | @@snip [Master.scala]($g8src$/scala/worker/Master.scala) { #persisting } 24 | 25 | The first thing you might notice is the comment saying _idempotent_, this means that the same work message may arrive multiple times, but regardless how many times the same work arrives, it should only be executed once. This is needed since the `FrontEnd` actor re-sends work in case of the `Work` or `Ack` messages getting lost (Akka does not provide any guarantee of delivery, [see details in the docs](http://doc.akka.io/docs/akka/current/scala/general/message-delivery-reliability.html#discussion-why-no-guaranteed-delivery-)). 26 | 27 | To make the logic idempotent we simple check if the work id is already known, and if it is we simply `Ack` it without further logic. If the work is previously unknown, we start by transforming it into a `WorkAccepted` event, which we persist, and only in the `handler`-function passed to `persist` do we actually update the `workState`, send an `Ack` back to the `FrontEnd` and trigger a search for available workers. 28 | 29 | 30 | ## Implementation items required for Akka Persistence 31 | 32 | In a "normal" Actor the only thing we have to do is to implement `receive`, which is then invoked for each incoming message. In a `PersistentActor` there are three things that needs to be implemented: 33 | 34 | 1. `persistenceId` is a global identifier for the actor, we must make sure that there is never more than one Actor instance with the same `persistenceId` running globally, or else we would possibly mess up its journal. 35 | 1. `receiveCommand` corresponds to the `receive` method of regular actors. Messages sent to the actor end up here. 36 | 1. `receiveRecover` is invoked with the recorded events of the actor when it starts up 37 | 38 | ## Tracking workers 39 | 40 | Unlike the `Master` actor, the example system contains multiple workers that can be stopped and restarted frequently. We do not need to save their state since the `Master` is tracking work and will simply send work to another worker if the original fails to respond. So, rather than persisting a list of available workers, the example uses the following strategy: 41 | 42 | * Running workers periodically register with the master using a `RegisterWorker` message. If a `back-end` node fails and the `Master` is started on a new node, the registrations go automatically to the new node. 43 | * Any type of failure -- whether from the network, worker actor, or node -- that prevents a `RegisterWorker` message from arriving within the `work-timeout` period causes the 'Master' actor to remove the worker from its list. 44 | 45 | @@snip [Master.scala]($g8src$/scala/worker/Master.scala) { #pruning } 46 | 47 | When stopping a `Worker` Actor still tries to gracefully remove it self using the `DeRegisterWorker` message, but in case of crash it will have no chance to communicate that with the master node. 48 | 49 | @@snip [Master.scala]($g8src$/scala/worker/Master.scala) { #graceful-remove } 50 | 51 | Now let's move on to the last piece of the puzzle, the worker nodes. -------------------------------------------------------------------------------- /docs/src/main/paradox/next-steps.md: -------------------------------------------------------------------------------- 1 | # Next Steps 2 | 3 | The following are some ideas where to take this sample next. Implementation of each idea is left up to you. 4 | 5 | ## Using a different serializer 6 | 7 | To simplify things in this sample the default Java Serializer was used but in a real application it should not be used. For passing messages across the network the Java Serialization has serious security implications (worst case ability to remotely execute code) and does not have good performance characteristics. For storing the domain events in a persistent journal, there is also the problem with how to deal with versioning, which is hard if not impossible using Java Serialization. 8 | 9 | A few options to look into are listed in the [Akka Docs section on Serialization](http://doc.akka.io/docs/akka/current/scala/serialization.html#external-akka-serializers) 10 | 11 | ## A HTTP Based API 12 | 13 | The `FrontEnd` in this sample is a dummy that automatically generates work. A real application could for example use [Akka HTTP](http://doc.akka.io/docs/akka-http/current/scala/http/introduction.html) to provide a HTTP REST (or other) API for external clients. 14 | 15 | ## Scaling better with many masters 16 | 17 | If the singleton master becomes a bottleneck we could start several master actors and shard the jobs among them. This could be achieved by using [Akka Cluster Sharding](http://doc.akka.io/docs/akka/current/scala/cluster-sharding.html) with many `Master` actors as entities and a hash of some sort on the payload deciding which master it should go to. 18 | 19 | ## More tools for building distributed systems 20 | 21 | In this example we have used 22 | [Cluster Singleton](http://doc.akka.io/docs/akka/current/scala/cluster-singleton.html#cluster-singleton) 23 | and 24 | [Distributed Publish Subscribe](http://doc.akka.io/docs/akka/current/scala/distributed-pub-sub.html) 25 | but those are not the only tools in Akka Cluster. 26 | 27 | You can also find a good overview of the various modules that make up Akka in 28 | [this section of the official documentation](http://doc.akka.io/docs/akka/current/scala/guide/modules.html#cluster-singleton) -------------------------------------------------------------------------------- /docs/src/main/paradox/worker.md: -------------------------------------------------------------------------------- 1 | # The Worker Nodes 2 | 3 | `Worker` actors and the `Master` actor interact as follows: 4 | 5 | 1. `Worker` actors register with the `Master` so the master knows they are available and ready to take on work. 6 | 1. When the `Master` actor has work, it sends a 'WorkIsReady' message to all workers it things are not busy. 7 | `Worker` actors that are `idle` will reply to the `Master` with a `WorkerRequestsWork` message. 8 | 1. The `Master` picks the first reply and assigns the work to that worker. This achieves back pressure because the `Master` does not push work on workers that are already busy and overwhelm their mailboxes. 9 | 1. When the worker receives work from the master, it delegates the actual processing to a child actor, `WorkExecutor`. This allows the worker to be responsive while its child executes the work. 10 | 11 | ![Master to Worker Message Flow](images/master-worker-message-flow.png) 12 | 13 | You can see how a worker node and a number of worker actors is started in the method `Main.startWorker`: 14 | 15 | @@snip [Main.scala]($g8src$/scala/worker/Main.scala) { #worker } 16 | 17 | Now that we have covered all the details, we can experiment with different sets of nodes for the cluster. -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.16 2 | -------------------------------------------------------------------------------- /project/default.properties: -------------------------------------------------------------------------------- 1 | name=akka-distributed-workers-scala 2 | description=Akka is a toolkit and runtime for building highly concurrent, distributed, and fault tolerant event-driven apps. This application will get you started building distributed systems with Scala. -------------------------------------------------------------------------------- /project/giter8.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.foundweekends.giter8" %% "sbt-giter8" % "0.11.0") 2 | -------------------------------------------------------------------------------- /project/paradox.sbt: -------------------------------------------------------------------------------- 1 | // sbt-paradox, used for documentation 2 | addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.2.11") -------------------------------------------------------------------------------- /src/main/g8/build.sbt: -------------------------------------------------------------------------------- 1 | name := "akka-distributed-workers" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "$scala_version$" 6 | lazy val akkaVersion = "$akka_version$" 7 | lazy val cassandraPluginVersion = "$akka_persistence_cassandra_version$" 8 | 9 | fork in Test := true 10 | 11 | libraryDependencies ++= Seq( 12 | "com.typesafe.akka" %% "akka-cluster" % akkaVersion, 13 | "com.typesafe.akka" %% "akka-cluster-tools" % akkaVersion, 14 | "com.typesafe.akka" %% "akka-persistence" % akkaVersion, 15 | "com.typesafe.akka" %% "akka-persistence-cassandra" % cassandraPluginVersion, 16 | // this allows us to start cassandra from the sample 17 | "com.typesafe.akka" %% "akka-persistence-cassandra-launcher" % cassandraPluginVersion, 18 | 19 | "com.typesafe.akka" %% "akka-slf4j" % akkaVersion, 20 | "ch.qos.logback" % "logback-classic" % "1.2.3", 21 | 22 | // test dependencies 23 | "com.typesafe.akka" %% "akka-testkit" % akkaVersion % "test", 24 | "org.scalatest" %% "scalatest" % "3.0.1" % "test", 25 | "commons-io" % "commons-io" % "2.4" % "test") 26 | -------------------------------------------------------------------------------- /src/main/g8/default.properties: -------------------------------------------------------------------------------- 1 | name=akka-distributed-workers-scala 2 | description=Akka is a toolkit and runtime for building highly concurrent, distributed, and fault tolerant event-driven apps. This application will get you started building distributed systems with Scala. 3 | verbatim=*.scala 4 | scala_version=2.12.7 5 | akka_version=2.5.26 6 | akka_persistence_cassandra_version=maven(com.typesafe.akka, akka-persistence-cassandra_2.12) 7 | -------------------------------------------------------------------------------- /src/main/g8/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.16 2 | -------------------------------------------------------------------------------- /src/main/g8/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | # This is the main configuration file for our application, it provides overrides to the default values 2 | # provided in the reference.conf of the modules from Akka 3 | akka { 4 | actor { 5 | # Must be set like this to use Akka Cluster 6 | provider = cluster 7 | 8 | # Only for convenience in the quickstart, Java serialization should not be used for actual applications 9 | warn-about-java-serializer-usage = off 10 | } 11 | 12 | # Use slf4j (backed by logback) for logging, additional configuration 13 | # can be done in logback.xml 14 | loggers = ["akka.event.slf4j.Slf4jLogger"] 15 | logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" 16 | loglevel = INFO 17 | 18 | # For the sample, just bind to loopback and do not allow access from the network 19 | remote.netty.tcp.hostname=127.0.0.1 20 | # the port is overridden by the logic in Main.scala 21 | remote.netty.tcp.port=0 22 | 23 | cluster { 24 | # Seed nodes are a way to have a node join the cluster (or form a new cluster) from configuration. 25 | seed-nodes = [ 26 | "akka.tcp://ClusterSystem@127.0.0.1:2551", 27 | "akka.tcp://ClusterSystem@127.0.0.1:2552", 28 | "akka.tcp://ClusterSystem@127.0.0.1:2553", 29 | "akka.tcp://ClusterSystem@127.0.0.1:2554"] 30 | 31 | # Only for convenience in the quickstart, auto-downing should not be used for actual applications. 32 | # Read more here: http://doc.akka.io/docs/akka/current/scala/cluster-usage.html#auto-downing-do-not-use- 33 | auto-down-unreachable-after = 10s 34 | 35 | # Needed when running many actor systems in the same JVM 36 | jmx.multi-mbeans-in-same-jvm = on 37 | } 38 | 39 | # use Cassandra to store both snapshots and the events of the persistent actors 40 | persistence { 41 | journal.plugin = "cassandra-journal" 42 | snapshot-store.plugin = "cassandra-snapshot-store" 43 | } 44 | 45 | # Run the pubsub mediator on all nodes, without any code starting it up 46 | extensions = ["akka.cluster.pubsub.DistributedPubSub"] 47 | } 48 | 49 | # Configuration related to the app is in its own namespace 50 | distributed-workers { 51 | # Each worker pings the master with this interval 52 | # to let it know that it is alive 53 | worker-registration-interval = 10s 54 | # If a worker hasn't gotten in touch in this long 55 | # it is removed from the set of workers 56 | consider-worker-dead-after = 60s 57 | 58 | # If a workload hasn't finished in this long it 59 | # is considered failed and is retried 60 | work-timeout = 10s 61 | 62 | } -------------------------------------------------------------------------------- /src/main/g8/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/FrontEnd.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import java.util.UUID 4 | import java.util.concurrent.ThreadLocalRandom 5 | 6 | import akka.actor.{Actor, ActorLogging, Cancellable, Props, Timers} 7 | import akka.pattern._ 8 | import akka.util.Timeout 9 | 10 | import scala.concurrent.duration._ 11 | 12 | /** 13 | * Dummy front-end that periodically sends a workload to the master. 14 | */ 15 | object FrontEnd { 16 | 17 | def props: Props = Props(new FrontEnd) 18 | 19 | private case object NotOk 20 | private case object Tick 21 | private case object Retry 22 | } 23 | 24 | // #front-end 25 | class FrontEnd extends Actor with ActorLogging with Timers { 26 | import FrontEnd._ 27 | import context.dispatcher 28 | 29 | val masterProxy = context.actorOf( 30 | MasterSingleton.proxyProps(context.system), 31 | name = "masterProxy") 32 | 33 | var workCounter = 0 34 | 35 | def nextWorkId(): String = UUID.randomUUID().toString 36 | 37 | override def preStart(): Unit = { 38 | timers.startSingleTimer("tick", Tick, 5.seconds) 39 | } 40 | 41 | def receive = idle 42 | 43 | def idle: Receive = { 44 | case Tick => 45 | workCounter += 1 46 | log.info("Produced work: {}", workCounter) 47 | val work = Work(nextWorkId(), workCounter) 48 | context.become(busy(work)) 49 | } 50 | 51 | def busy(workInProgress: Work): Receive = { 52 | sendWork(workInProgress) 53 | 54 | { 55 | case Master.Ack(workId) => 56 | log.info("Got ack for workId {}", workId) 57 | val nextTick = ThreadLocalRandom.current.nextInt(3, 10).seconds 58 | timers.startSingleTimer(s"tick", Tick, nextTick) 59 | context.become(idle) 60 | 61 | case NotOk => 62 | log.info("Work {} not accepted, retry after a while", workInProgress.workId) 63 | timers.startSingleTimer("retry", Retry, 3.seconds) 64 | 65 | case Retry => 66 | log.info("Retrying work {}", workInProgress.workId) 67 | sendWork(workInProgress) 68 | } 69 | } 70 | 71 | def sendWork(work: Work): Unit = { 72 | implicit val timeout = Timeout(5.seconds) 73 | (masterProxy ? work).recover { 74 | case _ => NotOk 75 | } pipeTo self 76 | } 77 | 78 | } 79 | // #front-end -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/Main.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import java.io.File 4 | import java.util.concurrent.CountDownLatch 5 | 6 | import akka.actor.ActorSystem 7 | import akka.persistence.cassandra.testkit.CassandraLauncher 8 | import com.typesafe.config.{Config, ConfigFactory} 9 | 10 | object Main { 11 | 12 | // note that 2551 and 2552 are expected to be seed nodes though, even if 13 | // the back-end starts at 2000 14 | val backEndPortRange = 2000 to 2999 15 | 16 | val frontEndPortRange = 3000 to 3999 17 | 18 | def main(args: Array[String]): Unit = { 19 | args.headOption match { 20 | 21 | case None => 22 | startClusterInSameJvm() 23 | 24 | case Some(portString) if portString.matches("""\d+""") => 25 | val port = portString.toInt 26 | if (backEndPortRange.contains(port)) startBackEnd(port) 27 | else if (frontEndPortRange.contains(port)) startFrontEnd(port) 28 | else startWorker(port, args.lift(1).map(_.toInt).getOrElse(1)) 29 | 30 | case Some("cassandra") => 31 | startCassandraDatabase() 32 | println("Started Cassandra, press Ctrl + C to kill") 33 | new CountDownLatch(1).await() 34 | 35 | } 36 | } 37 | 38 | def startClusterInSameJvm(): Unit = { 39 | startCassandraDatabase() 40 | 41 | // two backend nodes 42 | startBackEnd(2551) 43 | startBackEnd(2552) 44 | // two front-end nodes 45 | startFrontEnd(3000) 46 | startFrontEnd(3001) 47 | // two worker nodes with two worker actors each 48 | startWorker(5001, 2) 49 | startWorker(5002, 2) 50 | } 51 | 52 | /** 53 | * Start a node with the role backend on the given port. (This may also 54 | * start the shared journal, see below for details) 55 | */ 56 | def startBackEnd(port: Int): Unit = { 57 | val system = ActorSystem("ClusterSystem", config(port, "back-end")) 58 | MasterSingleton.startSingleton(system) 59 | } 60 | 61 | /** 62 | * Start a front end node that will submit work to the backend nodes 63 | */ 64 | // #front-end 65 | def startFrontEnd(port: Int): Unit = { 66 | val system = ActorSystem("ClusterSystem", config(port, "front-end")) 67 | system.actorOf(FrontEnd.props, "front-end") 68 | system.actorOf(WorkResultConsumer.props, "consumer") 69 | } 70 | // #front-end 71 | 72 | /** 73 | * Start a worker node, with n actual workers that will accept and process workloads 74 | */ 75 | // #worker 76 | def startWorker(port: Int, workers: Int): Unit = { 77 | val system = ActorSystem("ClusterSystem", config(port, "worker")) 78 | val masterProxy = system.actorOf( 79 | MasterSingleton.proxyProps(system), 80 | name = "masterProxy") 81 | 82 | (1 to workers).foreach(n => 83 | system.actorOf(Worker.props(masterProxy), s"worker-$n") 84 | ) 85 | } 86 | // #worker 87 | 88 | def config(port: Int, role: String): Config = 89 | ConfigFactory.parseString(s""" 90 | akka.remote.netty.tcp.port=$port 91 | akka.cluster.roles=[$role] 92 | """).withFallback(ConfigFactory.load()) 93 | 94 | /** 95 | * To make the sample easier to run we kickstart a Cassandra instance to 96 | * act as the journal. Cassandra is a great choice of backend for Akka Persistence but 97 | * in a real application a pre-existing Cassandra cluster should be used. 98 | */ 99 | def startCassandraDatabase(): Unit = { 100 | val databaseDirectory = new File("target/cassandra-db") 101 | CassandraLauncher.start( 102 | databaseDirectory, 103 | CassandraLauncher.DefaultTestConfigResource, 104 | clean = false, 105 | port = 9042 106 | ) 107 | 108 | // shut the cassandra instance down when the JVM stops 109 | sys.addShutdownHook { 110 | CassandraLauncher.stop() 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/Master.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import akka.actor.{ActorLogging, ActorRef, Cancellable, Props, Timers} 4 | import akka.cluster.pubsub.{DistributedPubSub, DistributedPubSubMediator} 5 | import akka.persistence.{PersistentActor, RecoveryCompleted, SnapshotOffer} 6 | 7 | import scala.concurrent.duration.{Deadline, FiniteDuration, _} 8 | 9 | /** 10 | * The master actor keep tracks of all available workers, and all scheduled and ongoing work items 11 | */ 12 | object Master { 13 | 14 | val ResultsTopic = "results" 15 | 16 | def props(workTimeout: FiniteDuration): Props = 17 | Props(new Master(workTimeout)) 18 | 19 | case class Ack(workId: String) 20 | 21 | private sealed trait WorkerStatus 22 | private case object Idle extends WorkerStatus 23 | private case class Busy(workId: String, deadline: Deadline) extends WorkerStatus 24 | private case class WorkerState(ref: ActorRef, status: WorkerStatus, staleWorkerDeadline: Deadline) 25 | 26 | private case object CleanupTick 27 | 28 | } 29 | 30 | class Master(workTimeout: FiniteDuration) extends Timers with PersistentActor with ActorLogging { 31 | import Master._ 32 | import WorkState._ 33 | import context.dispatcher 34 | 35 | override val persistenceId: String = "master" 36 | 37 | val considerWorkerDeadAfter: FiniteDuration = 38 | context.system.settings.config.getDuration("distributed-workers.consider-worker-dead-after").getSeconds.seconds 39 | def newStaleWorkerDeadline(): Deadline = considerWorkerDeadAfter.fromNow 40 | 41 | timers.startPeriodicTimer("cleanup", CleanupTick, workTimeout / 2) 42 | 43 | val mediator: ActorRef = DistributedPubSub(context.system).mediator 44 | 45 | // the set of available workers is not event sourced as it depends on the current set of workers 46 | private var workers = Map[String, WorkerState]() 47 | 48 | // workState is event sourced to be able to make sure work is processed even in case of crash 49 | private var workState = WorkState.empty 50 | 51 | 52 | override def receiveRecover: Receive = { 53 | 54 | case SnapshotOffer(_, workStateSnapshot: WorkState) => 55 | // If we would have logic triggering snapshots in the actor 56 | // we would start from the latest snapshot here when recovering 57 | log.info("Got snapshot work state") 58 | workState = workStateSnapshot 59 | 60 | case event: WorkDomainEvent => 61 | // only update current state by applying the event, no side effects 62 | workState = workState.updated(event) 63 | log.info("Replayed {}", event.getClass.getSimpleName) 64 | 65 | case RecoveryCompleted => 66 | log.info("Recovery completed") 67 | 68 | } 69 | 70 | override def receiveCommand: Receive = { 71 | case MasterWorkerProtocol.RegisterWorker(workerId) => 72 | if (workers.contains(workerId)) { 73 | workers += (workerId -> workers(workerId).copy(ref = sender(), staleWorkerDeadline = newStaleWorkerDeadline())) 74 | } else { 75 | log.info("Worker registered: {}", workerId) 76 | val initialWorkerState = WorkerState( 77 | ref = sender(), 78 | status = Idle, 79 | staleWorkerDeadline = newStaleWorkerDeadline()) 80 | workers += (workerId -> initialWorkerState) 81 | 82 | if (workState.hasWork) 83 | sender() ! MasterWorkerProtocol.WorkIsReady 84 | } 85 | 86 | // #graceful-remove 87 | case MasterWorkerProtocol.DeRegisterWorker(workerId) => 88 | workers.get(workerId) match { 89 | case Some(WorkerState(_, Busy(workId, _), _)) => 90 | // there was a workload assigned to the worker when it left 91 | log.info("Busy worker de-registered: {}", workerId) 92 | persist(WorkerFailed(workId)) { event ⇒ 93 | workState = workState.updated(event) 94 | notifyWorkers() 95 | } 96 | case Some(_) => 97 | log.info("Worker de-registered: {}", workerId) 98 | case _ => 99 | } 100 | workers -= workerId 101 | // #graceful-remove 102 | 103 | case MasterWorkerProtocol.WorkerRequestsWork(workerId) => 104 | if (workState.hasWork) { 105 | workers.get(workerId) match { 106 | case Some(workerState @ WorkerState(_, Idle, _)) => 107 | val work = workState.nextWork 108 | persist(WorkStarted(work.workId)) { event => 109 | workState = workState.updated(event) 110 | log.info("Giving worker {} some work {}", workerId, work.workId) 111 | val newWorkerState = workerState.copy( 112 | status = Busy(work.workId, Deadline.now + workTimeout), 113 | staleWorkerDeadline = newStaleWorkerDeadline()) 114 | workers += (workerId -> newWorkerState) 115 | sender() ! work 116 | } 117 | case _ => 118 | } 119 | } 120 | 121 | case MasterWorkerProtocol.WorkIsDone(workerId, workId, result) => 122 | // idempotent - redelivery from the worker may cause duplicates, so it needs to be 123 | if (workState.isDone(workId)) { 124 | // previous Ack was lost, confirm again that this is done 125 | sender() ! MasterWorkerProtocol.Ack(workId) 126 | } else if (!workState.isInProgress(workId)) { 127 | log.info("Work {} not in progress, reported as done by worker {}", workId, workerId) 128 | } else { 129 | log.info("Work {} is done by worker {}", workId, workerId) 130 | changeWorkerToIdle(workerId, workId) 131 | persist(WorkCompleted(workId, result)) { event ⇒ 132 | workState = workState.updated(event) 133 | mediator ! DistributedPubSubMediator.Publish(ResultsTopic, WorkResult(workId, result)) 134 | // Ack back to original sender 135 | sender ! MasterWorkerProtocol.Ack(workId) 136 | } 137 | } 138 | 139 | case MasterWorkerProtocol.WorkFailed(workerId, workId) => 140 | if (workState.isInProgress(workId)) { 141 | log.info("Work {} failed by worker {}", workId, workerId) 142 | changeWorkerToIdle(workerId, workId) 143 | persist(WorkerFailed(workId)) { event ⇒ 144 | workState = workState.updated(event) 145 | notifyWorkers() 146 | } 147 | } 148 | 149 | // #persisting 150 | case work: Work => 151 | // idempotent 152 | if (workState.isAccepted(work.workId)) { 153 | sender() ! Master.Ack(work.workId) 154 | } else { 155 | log.info("Accepted work: {}", work.workId) 156 | persist(WorkAccepted(work)) { event ⇒ 157 | // Ack back to original sender 158 | sender() ! Master.Ack(work.workId) 159 | workState = workState.updated(event) 160 | notifyWorkers() 161 | } 162 | } 163 | // #persisting 164 | 165 | // #pruning 166 | case CleanupTick => 167 | workers.foreach { 168 | case (workerId, WorkerState(_, Busy(workId, timeout), _)) if timeout.isOverdue() => 169 | log.info("Work timed out: {}", workId) 170 | workers -= workerId 171 | persist(WorkerTimedOut(workId)) { event ⇒ 172 | workState = workState.updated(event) 173 | notifyWorkers() 174 | } 175 | 176 | 177 | case (workerId, WorkerState(_, Idle, lastHeardFrom)) if lastHeardFrom.isOverdue() => 178 | log.info("Too long since heard from worker {}, pruning", workerId) 179 | workers -= workerId 180 | 181 | case _ => // this one is a keeper! 182 | } 183 | // #pruning 184 | } 185 | 186 | def notifyWorkers(): Unit = 187 | if (workState.hasWork) { 188 | workers.foreach { 189 | case (_, WorkerState(ref, Idle, _)) => ref ! MasterWorkerProtocol.WorkIsReady 190 | case _ => // busy 191 | } 192 | } 193 | 194 | def changeWorkerToIdle(workerId: String, workId: String): Unit = 195 | workers.get(workerId) match { 196 | case Some(workerState @ WorkerState(_, Busy(`workId`, _), _)) ⇒ 197 | val newWorkerState = workerState.copy(status = Idle, staleWorkerDeadline = newStaleWorkerDeadline()) 198 | workers += (workerId -> newWorkerState) 199 | case _ ⇒ 200 | // ok, might happen after standby recovery, worker state is not persisted 201 | } 202 | 203 | def tooLongSinceHeardFrom(lastHeardFrom: Long) = 204 | System.currentTimeMillis() - lastHeardFrom > considerWorkerDeadAfter.toMillis 205 | 206 | } -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/MasterSingleton.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import akka.actor.{ActorSystem, PoisonPill} 4 | import akka.cluster.singleton._ 5 | 6 | import scala.concurrent.duration._ 7 | 8 | object MasterSingleton { 9 | 10 | private val singletonName = "master" 11 | private val singletonRole = "back-end" 12 | 13 | // #singleton 14 | def startSingleton(system: ActorSystem) = { 15 | val workTimeout = system.settings.config.getDuration("distributed-workers.work-timeout").getSeconds.seconds 16 | 17 | system.actorOf( 18 | ClusterSingletonManager.props( 19 | Master.props(workTimeout), 20 | PoisonPill, 21 | ClusterSingletonManagerSettings(system).withRole(singletonRole) 22 | ), 23 | singletonName) 24 | } 25 | // #singleton 26 | 27 | // #proxy 28 | def proxyProps(system: ActorSystem) = ClusterSingletonProxy.props( 29 | settings = ClusterSingletonProxySettings(system).withRole(singletonRole), 30 | singletonManagerPath = s"/user/$singletonName") 31 | // #proxy 32 | } 33 | -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/MasterWorkerProtocol.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | object MasterWorkerProtocol { 4 | // Messages from Workers 5 | case class RegisterWorker(workerId: String) 6 | case class DeRegisterWorker(workerId: String) 7 | case class WorkerRequestsWork(workerId: String) 8 | case class WorkIsDone(workerId: String, workId: String, result: Any) 9 | case class WorkFailed(workerId: String, workId: String) 10 | 11 | // Messages to Workers 12 | case object WorkIsReady 13 | case class Ack(id: String) 14 | } -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/Work.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | case class Work(workId: String, job: Any) 4 | 5 | case class WorkResult(workId: String, result: Any) -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/WorkExecutor.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import java.util.concurrent.ThreadLocalRandom 4 | 5 | import akka.actor.{Actor, Props} 6 | 7 | import scala.concurrent.duration._ 8 | 9 | /** 10 | * Work executor is the actor actually performing the work. 11 | */ 12 | object WorkExecutor { 13 | def props = Props(new WorkExecutor) 14 | 15 | case class DoWork(n: Int) 16 | case class WorkComplete(result: String) 17 | } 18 | 19 | class WorkExecutor extends Actor { 20 | import WorkExecutor._ 21 | import context.dispatcher 22 | 23 | def receive = { 24 | case DoWork(n: Int) => 25 | val n2 = n * n 26 | val result = s"$n * $n = $n2" 27 | 28 | // simulate that the processing time varies 29 | val randomProcessingTime = ThreadLocalRandom.current.nextInt(1, 3).seconds 30 | context.system.scheduler.scheduleOnce(randomProcessingTime, sender(), WorkComplete(result)) 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/WorkResultConsumer.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import akka.actor.{Actor, ActorLogging, Props} 4 | import akka.cluster.pubsub.DistributedPubSub 5 | import akka.cluster.pubsub.DistributedPubSubMediator 6 | 7 | object WorkResultConsumer { 8 | def props: Props = Props(new WorkResultConsumer) 9 | } 10 | 11 | // #work-result-consumer 12 | class WorkResultConsumer extends Actor with ActorLogging { 13 | 14 | val mediator = DistributedPubSub(context.system).mediator 15 | mediator ! DistributedPubSubMediator.Subscribe(Master.ResultsTopic, self) 16 | 17 | def receive = { 18 | case _: DistributedPubSubMediator.SubscribeAck => 19 | case WorkResult(workId, result) => 20 | log.info("Consumed result: {}", result) 21 | } 22 | 23 | } 24 | // #work-result-consumer -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/WorkState.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import scala.collection.immutable.Queue 4 | 5 | object WorkState { 6 | 7 | def empty: WorkState = WorkState( 8 | pendingWork = Queue.empty, 9 | workInProgress = Map.empty, 10 | acceptedWorkIds = Set.empty, 11 | doneWorkIds = Set.empty) 12 | 13 | trait WorkDomainEvent 14 | // #events 15 | case class WorkAccepted(work: Work) extends WorkDomainEvent 16 | case class WorkStarted(workId: String) extends WorkDomainEvent 17 | case class WorkCompleted(workId: String, result: Any) extends WorkDomainEvent 18 | case class WorkerFailed(workId: String) extends WorkDomainEvent 19 | case class WorkerTimedOut(workId: String) extends WorkDomainEvent 20 | // #events 21 | } 22 | 23 | case class WorkState private ( 24 | private val pendingWork: Queue[Work], 25 | private val workInProgress: Map[String, Work], 26 | private val acceptedWorkIds: Set[String], 27 | private val doneWorkIds: Set[String]) { 28 | 29 | import WorkState._ 30 | 31 | def hasWork: Boolean = pendingWork.nonEmpty 32 | def nextWork: Work = pendingWork.head 33 | def isAccepted(workId: String): Boolean = acceptedWorkIds.contains(workId) 34 | def isInProgress(workId: String): Boolean = workInProgress.contains(workId) 35 | def isDone(workId: String): Boolean = doneWorkIds.contains(workId) 36 | 37 | def updated(event: WorkDomainEvent): WorkState = event match { 38 | case WorkAccepted(work) ⇒ 39 | copy( 40 | pendingWork = pendingWork enqueue work, 41 | acceptedWorkIds = acceptedWorkIds + work.workId) 42 | 43 | case WorkStarted(workId) ⇒ 44 | val (work, rest) = pendingWork.dequeue 45 | require(workId == work.workId, s"WorkStarted expected workId $workId == ${work.workId}") 46 | copy( 47 | pendingWork = rest, 48 | workInProgress = workInProgress + (workId -> work)) 49 | 50 | case WorkCompleted(workId, result) ⇒ 51 | copy( 52 | workInProgress = workInProgress - workId, 53 | doneWorkIds = doneWorkIds + workId) 54 | 55 | case WorkerFailed(workId) ⇒ 56 | copy( 57 | pendingWork = pendingWork enqueue workInProgress(workId), 58 | workInProgress = workInProgress - workId) 59 | 60 | case WorkerTimedOut(workId) ⇒ 61 | copy( 62 | pendingWork = pendingWork enqueue workInProgress(workId), 63 | workInProgress = workInProgress - workId) 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/g8/src/main/scala/worker/Worker.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import java.util.UUID 4 | 5 | import akka.actor.SupervisorStrategy.{Restart, Stop} 6 | import akka.actor._ 7 | 8 | import scala.concurrent.duration._ 9 | 10 | /** 11 | * The worker is actually more of a middle manager, delegating the actual work 12 | * to the WorkExecutor, supervising it and keeping itself available to interact with the work master. 13 | */ 14 | object Worker { 15 | 16 | def props(masterProxy: ActorRef): Props = Props(new Worker(masterProxy)) 17 | 18 | } 19 | 20 | class Worker(masterProxy: ActorRef) 21 | extends Actor with Timers with ActorLogging { 22 | import MasterWorkerProtocol._ 23 | import context.dispatcher 24 | 25 | 26 | val workerId = UUID.randomUUID().toString 27 | val registerInterval = context.system.settings.config.getDuration("distributed-workers.worker-registration-interval").getSeconds.seconds 28 | 29 | val registerTask = context.system.scheduler.schedule(0.seconds, registerInterval, masterProxy, RegisterWorker(workerId)) 30 | 31 | val workExecutor = createWorkExecutor() 32 | 33 | var currentWorkId: Option[String] = None 34 | def workId: String = currentWorkId match { 35 | case Some(workId) => workId 36 | case None => throw new IllegalStateException("Not working") 37 | } 38 | 39 | def receive = idle 40 | 41 | def idle: Receive = { 42 | case WorkIsReady => 43 | // this is the only state where we reply to WorkIsReady 44 | masterProxy ! WorkerRequestsWork(workerId) 45 | 46 | case Work(workId, job: Int) => 47 | log.info("Got work: {}", job) 48 | currentWorkId = Some(workId) 49 | workExecutor ! WorkExecutor.DoWork(job) 50 | context.become(working) 51 | 52 | } 53 | 54 | def working: Receive = { 55 | case WorkExecutor.WorkComplete(result) => 56 | log.info("Work is complete. Result {}.", result) 57 | masterProxy ! WorkIsDone(workerId, workId, result) 58 | context.setReceiveTimeout(5.seconds) 59 | context.become(waitForWorkIsDoneAck(result)) 60 | 61 | case _: Work => 62 | log.warning("Yikes. Master told me to do work, while I'm already working.") 63 | 64 | } 65 | 66 | def waitForWorkIsDoneAck(result: Any): Receive = { 67 | case Ack(id) if id == workId => 68 | masterProxy ! WorkerRequestsWork(workerId) 69 | context.setReceiveTimeout(Duration.Undefined) 70 | context.become(idle) 71 | 72 | case ReceiveTimeout => 73 | log.info("No ack from master, resending work result") 74 | masterProxy ! WorkIsDone(workerId, workId, result) 75 | 76 | } 77 | 78 | def createWorkExecutor(): ActorRef = 79 | // in addition to starting the actor we also watch it, so that 80 | // if it stops this worker will also be stopped 81 | context.watch(context.actorOf(WorkExecutor.props, "work-executor")) 82 | 83 | override def supervisorStrategy = OneForOneStrategy() { 84 | case _: ActorInitializationException => Stop 85 | case _: Exception => 86 | currentWorkId foreach { workId => masterProxy ! WorkFailed(workerId, workId) } 87 | context.become(idle) 88 | Restart 89 | } 90 | 91 | override def postStop(): Unit = { 92 | registerTask.cancel() 93 | masterProxy ! DeRegisterWorker(workerId) 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /src/main/g8/src/test/scala/worker/DistributedWorkerSpec.scala: -------------------------------------------------------------------------------- 1 | package worker 2 | 3 | import java.io.File 4 | 5 | import akka.actor.{Actor, ActorLogging, ActorRef, ActorSystem, PoisonPill, Props} 6 | import akka.cluster.Cluster 7 | import akka.cluster.ClusterEvent.{CurrentClusterState, MemberUp} 8 | import akka.cluster.pubsub.DistributedPubSub 9 | import akka.cluster.pubsub.DistributedPubSubMediator.{CurrentTopics, GetTopics, Subscribe, SubscribeAck} 10 | import akka.cluster.singleton.{ClusterSingletonManager, ClusterSingletonManagerSettings} 11 | import akka.testkit.{ImplicitSender, TestKit, TestProbe} 12 | import com.typesafe.config.ConfigFactory 13 | import org.apache.commons.io.FileUtils 14 | import org.scalatest.{BeforeAndAfterAll, FlatSpecLike, Matchers} 15 | 16 | import scala.concurrent.duration._ 17 | import scala.concurrent.{Await, Future} 18 | 19 | object DistributedWorkerSpec { 20 | 21 | val clusterConfig = ConfigFactory.parseString(""" 22 | akka { 23 | persistence { 24 | journal.plugin = "akka.persistence.journal.inmem" 25 | snapshot-store { 26 | plugin = "akka.persistence.snapshot-store.local" 27 | local.dir = "target/test-snapshots" 28 | } 29 | } 30 | extensions = ["akka.cluster.pubsub.DistributedPubSub"] 31 | } 32 | distributed-workers.consider-worker-dead-after = 10s 33 | distributed-workers.worker-registration-interval = 1s 34 | """).withFallback(ConfigFactory.load()) 35 | 36 | class FlakyWorkExecutor extends Actor with ActorLogging { 37 | var i = 0 38 | 39 | override def postRestart(reason: Throwable): Unit = { 40 | i = 3 41 | super.postRestart(reason) 42 | } 43 | 44 | def receive = { 45 | case WorkExecutor.DoWork(n: Int) => 46 | i += 1 47 | if (i == 3) { 48 | log.info("Cannot be trusted, crashing") 49 | throw new RuntimeException("Flaky worker") 50 | } else if (i == 5) { 51 | log.info("Cannot be trusted, stopping myself") 52 | context.stop(self) 53 | } else { 54 | val n2 = n * n 55 | val result = s"$n * $n = $n2" 56 | log.info("Cannot be trusted, but did complete work: {}", result) 57 | sender() ! WorkExecutor.WorkComplete(result) 58 | } 59 | } 60 | } 61 | 62 | class FastWorkExecutor extends Actor with ActorLogging { 63 | def receive = { 64 | case WorkExecutor.DoWork(n: Int) => 65 | val n2 = n * n 66 | val result = s"$n * $n = $n2" 67 | sender() ! WorkExecutor.WorkComplete(result) 68 | } 69 | } 70 | 71 | class RemoteControllableFrontend extends FrontEnd { 72 | 73 | var currentWorkIdAndSender: Option[(String, ActorRef)] = None 74 | 75 | override def idle: Receive = { 76 | // just to be able to send one message at a time from the test 77 | currentWorkIdAndSender match { 78 | case Some((workId, originalSender)) => originalSender ! "ok-" + workId 79 | case None => 80 | } 81 | currentWorkIdAndSender = None 82 | 83 | { 84 | case work: Work => 85 | log.debug("Forwarding some work: {}", work) 86 | sendWork(work) 87 | currentWorkIdAndSender = Some((work.workId, sender())) 88 | context.become(busy(work)) 89 | } 90 | } 91 | } 92 | } 93 | 94 | class DistributedWorkerSpec(_system: ActorSystem) 95 | extends TestKit(_system) 96 | with Matchers 97 | with FlatSpecLike 98 | with BeforeAndAfterAll 99 | with ImplicitSender { 100 | 101 | import DistributedWorkerSpec._ 102 | 103 | val workTimeout = 3.seconds 104 | 105 | def this() = this(ActorSystem("DistributedWorkerSpec", DistributedWorkerSpec.clusterConfig)) 106 | 107 | val backendSystem: ActorSystem = { 108 | val config = ConfigFactory.parseString("akka.cluster.roles=[back-end]").withFallback(clusterConfig) 109 | ActorSystem("DistributedWorkerSpec", config) 110 | } 111 | 112 | val workerSystem: ActorSystem = ActorSystem("DistributedWorkerSpec", clusterConfig) 113 | 114 | val storageLocations = List( 115 | "akka.persistence.journal.leveldb.dir", 116 | "akka.persistence.snapshot-store.local.dir").map(s => new File(system.settings.config.getString(s))) 117 | 118 | override def beforeAll(): Unit = { 119 | // make sure we do not use persisted data from a previous run 120 | storageLocations.foreach(dir => FileUtils.deleteDirectory(dir)) 121 | } 122 | 123 | "Distributed workers" should "perform work and publish results" in { 124 | val clusterAddress = Cluster(backendSystem).selfAddress 125 | val clusterProbe = TestProbe() 126 | Cluster(backendSystem).subscribe(clusterProbe.ref, classOf[MemberUp]) 127 | clusterProbe.expectMsgType[CurrentClusterState] 128 | Cluster(backendSystem).join(clusterAddress) 129 | clusterProbe.expectMsgType[MemberUp] 130 | 131 | backendSystem.actorOf( 132 | ClusterSingletonManager.props( 133 | Master.props(workTimeout), 134 | PoisonPill, 135 | ClusterSingletonManagerSettings(system).withRole("back-end")), 136 | "master") 137 | 138 | Cluster(workerSystem).join(clusterAddress) 139 | 140 | val masterProxy = workerSystem.actorOf( 141 | MasterSingleton.proxyProps(workerSystem), 142 | name = "masterProxy") 143 | val fastWorkerProps = Props(new Worker(masterProxy) { 144 | override def createWorkExecutor(): ActorRef = context.actorOf(Props(new FastWorkExecutor), "fast-executor") 145 | }) 146 | 147 | for (n <- 1 to 3) 148 | workerSystem.actorOf(fastWorkerProps, "worker-" + n) 149 | 150 | val flakyWorkerProps = Props(new Worker(masterProxy) { 151 | override def createWorkExecutor(): ActorRef = { 152 | context.actorOf(Props(new FlakyWorkExecutor), "flaky-executor") 153 | } 154 | }) 155 | val flakyWorker = workerSystem.actorOf(flakyWorkerProps, "flaky-worker") 156 | 157 | Cluster(system).join(clusterAddress) 158 | clusterProbe.expectMsgType[MemberUp] 159 | 160 | // allow posting work from the outside 161 | 162 | val frontend = system.actorOf(Props[RemoteControllableFrontend], "front-end") 163 | 164 | val results = TestProbe() 165 | DistributedPubSub(system).mediator ! Subscribe(Master.ResultsTopic, results.ref) 166 | expectMsgType[SubscribeAck] 167 | 168 | // make sure pub sub topics are replicated over to the back-end system before triggering any work 169 | within(10.seconds) { 170 | awaitAssert { 171 | DistributedPubSub(backendSystem).mediator ! GetTopics 172 | expectMsgType[CurrentTopics].getTopics() should contain(Master.ResultsTopic) 173 | } 174 | } 175 | 176 | // make sure we can get one piece of work through to fail fast if it doesn't 177 | frontend ! Work("1", 1) 178 | expectMsg("ok-1") 179 | within(10.seconds) { 180 | awaitAssert { 181 | results.expectMsgType[WorkResult].workId should be("1") 182 | } 183 | } 184 | 185 | 186 | // and then send in some actual work 187 | for (n <- 2 to 100) { 188 | frontend ! Work(n.toString, n) 189 | expectMsg(s"ok-$n") 190 | } 191 | system.log.info("99 work items sent") 192 | 193 | results.within(20.seconds) { 194 | val ids = results.receiveN(99).map { case WorkResult(workId, _) => workId } 195 | // nothing lost, and no duplicates 196 | ids.toVector.map(_.toInt).sorted should be((2 to 100).toVector) 197 | } 198 | 199 | } 200 | 201 | override def afterAll(): Unit = { 202 | import scala.concurrent.ExecutionContext.Implicits.global 203 | val allTerminated = Future.sequence(Seq( 204 | system.terminate(), 205 | backendSystem.terminate(), 206 | workerSystem.terminate() 207 | )) 208 | 209 | Await.ready(allTerminated, Duration.Inf) 210 | 211 | storageLocations.foreach(dir => FileUtils.deleteDirectory(dir)) 212 | } 213 | 214 | } --------------------------------------------------------------------------------