├── .github ├── dependabot.yml └── workflows │ └── maven.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── bin ├── analyze-log.sh ├── client.sh ├── compile.sh ├── counter-perf.sh ├── counter.sh ├── jmh.sh ├── migrate_leveldb.java ├── probe.sh ├── release-perform.sh ├── release-prepare.sh ├── remove-elections.sh ├── replication-perf.sh ├── rsm-client.sh ├── rsm.sh ├── run.sh └── test-run.sh ├── conf ├── jgroups-raft.yaml ├── jni.json ├── log4j2-test.xml ├── log4j2.xml ├── raft.xml ├── reflection.json └── rsm.yaml ├── doc ├── design.adoc ├── design │ ├── AppendEntries.txt │ ├── Election.txt │ ├── Election2.adoc │ ├── LearnerNodes.adoc │ └── Log.txt ├── manual │ ├── blocks.adoc │ ├── manual.adoc │ ├── migration.adoc │ ├── overview.adoc │ ├── protocols-template.adoc │ └── using.adoc └── readme.adoc ├── pom.xml ├── src └── org │ └── jgroups │ ├── protocols │ └── raft │ │ ├── AppendEntriesRequest.java │ │ ├── AppendEntriesResponse.java │ │ ├── AppendResult.java │ │ ├── CLIENT.java │ │ ├── DynamicMembership.java │ │ ├── ELECTION.java │ │ ├── ELECTION2.java │ │ ├── FileBasedLog.java │ │ ├── Follower.java │ │ ├── InMemoryLog.java │ │ ├── InstallSnapshotRequest.java │ │ ├── InternalCommand.java │ │ ├── Leader.java │ │ ├── Learner.java │ │ ├── LevelDBLog.java │ │ ├── Log.java │ │ ├── LogEntries.java │ │ ├── LogEntry.java │ │ ├── NO_DUPES.java │ │ ├── PersistentState.java │ │ ├── RAFT.java │ │ ├── REDIRECT.java │ │ ├── RaftHeader.java │ │ ├── RaftImpl.java │ │ ├── RaftLeaderException.java │ │ ├── Role.java │ │ ├── election │ │ ├── BaseElection.java │ │ ├── LeaderElected.java │ │ ├── PreVoteRequest.java │ │ ├── PreVoteResponse.java │ │ ├── VoteRequest.java │ │ └── VoteResponse.java │ │ └── state │ │ └── RaftState.java │ └── raft │ ├── Options.java │ ├── RaftHandle.java │ ├── Settable.java │ ├── StateMachine.java │ ├── blocks │ ├── AsyncCounterImpl.java │ ├── CounterService.java │ ├── RaftAsyncCounter.java │ ├── RaftCounter.java │ ├── RaftSyncCounter.java │ └── ReplicatedStateMachine.java │ ├── client │ ├── Client.java │ ├── ClientStub.java │ └── ReplicatedStateMachineClient.java │ ├── demos │ ├── CounterServiceDemo.java │ ├── ProgrammaticRSM.java │ └── ReplicatedStateMachineDemo.java │ ├── filelog │ ├── FilePositionCache.java │ ├── FileStorage.java │ ├── LogEntryStorage.java │ └── MetadataStorage.java │ ├── testfwk │ ├── BlockingMessageInterceptor.java │ ├── MockRaftCluster.java │ ├── PartitionedRaftCluster.java │ ├── RaftCluster.java │ ├── RaftNode.java │ └── RaftTestUtils.java │ └── util │ ├── AnalyzeLog.java │ ├── ArrayRingBuffer.java │ ├── CommitTable.java │ ├── CounterStateMachine.java │ ├── LogCache.java │ ├── LongHelper.java │ ├── PropsToAsciidoc.java │ ├── ReplStateMachine.java │ ├── RequestTable.java │ ├── Utils.java │ └── pmem │ ├── FileProvider.java │ └── PmemUtilWrapper.java └── tests ├── benchmark └── org │ └── jgroups │ └── perf │ ├── CommandLineOptions.java │ ├── Main.java │ ├── counter │ ├── AsyncCounterBenchmark.java │ ├── CounterBenchmark.java │ ├── CounterPerf.java │ ├── HistogramUtil.java │ └── SyncBenchmark.java │ ├── harness │ ├── AbstractRaftBenchmark.java │ └── RaftBenchmark.java │ ├── jmh │ ├── DataReplicationBenchmark.java │ ├── LogJmhBenchmark.java │ └── StorageAppenderBenchmark.java │ └── replication │ ├── AsyncReplicationBenchmark.java │ ├── ReplicationPerf.java │ └── SyncReplicationBenchmark.java ├── junit-functional └── org │ └── jgroups │ └── tests │ ├── AppendEntriesTest.java │ ├── CommitTableTest.java │ ├── CompletableFutureTest.java │ ├── DummyStateMachine.java │ ├── DynamicMembershipTest.java │ ├── ElectionsTest.java │ ├── LearnerMemberTest.java │ ├── LogEntriesTest.java │ ├── LogTest.java │ ├── LongHelperTest.java │ ├── MaintenanceClusterTest.java │ ├── MergeTest.java │ ├── PartialConnectivityTest.java │ ├── RaftHeaderTest.java │ ├── RaftTest.java │ ├── ReplicatedStateMachineTest.java │ ├── RequestTableTest.java │ ├── SyncElectionTests.java │ ├── SyncElectionWithRestrictionTest.java │ ├── SyncLeaderCrashTest.java │ ├── SynchronousTests.java │ ├── TimeoutTest.java │ ├── UtilsTest.java │ ├── VoteTest.java │ ├── blocks │ └── CounterTest.java │ ├── election │ ├── DelayedElectedLeaderMessageTest.java │ ├── DetermineLeaderBreakdownTest.java │ ├── LeaderLeavingTest.java │ ├── NetworkPartitionChannelTest.java │ ├── NetworkPartitionElectionTest.java │ ├── ViewChangeElectionTest.java │ └── VotingThreadBreakdownTest.java │ ├── harness │ ├── AbstractRaftTest.java │ ├── BaseRaftChannelTest.java │ ├── BaseRaftClusterTest.java │ ├── BaseRaftElectionTest.java │ ├── BaseStateMachineTest.java │ ├── CheckPoint.java │ └── RaftAssertion.java │ └── utils │ ├── ArrayRingBufferTest.java │ └── JUnitXMLReporter.java └── resources └── raft-benchmark.xml /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: maven 9 | directory: "/" 10 | schedule: 11 | interval: daily 12 | timezone: Brazil/East 13 | time: "15:00" 14 | open-pull-requests-limit: 10 15 | assignees: 16 | - "jabolina" 17 | reviewers: 18 | - "jabolina" 19 | - package-ecosystem: github-actions 20 | directory: "/" 21 | schedule: 22 | interval: daily 23 | open-pull-requests-limit: 10 24 | -------------------------------------------------------------------------------- /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | name: Java CI with Maven 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | branches: 9 | - '*' 10 | workflow_dispatch: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | os: 22 | - "ubuntu-latest" 23 | - "windows-latest" 24 | - "macos-latest" 25 | # Keep this list as: all supported LTS JDKs, the latest GA JDK, and optionally the latest EA JDK (if available). 26 | # https://www.oracle.com/java/technologies/java-se-support-roadmap.html 27 | java: [ 11, 17, 21, 23 ] 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v4 31 | - name: Set up JDK ${{ matrix.java }} 32 | uses: actions/setup-java@v4 33 | with: 34 | java-version: ${{ matrix.java }} 35 | distribution: temurin 36 | cache: maven 37 | - name: Build with Maven 38 | timeout-minutes: 10 39 | id: test_runner 40 | run: mvn --batch-mode --no-transfer-progress package -Dgroups=functional 41 | - name: Generate test reports 42 | # Only generate reports if tests failed. 43 | if: failure() 44 | run: mvn --batch-mode --no-transfer-progress surefire-report:report-only 45 | - name: Generate and upload test reports 46 | # Only upload reports if tests failed. 47 | if: failure() 48 | uses: actions/upload-artifact@v4 49 | with: 50 | name: Test reports 51 | # Once SUREFIRE-2219 is finished we can upload only the html report. 52 | path: | 53 | tmp/html/ 54 | tmp/test-results/xml/ 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.iws 3 | *.ipr 4 | *.iml 5 | *.html 6 | .project 7 | .classpath 8 | .factorypath 9 | .settings/ 10 | .idea/ 11 | .idea 12 | .DS_Store 13 | classes/ 14 | build.properties 15 | dist/ 16 | atlassian* 17 | keystore/ 18 | tmp/ 19 | bla*.java 20 | doc/manual/target 21 | doc/manual/*.css 22 | doc/manual/build/* 23 | doc/manual/*.tmp 24 | doc/manual/*-generated.adoc 25 | doc/tutorial/target 26 | conf/MANIFEST.MF 27 | target/ 28 | lib/ 29 | *.log 30 | *.db 31 | *.properties 32 | .ant-targets-build.xml 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ## Builds an image containing jgroups-raft 2 | 3 | ## *************************************************************** 4 | ## Make sure you have jgroups-raft compiled (mvn clean package) before doing so! 5 | ## *************************************************************** 6 | 7 | ## The first stage is used to prepare/update the OS. 8 | ## The second stage copies the local files (lib:classes) to the image 9 | # Build: docker build -f Dockerfile -t belaban/jgroups-raft . 10 | # Push: docker push belaban/jgroups-raft 11 | 12 | 13 | FROM adoptopenjdk/openjdk11:jre as build-stage 14 | RUN apt-get update ; apt-get install -y git ant net-tools netcat iputils-ping 15 | 16 | # For the runtime, we only need a JRE (smaller footprint) 17 | FROM adoptopenjdk/openjdk11:jre 18 | LABEL maintainer="Bela Ban (belaban@mailbox.org)" 19 | RUN useradd --uid 1000 --home /opt/jgroups --create-home --shell /bin/bash jgroups 20 | RUN echo root:root | chpasswd ; echo jgroups:jgroups | chpasswd 21 | RUN printf "\njgroups ALL=(ALL) NOPASSWD: ALL\n" >> /etc/sudoers 22 | # EXPOSE 7800-7900:7800-7900 9000-9100:9000-9100 23 | EXPOSE 1965-1975:2065-2075 8787 24 | ENV HOME /opt/jgroups 25 | ENV PATH $PATH:$HOME/jgroups-raft/bin 26 | ENV JGROUPS_RAFT_HOME=$HOME/jgroups-raft 27 | WORKDIR /opt/jgroups 28 | 29 | COPY --from=build-stage /bin/ping /bin/netstat /bin/nc /bin/ 30 | COPY --from=build-stage /sbin/ifconfig /sbin/ 31 | COPY README.md $JGROUPS_RAFT_HOME/ 32 | COPY ./target/classes $JGROUPS_RAFT_HOME/classes 33 | COPY ./target/libs $JGROUPS_RAFT_HOME/lib 34 | COPY ./bin $JGROUPS_RAFT_HOME/bin 35 | COPY ./conf $JGROUPS_RAFT_HOME/conf 36 | 37 | RUN mkdir /mnt/data ; chown -R jgroups.jgroups /mnt/data $HOME/* 38 | 39 | # Run everything below as the jgroups user. Unfortunately, USER is only observed by RUN, *not* by ADD or COPY !! 40 | USER jgroups 41 | 42 | RUN chmod u+x $HOME/* 43 | CMD clear && cat $HOME/jgroups-raft/README.md && /bin/bash 44 | 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | jgroups-raft 2 | ============ 3 | 4 | [![Last Build](https://img.shields.io/github/actions/workflow/status/jgroups-extras/jgroups-raft/maven.yml?style=for-the-badge&logo=github)](https://github.com/jgroups-extras/jgroups-raft/actions/workflows/maven.yml) 5 | [![Maven Central](https://img.shields.io/maven-central/v/org.jgroups/jgroups-raft?style=for-the-badge&logo=apache-maven&color=green)](https://central.sonatype.com/artifact/org.jgroups/jgroups-raft) 6 | [![License](https://img.shields.io/github/license/jgroups-extras/jgroups-raft?style=for-the-badge&logo=apache&color=green)](https://www.apache.org/licenses/LICENSE-2.0) 7 | 8 | jgroups-raft is an implementation of the [Raft](https://raft.github.io/) consensus algorithm in [JGroups](http://jgroups.org/). 9 | Users can use jgroups-raft embedded in their applications to build strongly consistent, highly available, fault-tolerant systems. 10 | 11 | 12 | ## Overview 13 | jgroups-raft is a library offering all the guarantees the Raft algorithm provides, with features including: 14 | 15 | * Configurable and alternatives for leader election; 16 | * Dynamic membership changes in single-step; 17 | * Configurable log implementations for storage; 18 | * Member deduplication and request redirection; 19 | * Ready-to-use building blocks. 20 | 21 | By building on top of JGroups, jgroups-raft takes advantage of additional features with a mature and battle-tested network stack. 22 | jgroups-raft is [verified with Jepsen](https://github.com/jgroups-extras/jepsen-jgroups-raft) to identify linearizability violations in the building blocks. 23 | 24 | 25 | ## Getting Started 26 | 27 | To get started developing with jgroups-raft: 28 | 29 | * Take a look at the complete [documentation](https://belaban.github.io/jgroups-raft/manual/index.html); 30 | * Details about the implementation are available in the [design documents](https://github.com/jgroups-extras/jgroups-raft/tree/main/doc/design). 31 | 32 | 33 | ## Contributing 34 | 35 | * Get in touch through the [discussion group](https://groups.google.com/forum/#!forum/jgroups-raft) or [GitHub discussions](https://github.com/jgroups-extras/jgroups-raft/discussions); 36 | * Open bug reports on [GitHub issues](https://github.com/jgroups-extras/jgroups-raft/issues); 37 | * Feel like coding? Look at the issues page and get in touch with any questions. 38 | -------------------------------------------------------------------------------- /bin/analyze-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | `dirname $0`/run.sh -ea -Dlog4j.configurationFile=log4j2.xml org.jgroups.raft.util.AnalyzeLog $* 5 | -------------------------------------------------------------------------------- /bin/client.sh: -------------------------------------------------------------------------------- 1 | ### Calls Client 2 | 3 | #!/bin/bash 4 | 5 | 6 | `dirname $0`/run.sh org.jgroups.raft.client.Client $* 7 | 8 | -------------------------------------------------------------------------------- /bin/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Creates a native image using the GraalVM compiler (needs to be on the path) 4 | 5 | LIB=`dirname $0`/../lib 6 | CLASSES=`dirname $0`/../classes 7 | CONF=`dirname $0`/../conf 8 | CLASSPATH="$CLASSES:$LIB/*" 9 | 10 | OPTIONS="-H:+JNI --no-server -H:+ReportExceptionStackTraces --features=org.graalvm.home.HomeFinderFeature" 11 | 12 | OPTIONS="$OPTIONS -H:+AllowVMInspection -H:TraceClassInitialization=true --no-fallback --allow-incomplete-classpath" 13 | 14 | OPTIONS="$OPTIONS -H:ReflectionConfigurationFiles=$CONF/reflection.json" 15 | 16 | OPTIONS="$OPTIONS -H:JNIConfigurationFiles=$CONF/jni.json" #,$CONF/jni-config.json" 17 | 18 | OPTIONS="$OPTIONS -H:ConfigurationFileDirectories=conf2/" 19 | 20 | # OPTIONS="$OPTIONS -H:+PrintAnalysisCallTree" 21 | 22 | 23 | OPTIONS="$OPTIONS -Dgraal.CompilationFailureAction=Diagnose" 24 | 25 | # OPTIONS="$OPTIONS -H:IncludeResources=/home/bela/logging.properties -Dfoo=bar -Dcom.sun.management.jmxremote" 26 | 27 | #OPTIONS="$OPTIONS --debug-attach=*:5000" 28 | 29 | #OPTIONS="$OPTIONS -J-server -J-XX:+UseG1GC -J-XX:+UseAdaptiveSizePolicy -J-XX:MinHeapFreeRatio=20 -J-XX:MaxHeapFreeRatio=20" 30 | 31 | OPTIONS="$OPTIONS --initialize-at-build-time=" 32 | 33 | OPTIONS="$OPTIONS -Dlog4j2.disable.jmx=true" ## Prevents log4j2 from creating an MBeanServer 34 | 35 | OPTIONS="$OPTIONS -Djgroups.use.jdk_logger=true" ## prevents log4j2 from being used 36 | 37 | #OPTIONS="$OPTIONS -H:GenerateDebugInfo=1" 38 | 39 | #OPTIONS="$OPTIONS --initialize-at-run-time=com.sun.jmx.mbeanserver.JmxMBeanServer" 40 | 41 | #OPTIONS="$OPTIONS --initialize-at-run-time=org.jgroups.protocols.FD_SOCK" 42 | 43 | native-image -cp $CLASSPATH $OPTIONS $* -------------------------------------------------------------------------------- /bin/counter-perf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(dirname "$0") 4 | 5 | # shellcheck disable=SC2086,SC2048 6 | "$BASEDIR"/test-run.sh -ea org.jgroups.perf.counter.CounterPerf -props raft.xml $* 7 | 8 | -------------------------------------------------------------------------------- /bin/counter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | `dirname $0`/run.sh org.jgroups.raft.demos.CounterServiceDemo -props raft.xml $* 5 | 6 | -------------------------------------------------------------------------------- /bin/jmh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(dirname "$0") 4 | 5 | function help() { 6 | echo "" 7 | echo "tip: pass '-h' for JMH options" 8 | exit 1 9 | } 10 | 11 | function list_benchmarks() { 12 | echo "Available benchmarks:" 13 | printf "\tDataReplicationBenchmark\n" 14 | printf "\tLogJmhBenchmark\n" 15 | printf "\tStorageAppenderBenchmark\n" 16 | help; 17 | } 18 | 19 | BENCHMARK="$1" 20 | shift; 21 | 22 | if [ "$BENCHMARK" = "-list" ]; then 23 | list_benchmarks 24 | exit 0; 25 | fi 26 | 27 | # shellcheck disable=SC2086,SC2048 28 | "$BASEDIR"/test-run.sh -ea org.openjdk.jmh.Main "$BENCHMARK" $* 29 | -------------------------------------------------------------------------------- /bin/migrate_leveldb.java: -------------------------------------------------------------------------------- 1 | ///usr/bin/env jbang "$0" "$@" ; exit $? 2 | //DEPS info.picocli:picocli:4.6.3 3 | //DEPS org.jgroups:jgroups-raft:1.0.14.Final 4 | 5 | import org.jgroups.protocols.raft.LevelDBLog; 6 | import org.jgroups.protocols.raft.FileBasedLog; 7 | import org.jgroups.protocols.raft.LogEntries; 8 | 9 | import java.io.IOException; 10 | import java.nio.ByteBuffer; 11 | import java.nio.file.FileVisitResult; 12 | import java.nio.file.Files; 13 | import java.nio.file.Path; 14 | import java.nio.file.SimpleFileVisitor; 15 | import java.nio.file.attribute.BasicFileAttributes; 16 | import java.util.concurrent.Callable; 17 | 18 | import picocli.CommandLine; 19 | import picocli.CommandLine.Command; 20 | import picocli.CommandLine.Option; 21 | import picocli.CommandLine.Parameters; 22 | 23 | @Command(name = "migrate_leveldb", mixinStandardHelpOptions = true, version = "migrate_leveldb 0.1", 24 | description = "Migrates data created by LevelDBLog to the FileBasedLog format.") 25 | class migrate_leveldb implements Callable { 26 | private static final String ANIMATION = "|/-\\"; 27 | 28 | @Parameters(index = "0", description = "Path to the folder containing the LevelDBLog data") 29 | private String path; 30 | 31 | @Option(names = "--force", description = "Force the migration even with file-based data existent") 32 | private boolean force; 33 | 34 | public static void main(String... args) { 35 | int exitCode = new CommandLine(new migrate_leveldb()).execute(args); 36 | System.exit(exitCode); 37 | } 38 | 39 | @SuppressWarnings("removal") 40 | @Override 41 | public Integer call() throws Exception { 42 | System.out.printf("Verifying data located: %s%n", path); 43 | 44 | LevelDBLog src = new LevelDBLog(); 45 | FileBasedLog dst = new FileBasedLog(); 46 | 47 | src.init(path, null); 48 | dst.init(path, null); 49 | 50 | try (src; dst) { 51 | if (dst.sizeInBytes() > 0) { 52 | if (!force) { 53 | System.out.println("There is already a file-based storage with data in place. Aborting migration..."); 54 | return 1; 55 | } 56 | 57 | Path target = Path.of(path, "temp"); 58 | Path source = Path.of(path); 59 | 60 | if (Files.exists(target)) { 61 | System.out.printf("Temporary folder already exists at '%s'. Aborting migration...%n", target); 62 | return 1; 63 | } 64 | 65 | System.out.printf("Created data backup in: %s%n", target); 66 | 67 | Files.walkFileTree(source, new SimpleFileVisitor<>() { 68 | @Override 69 | public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { 70 | Files.createDirectories(target.resolve(source.relativize(dir).toString())); 71 | return FileVisitResult.CONTINUE; 72 | } 73 | 74 | @Override 75 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 76 | Files.copy(file, target.resolve(source.relativize(file).toString())); 77 | return FileVisitResult.CONTINUE; 78 | } 79 | }); 80 | } 81 | 82 | ByteBuffer bb = src.getSnapshot(); 83 | if (bb != null) { 84 | System.out.println("Migrating snapshot to file-based log"); 85 | dst.setSnapshot(bb); 86 | } 87 | 88 | long first = src.firstAppended(); 89 | long last = src.lastAppended(); 90 | System.out.printf("Migrating entries in range: [%d; %d]%n", first, last); 91 | 92 | dst.reinitializeTo(first, src.get(first)); 93 | src.forEach((entry, idx) -> { 94 | if (idx == first) return; 95 | LogEntries entries = new LogEntries(); 96 | entries.add(entry); 97 | dst.append(idx, entries); 98 | printProgress(first, last, idx); 99 | }); 100 | } catch (Throwable t) { 101 | System.out.println("Failed migrating data"); 102 | t.printStackTrace(System.err); 103 | dst.delete(); 104 | return 1; 105 | } 106 | 107 | outWrite(new byte[] { '\r' }); 108 | return 0; 109 | } 110 | 111 | private static void printProgress(long start, long end, long curr) { 112 | float v = (float) (curr - start) / (end - start); 113 | int p = (int) (v * 100); 114 | String data = "\r" + ANIMATION.charAt(p % ANIMATION.length()) + ": Migrating " + p + "%"; 115 | outWrite(data.getBytes()); 116 | } 117 | 118 | private static void outWrite(byte[] bytes) { 119 | try { 120 | System.out.write(bytes); 121 | } catch (IOException ignore) { } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /bin/probe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Discovers all UDP-based members running on a certain mcast address (use -help for help) 4 | # Probe [-help] [-addr ] [-port ] [-ttl ] [-timeout ] 5 | 6 | 7 | 8 | `dirname $0`/run.sh org.jgroups.tests.Probe $* 9 | -------------------------------------------------------------------------------- /bin/release-perform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mvn clean release:perform -Darguments="-DskipTests=true -Dmaven.skip.javadoc=true -Dmaven.test.skip=true" 4 | 5 | -------------------------------------------------------------------------------- /bin/release-prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mvn clean release:prepare -Darguments="-DskipTests=true -Dmaven.skip.javadoc=true -Dmaven.test.skip=true" 4 | 5 | -------------------------------------------------------------------------------- /bin/remove-elections.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Removes the ELECTION protocols from all running nodes 4 | probe ELECTION.no_elections=true 5 | probe ELECTION.stopElectionTimer[] 6 | probe ELECTION.stopHeartbeatTimer[] 7 | probe rp=ELECTION 8 | probe pp 9 | -------------------------------------------------------------------------------- /bin/replication-perf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR=$(dirname "$0") 4 | 5 | # shellcheck disable=SC2086,SC2048 6 | "$BASEDIR"/test-run.sh -ea org.jgroups.perf.Main org.jgroups.perf.replication.ReplicationPerf -props raft-benchmark.xml $* 7 | -------------------------------------------------------------------------------- /bin/rsm-client.sh: -------------------------------------------------------------------------------- 1 | ### ReplicatedStateMachineClient 2 | 3 | #!/bin/bash 4 | 5 | 6 | `dirname $0`/run.sh org.jgroups.raft.client.ReplicatedStateMachineClient $* 7 | 8 | -------------------------------------------------------------------------------- /bin/rsm.sh: -------------------------------------------------------------------------------- 1 | ### ReplicatedStateMachineDemo 2 | 3 | #!/bin/bash 4 | 5 | 6 | `dirname $0`/run.sh org.jgroups.raft.demos.ReplicatedStateMachineDemo -props raft.xml $* 7 | 8 | -------------------------------------------------------------------------------- /bin/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### Configurable properties: 4 | 5 | ## bind address, set the network interface to use for clustering traffic 6 | #BIND_ADDR=192.168.1.5 7 | #BIND_ADDR=match-interface:en.* 8 | #BIND_ADDR=site_local 9 | 10 | ################# CHANGE THIS ############################## 11 | #BIND_ADDR=match-address:192.168.1.* 12 | #BIND_ADDR=127.0.0.1 13 | ############################################################ 14 | 15 | MCAST_ADDR=232.5.5.5 16 | BIN_DIR=`dirname $0` 17 | 18 | # Project built with Maven. 19 | LIB=${BIN_DIR}/../target/libs 20 | CLASSES=${BIN_DIR}/../target/classes 21 | CONF=`dirname $0`/../conf 22 | 23 | CP=$CLASSES:$CONF:$LIB/* 24 | LOG="-Dlog4j.configurationFile=log4j2.xml" 25 | 26 | 27 | JG_FLAGS="-Djgroups.udp.mcast_addr=$MCAST_ADDR" 28 | JG_FLAGS="$JG_FLAGS -Djava.net.preferIPv4Stack=true" 29 | FLAGS="-server -Xmx600M -Xms600M" 30 | FLAGS="$FLAGS -XX:CompileThreshold=10000 -XX:ThreadStackSize=64K -XX:SurvivorRatio=8" 31 | FLAGS="$FLAGS -XX:TargetSurvivorRatio=90 -XX:MaxTenuringThreshold=15" 32 | FLAGS="$FLAGS -Xshare:off" 33 | #GC="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" ## concurrent mark and sweep (CMS) collector 34 | 35 | JMX="-Dcom.sun.management.jmxremote" 36 | EXPERIMENTAL="$EXPERIMENTAL -XX:+EliminateLocks" 37 | #JMC="-XX:+UnlockCommercialFeatures -XX:+FlightRecorder" 38 | # DEBUG="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:8787" 39 | 40 | java -cp $CP $DEBUG $LOG $GC $JG_FLAGS $FLAGS $EXPERIMENTAL $JMX $JMC $* 41 | 42 | -------------------------------------------------------------------------------- /bin/test-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Same as `run.sh` but for running classes in the test sources. 4 | 5 | function check_if_exists_or_exit() { 6 | [ ! -d "$@" ] && help "Missing '$@' directory. Run: 'mvn clean package -DskipTests' before running" 7 | } 8 | 9 | MCAST_ADDR=232.5.5.5 10 | BIN_DIR=$(dirname "$0") 11 | 12 | # Project built with Maven. 13 | LIB=${BIN_DIR}/../target/libs 14 | CLASSES=${BIN_DIR}/../target/classes 15 | TEST_CLASSES=${BIN_DIR}/../target/test-classes 16 | CONF=${BIN_DIR}/../conf 17 | 18 | check_if_exists_or_exit ${LIBS} 19 | check_if_exists_or_exit ${CLASSES} 20 | check_if_exists_or_exit ${TEST_CLASSES} 21 | 22 | CP="$CLASSES:$CONF:$TEST_CLASSES:$LIB/*" 23 | LOG="-Dlog4j.configurationFile=log4j2.xml" 24 | 25 | JG_FLAGS="-Djgroups.udp.mcast_addr=$MCAST_ADDR" 26 | JG_FLAGS="$JG_FLAGS -Djava.net.preferIPv4Stack=true" 27 | 28 | FLAGS="-server -Xmx600M -Xms600M" 29 | FLAGS="$FLAGS -XX:CompileThreshold=10000 -XX:ThreadStackSize=64K -XX:SurvivorRatio=8" 30 | FLAGS="$FLAGS -XX:TargetSurvivorRatio=90 -XX:MaxTenuringThreshold=15" 31 | FLAGS="$FLAGS -Xshare:off" 32 | 33 | #GC="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC" ## concurrent mark and sweep (CMS) collector 34 | 35 | EXPERIMENTAL="$EXPERIMENTAL -XX:+EliminateLocks" 36 | #DEBUG="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:8787" 37 | 38 | # shellcheck disable=SC2086,SC2048 39 | java -cp $CP $DEBUG $LOG $GC $JG_FLAGS $FLAGS $EXPERIMENTAL $JAVA_OPTIONS $* 40 | -------------------------------------------------------------------------------- /conf/jgroups-raft.yaml: -------------------------------------------------------------------------------- 1 | ## 2 | ## Can be used to deploy 3 pods running with raft.xml and a service to Kubernetes 3 | ## 4 | apiVersion: apps/v1 5 | kind: StatefulSet 6 | metadata: 7 | name: jgroups-raft 8 | labels: 9 | run: jgroups-raft 10 | spec: 11 | replicas: 1 12 | podManagementPolicy: OrderedReady ## Parallel 13 | selector: 14 | matchLabels: 15 | run: jgroups-raft 16 | serviceName: "jgroups-raft" 17 | template: 18 | metadata: 19 | labels: 20 | run: jgroups-raft 21 | spec: 22 | terminationGracePeriodSeconds: 5 23 | containers: 24 | - image: belaban/jgroups-raft 25 | name: jgroups-raft 26 | command: ["run.sh"] 27 | args: ["org.jgroups.raft.demos.ReplicatedStateMachineDemo -nohup -listen"] 28 | volumeMounts: 29 | - name: jgroups-raft-data 30 | mountPath: /mnt/data 31 | env: 32 | - name: raft_members 33 | value: "jgroups-raft-0,jgroups-raft-1,jgroups-raft-2" 34 | - name: log_dir 35 | value: /mnt/data 36 | - name: DNS_QUERY 37 | value: "jgroups-raft.default.svc.cluster.local" 38 | - name: DNS_RECORD_TYPE 39 | value: A 40 | # - name: DEBUG 41 | # value: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:8787" 42 | # - name: DNS_QUERY 43 | # value: "_ping._tcp.jgroups-raft.default.svc.cluster.local" 44 | # - name: DNS_RECORD_TYPE 45 | # value: SRV 46 | # - name: DNS_ADDRESS 47 | # value: 10.96.0.10 48 | # - name: DNS_PROBE_TRANSPORT_PORTS 49 | # value: "true" 50 | volumeClaimTemplates: 51 | - metadata: 52 | name: jgroups-raft-data 53 | spec: 54 | accessModes: [ "ReadWriteOnce" ] 55 | resources: 56 | requests: 57 | storage: 1Gi 58 | --- 59 | apiVersion: v1 60 | kind: Service 61 | metadata: 62 | name: jgroups-raft 63 | labels: 64 | run: jgroups-raft 65 | spec: 66 | publishNotReadyAddresses: true 67 | type: LoadBalancer 68 | selector: 69 | run: jgroups-raft 70 | # statefulset.kubernetes.io/pod-name: jgroups-raft-0 71 | ports: 72 | - protocol: TCP 73 | name: client 74 | port: 1965 75 | targetPort: 1965 # access CLIENT protocol via client.sh 76 | - protocol: TCP 77 | name: rsm 78 | port: 2065 79 | targetPort: 2065 # access rsh service via rsm-client.sh 80 | - protocol: TCP 81 | name: debug 82 | port: 8787 83 | targetPort: 8787 # to attach a debugger 84 | sessionAffinity: None 85 | status: 86 | loadBalancer: {} 87 | --- 88 | 89 | 90 | -------------------------------------------------------------------------------- /conf/jni.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "org.fusesource.leveldbjni.internal.NativeBuffer$NativeBufferJNI", 4 | "allDeclaredConstructors" : true, 5 | "allPublicConstructors" : true, 6 | "allDeclaredMethods" : true, 7 | "allPublicMethods" : true, 8 | "allDeclaredClasses" : true, 9 | "allPublicClasses" : true 10 | } 11 | ] -------------------------------------------------------------------------------- /conf/log4j2-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /conf/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /conf/raft.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 11 | 28 | 29 | 30 | 32 | 33 | 34 | 35 | 41 | 46 | 48 | 49 | 50 | 52 | 54 | 56 | 57 | 58 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /conf/reflection.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name" : "org.jgroups.protocols.TP", 4 | "fields" : [ 5 | { "name" : "bind_addr", "allowWrite" : true }, 6 | { "name" : "bind_port", "allowWrite" : true}, 7 | { "name" : "thread_pool_enabled", "allowWrite" : true} 8 | ] 9 | }, 10 | { 11 | "name": "org.jgroups.util.ObjectWrapper", 12 | "methods" : [ 13 | { "name" : "", "parameterTypes" : [] } 14 | ] 15 | }, 16 | { 17 | "name": "org.jgroups.tests.perf.PerfUtil$Config", 18 | "methods" : [ 19 | {"name" : "", "parameterTypes": []} 20 | ] 21 | }, 22 | { 23 | "name": "org.jgroups.tests.perf.PerfUtil$Results", 24 | "methods" : [ 25 | {"name" : "", "parameterTypes": []} 26 | ] 27 | }, 28 | { 29 | "name": "org.jgroups.tests.perf.PerfUtil$CustomCall", 30 | "methods" : [ 31 | {"name" : "", "parameterTypes": []} 32 | ] 33 | }, 34 | { 35 | "name": "org.jgroups.protocols.raft.LevelDBLog" 36 | }, 37 | { 38 | "name": "Lorg/iq80/leveldb/DB;" 39 | } 40 | ] -------------------------------------------------------------------------------- /conf/rsm.yaml: -------------------------------------------------------------------------------- 1 | ## 2 | ## Can be used to deploy 3 pods running with raft.xml and a service to Kubernetes 3 | ## 4 | apiVersion: apps/v1 5 | kind: StatefulSet 6 | metadata: 7 | name: jgroups-raft 8 | labels: 9 | run: jgroups-raft 10 | spec: 11 | replicas: 3 12 | podManagementPolicy: OrderedReady ## Parallel 13 | selector: 14 | matchLabels: 15 | run: jgroups-raft 16 | serviceName: "jgroups-raft" 17 | template: 18 | metadata: 19 | labels: 20 | run: jgroups-raft 21 | spec: 22 | terminationGracePeriodSeconds: 5 23 | containers: 24 | - image: belaban/jgroups-raft:blog 25 | name: jgroups-raft 26 | command: ["run.sh"] 27 | args: ["org.jgroups.raft.demos.ReplicatedStateMachineDemo -nohup -listen"] 28 | volumeMounts: 29 | - name: jgroups-raft-data 30 | mountPath: /mnt/data 31 | env: 32 | - name: raft_members 33 | value: "jgroups-raft-0,jgroups-raft-1,jgroups-raft-2" 34 | - name: log_dir 35 | value: /mnt/data 36 | - name: DNS_QUERY 37 | value: "jgroups-raft.default.svc.cluster.local" 38 | - name: DNS_RECORD_TYPE 39 | value: A 40 | volumeClaimTemplates: 41 | - metadata: 42 | name: jgroups-raft-data 43 | spec: 44 | accessModes: [ "ReadWriteOnce" ] 45 | resources: 46 | requests: 47 | storage: 1Gi 48 | --- 49 | apiVersion: v1 50 | kind: Service 51 | metadata: 52 | name: jgroups-raft 53 | labels: 54 | run: jgroups-raft 55 | spec: 56 | publishNotReadyAddresses: true 57 | type: LoadBalancer 58 | selector: 59 | run: jgroups-raft 60 | ports: 61 | - protocol: TCP 62 | name: client 63 | port: 1965 64 | targetPort: 1965 # access CLIENT protocol via client.sh 65 | - protocol: TCP 66 | name: rsm 67 | port: 2065 68 | targetPort: 2065 # access rsh service via rsm-client.sh 69 | - protocol: TCP 70 | name: debug 71 | port: 8787 72 | targetPort: 8787 # to attach a debugger 73 | sessionAffinity: None 74 | status: 75 | loadBalancer: {} 76 | --- 77 | 78 | 79 | -------------------------------------------------------------------------------- /doc/design.adoc: -------------------------------------------------------------------------------- 1 | 2 | = Design 3 | Author: Bela Ban 4 | 5 | 6 | == Architecture 7 | * Below is an overview of the architecture: 8 | 9 | ---- 10 | 11 | ------------------- 12 | | StateMachine | <--- 13 | | implementation | | 14 | ------------------- | 15 | | | 16 | | | 17 | ------------- | 18 | | Channel | | 19 | ------------- | 20 | | | 21 | ---------- | 22 | | REDIRECT | | 23 | ---------- | 24 | | | 25 | ----------- | 26 | | | ------ 27 | | RAFT | 28 | | | --> Log 29 | ----------- 30 | | 31 | ----------- 32 | | ELECTION | 33 | ------------ 34 | ---- 35 | 36 | * Protocols are `REDIRECT`, `RAFT` and `ELECTION` 37 | . `REDIRECT` handles the set(), get() and remove() events from above (e.g. a building block) and passes them down to the 38 | `RAFT` protocol. This protocol could also provide a REST interface, e.g. to handle requests from etcd 39 | . RAFT handles requests according to its role (follower, candidate, leader) and talks to Log and StateMachine 40 | ** It has references to a `Log` and a `StateMachine` 41 | ** E.g. a follower or candidate sends a redirect to a client if an operation is invoked 42 | . `ELECTION` performs leader election and passes up BECOME_ (e.g. BECOME_LEADER) events to RAFT. It also performs 43 | heartbeating (only a leader does this) to keep followers from becoming candidates 44 | * `Log` handles the persistent log 45 | ** By default, all prepares and commits are written to stable storage, but this protocol could be replaced by a 46 | pure in-memory implementation 47 | * `StateMachine` implements a state machine. Log entries can be applied to it 48 | 49 | 50 | == REDIRECT 51 | * Handles commands such as `set()`, `get()` and `remove()` sent from above 52 | * Handles redirection to leader (?) 53 | ** Buffers commands if no leader is present ? 54 | 55 | 56 | 57 | 58 | == RAFT 59 | * This protocol handles leader election, heartbeating, election timeouts and log replication and safety 60 | ** The log reads and writes themselves are handled by `LOG` 61 | * The role pattern is used. We have 3 classes 62 | Follower:: Initial state. When run into the election timeout, becomes Candidate and starts an election 63 | Candidate:: Waits for election responses. Becomes Leader if it wins the election, or goes back to Follower if it loses 64 | the election 65 | Leader:: Starts the heartbeat. Goes back to Follower is it sees a message with a higher term. Handles all client requests 66 | by sending them to the followers and committing them in case of a majority response. 67 | 68 | There's a common superclass `RaftImpl` which handles all of the common state and other ancillary functions (e.g. timers). 69 | 70 | === Properties 71 | heartbeat_interval:: Interval (in ms) at which the heartbeats are set (default: 50 ms) 72 | 73 | election_min_interval:: Min interval for election timer (default: 150 ms) 74 | 75 | election_max_interval:: Max interval for election timer (default: 300 ms). The election timeout is a randomized value 76 | between `election_min_interval` and `election_max_interval`. 77 | majority:: The majority needed to win an election or commit a log entry. Will be set dynamically when view 78 | changes are implemented, so this property will get removed then) 79 | 80 | === Fields 81 | view:: The current view 82 | 83 | leader:: The current leader (null if none has been elected yet) 84 | 85 | current_term:: The current term 86 | 87 | voted_for:: The address of the candidate this node voted for in the current term 88 | 89 | heartbeat_task:: Task which sends a heartbeat periodically. Only run in the leader. 90 | 91 | election_task:: Task which runs periodically to see if an election is needed 92 | 93 | role:: An instance of `RaftImpl`; `Follower`, `Candidate` or `Leader` 94 | 95 | 96 | === RaftImpl 97 | ==== Fields 98 | prot:: A reference to the RAFT protocol in which the instance is created. This way, fields of RAFT can be accessed 99 | 100 | ==== Behavior 101 | 102 | ===== On init 103 | * Start the election timer 104 | * Stop the heartbeat task 105 | 106 | ===== On reception of a heartbeat 107 | * Reset the election timer 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | === Follower 116 | 117 | ==== Behavior 118 | 119 | ===== On 120 | 121 | 122 | === Candidate 123 | 124 | 125 | 126 | === Leader 127 | 128 | ===== On init 129 | * Start the heartbeat task 130 | * Stop the election timer 131 | 132 | 133 | 134 | 135 | == LOG 136 | 137 | 138 | -------------------------------------------------------------------------------- /doc/design/AppendEntries.txt: -------------------------------------------------------------------------------- 1 | 2 | Handling of AppendEntries messages 3 | ================================== 4 | 5 | State 6 | ----- 7 | - The leader maintains 8 | - last_appended: index of the last entry added to the log (see Log.txt) 9 | - commit-index: index of the highest committed entry 10 | - The leader maintains (for each member P) 11 | - next-index: the index of the next entry to be sent to P (initialized to last_appended +1) 12 | - match-index: the index of the highest entry known to be replicated to P (initialized to 0) 13 | - commit-index: the index of the highest entry known to be committed by P. This is only needed because - contrary 14 | to the RAFT paper - AppendEntries requests are only sent until P.next_index > last_appended, and not 15 | continually (also acting as heartbeats). Initialized to 0. 16 | 17 | 18 | Adding an entry at the leader 19 | ----------------------------- 20 | - The leader sets curr-index to ++last-appended and appends the entry to its log at index curr-index 21 | - A new entry for curr-index is added to request-table 22 | - An AppendEntries request is sent to all cluster members (except the leader) 23 | 24 | 25 | 26 | On reception of an AppendEntries request req(prev-index,prev-term) from P 27 | ------------------------------------------------------------------------- 28 | - On the followers only 29 | - If req.term < current-term -> send an AppendEntries(false) response to P and return 30 | - Set current-term to req.term and leader to req.leader 31 | - prev = log.get(prev-index) 32 | - curr-index=prev-index +1 33 | - If req is empty: 34 | - Apply all log entries from commit-index+1 to req.commit-index 35 | - Set commit-index to req.commit-index 36 | - Send an AppendEntries(true) response with commit-index to P 37 | - Else 38 | - If prev == null and prev-index > 0 39 | - Send AppendEntries(false) response with index=last-appended 40 | - Return 41 | - If prev-index == 0 or prev-term == prev.term 42 | - Get the entry at index curr-index 43 | - If not empty and terms don't match: delete entry at curr-index and all following entries 44 | - Append new entry at index curr-index 45 | - Send AppendEntries(true) response with index=curr-index 46 | - Else 47 | - If prev != null -> send AppendEntries(false) response with first index of conflicting term 48 | - Else -> send AppendEntries(true) response with curr-index 49 | 50 | 51 | On reception of an AppendEntries response rsp(index,commit-index) from P 52 | ------------------------------------------------------------------------ 53 | - Set P.commit-index to max(P.commit-index, rsp.commit-index) 54 | - Success: 55 | - Set P.match-index=rsp.index and P.next-index=rsp.index+1 56 | - Add response to request-table 57 | - If majority: 58 | - Apply log entries in range [commit-index+1 .. rsp.index] to state machine 59 | - Set commit-index to index of last successfully applied entry 60 | [optimization] 61 | - Send AppendEntries in range [P.match-index+1 .. last_appended] to P 62 | // 1 message ? otherwise we'll send all msgs in this range to P, which causes more traffic than necessary 63 | - Failure: 64 | - Set P.next-index=rsp.index 65 | 66 | 67 | Periodic resending of AppendEntries requests 68 | -------------------------------------------- 69 | - Performed on the *leader* only 70 | - Executed every RAFT.resend_interval ms 71 | - For each member P in the CommitTable (RAFT.commit_table): 72 | - If P.next-index < log.first_applied: // e.g. my log is [15 .. 120], but P.next-index is 10 73 | - Send snapshot to P 74 | - continue 75 | - If last_appended >= P.next-index: 76 | - Send AppendEntries request with entry at index P.next-index and commit-index 77 | - continue 78 | - If commit-index > P.commit-index: 79 | - Send AppendEntries message with empty entry and commit-index 80 | 81 | 82 | On reception of snapshot S from P 83 | --------------------------------- 84 | - Apply S to the state machine 85 | - Truncate the log at S.last_included_index and S.last_included_term 86 | - Set last-appended -------------------------------------------------------------------------------- /doc/design/Election.txt: -------------------------------------------------------------------------------- 1 | 2 | Election based on JGroups views 3 | =============================== 4 | Author: Bela Ban 5 | 6 | 7 | Motivation 8 | ---------- 9 | The current impl of jgroups-raft ignores the functionality (failure detection, view changes) JGroups offers and 10 | implements leader election more or less as described in [1]. 11 | 12 | By rewriting that part to reuse JGroups functionality, the following advantages can be had: 13 | * More deterministic voting process, each round is _orchestrated_ by the coordinator 14 | * Code reduction, reuse of proven code 15 | * Many failure detection protocols are available 16 | * Ability to customize failure detection 17 | * No constant traffic by heartbeats; a leader establishes itself once and then remains leader until a new view starts a 18 | new leader election 19 | * No elections unless triggered by view changes -> this reduces competing elections 20 | 21 | 22 | In a nutshell 23 | ------------- 24 | * Leader election is started only on view changes 25 | * Contrary to [1], there are only leaders or followers, *no* candidates 26 | * Once a leader is elected, there won't be any elections until a new view (not including the leader, or dropping below 27 | the majority) is installed 28 | * New members get the term and the leader's address via a message from the current leader (LeaderElected) 29 | 30 | Voting is started by the coordinator; it increments its term and solicits the votes of all members (including itself). 31 | The vote responses contain the term of the last log entry and the last log index for the responding member. 32 | 33 | The coordinator picks the member with the highest last term / last log index. If all members have the same terms and 34 | indices, the coordinator picks the oldest member (itself) as leader. 35 | 36 | If no responses from a majority of members have been received after a timeout, then voting continues; otherwise 37 | voting is stopped and a LeaderElected message (with the new term and leader) is sent to all members 38 | 39 | 40 | 41 | Implementation 42 | -------------- 43 | 44 | State 45 | ----- 46 | - term: the current term 47 | - leader: the address of the leader 48 | - voted-for: the address of the member we voted for in the current term 49 | 50 | 51 | On reception of view change 52 | --------------------------- 53 | - Coordinator: if the majority is reached or the leader left -> start the voting thread 54 | - Leader: new members joined -> send a LeaderElected msg to new members 55 | - If the majority is lost -> leader=null 56 | 57 | Voting thread 58 | ------------- 59 | - Periodically: increment term and send a VoteRequest(term) to all members (+self) 60 | - Terminates when LeaderElected message is received, or on shutdown 61 | 62 | On reception of VoteRequest(term) 63 | --------------------------------- 64 | - If term < current_term: reject (don't send a VoteResponse) 65 | - Else: 66 | - If term > current_term -> voted-for=null, current_term=term 67 | - If voted-for == null or voted-for == vote requester -> set voted-for=vote requester, send VoteResponse 68 | - Send a VoteResponse (containing the current term and last log index and last log term) back to the sender 69 | (Example current_term=26 (the number of elections held so far), last_log_index=2012, last_log_term=6) 70 | 71 | 72 | On reception of VoteResponse 73 | ---------------------------- 74 | (on coordinator) 75 | - If enough votes 76 | -> Determine the leader based on highest last log term / index / rank 77 | -> Set term to max of all received current terms 78 | -> Send LeaderElected(leader,term) message to all members (+self) 79 | -> Stop voting thread 80 | - Else 81 | -> Continue 82 | 83 | 84 | On reception of LeaderElected(leader,term) message 85 | -------------------------------------------------- 86 | - if(leader == self) -> become leader 87 | - Only on the coordinator: stop the Voting thread 88 | - Set term and leader 89 | 90 | 91 | Leader: on reception of any request(term) 92 | ----------------------------------------- 93 | - If term > current_term -> step down as leader and become follower 94 | - If term < current_term -> reject / send negative rsp (depending on role and protocol (RAFT,ELECTION)) 95 | 96 | 97 | 98 | [1] https://github.com/ongardie/dissertation 99 | -------------------------------------------------------------------------------- /doc/design/Log.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | The persistent log (org.jgroups.protocols.raft.Log) 4 | =================================================== 5 | Author: Bela Ban 6 | 7 | The persistent log maintains a list of log entries, where each entry has a term and a byte[] buffer to store the command 8 | to be applied to the state machine. 9 | 10 | A log starts at index 1, so the first element is always unused (marker element). Indices are monotonically increasing. 11 | Iteration works on elements in range [first_appended .. last_appended] 12 | 13 | 14 | Variables 15 | ---------- 16 | - first_appended: the index of the first appended item. Initial value: 0 17 | - last_appended: the index of the last added item. Initial value: 0 18 | - commit_index: the index of the last committed item. Always <= last_appended. Initial value: 0 19 | 20 | 21 | Initial state (at the leader) 22 | ----------------------------- 23 | The initial log looks like this (conceptually shown as an array): 24 | 25 | 0 1 2 3 4 26 | --------------------- 27 | | - | | | | | 28 | --------------------- 29 | ^ 30 | | 31 | first_appended ---- 32 | | 33 | last_appended ----- 34 | | 35 | commit_index ------ 36 | 37 | 38 | 39 | After adding A and B: 40 | 41 | 0 1 2 3 4 42 | --------------------- 43 | | - | A | B | | | 44 | --------------------- 45 | ^ ^ 46 | | | 47 | first_appended ---- | 48 | | | 49 | last_appended ------------- 50 | | 51 | commit_index ------ 52 | 53 | 54 | 55 | Now we got acks from a majority of members for index 2: 56 | 57 | 0 1 2 3 4 58 | --------------------- 59 | | - | A | B | | | 60 | --------------------- 61 | ^ ^ 62 | | | 63 | first_appended ---- | 64 | | 65 | last_appended ------------- 66 | | 67 | commit_index -------------- 68 | 69 | 70 | After adding a few more log entries: 71 | 72 | 0 1 2 3 4 5 6 7 8 73 | ------------------------------------- 74 | | - | A | B | C | D | D | E | F | G | 75 | ------------------------------------- 76 | ^ 77 | | 78 | first_appended ---- ^ ^ 79 | | | 80 | last_appended ------------------------------------- 81 | | 82 | commit_index ------------------------------ 83 | 84 | 85 | We now have 8 log entries, but thus far only committed 6. 86 | 87 | Now the log is truncated at the last committed entry (Log.truncate(6)): 88 | 89 | 6 7 8 9 10 11 12 13 14 90 | ------------------------------------- 91 | | E | F | G | | | | | | | 92 | ------------------------------------- 93 | ^ ^ 94 | | | 95 | first_appended ---- | 96 | | | 97 | last_appended ------------- 98 | | 99 | commit_index ------ 100 | 101 | 102 | - All entries from [1 .. 6] were written to a snapshot 103 | - The array was wiped and the first entry is now the last committed entry written to the snapshot (6) 104 | - This is needed because we need that entry to compare to (prev entry) on the next log append 105 | - first_appended is set to 6 106 | 107 | Now a log entry H with index 9 is added: 108 | 6 7 8 9 10 11 12 13 14 109 | ------------------------------------- 110 | | E | F | G | H | | | | | | 111 | ------------------------------------- 112 | ^ ^ 113 | | | 114 | first_appended ---- | 115 | | | 116 | last_appended ----------------- 117 | | 118 | commit_index ------ 119 | 120 | - last_appended is advanced 121 | - commit_index will also advance when acks from the majority have been received 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /doc/manual/manual.adoc: -------------------------------------------------------------------------------- 1 | = Distributed consensus with JGroups Raft 2 | :author: Bela Ban 3 | :toc2: 4 | :toclevels: 3 5 | :icons: 6 | :homepage: http://belaban.github.io/jgroups-raft 7 | :source-highlighter: pygments 8 | :iconsdir: ./images/icons 9 | 10 | © Red Hat 2014 – 2025 11 | 12 | This document is licensed under the 13 | http://creativecommons.org/licenses/by-sa/3.0/us/legalcode["Creative Commons Attribution-ShareAlike (CC-BY-SA) 3.0"] 14 | license. 15 | 16 | 17 | This is the JGroups Raft manual. It provides information about 18 | 19 | * Design and architecture 20 | * Configuration and use 21 | 22 | of jgroups-raft. 23 | 24 | Bela Ban, Kreuzlingen Switzerland 2017 25 | 26 | 27 | include::./overview.adoc[Overview] 28 | 29 | include::./using.adoc[Using jgroups-raft] 30 | 31 | include::./blocks.adoc[Building blocks] 32 | 33 | include::./protocols-generated.adoc[Protocol list] 34 | 35 | include::./migration.adoc[Migration guide] 36 | -------------------------------------------------------------------------------- /doc/manual/migration.adoc: -------------------------------------------------------------------------------- 1 | == Migration Guide 2 | 3 | Guide to migrate your applications between different version. 4 | JGroups Raft follows the usual semantic versioning for releases. 5 | Patch releases should work out-of-the-box without further changes from the application's perspective. 6 | Major and minor might require some extra steps to continue working (migration scripts, public API updates, etc). 7 | 8 | === 1.1.0 9 | 10 | For users upgrading from 1.0.x to 1.1.x. 11 | 12 | ==== LevelDBLog is deprecated 13 | 14 | To remove the use of abandonware, we marked `LevelDBLog` as deprecated and its implementation now delegates to `FileBasedLog`. 15 | Since these logs utilize different formats, a migration procedure is needed to keep your data intact. 16 | We have developed a script utilizing https://www.jbang.dev/[JBang] to migrate from `LevelDBLog` to `FileBasedLog`. 17 | This procedure will migrate the existing files to the expected format and keep the original data. 18 | 19 | The migration script is located under the `bin` folder in the project root with name `migrate_leveldb.java`. 20 | These scripts are not shipped with the release. 21 | Suppose you utilized JGroups Raft with LevelDB pointing to folder `/home/raft/data`. 22 | You can utilize the migration script as: 23 | 24 | [source,bash] 25 | ---- 26 | $ ./bin/migrate_leveldb.java /home/raft/data 27 | ---- 28 | 29 | This will read the data files at the given folder and transform to the expected format. 30 | If the procedure fails during the migration, the new files will be cleaned up. 31 | The new data will be placed in the same folder, so you can restart your application without changing properties. 32 | 33 | If there is already data files with the format expected by the `FileBasedLog` the script will not run and exit. 34 | To run the migration tool and ignore the existing files, use the `--force` flag when invoking the script. 35 | This will copy the existing data to a `temp` folder (`/home/raft/data/temp`, using the previous example) and will proceed with the migration. 36 | If there is a `temp` folder already, the script will not run. 37 | 38 | 39 | NOTE: JBang allows running remote scripts. See https://www.jbang.dev/documentation/guide/latest/usage.html#urls-from-trusted-sources[JBang] for more information. 40 | -------------------------------------------------------------------------------- /doc/manual/overview.adoc: -------------------------------------------------------------------------------- 1 | 2 | == Overview 3 | 4 | The https://github.com/jgroups-extras/jgroups-raft[jgroups-raft] project is an implementation of 5 | https://raftconsensus.github.io/[Raft] in http://www.jgroups.org[JGroups]. 6 | 7 | It provides a consensus based system where leader election and changes are committed by _consensus_ (majority agreement). 8 | A fixed number of nodes form a cluster and each node is a state machine. A leader is elected by consensus and all 9 | changes happen through the leader which replicates them to all nodes, which add them to their persistent log. 10 | 11 | Because Raft guarantees that there's only ever one leader at any time, and changes are identified uniquely, all state 12 | machines receive the same ordered stream of updates and thus have the exact same state. 13 | 14 | Raft favors _consistency_ over _availability_; in terms of the http://en.wikipedia.org/wiki/CAP_theorem[CAP theorem], 15 | jgroups-raft is a CP system. This means jgroups-raft is highly consistent, and the data replicated to nodes will never 16 | diverge, even in the face of network partitions (split brains), or restarts. Or, on an extended version, jgroups-raft 17 | provides the means to build PC/EC systems concerning the https://en.wikipedia.org/wiki/PACELC_theorem[PACELC theorem]. 18 | 19 | In case of a network partition, in a cluster of `N` nodes, at least `N/2+1` nodes have to be running for the 20 | system to be available. 21 | 22 | If for example, in a 5 node cluster, 2 nodes go down, then the system can still commit changes 23 | and elect leaders as 3 is still the majority. However, if another node goes down, the system becomes unavailable and client 24 | requests will be rejected. (Depending on configuration, there may still be some limited form of read-only availability.) 25 | 26 | By implementing jgroups-raft in JGroups, the following benefits can be had: 27 | 28 | * Transports already available: UDP, TCP 29 | ** Contains thread pools, priority delivery (OOB), batching etc 30 | * Variety of discovery protocols 31 | * Encryption, authentication, compression 32 | * Fragmentation, reliability over UDP 33 | * Multicasting for larger clusters 34 | * Failure detection 35 | * Sync/async cluster RPCs 36 | 37 | The code required to be written for a full Raft implementation is smaller than if it had been implemented outside of JGroups. 38 | 39 | 40 | The feature set of jgroups-raft includes 41 | 42 | * Leader election and append entries functionality by consensus 43 | * Persistent log (using FileBasedLog) 44 | * Dynamic addition and removal of cluster nodes 45 | * Cluster wide atomic counters 46 | * Replicated hash maps (replicated state machines) 47 | 48 | 49 | 50 | 51 | === Architecture 52 | 53 | The architecture of jgroups-raft is shown below. 54 | 55 | [[ArchitectureFig]] 56 | .The architecture of jgroups-raft. 57 | [ditaa, format="svg"] 58 | ---- 59 | +----------------+ 60 | | | 61 | +-+ StateMachine |<-+ 62 | | | | | 63 | | +----------------+ | 64 | | | 65 | set/get apply 66 | | | 67 | | +--------------+ | 68 | +->| RaftHandle +---+ 69 | +--------------+ 70 | 71 | +------------+ 72 | | Channel | 73 | +------------+ 74 | 75 | +------------+ 76 | | CLIENT | 77 | +------------+ 78 | | REDIRECT | 79 | +------------+ +-------+ 80 | | RAFT +--------+ Log | 81 | +------------+ +-------+ 82 | | ELECTION | 83 | +------------+ 84 | . 85 | . 86 | +------------+ 87 | | NO_DUPES | 88 | +------------+ 89 | . 90 | . 91 | ---- 92 | 93 | The components that make up jgroups-raft are 94 | 95 | * A JGroups protocol stack with jgroups-raft specific protocols added: 96 | ** `NO_DUPES`: makes sure that a jgroups-raft node does not appear in a view more than once 97 | ** `ELECTION`: handles leader election 98 | ** `RAFT`: implements the Raft algorithm, i.e. appending entries to the persistent log, committing them, syncing new members etc 99 | ** `REDIRECT`: redirects requests to the leader 100 | ** `CLIENT`: accepts client requests over a socket, executes them and sends the results back to the clients 101 | * `Channel`: this is a regular JGroups `JChannel` or `ForkChannel` 102 | * `RaftHandle`: the main class for users of jgroups-raft to interact with 103 | * `StateMachine`: an implementation of `StateMachine`. This is typically a replicated state machine. jgroups-raft 104 | ships with a number of building blocks implementing `StateMachine` such as `CounterService` or `ReplicatedStateMachine`. 105 | 106 | The figure above shows one node in a cluster, but the other nodes have the same setup except that every node is required 107 | to have a different `raft_id` (defined in `RAFT`). This is a string which defines one cluster member; all members 108 | need to have different raft_ids (more on this later). -------------------------------------------------------------------------------- /doc/manual/protocols-template.adoc: -------------------------------------------------------------------------------- 1 | [[protlist]] 2 | == List of protocols 3 | 4 | This chapter describes the most frequently used protocols, and their configuration. 5 | 6 | Meanwhile, we recommend that users should copy one of the predefined configurations (shipped with jgroups-raft), e.g. 7 | +raft.xml+, and make only minimal changes to it. 8 | 9 | 10 | 11 | 12 | 13 | [[NO_DUPES]] 14 | === NO_DUPES 15 | 16 | This protocol prevents duplicate members from joining the cluster. The protocol needs to be located somewhere below 17 | `GMS`. 18 | 19 | `NO_DUPES` catches JOIN requests from a joiner to the JGroups coordinator and checks if the joiner's `raft_id` is 20 | already contained in the current membership, and rejects the JOIN if this is the case. 21 | 22 | For example, if we have current members `{A,B}` and another member with `raft_id` "B" joins, then the joiner would 23 | get the following exception when trying to join the cluster: 24 | ---- 25 | ------------------------------------------------------------------- 26 | GMS: address=B, cluster=cntrs, physical address=127.0.0.1:64733 27 | ------------------------------------------------------------------- 28 | Exception in thread "main" java.lang.Exception: connecting to channel "cntrs" failed 29 | at org.jgroups.JChannel._connect(JChannel.java:570) 30 | at org.jgroups.JChannel.connect(JChannel.java:294) 31 | at org.jgroups.JChannel.connect(JChannel.java:279) 32 | at org.jgroups.raft.demos.CounterServiceDemo.start(CounterServiceDemo.java:32) 33 | at org.jgroups.raft.demos.CounterServiceDemo.main(CounterServiceDemo.java:163) 34 | Caused by: java.lang.SecurityException: join of B rejected as it would create a view with duplicate members (current view: [B|1] (2) [B, A]) 35 | at org.jgroups.protocols.pbcast.ClientGmsImpl.isJoinResponseValid(ClientGmsImpl.java:187) 36 | at org.jgroups.protocols.pbcast.ClientGmsImpl.installViewIfValidJoinRsp(ClientGmsImpl.java:153) 37 | at org.jgroups.protocols.pbcast.ClientGmsImpl.joinInternal(ClientGmsImpl.java:111) 38 | at org.jgroups.protocols.pbcast.ClientGmsImpl.join(ClientGmsImpl.java:41) 39 | at org.jgroups.protocols.pbcast.GMS.down(GMS.java:1087) 40 | at org.jgroups.protocols.FlowControl.down(FlowControl.java:353) 41 | at org.jgroups.protocols.FlowControl.down(FlowControl.java:353) 42 | at org.jgroups.protocols.FRAG2.down(FRAG2.java:136) 43 | at org.jgroups.protocols.RSVP.down(RSVP.java:153) 44 | at org.jgroups.protocols.pbcast.STATE_TRANSFER.down(STATE_TRANSFER.java:202) 45 | at org.jgroups.protocols.raft.ELECTION.down(ELECTION.java:112) 46 | at org.jgroups.protocols.raft.RAFT.down(RAFT.java:442) 47 | at org.jgroups.protocols.raft.REDIRECT.down(REDIRECT.java:103) 48 | at org.jgroups.stack.ProtocolStack.down(ProtocolStack.java:1038) 49 | at org.jgroups.JChannel.down(JChannel.java:791) 50 | at org.jgroups.JChannel._connect(JChannel.java:564) 51 | ... 4 more 52 | [mac] /Users/bela/jgroups-raft$ 53 | ---- 54 | 55 | The error message is `SecurityException: join of B rejected as it would create a view with duplicate members (current view: [B|1] (2) [B, A])`, 56 | which shows that view `{B,A}` already contains a member with `raft_id` `B`, and so the JOIN request of the new member 57 | is rejected. 58 | 59 | ${NO_DUPES} 60 | 61 | 62 | 63 | [[ELECTION]] 64 | === ELECTION 65 | 66 | `ELECTION` is the protocol which performs leader election, as defined by Raft. 67 | Its attributes define the election timeout and the heartbeat interval (see Raft for details). 68 | 69 | ${ELECTION} 70 | 71 | 72 | [[ELECTION2]] 73 | === ELECTION2 74 | 75 | `ELECTION2` is an alternative election algorithm. 76 | It builds on top of <> to include a pre-vote mechanism. 77 | The pre-vote runs before delegating to the algorithm of <>. 78 | 79 | By design, <> uses view changes to start election rounds and should be stable without interruptions. 80 | `ELECTION2` is an alternative in networks with recurrent partitions that could lead to more disruptions with unnecessary election rounds. 81 | More information about how it works is available in the design documents. 82 | 83 | 84 | ${ELECTION2} 85 | 86 | 87 | [[RAFT]] 88 | === RAFT 89 | 90 | `RAFT` is the main protocol in jgroups-raft; it implements log appending and committing, snapshotting and log compaction, 91 | syncing of new members and so on. 92 | 93 | ${RAFT} 94 | 95 | 96 | [[REDIRECT]] 97 | === REDIRECT 98 | 99 | The `REDIRECT` protocol needs to be somewhere above `RAFT`. It keeps track of the current Raft leader and redirects 100 | requests to the right leader. If there is no leader, e.g. because there's no majority to elect one, an exception will 101 | be thrown. 102 | 103 | ${REDIRECT} 104 | 105 | 106 | [[CLIENT]] 107 | === CLIENT 108 | 109 | `CLIENT` listens on a socket for client requests. When a request is received, it is sent down where it will be forwarded 110 | (by `REDIRECT`) to the current leader which executes the request. The responses is then sent back to the client. 111 | 112 | ${CLIENT} -------------------------------------------------------------------------------- /doc/readme.adoc: -------------------------------------------------------------------------------- 1 | = Raft in JGroups 2 | 3 | == Goals 4 | 5 | . Implement the RAFT consensus protocol in JGroups 6 | . Provide an API to RAFT 7 | . Implement etcd (?) [API and REST interface] 8 | . ZooKeeper impl using etcd impl (?) 9 | . View installation via consensus ? 10 | 11 | 12 | == Advantages of using JGroups as basis for a RAFT impl 13 | 14 | * Transports already available: UDP, TCP 15 | ** Contains thread pools, priority delivery (OOB), batching etc 16 | * Variety of discovery protocols 17 | * Encryption, authentication, compression 18 | * Fragmentation, reliability over UDP 19 | * Multicasting for larger clusters 20 | * Failure detection 21 | * Sync/async cluster RPC 22 | 23 | 24 | == Design overview 25 | 26 | * Raft building block over a channel 27 | ** Communicates with RAFT protocol via events (like COUNTER or CENTRAL_LOCK) 28 | 29 | 30 | == Design issues 31 | 32 | * Separate protocols for 33 | ** Leader election 34 | *** Could be omitted as JGroups already has a leader 35 | **** No, we cannot use this (see below under Issues) ! 36 | **** But we may be able to reuse the JGroups failure detection protocols for the heartbeat 37 | *** Majority partition makes progress 38 | ** Log replication 39 | ** Log safety 40 | ** Client interaction 41 | * These protocols would communicate via events 42 | * Heartbeat is not needed for failure detection, but only for log replication 43 | and log safety 44 | * Send heartbeat messages as NO_RELIABILITY ? -> Do we care about JGroups failure detection and view management ? 45 | ** Or could we use JGroups' failure detection ? 46 | *** Don't use RAFT's heartbeat mechanism 47 | *** Start an election when there's a view change which doesn't contain the current leader 48 | *** JGroups coordinator != RAFT leader 49 | 50 | 51 | == Replace RAFT's heartbeat mechanism 52 | 53 | * The problem is that the AppendEntries RPC is used for log shipping *and* heartbeating 54 | * The heartbeating part is done (ca. every 20-30 ms) even when no logs have to be shipped 55 | ** This is annoying as 56 | *** This causes unneeded traffic to all cluster nodes 57 | *** JGroups already does this (duplication of functionality) 58 | * Suggestion: drop the heartbeating (not the log replication) part of AppendEntries and replace it with the JGroups equivalent 59 | * The RAFT heartbeat is used to do the following: 60 | . Keep the followers from becoming candidates and start their own elections 61 | . Send the current term to the followers 62 | . Send the identity of the leader to the followers 63 | * This could be replaced with the following: 64 | . JGroups failure detection. An election is only started when the current leader is removed from the view. Note that 65 | JGroups coordinator != RAFT leader 66 | . Current term: this can be done with a multicast to all followers, or by state transfer when a new follower starts. This state transfer 67 | is done anyway for new followers (InstallSnapshot) 68 | . Identity of the leader: same as above (multicast and/or sent as part of the state to a new follower) 69 | 70 | * The advantage of this would be that we 71 | ** Separate heartbeating from log replication (RAFT does both with the `AppendEntries` RPC) 72 | ** Eliminate constant traffic caused by heartbeating and 73 | ** Remove redundant functionality of RAFT that's already part of JGroups. In addition, JGroups provides a number of 74 | (customizable) failure detection protocols. 75 | * Issues: look at whether merging can be done with this mechanism, too 76 | 77 | 78 | == Misc 79 | 80 | * Seperate GitHub project for now for RAFT protocol and building block 81 | ** May be moved into JGroups once it is stable 82 | ** But for now, with a separate project we can release quickly and independently 83 | * Separate project for etcd consuming RAFT and JGroups ? 84 | * Mailing list on google 85 | ** Potential contributors from Lyon and Newcastle uni (MarkL) 86 | *** Julien Ponge: julien.ponge@insa-lyon.fr 87 | * Use of LevelDB / JDBM2 for persistence ? 88 | 89 | 90 | == Issues 91 | 92 | * What happens with client requests when no leader is elected ? 93 | ** Are they queued ? 94 | 95 | * Do clients block until consensus has been reached, before they get the result ? 96 | ** For a get() this makes sense, but for a write ? 97 | 98 | * Log replication message: sent to all, or only to those which have missing log entries ? 99 | ** Probably to all, as this also serves as heartbeat 100 | *** Not very efficient to send *all* missing log entries to *all* members ! 101 | 102 | * We cannot use JGroups leader election (coordinators) because *a new leader may not contain 103 | all of the committed log entries !* 104 | ** In RAFT's leader election algorithm, only candidates with all (or the most) committed entries can become leaders 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/AppendEntriesRequest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Header; 5 | import org.jgroups.util.Bits; 6 | import org.jgroups.util.Util; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | import java.util.function.Supplier; 12 | 13 | /** 14 | * Used to send AppendEntries messages to cluster members. The log entries are contained in actual payload of the message, 15 | * not in this header. 16 | * @author Bela Ban 17 | * @since 0.1 18 | */ 19 | public class AppendEntriesRequest extends RaftHeader { 20 | protected Address leader; // probably not needed as msg.src() contains the leader's address already 21 | 22 | // the term of the entry; this differs from term, e.g. when a LogEntry is resent with entry_term=25 and term=30 23 | protected long entry_term; 24 | protected long prev_log_index; 25 | protected long prev_log_term; 26 | protected long leader_commit; // the commit_index of the leader 27 | 28 | public AppendEntriesRequest() {} 29 | public AppendEntriesRequest(Address leader, long current_term, long prev_log_index, long prev_log_term, 30 | long entry_term, long leader_commit) { 31 | super(current_term); 32 | this.leader=leader; 33 | this.entry_term=entry_term; 34 | this.prev_log_index=prev_log_index; 35 | this.prev_log_term=prev_log_term; 36 | this.leader_commit=leader_commit; 37 | } 38 | 39 | public short getMagicId() { 40 | return RAFT.APPEND_ENTRIES_REQ; 41 | } 42 | 43 | public Supplier create() { 44 | return AppendEntriesRequest::new; 45 | } 46 | 47 | @Override 48 | public int serializedSize() { 49 | return super.serializedSize() + Util.size(leader) + Bits.size(entry_term) + Bits.size(prev_log_index) 50 | + Bits.size(prev_log_term) + Bits.size(leader_commit); 51 | } 52 | 53 | @Override 54 | public void writeTo(DataOutput out) throws IOException { 55 | super.writeTo(out); 56 | Util.writeAddress(leader, out); 57 | Bits.writeLongCompressed(entry_term, out); 58 | Bits.writeLongCompressed(prev_log_index, out); 59 | Bits.writeLongCompressed(prev_log_term, out); 60 | Bits.writeLongCompressed(leader_commit, out); 61 | } 62 | 63 | @Override 64 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 65 | super.readFrom(in); 66 | leader=Util.readAddress(in); 67 | entry_term=Bits.readLongCompressed(in); 68 | prev_log_index=Bits.readLongCompressed(in); 69 | prev_log_term=Bits.readLongCompressed(in); 70 | leader_commit=Bits.readLongCompressed(in); 71 | } 72 | 73 | @Override public String toString() { 74 | return String.format("%s, leader=%s, entry_term=%d, prev_log_index=%d, prev_log_term=%d, leader_commit=%d", 75 | super.toString(), leader, entry_term, prev_log_index, prev_log_term, leader_commit); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/AppendEntriesResponse.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.Header; 5 | import org.jgroups.util.Util; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | import java.util.function.Supplier; 11 | 12 | /** 13 | * @author Bela Ban 14 | * @since 0.1 15 | */ 16 | public class AppendEntriesResponse extends RaftHeader { 17 | protected AppendResult result; 18 | 19 | public AppendEntriesResponse() {} 20 | public AppendEntriesResponse(long term, AppendResult result) {super(term); this.result=result;} 21 | 22 | public short getMagicId() { 23 | return RAFT.APPEND_ENTRIES_RSP; 24 | } 25 | 26 | public Supplier create() { 27 | return AppendEntriesResponse::new; 28 | } 29 | 30 | @Override 31 | public int serializedSize() { 32 | return super.serializedSize() + Global.BYTE_SIZE + (result != null? result.size() : 0); 33 | } 34 | 35 | @Override 36 | public void writeTo(DataOutput out) throws IOException { 37 | super.writeTo(out); 38 | Util.writeStreamable(result, out); 39 | } 40 | 41 | @Override 42 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 43 | super.readFrom(in); 44 | result=Util.readStreamable(AppendResult::new, in); 45 | } 46 | 47 | @Override 48 | public String toString() { 49 | return super.toString() + ", result: " + result; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/AppendResult.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.util.Bits; 4 | import org.jgroups.util.Streamable; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | /** 11 | * The result of an AppendEntries request 12 | * @author Bela Ban 13 | * @since 0.1 14 | */ 15 | public class AppendResult implements Streamable { 16 | 17 | public enum Result {OK, FAIL_ENTRY_NOT_FOUND, FAIL_CONFLICTING_PREV_TERM}; 18 | 19 | /** True if the append succeeded, false otherwise */ 20 | protected Result result; 21 | 22 | /** The index of the last appended entry if success == true. If success is false, the first index for 23 | * non-matching term. If index == 0, this means the follower doesn't have a log and needs to run the 24 | * InstallSnapshot protocol to fetch the initial snapshot */ 25 | protected long index; 26 | 27 | /** The commit_index of the follower */ 28 | protected long commit_index; 29 | 30 | /** Ignored if success == true. If success is false, the non-matching term. */ 31 | protected long non_matching_term; // todo: needed ? 32 | 33 | public AppendResult() {} 34 | 35 | public AppendResult(Result result, long index) { 36 | this.result=result; 37 | this.index=index; 38 | } 39 | 40 | public AppendResult(Result result, long index, long non_matching_term) { 41 | this(result, index); 42 | this.non_matching_term=non_matching_term; 43 | } 44 | 45 | public boolean success() {return result != null && result == Result.OK;} 46 | public long index() {return index;} 47 | public long commitIndex() {return commit_index;} 48 | public long nonMatchingTerm() {return non_matching_term;} 49 | public AppendResult commitIndex(long ci) {this.commit_index=ci; return this;} 50 | 51 | public int size() { 52 | return Bits.size(result.ordinal()) + Bits.size(index) + Bits.size(commit_index) + Bits.size(non_matching_term); 53 | } 54 | 55 | public void writeTo(DataOutput out) throws IOException { 56 | Bits.writeIntCompressed(result.ordinal(), out); 57 | Bits.writeLongCompressed(index, out); 58 | Bits.writeLongCompressed(commit_index, out); 59 | Bits.writeLongCompressed(non_matching_term, out); 60 | } 61 | 62 | public void readFrom(DataInput in) throws IOException { 63 | int ordinal=Bits.readIntCompressed(in); 64 | result=Result.values()[ordinal]; 65 | index=Bits.readLongCompressed(in); 66 | commit_index=Bits.readLongCompressed(in); 67 | non_matching_term=Bits.readLongCompressed(in); 68 | } 69 | 70 | 71 | public String toString() { 72 | return String.format("%b%s, index=%d, commit-index=%d%s", 73 | success(), success()? "" : String.format(" (%s)", result), index, commit_index, 74 | non_matching_term> 0? String.format(", non-matching-term=%d", non_matching_term) : ""); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/DynamicMembership.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import java.util.concurrent.CompletableFuture; 4 | 5 | /** 6 | * Defines the contract to add and remove servers (RAFT.members). 7 | * @author Bela Ban 8 | * @since 0.2 9 | */ 10 | public interface DynamicMembership { 11 | CompletableFuture addServer(String name) throws Exception; 12 | CompletableFuture removeServer(String name) throws Exception; 13 | } 14 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/ELECTION.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.View; 5 | import org.jgroups.annotations.MBean; 6 | import org.jgroups.conf.ClassConfigurator; 7 | import org.jgroups.protocols.raft.election.BaseElection; 8 | import org.jgroups.raft.util.Utils; 9 | import org.jgroups.raft.util.Utils.Majority; 10 | 11 | import java.util.List; 12 | 13 | /** 14 | * The default leader election algorithm. 15 | *

16 | * Performs leader election. This implementation takes full advantage of JGroup's membership events with {@link View}. 17 | * When the current node is the view coordinator, it starts a voting thread to ask all members to send their information. 18 | * The voting thread stops when a new leader is elected. 19 | *

20 | * The process that starts the voting thread is not trying to elect itself. The process running the voting process 21 | * increases its term and asks all nodes about their term and log index information to select the new leader, in the 22 | * form of {@link org.jgroups.protocols.raft.election.VoteResponse}. For safety reasons, only the nodes with the most 23 | * up-to-date log can be elected a leader. With a response from the majority processes, the leader with the higher term 24 | * and log index is elected. The oldest process (view coordinator) in the system has a priority. Once decided, the 25 | * process sends a message reliably to everyone identifying the new leader, with the 26 | * {@link org.jgroups.protocols.raft.election.LeaderElected} message. 27 | *

28 | * After a leader is elected, a new election round starts on view changes only if the leader left the cluster. In case 29 | * of losing a majority, the leader steps down. 30 | *

31 | * This implementation is more robust than building with heartbeats, leading to fewer disruptions in the cluster with 32 | * unnecessary (competing) election rounds. This also means the leader is capable of stepping down. Referred to in 33 | * §6.2 of Ongaro's dissertation to prevent stale leadership information. 34 | *

35 | * More information is available in the design docs. 36 | * 37 | * @author Bela Ban 38 | * @since 0.1 39 | * @see Ongaro's dissertation 40 | */ 41 | @MBean(description="Protocol performing leader election according to the RAFT paper") 42 | public class ELECTION extends BaseElection { 43 | protected static final short ELECTION_ID = 520; 44 | 45 | static { 46 | ClassConfigurator.addProtocol(ELECTION_ID, ELECTION.class); 47 | } 48 | 49 | @Override 50 | protected void handleView(View v) { 51 | View previousView = this.view; 52 | this.view = v; 53 | Majority result=Utils.computeMajority(previousView, v, raft); 54 | log.debug("%s: existing view: %s, new view: %s, result: %s", local_addr, previousView, v, result); 55 | List

joiners=View.newMembers(previousView, v); 56 | boolean has_new_members=joiners != null && !joiners.isEmpty(); 57 | boolean coordinatorChanged = Utils.viewCoordinatorChanged(previousView, v); 58 | switch(result) { 59 | case no_change: 60 | // the leader resends its term/address for new members to set the term/leader. 61 | if(raft.isLeader() && has_new_members) 62 | sendLeaderElectedMessage(raft.leader(), raft.currentTerm()); 63 | 64 | // Handle cases where the previous coordinator left *before* a leader was elected. 65 | // See: https://github.com/jgroups-extras/jgroups-raft/issues/259 66 | else if (coordinatorChanged && isViewCoordinator() && isMajorityAvailable() && raft.leader() == null) 67 | startVotingThread(); 68 | break; 69 | case reached: 70 | case leader_lost: 71 | // In case the leader is lost, we stop everything *before* starting again. 72 | // This avoids cases where the leader is lost before the voting mechanism has stopped. 73 | // See: https://github.com/jgroups-extras/jgroups-raft/issues/259 74 | if(isViewCoordinator()) { 75 | log.trace("%s: starting voting process (reason: %s, view: %s)", local_addr, result, view); 76 | startVotingThread(); 77 | } 78 | break; 79 | case lost: 80 | stopVotingThread(); // if running, double-dutch 81 | raft.setLeaderAndTerm(null); 82 | break; 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/Follower.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.EmptyMessage; 5 | import org.jgroups.Message; 6 | import org.jgroups.raft.StateMachine; 7 | import org.jgroups.util.ByteArrayDataInputStream; 8 | import org.jgroups.util.Util; 9 | 10 | import java.io.DataInput; 11 | import java.nio.ByteBuffer; 12 | 13 | /** 14 | * Implements the behavior of a RAFT follower 15 | * @author Bela Ban 16 | * @since 0.1 17 | */ 18 | public class Follower extends RaftImpl { 19 | public Follower(RAFT raft) {super(raft);} 20 | 21 | @Override 22 | public void handleInstallSnapshotRequest(Message msg, Address leader, 23 | long last_included_index, long last_included_term) { 24 | StateMachine sm; 25 | if((sm=raft.state_machine) == null) { 26 | raft.getLog().error("%s: no state machine set, cannot install snapshot", raft.getAddress()); 27 | return; 28 | } 29 | Address sender=msg.src(); 30 | try { 31 | // Read into state machine 32 | ByteBuffer sn=ByteBuffer.wrap(msg.getArray(), msg.getOffset(), msg.getLength()); 33 | raft.log().setSnapshot(sn); 34 | 35 | DataInput in=new ByteArrayDataInputStream(msg.getArray(), msg.getOffset(), msg.getLength()); 36 | raft.internal_state.readFrom(in); 37 | sm.readContentFrom(in); 38 | 39 | // insert a dummy entry at last_included_index and set first/last/commit to it 40 | Log log=raft.log(); 41 | LogEntry le=new LogEntry(last_included_term, null); 42 | log.reinitializeTo(last_included_index, le); 43 | raft.commit_index=raft.last_appended=last_included_index; 44 | 45 | raft.getLog().debug("%s: applied snapshot (%s) from %s; last_appended=%d, commit_index=%d", 46 | raft.getAddress(), Util.printBytes(msg.getLength()), msg.src(), raft.lastAppended(), 47 | raft.commitIndex()); 48 | raft.num_snapshot_received++; 49 | AppendResult result=new AppendResult(AppendResult.Result.OK, last_included_index).commitIndex(raft.commitIndex()); 50 | Message ack=new EmptyMessage(leader).putHeader(raft.getId(), new AppendEntriesResponse(raft.currentTerm(), result)); 51 | raft.getDownProtocol().down(ack); 52 | } 53 | catch(Exception ex) { 54 | raft.getLog().error("%s: failed applying snapshot from %s: %s", raft.getAddress(), sender, ex); 55 | } 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/InstallSnapshotRequest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Header; 5 | import org.jgroups.util.Bits; 6 | import org.jgroups.util.Util; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | import java.util.function.Supplier; 12 | 13 | /** 14 | * @author Bela Ban 15 | * @since 0.1 16 | */ 17 | public class InstallSnapshotRequest extends RaftHeader { 18 | protected Address leader; 19 | protected long last_included_index; 20 | protected long last_included_term; 21 | 22 | public InstallSnapshotRequest() {} 23 | public InstallSnapshotRequest(long term) {super(term);} 24 | 25 | public InstallSnapshotRequest(long term, Address leader, long last_included_index, long last_included_term) { 26 | this(term); 27 | this.leader=leader; 28 | this.last_included_index=last_included_index; 29 | this.last_included_term=last_included_term; 30 | } 31 | 32 | public short getMagicId() { 33 | return RAFT.INSTALL_SNAPSHOT_REQ; 34 | } 35 | 36 | public Supplier create() { 37 | return InstallSnapshotRequest::new; 38 | } 39 | 40 | @Override 41 | public int serializedSize() { 42 | return super.serializedSize() + Util.size(leader) + Bits.size(last_included_index) + Bits.size(last_included_term); 43 | } 44 | 45 | @Override 46 | public void writeTo(DataOutput out) throws IOException { 47 | super.writeTo(out); 48 | Util.writeAddress(leader, out); 49 | Bits.writeLongCompressed(last_included_index, out); 50 | Bits.writeLongCompressed(last_included_term, out); 51 | } 52 | 53 | @Override 54 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 55 | super.readFrom(in); 56 | leader=Util.readAddress(in); 57 | last_included_index=Bits.readLongCompressed(in); 58 | last_included_term=Bits.readLongCompressed(in); 59 | } 60 | 61 | @Override 62 | public String toString() { 63 | return super.toString() + ", leader=" + leader + ", last_included_index=" + last_included_index + 64 | ", last_included_term=" + last_included_term; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/InternalCommand.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.util.Bits; 4 | import org.jgroups.util.Streamable; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | /** 11 | * Internal command to be added to the log, e.g. adding or removing a server 12 | * @author Bela Ban 13 | * @since 0.2 14 | */ 15 | public class InternalCommand implements Streamable { 16 | protected Type type; 17 | protected String name; 18 | 19 | public InternalCommand() { // marshalling 20 | } 21 | 22 | public InternalCommand(Type type, String name) { 23 | this.type=type; 24 | this.name=name; 25 | } 26 | 27 | public Type type() {return type;} 28 | 29 | public void writeTo(DataOutput out) throws IOException { 30 | out.writeByte(type.ordinal()); 31 | Bits.writeString(name, out); 32 | } 33 | 34 | public void readFrom(DataInput in) throws IOException { 35 | type=Type.values()[in.readByte()]; 36 | name=Bits.readString(in); 37 | } 38 | 39 | public Object execute(RAFT raft) throws Exception { 40 | switch(type) { 41 | case addServer: 42 | raft._addServer(name); 43 | break; 44 | case removeServer: 45 | raft._removeServer(name); 46 | break; 47 | } 48 | return null; 49 | } 50 | 51 | @Override 52 | public String toString() { 53 | return type + (type == Type.noop? "" : "(" + name + ")"); 54 | } 55 | 56 | public enum Type {addServer, removeServer, noop}; 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/Leader.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Message; 5 | import org.jgroups.ObjectMessage; 6 | import org.jgroups.logging.Log; 7 | import org.jgroups.raft.util.CommitTable; 8 | import org.jgroups.raft.util.RequestTable; 9 | import org.jgroups.raft.util.Utils; 10 | 11 | import java.util.function.Supplier; 12 | 13 | /** 14 | * Implements the behavior of a RAFT leader 15 | * @author Bela Ban 16 | * @since 0.1 17 | */ 18 | public class Leader extends RaftImpl { 19 | protected final Supplier majority=() -> raft.majority(); 20 | 21 | public Leader(RAFT raft) { 22 | super(raft); 23 | } 24 | 25 | 26 | public void init() { 27 | super.init(); 28 | raft.createRequestTable(); 29 | raft.createCommitTable(); 30 | } 31 | 32 | public void destroy() { 33 | super.destroy(); 34 | RequestTable reqTable = raft.request_table; 35 | raft.request_table=null; 36 | raft.commit_table=null; 37 | 38 | if (reqTable != null) reqTable.destroy(raft.notCurrentLeader()); 39 | } 40 | 41 | 42 | @Override 43 | public void handleAppendEntriesResponse(Address sender, long term, AppendResult result) { 44 | RequestTable reqtab=raft.request_table; 45 | if(reqtab == null) 46 | throw new IllegalStateException("request table cannot be null in leader"); 47 | String sender_raft_id=Utils.extractRaftId(sender); 48 | Log log=raft.getLog(); 49 | if(log.isTraceEnabled()) 50 | log.trace("%s: received AppendEntries response from %s for term %d: %s", raft.getAddress(), sender, term, result); 51 | switch(result.result) { 52 | case OK: 53 | raft.commit_table.update(sender, result.index(), result.index() + 1, result.commit_index, false); 54 | 55 | // Learner members do not count to an entry commit. 56 | if (!Utils.isRaftMember(sender_raft_id, raft.members())) 57 | break; 58 | 59 | boolean done = reqtab.add(result.index, sender_raft_id, this.majority); 60 | if(done) { 61 | raft.commitLogTo(result.index, true); 62 | } 63 | // Send commits immediately. 64 | // Note that, an entry is committed by a MAJORITY, this means that some of the nodes doesn't know the entry exist yet. 65 | // This way, send the commit messages any time we handle an append response. 66 | if(raft.send_commits_immediately) { 67 | // Done is only true when reaching a majority threshold, we also need to check is committed to resend 68 | // to slower nodes. 69 | if (done || reqtab.isCommitted(result.index)) 70 | sendCommitMessageToFollowers(); 71 | } 72 | break; 73 | // todo: change 74 | case FAIL_ENTRY_NOT_FOUND: 75 | raft.commit_table.update(sender, result.index(), result.index()+1, result.commit_index, true); 76 | break; 77 | case FAIL_CONFLICTING_PREV_TERM: 78 | raft.commit_table.update(sender, result.index()-1, result.index(), result.commit_index, true, true); 79 | break; 80 | } 81 | } 82 | 83 | private void sendCommitMessageToFollowers() { 84 | raft.commit_table.forEach(this::sendCommitMessageToFollower); 85 | } 86 | 87 | private void sendCommitMessageToFollower(Address member, CommitTable.Entry entry) { 88 | if(raft.commit_index > entry.commitIndex()) { 89 | long cterm=raft.currentTerm(); 90 | short id=raft.getId(); 91 | Address leader=raft.getAddress(); 92 | Message msg=new ObjectMessage(member, null) 93 | .putHeader(id, new AppendEntriesRequest(leader, cterm, 0, 0, cterm, raft.commit_index)); 94 | raft.down(msg); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/Learner.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | /** 4 | * Implements the behavior of a learner node. 5 | * 6 | *

7 | * A learner nodes operates the same way as a {@link Follower}. However, the learner does not have voting rights for 8 | * committing an entry, for electing a leader, or to become a leader. 9 | *

10 | * 11 | * @author José Bolina 12 | * @since 1.1 13 | * @see Learner design 14 | */ 15 | public final class Learner extends Follower { 16 | 17 | public Learner(RAFT raft) { 18 | super(raft); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/LevelDBLog.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.logging.LogFactory; 4 | 5 | import java.util.Map; 6 | 7 | /** 8 | * Legacy implementation of {@link Log} which was using LevelDB JNI preserved for compatibility. Implementation simply 9 | * delegated to {@link FileBasedLog}. 10 | * 11 | * @author Ugo Landini 12 | * @deprecated Delegates to {@link FileBasedLog}. 13 | */ 14 | @Deprecated(since = "1.1.0", forRemoval = true) 15 | public class LevelDBLog extends FileBasedLog { 16 | 17 | protected final org.jgroups.logging.Log LOG = LogFactory.getLog(LevelDBLog.class); 18 | 19 | @Override 20 | public void init(String log_name, Map args) throws Exception { 21 | LOG.warn("LevelDBLog log implementation is deprecated. Verify the upgrade guide to migrate your data."); 22 | super.init(log_name, args); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/LogEntries.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.util.Bits; 4 | import org.jgroups.util.SizeStreamable; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | import java.util.ArrayList; 10 | import java.util.Iterator; 11 | import java.util.Objects; 12 | 13 | /** 14 | * List of {@link LogEntry} elements, provides efficient serialization. Used mainly in {@link AppendEntriesRequest} 15 | * messages. Note that this class is unsynchronized, as it is intended to be used by a single thread. 16 | *
17 | * Format:
| num-elements | log-entry0 | log-entry1 ... | log-entryN | 
18 | * @author Bela Ban 19 | * @since 1.0.8 20 | */ 21 | public class LogEntries implements SizeStreamable, Iterable { 22 | protected ArrayList entries; 23 | 24 | 25 | public LogEntries add(LogEntry ... log_entries) { 26 | if(entries == null) 27 | entries=new ArrayList<>(log_entries.length); 28 | else 29 | entries.ensureCapacity(log_entries.length); 30 | for(LogEntry le: log_entries) 31 | entries.add(Objects.requireNonNull(le)); 32 | return this; 33 | } 34 | 35 | public static LogEntries create(LogEntry... entries) { 36 | return new LogEntries().add(entries); 37 | } 38 | 39 | public LogEntries clear() { 40 | if(entries != null) 41 | entries.clear(); 42 | return this; 43 | } 44 | 45 | public Iterator iterator() { 46 | if(entries == null) 47 | entries=new ArrayList<>(); 48 | return entries.iterator(); 49 | } 50 | 51 | public int size() { 52 | return entries != null? entries.size() : 0; 53 | } 54 | 55 | // will be removed as soon as Log.append(term, LogEntry... entries) has been changed to append(term, LogEntries e) 56 | public LogEntry[] toArray() { 57 | if(entries == null) 58 | return new LogEntry[0]; 59 | LogEntry[] ret=new LogEntry[size()]; 60 | int index=0; 61 | for(LogEntry le: entries) 62 | ret[index++]=le; 63 | return ret; 64 | } 65 | 66 | public long totalSize() { 67 | final ArrayList tmp=this.entries; 68 | if(tmp == null) 69 | return 0; 70 | long length=0; 71 | for(int i=0, size=tmp.size(); i < size; i++) { 72 | LogEntry entry=tmp.get(i); 73 | if(entry != null) 74 | length+=entry.length; 75 | } 76 | return length; 77 | } 78 | 79 | public int serializedSize() { 80 | int size=size(); 81 | int retval=Bits.size(size); 82 | if(size > 0) { 83 | for(LogEntry le: entries) 84 | retval+=le.serializedSize(); 85 | } 86 | return retval; 87 | } 88 | 89 | public void writeTo(DataOutput out) throws IOException { 90 | int size=size(); 91 | Bits.writeIntCompressed(size, out); 92 | if(size > 0) { 93 | for(LogEntry le: entries) 94 | le.writeTo(out); 95 | } 96 | } 97 | 98 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 99 | int size=Bits.readIntCompressed(in); 100 | if(size > 0) { 101 | entries=new ArrayList<>(size); 102 | for(int i=0; i < size; i++) { 103 | LogEntry le=new LogEntry(); 104 | le.readFrom(in); 105 | entries.add(le); 106 | } 107 | } 108 | } 109 | 110 | public String toString() { 111 | return String.format("%d entries", size()); 112 | } 113 | 114 | 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/LogEntry.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.util.Bits; 5 | import org.jgroups.util.SizeStreamable; 6 | import org.jgroups.util.Util; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | 12 | /** 13 | * An element in a log. Captures the term and command to be applied to the state machine 14 | * @author Bela Ban 15 | * @since 0.1 16 | */ 17 | public class LogEntry implements SizeStreamable { 18 | protected long term; // the term of this entry 19 | protected byte[] command; // the command (interpreted by the state machine) 20 | protected int offset; // may get removed (always 0) 21 | protected int length; // may get removed (always command.length) 22 | protected boolean internal; // if true, the contents of the buffer are an InternalCommand 23 | 24 | 25 | public LogEntry() {} 26 | 27 | public LogEntry(long term, byte[] command) { 28 | this(term, command, 0, command != null? command.length : 0); 29 | } 30 | 31 | public LogEntry(long term, byte[] command, int offset, int length) { 32 | this(term, command, offset, length, false); 33 | } 34 | 35 | 36 | public LogEntry(long term, byte[] command, int offset, int length, boolean internal) { 37 | this.term=term; 38 | this.command=command; 39 | this.offset=offset; 40 | this.length=length; 41 | this.internal=internal; 42 | } 43 | 44 | public long term() {return term;} 45 | public LogEntry term(long t) {term=t; return this;} 46 | public byte[] command() {return command;} 47 | public int offset() {return offset;} 48 | public int length() {return length;} 49 | public boolean internal() {return internal;} 50 | public LogEntry internal(boolean b) {internal=b; return this;} 51 | 52 | public int serializedSize() { 53 | int retval=Bits.size(term) + Global.BYTE_SIZE*2; 54 | if(command != null) 55 | retval+=Global.INT_SIZE + length(); 56 | return retval; 57 | } 58 | 59 | public void writeTo(DataOutput out) throws IOException { 60 | Bits.writeLongCompressed(term, out); 61 | Util.writeByteBuffer(command, offset, length, out); 62 | out.writeBoolean(internal); 63 | } 64 | 65 | public void readFrom(DataInput in) throws IOException { 66 | term=Bits.readLongCompressed(in); 67 | command=Util.readByteBuffer(in); 68 | offset=0; 69 | length=command != null? command.length : 0; 70 | internal=in.readBoolean(); 71 | } 72 | 73 | public String toString() { 74 | StringBuilder str = new StringBuilder(); 75 | str.append("term=").append(term).append(" (").append(command != null? command.length : 0).append(" bytes)"); 76 | if(internal) str.append(" [internal]"); 77 | return str.toString(); 78 | } 79 | 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/NO_DUPES.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.*; 4 | import org.jgroups.annotations.MBean; 5 | import org.jgroups.conf.ClassConfigurator; 6 | import org.jgroups.protocols.pbcast.GMS; 7 | import org.jgroups.protocols.pbcast.JoinRsp; 8 | import org.jgroups.stack.Protocol; 9 | import org.jgroups.util.ByteArray; 10 | import org.jgroups.util.ExtendedUUID; 11 | import org.jgroups.util.MessageBatch; 12 | import org.jgroups.util.Util; 13 | 14 | import java.util.Arrays; 15 | import java.util.Iterator; 16 | 17 | /** 18 | * Intercepts JOIN and MERGE requests on the coordinator and rejects members whose addition would lead to members 19 | * with duplicate raft-ids in the view.

20 | * Every member's address must be an {@link org.jgroups.util.ExtendedUUID} and have a "raft-id" key whose value 21 | * is the raft-id. When intercepting a JOIN request whose sender has a raft-id that's already in the view, we send 22 | * back a {@link org.jgroups.protocols.pbcast.JoinRsp} with a rejection message.

23 | * Very similar to {@link org.jgroups.protocols.AUTH}. 24 | * @author Bela Ban 25 | * @since 0.2 26 | */ 27 | @MBean(description="Rejects views with duplicate members (identical raft-ids)") 28 | public class NO_DUPES extends Protocol { 29 | protected static final short gms_id=ClassConfigurator.getProtocolId(GMS.class); 30 | protected volatile View view; 31 | 32 | public Object down(Event evt) { 33 | switch(evt.getType()) { 34 | case Event.VIEW_CHANGE: 35 | view=evt.getArg(); 36 | break; 37 | } 38 | return down_prot.down(evt); 39 | } 40 | 41 | public Object up(Message msg) { 42 | GMS.GmsHeader hdr=msg.getHeader(gms_id); 43 | if(hdr != null && !handleGmsHeader(hdr, msg.src())) 44 | return null; 45 | return up_prot.up(msg); 46 | } 47 | 48 | public void up(MessageBatch batch) { 49 | for(Iterator it=batch.iterator(); it.hasNext();) { 50 | Message msg=it.next(); 51 | GMS.GmsHeader hdr=msg.getHeader(gms_id); 52 | if(hdr != null && !handleGmsHeader(hdr, msg.src())) 53 | it.remove(); 54 | } 55 | if(!batch.isEmpty()) 56 | up_prot.up(batch); 57 | } 58 | 59 | /** 60 | * @return True if the message should be passed up, false if it should be discarded 61 | */ 62 | protected boolean handleGmsHeader(GMS.GmsHeader hdr, Address sender) { 63 | switch(hdr.getType()) { 64 | case GMS.GmsHeader.JOIN_REQ: 65 | case GMS.GmsHeader.JOIN_REQ_WITH_STATE_TRANSFER: 66 | Address joiner=hdr.getMember(); 67 | if(!(joiner instanceof ExtendedUUID)) { 68 | log.debug("joiner %s needs to have an ExtendedUUID but has a %s", sender, joiner.getClass().getSimpleName()); 69 | break; 70 | } 71 | View v=view; 72 | if(contains(v, (ExtendedUUID)joiner)) { 73 | String msg=String.format("join of %s rejected as it would create a view with duplicate members (current view: %s)", joiner, v); 74 | log.warn(msg); 75 | sendJoinRejectedMessageTo(sender, msg); 76 | return false; 77 | } 78 | break; 79 | case GMS.GmsHeader.MERGE_REQ: 80 | // to be done later when we know how to handle merges in jgroups-raft 81 | break; 82 | } 83 | return true; 84 | } 85 | 86 | protected static boolean contains(View v, ExtendedUUID joiner) { 87 | byte[] raft_id=joiner.get(RAFT.raft_id_key); 88 | for(Address addr: v) { 89 | if(addr instanceof ExtendedUUID) { 90 | ExtendedUUID uuid=(ExtendedUUID)addr; 91 | byte[] tmp=uuid.get(RAFT.raft_id_key); 92 | // compare byte[] buffers to avoid the cost of deserialization 93 | if(Arrays.equals(raft_id, tmp)) 94 | return true; 95 | } 96 | } 97 | return false; 98 | } 99 | 100 | protected void sendJoinRejectedMessageTo(Address joiner, String reject_message) { 101 | try { 102 | // needs to be a BytesMessage for now (no ObjectMessage) as GMS itself also uses a BytesMessage; 103 | // once GMS has been changed in JGroups itself to use an ObjectMessage, we can change this here, too 104 | ByteArray buffer=Util.streamableToBuffer(new JoinRsp(reject_message)); 105 | Message msg=new BytesMessage(joiner, buffer).putHeader(gms_id, new GMS.GmsHeader(GMS.GmsHeader.JOIN_RSP)); 106 | down_prot.down(msg); 107 | } 108 | catch(Exception ex) { 109 | log.error("failed sending JoinRsp to %s: %s", joiner, ex); 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/PersistentState.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.util.SizeStreamable; 5 | import org.jgroups.util.Util; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | import java.util.Collection; 12 | import java.util.HashSet; 13 | import java.util.List; 14 | 15 | /** 16 | * Responsible for a node's internal state. 17 | * 18 | *

The data in this class is serialized and stored at the beginning of a snapshot file. This class does not 19 | * hold information that the {@link RAFT} protocol requires to be persistent, e.g., terms and votes.

20 | * 21 | * @author Jose Bolina 22 | * @since 1.0.11 23 | */ 24 | public class PersistentState implements SizeStreamable { 25 | private final List members = new ArrayList<>(); 26 | 27 | public List getMembers() { 28 | return new ArrayList<>(members); 29 | } 30 | 31 | public void setMembers(Collection value) { 32 | members.clear(); 33 | members.addAll(new HashSet<>(value)); 34 | } 35 | 36 | @Override 37 | public void writeTo(DataOutput out) throws IOException { 38 | int size=members.size(); 39 | out.writeInt(size); 40 | for (String member : members) { 41 | out.writeUTF(member); 42 | } 43 | } 44 | 45 | @Override 46 | public void readFrom(DataInput in) throws IOException { 47 | int size=in.readInt(); 48 | List tmp = new ArrayList<>(); 49 | 50 | for (int i = 0; i < size; i++) { 51 | tmp.add(in.readUTF()); 52 | } 53 | setMembers(tmp); 54 | } 55 | 56 | @Override 57 | public int serializedSize() { 58 | int size=Global.INT_SIZE; 59 | for (String member : members) { 60 | size += Util.size(member); 61 | } 62 | return size; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return String.format("members=%s", members); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/RaftHeader.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | import org.jgroups.Header; 4 | import org.jgroups.util.Bits; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | /** 11 | * @author Bela Ban 12 | * @since 0.1 13 | */ 14 | public abstract class RaftHeader extends Header { 15 | protected long curr_term; // the current term on the leader 16 | 17 | public RaftHeader() {} 18 | public RaftHeader(long curr_term) {this.curr_term=curr_term;} 19 | 20 | public long currTerm() {return curr_term;} 21 | public RaftHeader currTerm(long t) {curr_term=t; return this;} 22 | 23 | 24 | public int serializedSize() { 25 | return Bits.size(curr_term); 26 | } 27 | 28 | public void writeTo(DataOutput out) throws IOException { 29 | Bits.writeLongCompressed(curr_term, out); 30 | } 31 | 32 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 33 | curr_term=Bits.readLongCompressed(in); 34 | } 35 | 36 | public String toString() {return getClass().getSimpleName() + ": current_term=" + curr_term;} 37 | } 38 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/RaftLeaderException.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | public class RaftLeaderException extends Exception { 4 | public RaftLeaderException(String s) { 5 | super(s); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/Role.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft; 2 | 3 | /** 4 | * @author Bela Ban 5 | * @since 0.1 6 | */ 7 | public enum Role { 8 | Follower, 9 | Leader, 10 | Learner 11 | } 12 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/election/LeaderElected.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft.election; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Header; 5 | import org.jgroups.protocols.raft.RaftHeader; 6 | import org.jgroups.util.Util; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | import java.util.function.Supplier; 12 | 13 | import static org.jgroups.protocols.raft.election.BaseElection.LEADER_ELECTED; 14 | 15 | /** 16 | * Sent by the freshly elected leader to all members (-self) 17 | * @author Bela Ban 18 | * @since 1.0.6 19 | */ 20 | public class LeaderElected extends RaftHeader { 21 | protected Address leader; 22 | 23 | public LeaderElected() { 24 | } 25 | 26 | public LeaderElected(Address leader) {this.leader=leader;} 27 | 28 | public Address leader() {return leader;} 29 | public short getMagicId() { 30 | return LEADER_ELECTED; 31 | } 32 | 33 | public Supplier create() { 34 | return LeaderElected::new; 35 | } 36 | 37 | public int serializedSize() { 38 | return super.serializedSize() + Util.size(leader); 39 | } 40 | 41 | public void writeTo(DataOutput out) throws IOException { 42 | super.writeTo(out); 43 | Util.writeAddress(leader, out); 44 | } 45 | 46 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 47 | super.readFrom(in); 48 | leader=Util.readAddress(in); 49 | } 50 | 51 | public String toString() { 52 | return super.toString() + ", leader=" + leader; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/election/PreVoteRequest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft.election; 2 | 3 | import org.jgroups.Header; 4 | import org.jgroups.protocols.raft.ELECTION2; 5 | import org.jgroups.protocols.raft.RaftHeader; 6 | 7 | import java.util.function.Supplier; 8 | 9 | /** 10 | * Utilized during the pre-voting phase to ask nodes information about their leader. 11 | * 12 | * @author José Bolina 13 | * @since 1.0.12 14 | */ 15 | public class PreVoteRequest extends RaftHeader { 16 | 17 | public PreVoteRequest() { } 18 | 19 | @Override 20 | public short getMagicId() { 21 | return ELECTION2.PRE_VOTE_REQ; 22 | } 23 | 24 | @Override 25 | public Supplier create() { 26 | return PreVoteRequest::new; 27 | } 28 | 29 | @Override 30 | public String toString() { 31 | return "PreVote: " + super.toString(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/election/PreVoteResponse.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft.election; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Header; 5 | import org.jgroups.protocols.raft.ELECTION2; 6 | import org.jgroups.protocols.raft.RaftHeader; 7 | import org.jgroups.util.Util; 8 | 9 | import java.io.DataInput; 10 | import java.io.DataOutput; 11 | import java.io.IOException; 12 | import java.util.function.Supplier; 13 | 14 | /** 15 | * Utilized during the pre-voting phase to return information about the current seen leader. 16 | * 17 | * @author José Bolina 18 | * @since 1.0.12 19 | */ 20 | public class PreVoteResponse extends RaftHeader { 21 | 22 | protected Address leader; 23 | 24 | public PreVoteResponse() {} 25 | 26 | public PreVoteResponse(Address leader) { 27 | this.leader = leader; 28 | } 29 | 30 | public Address leader() { 31 | return leader; 32 | } 33 | 34 | @Override 35 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 36 | super.readFrom(in); 37 | leader = Util.readAddress(in); 38 | } 39 | 40 | @Override 41 | public void writeTo(DataOutput out) throws IOException { 42 | super.writeTo(out); 43 | Util.writeAddress(leader, out); 44 | } 45 | 46 | @Override 47 | public int serializedSize() { 48 | return super.serializedSize() + Util.size(leader); 49 | } 50 | 51 | @Override 52 | public short getMagicId() { 53 | return ELECTION2.PRE_VOTE_RSP; 54 | } 55 | 56 | @Override 57 | public Supplier create() { 58 | return PreVoteResponse::new; 59 | } 60 | 61 | @Override 62 | public String toString() { 63 | return getClass().getSimpleName() + ": leader=" + leader; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/election/VoteRequest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft.election; 2 | 3 | import org.jgroups.Header; 4 | import org.jgroups.protocols.raft.RaftHeader; 5 | 6 | import java.util.function.Supplier; 7 | 8 | import static org.jgroups.protocols.raft.election.BaseElection.VOTE_REQ; 9 | 10 | /** 11 | * @author Bela Ban 12 | * @since 0.1 13 | */ 14 | public class VoteRequest extends RaftHeader { 15 | 16 | public VoteRequest() {} 17 | public VoteRequest(long term) { 18 | super(term); 19 | } 20 | 21 | public short getMagicId() { 22 | return VOTE_REQ; 23 | } 24 | 25 | public Supplier create() { 26 | return VoteRequest::new; 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/org/jgroups/protocols/raft/election/VoteResponse.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.protocols.raft.election; 2 | 3 | import org.jgroups.Header; 4 | import org.jgroups.protocols.raft.RaftHeader; 5 | import org.jgroups.util.Bits; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | import java.util.function.Supplier; 11 | 12 | import static org.jgroups.protocols.raft.election.BaseElection.VOTE_RSP; 13 | 14 | /** 15 | * @author Bela Ban 16 | * @since 0.1 17 | */ 18 | public class VoteResponse extends RaftHeader { 19 | protected long last_log_term; // term of the last log entry 20 | protected long last_log_index; // index of the last log entry 21 | 22 | public VoteResponse() {} 23 | public VoteResponse(long term, long last_log_term, long last_log_index) { 24 | super(term); 25 | this.last_log_term=last_log_term; 26 | this.last_log_index=last_log_index; 27 | } 28 | 29 | public short getMagicId() { 30 | return VOTE_RSP; 31 | } 32 | 33 | public Supplier create() { 34 | return VoteResponse::new; 35 | } 36 | 37 | 38 | public int serializedSize() { 39 | return super.serializedSize() + Bits.size(last_log_term) + Bits.size(last_log_index); 40 | } 41 | 42 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 43 | super.readFrom(in); 44 | last_log_term=Bits.readLongCompressed(in); 45 | last_log_index=Bits.readLongCompressed(in); 46 | } 47 | 48 | public void writeTo(DataOutput out) throws IOException { 49 | super.writeTo(out); 50 | Bits.writeLongCompressed(last_log_term, out); 51 | Bits.writeLongCompressed(last_log_index, out); 52 | } 53 | 54 | public String toString() { 55 | return super.toString() + ", last_log_term=" + last_log_term + ", last_log_index=" + last_log_index; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/Options.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.util.SizeStreamable; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | /** 11 | * Options to pass to {@link Settable#setAsync(byte[], int, int)} call 12 | * @author Bela Ban 13 | * @since 1.0.9 14 | */ 15 | public class Options implements SizeStreamable { 16 | 17 | public static final Options DEFAULT_OPTIONS = new Options(); 18 | 19 | protected boolean ignore_return_value; 20 | 21 | public boolean ignoreReturnValue() {return ignore_return_value;} 22 | 23 | public Options ignoreReturnValue(boolean ignore) {this.ignore_return_value=ignore; return this;} 24 | 25 | public static Options create(boolean ignore_retval) { 26 | return new Options().ignoreReturnValue(ignore_retval); 27 | } 28 | 29 | public void writeTo(DataOutput out) throws IOException { 30 | out.writeBoolean(ignore_return_value); 31 | } 32 | 33 | public void readFrom(DataInput in) throws IOException, ClassNotFoundException { 34 | ignore_return_value=in.readBoolean(); 35 | } 36 | 37 | public int serializedSize() { 38 | return Global.BYTE_SIZE; 39 | } 40 | 41 | public String toString() { 42 | return String.format("%s", ignore_return_value? "[ignore-retval]" : ""); 43 | } 44 | 45 | @Override 46 | public boolean equals(Object o) { 47 | if (this == o) return true; 48 | if (!(o instanceof Options)) return false; 49 | 50 | Options options = (Options) o; 51 | 52 | return ignore_return_value == options.ignore_return_value; 53 | } 54 | 55 | @Override 56 | public int hashCode() { 57 | return (ignore_return_value ? 1 : 0); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/Settable.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft; 2 | 3 | 4 | import java.util.concurrent.CompletableFuture; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | /** 8 | * Interface to make changes to the Raft state machine. All changes are made through the leader, which appends the change 9 | * to its log and then sends it to all followers. When the majority has acked the change, it will be committed to the log. 10 | * @author Bela Ban 11 | * @since 0.1 12 | */ 13 | public interface Settable { 14 | /** 15 | * Synchronous set. Blocks until the change has been committed. 16 | * @param buf The buffer (usually a serialized command) which represent the change to be applied to all state machines 17 | * @param offset The offset into the buffer 18 | * @param length The number of bytes to be used in the buffer, starting at offset 19 | * @return Another buffer, representing the result of applying the change. E.g. for a put(k,v), this might be the 20 | * serialized result of the previous key in a hashmap 21 | * @throws Exception Thrown if the change could not be applied/committed, e.g. because there was no majority, or no elected leader 22 | */ 23 | default byte[] set(byte[] buf, int offset, int length) throws Exception { 24 | CompletableFuture future=setAsync(buf, offset, length); 25 | return future.get(); 26 | } 27 | 28 | /** 29 | * Synchronous set bounded by a timeout. Blocks until the change has been committed or a timeout occurred 30 | * @param buf The buffer (usually a serialized command) which represent the change to be applied to all state machines 31 | * @param offset The offset into the buffer 32 | * @param length The number of bytes to be used in the buffer, starting at offset 33 | * @param timeout The timeout, in unit (below) 34 | * @param unit The unit of the timeout 35 | * @return Another buffer, representing the result of applying the change. E.g. for a put(k,v), this might be the 36 | * serialized result of the previous key in a hashmap 37 | * @throws Exception Thrown if the change could not be applied/committed, e.g. because there was no majority, or no elected leader 38 | */ 39 | default byte[] set(byte[] buf, int offset, int length, long timeout, TimeUnit unit) throws Exception { 40 | CompletableFuture future=setAsync(buf, offset, length); 41 | return future.get(timeout, unit); 42 | } 43 | 44 | 45 | default CompletableFuture setAsync(byte[] buf, int offset, int length) throws Exception { 46 | return setAsync(buf, offset, length, null); 47 | } 48 | 49 | /** 50 | * Asynchronous set, returns immediately with a CompletableFuture. To wait for the result, 51 | * {@link CompletableFuture#get()} or {@link CompletableFuture#get(long, TimeUnit)} can be called. 52 | * @param buf The buffer (usually a serialized command) which represent the change to be applied to all state machines 53 | * @param offset The offset into the buffer 54 | * @param length he number of bytes to be used in the buffer, starting at offset 55 | * @param options Options to pass to the call, may be null 56 | * @return A CompletableFuture which can be used to fetch the result. 57 | */ 58 | CompletableFuture setAsync(byte[] buf, int offset, int length, Options options) throws Exception; 59 | } 60 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/StateMachine.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | 6 | /** 7 | * Interface of a state machine which stores data in memory. Committed log entries are applied to the state machine. 8 | * @author Bela Ban 9 | * @since 0.1 10 | */ 11 | public interface StateMachine { 12 | /** 13 | * Applies a command to the state machine. The contents of the byte[] buffer are interpreted by the state machine. 14 | * The command could for example be a set(), remove() or clear() command. 15 | * @param data The byte[] buffer 16 | * @param offset The offset at which the data starts 17 | * @param length The length of the data 18 | * @param serialize_response If true, serialize and return the response, else return null 19 | * @return A serialized response value, or null (e.g. if the method returned void) 20 | * @throws Exception Thrown on deserialization or other failure 21 | */ 22 | // todo: use ByteBuffers? 23 | byte[] apply(byte[] data, int offset, int length, boolean serialize_response) throws Exception; 24 | 25 | /** 26 | * Reads the contents of the state machine from an input stream. This can be the case when an InstallSnapshot RPC 27 | * is used to bootstrap a new node, or a node that's lagging far behind.

28 | * The parsing depends on the concrete state machine implementation, but the idea is that the stream is a sequence 29 | * of commands, each of which can be passed to {@link #apply(byte[], int, int, boolean)}.

30 | * The state machine may need to block modifications until the contents have been set (unless e.g. copy-on-write 31 | * is used)

32 | * The state machine implementation may need to remove all contents before populating itself from the stream. 33 | * @param in The input stream 34 | */ 35 | void readContentFrom(DataInput in) throws Exception; 36 | 37 | /** 38 | * Writes the contents of the state machine to an output stream. This is typically called on the leader to 39 | * provide state to a new node, or a node that's lagging far behind.

40 | * Updates to the state machine may need to be put on hold while the state is written to the output stream. 41 | * @param out The output stream 42 | */ 43 | void writeContentTo(DataOutput out) throws Exception; 44 | } 45 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/blocks/RaftAsyncCounter.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.blocks; 2 | 3 | import org.jgroups.blocks.atomic.AsyncCounter; 4 | import org.jgroups.raft.Options; 5 | 6 | /** 7 | * TODO! document this 8 | */ 9 | public interface RaftAsyncCounter extends AsyncCounter, RaftCounter { 10 | 11 | @Override 12 | default RaftAsyncCounter async() { 13 | return this; 14 | } 15 | 16 | @Override 17 | RaftAsyncCounter withOptions(Options opts); 18 | } 19 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/blocks/RaftCounter.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.blocks; 2 | 3 | import org.jgroups.blocks.atomic.BaseCounter; 4 | import org.jgroups.raft.Options; 5 | 6 | /** 7 | * TODO! document this 8 | */ 9 | public interface RaftCounter extends BaseCounter { 10 | 11 | /** 12 | * Gets the current local value of the counter; this is purely local and the value may be stale 13 | * 14 | * @return The current local value of the counter 15 | */ 16 | long getLocal(); 17 | 18 | /** 19 | * Returns an instance of a counter with the given options 20 | * 21 | * @param opts The options 22 | * @return The counter of the given type 23 | */ 24 | RaftCounter withOptions(Options opts); 25 | 26 | @Override 27 | RaftSyncCounter sync(); 28 | 29 | @Override 30 | RaftAsyncCounter async(); 31 | } 32 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/blocks/RaftSyncCounter.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.blocks; 2 | 3 | import org.jgroups.blocks.atomic.SyncCounter; 4 | import org.jgroups.raft.Options; 5 | 6 | /** 7 | * TODO! document this 8 | */ 9 | public interface RaftSyncCounter extends SyncCounter, RaftCounter { 10 | 11 | @Override 12 | default RaftSyncCounter sync() { 13 | return this; 14 | } 15 | 16 | @Override 17 | RaftSyncCounter withOptions(Options opts); 18 | } 19 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/client/Client.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.client; 2 | 3 | import org.jgroups.protocols.raft.CLIENT; 4 | import org.jgroups.util.Util; 5 | 6 | import java.net.InetAddress; 7 | import java.util.concurrent.CompletableFuture; 8 | 9 | /** 10 | * Client which accesses the {@link org.jgroups.protocols.raft.CLIENT} protocol through a socket. Currently used to 11 | * submit addServer and remove Server commands 12 | * @author Bela Ban 13 | * @since 0.2 14 | */ 15 | public class Client { 16 | 17 | 18 | protected static void start(InetAddress dest, int port, String add_server, String remove_server) throws Throwable { 19 | try(ClientStub stub=new ClientStub(dest, port).start()) { 20 | CLIENT.RequestType type=add_server != null? CLIENT.RequestType.add_server : CLIENT.RequestType.remove_server; 21 | byte[] buf=Util.stringToBytes(add_server != null? add_server : remove_server); 22 | CompletableFuture cf=stub.setAsync(type, buf, 0, buf.length); 23 | byte[] rsp=cf.join(); 24 | Object response=Util.objectFromByteBuffer(rsp); 25 | if(response instanceof Throwable) 26 | throw (Throwable)response; 27 | System.out.printf(String.valueOf(response)); 28 | } 29 | } 30 | 31 | public static void main(String[] args) throws Throwable { 32 | InetAddress dest=InetAddress.getLocalHost(); 33 | int port=1965; 34 | String add_server=null, remove_server=null; 35 | 36 | for(int i=0; i < args.length; i++) { 37 | if(args[i].equals("-dest")) { 38 | dest=InetAddress.getByName(args[++i]); 39 | continue; 40 | } 41 | if(args[i].equals("-port")) { 42 | port=Integer.parseInt(args[++i]); 43 | continue; 44 | } 45 | if(args[i].equals("-add")) { 46 | add_server=args[++i]; 47 | continue; 48 | } 49 | if(args[i].equals("-remove")) { 50 | remove_server=args[++i]; 51 | continue; 52 | } 53 | help(); 54 | } 55 | 56 | if(add_server == null && remove_server == null) { 57 | System.err.println("no server to be added or removed was given"); 58 | return; 59 | } 60 | if(add_server != null && remove_server != null) { 61 | System.err.println("only one server can be added or removed at a time"); 62 | return; 63 | } 64 | 65 | try { 66 | Client.start(dest, port, add_server, remove_server); 67 | } catch (Throwable t) { 68 | t.printStackTrace(); 69 | System.exit(1); 70 | } 71 | } 72 | 73 | protected static void help() { 74 | System.out.println("Client [-dest ] [-port ] (-add | -remove )"); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/testfwk/PartitionedRaftCluster.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.testfwk; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Message; 5 | import org.jgroups.View; 6 | 7 | import java.util.Collection; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.concurrent.ArrayBlockingQueue; 11 | import java.util.concurrent.BlockingQueue; 12 | import java.util.concurrent.ConcurrentHashMap; 13 | import java.util.concurrent.atomic.AtomicBoolean; 14 | 15 | /** 16 | * Manipulate the cluster during tests. 17 | *

18 | * This class offers the possibility of creating partitions in the cluster. The partitions are created by view updates. 19 | * 20 | * @since 1.0.12 21 | * @author José Bolina 22 | */ 23 | public class PartitionedRaftCluster extends MockRaftCluster { 24 | protected final Map> partitions = new ConcurrentHashMap<>(); 25 | protected final Map nodes = new ConcurrentHashMap<>(); 26 | 27 | private final AtomicBoolean viewChanging = new AtomicBoolean(false); 28 | private final BlockingQueue pending = new ArrayBlockingQueue<>(16); 29 | 30 | @Override 31 | public T clear() { 32 | nodes.clear(); 33 | return self(); 34 | } 35 | 36 | @Override 37 | public T add(Address addr, RaftNode node) { 38 | nodes.put(addr, node); 39 | return self(); 40 | } 41 | 42 | @Override 43 | public void handleView(View view) { 44 | viewChanging.set(true); 45 | try { 46 | List

members = view.getMembers(); 47 | for (Address member : members) { 48 | partitions.put(member, members); 49 | } 50 | 51 | // Update the view in the inverse order. 52 | // The coordinator usually has additional work in our implementation, therefore, we install 53 | // the view in the reverse order to make sure all members have the same view. 54 | for (int i = members.size() - 1; i >= 0; i--) { 55 | Address member = members.get(i); 56 | RaftNode node = nodes.get(member); 57 | node.handleView(view); 58 | } 59 | } finally { 60 | viewChanging.set(false); 61 | sendPending(); 62 | } 63 | } 64 | 65 | @Override 66 | public void send(Message msg) { 67 | // Enqueue messages during a view change, to make sure everything is sent after the view is installed on all members. 68 | if (viewChanging.get()) { 69 | pending.add(msg); 70 | return; 71 | } 72 | 73 | Address dest=msg.dest(), src=msg.src(); 74 | boolean block = interceptor != null && interceptor.shouldBlock(msg); 75 | 76 | // Blocks the invoking thread. 77 | if (block) interceptor.blockMessage(msg); 78 | 79 | if(dest != null) { 80 | List
connected = partitions.get(src); 81 | if (connected.contains(dest)) { 82 | RaftNode node = nodes.get(dest); 83 | send(node, msg); 84 | } 85 | } else { 86 | Collection
targets = partitions.get(src); 87 | for (Address a : targets) { 88 | RaftNode node = nodes.get(a); 89 | send(node, msg); 90 | } 91 | 92 | if (!msg.isFlagSet(Message.TransientFlag.DONT_LOOPBACK)) { 93 | RaftNode node = nodes.get(src); 94 | send(node, msg); 95 | } 96 | } 97 | } 98 | 99 | @Override 100 | public int size() { 101 | return nodes.size(); 102 | } 103 | 104 | @Override 105 | public T remove(Address addr) { 106 | nodes.remove(addr); 107 | return self(); 108 | } 109 | 110 | private void send(RaftNode node, Message msg) { 111 | if (async) deliverAsync(node, msg); 112 | else node.up(msg); 113 | } 114 | 115 | private void sendPending() { 116 | Message msg; 117 | while ((msg = pending.poll()) != null) { 118 | send(msg); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/CommitTable.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util; 2 | 3 | import org.jgroups.Address; 4 | 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Set; 8 | import java.util.concurrent.ConcurrentHashMap; 9 | import java.util.concurrent.ConcurrentMap; 10 | import java.util.function.BiConsumer; 11 | import java.util.stream.Collectors; 12 | 13 | /** 14 | * Keeps track of next_index and match_index for each cluster member (excluding this leader). 15 | * Used to (1) compute the commit_index and (2) to resend log entries to members which haven't yet seen them.

16 | * Only created on the leader 17 | * @author Bela Ban 18 | * @since 0.1 19 | */ 20 | public class CommitTable { 21 | protected final ConcurrentMap map=new ConcurrentHashMap<>(); 22 | 23 | 24 | public CommitTable(List

members, long next_index) { 25 | adjust(members, next_index); 26 | } 27 | 28 | public Set
keys() {return map.keySet();} 29 | public Entry get(Address a) {return map.get(a);} 30 | 31 | public void adjust(List
members, long next_index) { 32 | map.keySet().retainAll(members); 33 | // entry is only created if mbr is not in map, reducing unneeded creations 34 | members.forEach(mbr -> map.computeIfAbsent(mbr, k -> new Entry(next_index))); 35 | } 36 | 37 | public CommitTable update(Address member, long match_index, long next_index, long commit_index, boolean single_resend) { 38 | return update(member, match_index, next_index, commit_index, single_resend, false); 39 | } 40 | 41 | public CommitTable update(Address member, long match_index, long next_index, long commit_index, 42 | boolean single_resend, boolean overwrite) { 43 | Entry e=map.get(member); 44 | if(e == null) 45 | return this; 46 | e.match_index=overwrite? match_index : Math.max(match_index, e.match_index); 47 | e.next_index=Math.max(1, next_index); 48 | e.commit_index=Math.max(e.commit_index, commit_index); 49 | e.send_single_msg=single_resend; 50 | e.assertInvariant(); 51 | return this; 52 | } 53 | 54 | 55 | /** Applies a function to all elements of the commit table */ 56 | public void forEach(BiConsumer function) { 57 | for(Map.Entry entry: map.entrySet()) { 58 | Entry val=entry.getValue(); 59 | function.accept(entry.getKey(), val); 60 | } 61 | } 62 | 63 | @Override 64 | public String toString() { 65 | return map.entrySet().stream().map(e -> String.format("%s: %s", e.getKey(), e.getValue())) 66 | .collect(Collectors.joining("\n")); 67 | } 68 | 69 | 70 | 71 | public static class Entry { 72 | protected long commit_index; // the commit index of the given member 73 | 74 | protected long match_index; // the index of the highest entry known to be replicated to the member 75 | 76 | protected long next_index; // the next index to send; initialized to last_appended +1 77 | 78 | // set to true when next_index was decremented, so we only send a single entry on the next resend interval; 79 | // set to false when we receive an AppendEntries(true) response 80 | protected boolean send_single_msg; 81 | 82 | public Entry(long next_index) {this.next_index=next_index;} 83 | 84 | public long commitIndex() {return commit_index;} 85 | public Entry commitIndex(long idx) {this.commit_index=idx; return this;} 86 | public long matchIndex() {return match_index;} 87 | public Entry matchIndex(long idx) {this.match_index=idx; return this;} 88 | public long nextIndex() {return next_index;} 89 | public Entry nextIndex(long idx) {next_index=idx; return this;} 90 | 91 | public boolean sendSingleMessage() {return send_single_msg;} 92 | public Entry sendSingleMessage(boolean flag) {this.send_single_msg=flag; return this;} 93 | 94 | 95 | public void assertInvariant() { 96 | assert commit_index <= match_index && match_index <= next_index : this; 97 | } 98 | 99 | @Override public String toString() { 100 | StringBuilder sb=new StringBuilder("commit-index=").append(commit_index) 101 | .append(", match-index=").append(match_index).append(", next-index=").append(next_index); 102 | if(send_single_msg) 103 | sb.append(" [send-single-msg]"); 104 | return sb.toString(); 105 | } 106 | } 107 | 108 | } -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/CounterStateMachine.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util; 2 | 3 | import org.jgroups.protocols.raft.LogEntry; 4 | import org.jgroups.raft.StateMachine; 5 | import org.jgroups.util.Bits; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | /** 12 | * Sample state machine accepting additions and subtractions 13 | * @author Bela Ban 14 | * @since 1.0.5 15 | */ 16 | public class CounterStateMachine implements StateMachine { 17 | protected final AtomicInteger counter=new AtomicInteger(); 18 | protected final AtomicInteger additions=new AtomicInteger(); 19 | protected final AtomicInteger subtractions=new AtomicInteger(); 20 | 21 | public int counter() {return counter.get();} 22 | public int additions() {return additions.get();} 23 | public int subtractions() {return subtractions.get();} 24 | 25 | public byte[] apply(byte[] data, int offset, int length, boolean serialize_response) throws Exception { 26 | int val=Bits.readInt(data, offset); 27 | if(val < 0) 28 | subtractions.incrementAndGet(); 29 | else 30 | additions.incrementAndGet(); 31 | int old_counter=counter.get(); 32 | counter.addAndGet(val); 33 | if(!serialize_response) 34 | return null; 35 | byte[] retval=new byte[Integer.BYTES]; 36 | Bits.writeInt(old_counter, retval, 0); 37 | return retval; 38 | } 39 | 40 | public static String readAndDumpSnapshot(DataInput in) { 41 | try { 42 | int num=in.readInt(); 43 | return String.valueOf(num); 44 | } 45 | catch(Exception ex) { 46 | return null; 47 | } 48 | } 49 | 50 | public static String reader(LogEntry le) { 51 | byte[] buf=le.command(); 52 | int offset=le.offset(); 53 | int val=Bits.readInt(buf, offset); 54 | return String.valueOf(val); 55 | } 56 | 57 | public void readContentFrom(DataInput in) throws Exception { 58 | int val=in.readInt(); 59 | counter.set(val); 60 | } 61 | 62 | public void writeContentTo(DataOutput out) throws Exception { 63 | out.writeInt(counter.get()); 64 | } 65 | 66 | public CounterStateMachine reset() { 67 | counter.set(0); additions.set(0); subtractions.set(0); 68 | return this; 69 | } 70 | 71 | public String toString() { 72 | return String.format("counter=%d (%d additions %d subtractions)", 73 | counter.get(), additions.get(), subtractions.get()); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/LongHelper.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util; 2 | 3 | /** 4 | * @author Bela Ban 5 | * @since 1.0.9 6 | */ 7 | public class LongHelper { 8 | 9 | public static byte[] fromLongToByteArray(long value) { 10 | return new byte[] { 11 | (byte)(value >>> 56), 12 | (byte)(value >>> 48), 13 | (byte)(value >>> 40), 14 | (byte)(value >>> 32), 15 | (byte)(value >>> 24), 16 | (byte)(value >>> 16), 17 | (byte)(value >>> 8), 18 | (byte)value}; 19 | } 20 | 21 | public static long fromByteArrayToLong(byte[] b) { 22 | if((b == null) || (b.length != Long.BYTES)) 23 | return 0; 24 | return ((long)b[7] & 0xff) + 25 | (((long)b[6] & 0xff) << 8) + 26 | (((long)b[5] & 0xff) << 16) + 27 | (((long)b[4] & 0xff) << 24) + 28 | (((long)b[3] & 0xff) << 32) + 29 | (((long)b[2] & 0xff) << 40) + 30 | (((long)b[1] & 0xff) << 48) + 31 | ((long)b[0] << 56); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/ReplStateMachine.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util; 2 | 3 | import org.jgroups.raft.StateMachine; 4 | import org.jgroups.util.Bits; 5 | import org.jgroups.util.ByteArrayDataInputStream; 6 | import org.jgroups.util.Util; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | /** 14 | * Dummy replicated hashmap state machine 15 | * @author Bela Ban 16 | * @since 1.0.5 17 | */ 18 | public class ReplStateMachine implements StateMachine { 19 | protected final Map map=new HashMap<>(); 20 | public static final int PUT = 1; 21 | public static final int REMOVE = 2; 22 | 23 | @Override 24 | public byte[] apply(byte[] data, int offset, int length, boolean serialize_response) throws Exception { 25 | ByteArrayDataInputStream in=new ByteArrayDataInputStream(data, offset, length); 26 | byte command=in.readByte(); 27 | switch(command) { 28 | case PUT: 29 | K key=Util.objectFromStream(in); 30 | V val=Util.objectFromStream(in); 31 | V old_val; 32 | synchronized(map) { 33 | old_val=map.put(key, val); 34 | } 35 | return old_val == null? null : serialize_response? Util.objectToByteBuffer(old_val) : null; 36 | case REMOVE: 37 | key=Util.objectFromStream(in); 38 | synchronized(map) { 39 | old_val=map.remove(key); 40 | } 41 | return old_val == null? null : serialize_response? Util.objectToByteBuffer(old_val) : null; 42 | default: 43 | throw new IllegalArgumentException("command " + command + " is unknown"); 44 | } 45 | } 46 | 47 | @Override public void readContentFrom(DataInput in) throws Exception { 48 | int size=Bits.readIntCompressed(in); 49 | Map tmp=new HashMap<>(size); 50 | for(int i=0; i < size; i++) { 51 | K key=Util.objectFromStream(in); 52 | V val=Util.objectFromStream(in); 53 | tmp.put(key, val); 54 | } 55 | synchronized(map) { 56 | map.putAll(tmp); 57 | } 58 | } 59 | 60 | @Override public void writeContentTo(DataOutput out) throws Exception { 61 | synchronized(map) { 62 | int size=map.size(); 63 | Bits.writeIntCompressed(size, out); 64 | for(Map.Entry entry: map.entrySet()) { 65 | Util.objectToStream(entry.getKey(), out); 66 | Util.objectToStream(entry.getValue(), out); 67 | } 68 | } 69 | } 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/pmem/FileProvider.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util.pmem; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.channels.FileChannel; 6 | import java.nio.file.StandardOpenOption; 7 | 8 | /** 9 | * Creates {@link FileChannel}. 10 | *

11 | * If a Persistence Memory drive is available, support is provided by https://github.com/jhalliday/mashona 12 | * 13 | * @author Pedro Ruivo 14 | * @since 0.5.4 15 | */ 16 | public class FileProvider { 17 | 18 | private static final boolean ATTEMPT_PMEM; 19 | 20 | static { 21 | boolean attemptPmem = false; 22 | try { 23 | Class.forName("io.mashona.logwriting.PmemUtil"); 24 | // use persistent memory if available, otherwise fallback to regular file. 25 | attemptPmem = true; 26 | } catch (ClassNotFoundException e) { 27 | //no op 28 | } 29 | ATTEMPT_PMEM = attemptPmem; 30 | } 31 | 32 | public static boolean isPMEMAvailable() { 33 | return ATTEMPT_PMEM; 34 | } 35 | 36 | public static FileChannel openPMEMChannel(File file, 37 | int length, 38 | boolean create, 39 | boolean readSharedMetadata) throws IOException { 40 | if (!isPMEMAvailable()) { 41 | return null; 42 | } 43 | return PmemUtilWrapper.pmemChannelFor(file, length, create, readSharedMetadata); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/org/jgroups/raft/util/pmem/PmemUtilWrapper.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.raft.util.pmem; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.nio.channels.FileChannel; 6 | 7 | import io.mashona.logwriting.PmemUtil; 8 | 9 | /** 10 | * This class is here solely for the purpose of encapsulating the {@link PmemUtil} class so we do not load it unless 11 | * necessary, allowing this to be an optional dependency. Any code that invokes a method in this class should first 12 | * check if the {@link PmemUtil} can be loaded via {@link Class#forName(String)} otherwise a {@link ClassNotFoundException} 13 | * may be thrown when loading this class. 14 | */ 15 | public class PmemUtilWrapper { 16 | /** 17 | * Same as {@link PmemUtil#pmemChannelFor(File, int, boolean, boolean)}. 18 | */ 19 | static public FileChannel pmemChannelFor(File file, int length, boolean create, boolean readSharedMetadata) throws FileNotFoundException { 20 | return PmemUtil.pmemChannelFor(file, length, create, readSharedMetadata); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/CommandLineOptions.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf; 2 | 3 | /** 4 | * Base command line arguments for Raft benchmarks. 5 | *

6 | * The first argument must be the FQN of the benchmark class. The remaining arguments can be provided in any order: 7 | *

    8 | *
  • -props: The XML file to configure the protocol stack;
  • 9 | *
  • -name: The raft-id of the current node;
  • 10 | *
  • -nohup: Disable the event loop;
  • 11 | *
  • -histogram: Path to write the HdrHistogram file with the collected metrics for this node.
  • 12 | *
13 | *

14 | */ 15 | public final class CommandLineOptions { 16 | 17 | private final String benchmark; 18 | private final String name; 19 | private final String props; 20 | private final String histogramPath; 21 | private final boolean runEventLoop; 22 | 23 | private CommandLineOptions(String benchmark, String name, String props, String histogramPath, boolean runEventLoop) { 24 | this.benchmark = benchmark; 25 | this.name = name; 26 | this.props = props; 27 | this.histogramPath = histogramPath; 28 | this.runEventLoop = runEventLoop; 29 | } 30 | 31 | public String getBenchmark() { 32 | return benchmark; 33 | } 34 | 35 | public String getName() { 36 | return name; 37 | } 38 | 39 | public String getProps() { 40 | return props; 41 | } 42 | 43 | public String getHistogramPath() { 44 | return histogramPath; 45 | } 46 | 47 | public boolean shouldRunEventLoop() { 48 | return runEventLoop; 49 | } 50 | 51 | public static CommandLineOptions parse(String[] args) { 52 | String props = null; 53 | String name = null; 54 | String histogramPath = null; 55 | boolean runEventLoop = true; 56 | 57 | if (args.length == 0) 58 | throw new IllegalArgumentException("Arguments not provided"); 59 | 60 | // The first position contains the benchmark class to run. 61 | String benchmark = args[0]; 62 | 63 | for (int i = 1; i < args.length; i++) { 64 | switch (args[i]) { 65 | case "-props": 66 | props = args[++i]; 67 | break; 68 | 69 | case "-name": 70 | name = args[++i]; 71 | break; 72 | 73 | case "-nohup": 74 | runEventLoop = false; 75 | break; 76 | 77 | case "-histogram": 78 | histogramPath = args[++i]; 79 | break; 80 | 81 | default: 82 | System.out.printf("Unknown option: %s%n", args[i]); 83 | help(benchmark); 84 | break; 85 | } 86 | } 87 | 88 | return new CommandLineOptions(benchmark, name, props, histogramPath, runEventLoop); 89 | } 90 | 91 | private static void help(String benchmark) { 92 | System.out.printf("%s [-props ] [-name ] [-nohup] [-histogram /path/to/write]", benchmark); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/Main.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf; 2 | 3 | import org.jgroups.perf.harness.AbstractRaftBenchmark; 4 | import org.jgroups.util.Util; 5 | 6 | import java.lang.reflect.Constructor; 7 | import java.lang.reflect.InvocationTargetException; 8 | 9 | /** 10 | * Entry-point class to run Raft benchmarks. 11 | *

12 | * The command line arguments are parsed and the benchmark class is instantiated. The arguments must be provided in the 13 | * correct order. The very first argument is the benchmark class to run, followed by the arguments. 14 | *

15 | */ 16 | public class Main { 17 | 18 | public static void main(String[] args) throws Throwable { 19 | CommandLineOptions cmd = CommandLineOptions.parse(args); 20 | AbstractRaftBenchmark benchmark = instantiate(cmd); 21 | 22 | // Initializes the benchmark. 23 | // Causes the nodes to retrieve the benchmark configuration from the coordinator. 24 | benchmark.init(); 25 | 26 | if (cmd.shouldRunEventLoop()) { 27 | benchmark.eventLoop(); 28 | } else { 29 | for (;;) Util.sleep(60_000); 30 | } 31 | 32 | benchmark.stop(); 33 | } 34 | 35 | @SuppressWarnings("unchecked") 36 | private static AbstractRaftBenchmark instantiate(CommandLineOptions cmd) 37 | throws InvocationTargetException, InstantiationException, IllegalAccessException, ClassNotFoundException { 38 | 39 | Class clazz = (Class) Class.forName(cmd.getBenchmark()); 40 | Constructor[] constructors = clazz.getConstructors(); 41 | 42 | if (constructors.length > 1) 43 | throw new IllegalStateException("Multiple constructors declared!"); 44 | 45 | Constructor c = (Constructor) constructors[0]; 46 | return c.newInstance(cmd); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/counter/AsyncCounterBenchmark.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.counter; 2 | 3 | import org.HdrHistogram.AbstractHistogram; 4 | import org.HdrHistogram.AtomicHistogram; 5 | import org.HdrHistogram.Histogram; 6 | import org.jgroups.blocks.atomic.AsyncCounter; 7 | import org.jgroups.raft.Options; 8 | import org.jgroups.raft.blocks.RaftCounter; 9 | import org.jgroups.util.CompletableFutures; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.concurrent.CompletableFuture; 14 | import java.util.concurrent.CompletionStage; 15 | import java.util.concurrent.ThreadFactory; 16 | import java.util.concurrent.atomic.AtomicBoolean; 17 | import java.util.concurrent.atomic.LongAdder; 18 | import java.util.function.Function; 19 | import java.util.function.LongSupplier; 20 | 21 | /** 22 | * Basic {@link org.jgroups.blocks.atomic.AsyncCounter} benchmark 23 | */ 24 | public class AsyncCounterBenchmark implements CounterBenchmark { 25 | 26 | private List> requests; 27 | private LongSupplier deltaSupplier; 28 | private int concurrency; 29 | private AsyncCounter counter; 30 | private final AtomicBoolean stop = new AtomicBoolean(false); 31 | private final LongAdder updates = new LongAdder(); 32 | private final AtomicHistogram histogram = HistogramUtil.createAtomic(); 33 | 34 | @Override 35 | public void init(int concurrency, ThreadFactory threadFactory, LongSupplier deltaSupplier, RaftCounter counter) { 36 | this.concurrency = concurrency; 37 | this.deltaSupplier = deltaSupplier; 38 | this.counter = counter.async().withOptions(Options.create(true)); 39 | requests = new ArrayList<>(concurrency); 40 | } 41 | 42 | @Override 43 | public void start() { 44 | stop.set(false); 45 | final long currentTime = System.nanoTime(); 46 | for (int i = 0; i < concurrency; ++i) { 47 | requests.add(updateCounter()); 48 | } 49 | } 50 | 51 | @Override 52 | public void stop() { 53 | stop.set(true); 54 | } 55 | 56 | @Override 57 | public void join() throws InterruptedException { 58 | for (CompletionStage stage : requests) { 59 | stage.toCompletableFuture().join(); 60 | } 61 | } 62 | 63 | @Override 64 | public long getTotalUpdates() { 65 | return updates.sum(); 66 | } 67 | 68 | @Override 69 | public Histogram getResults(boolean printUpdaters, Function timePrinter) { 70 | return histogram; 71 | } 72 | 73 | @Override 74 | public void close() throws Exception { 75 | stop.set(true); 76 | requests.clear(); 77 | } 78 | 79 | private void updateTime(long timeNanos) { 80 | updates.increment(); 81 | histogram.recordValue(timeNanos); 82 | } 83 | 84 | private CompletionStage updateCounter(CompletableFuture cf, CompletionStage prev, long start) { 85 | if (stop.get()) { 86 | cf.complete(null); 87 | return prev; 88 | } 89 | 90 | return prev.whenComplete((ignoreV, ignoreT) -> { 91 | final long currentTime = System.nanoTime(); 92 | updateTime(currentTime - start); 93 | long delta = deltaSupplier.getAsLong(); 94 | updateCounter(cf, counter.addAndGet(delta), System.nanoTime()); 95 | }); 96 | } 97 | 98 | private CompletionStage updateCounter() { 99 | if (stop.get()) { 100 | // we don't check the return value 101 | return CompletableFutures.completedNull(); 102 | } 103 | 104 | CompletableFuture cf = new CompletableFuture<>(); 105 | updateCounter(cf, counter.addAndGet(deltaSupplier.getAsLong()), System.nanoTime()); 106 | return cf; 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/counter/CounterBenchmark.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.counter; 2 | 3 | import org.HdrHistogram.AbstractHistogram; 4 | import org.HdrHistogram.Histogram; 5 | 6 | import org.jgroups.raft.blocks.RaftAsyncCounter; 7 | import org.jgroups.raft.blocks.RaftCounter; 8 | import org.jgroups.raft.blocks.RaftSyncCounter; 9 | 10 | import java.util.concurrent.ThreadFactory; 11 | import java.util.function.Function; 12 | import java.util.function.LongSupplier; 13 | 14 | /** 15 | * Benchmark implementation used by {@link CounterPerf}. 16 | *

17 | * A new instance is created when "start benchmark" command is created. 18 | * After creation, {@link #init(int, ThreadFactory, LongSupplier, RaftCounter)} is invoked with the benchmark settings follow by {@link #start()}. 19 | * The benchmark runs for some time and then {@link #stop()} and {@link #join()} are invoked. 20 | */ 21 | public interface CounterBenchmark extends AutoCloseable { 22 | 23 | /** 24 | * Initializes with the benchmark settings. 25 | * @param concurrency The number of concurrent updaters. 26 | * @param threadFactory The thread factory (if it needs to create threads). 27 | * @param deltaSupplier For each "add" operation, the delta from this {@link LongSupplier} must be used. 28 | * @param counter The {@link RaftCounter} to benchmark. Note that the {@link RaftSyncCounter} 29 | * or {@link RaftAsyncCounter} instances can be gotten by calling 30 | * {@link RaftCounter#sync()} or {@link RaftCounter#async()}, respectively 31 | */ 32 | void init(int concurrency, ThreadFactory threadFactory, LongSupplier deltaSupplier, RaftCounter counter); 33 | 34 | /** 35 | * Signals the test start. 36 | */ 37 | void start(); 38 | 39 | /** 40 | * Signals the test end. 41 | */ 42 | void stop(); 43 | 44 | /** 45 | * Wait until all updaters finish their work. 46 | * 47 | * @throws InterruptedException If interrupted. 48 | */ 49 | void join() throws InterruptedException; 50 | 51 | /** 52 | * @return The total number of "add" operation invoked. 53 | */ 54 | long getTotalUpdates(); 55 | 56 | /** 57 | * Returns the results of the run. 58 | * 59 | * @param printUpdaters If supported and if {@code true}, print to {@link System#out} each updater result. 60 | * @param timePrinter {@link Function} to use to print each updater {@link AbstractHistogram} result. 61 | * @return The {@link Histogram} with the results of all updaters. 62 | */ 63 | Histogram getResults(boolean printUpdaters, Function timePrinter); 64 | 65 | } 66 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/counter/HistogramUtil.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.counter; 2 | 3 | import org.HdrHistogram.AbstractHistogram; 4 | import org.HdrHistogram.AtomicHistogram; 5 | import org.HdrHistogram.Histogram; 6 | 7 | import java.io.File; 8 | import java.io.FileOutputStream; 9 | import java.io.IOException; 10 | import java.io.PrintStream; 11 | import java.util.concurrent.TimeUnit; 12 | 13 | /** 14 | * Utility methods to create histograms and write the logs to file 15 | */ 16 | public enum HistogramUtil { 17 | ; 18 | 19 | // highest latency observable 20 | private static final long HIGHEST_TRACKABLE_VALUE = TimeUnit.MINUTES.toNanos(1); 21 | // precision between 0 and 5 22 | private static final int PRECISION = 3; 23 | 24 | public static Histogram create() { 25 | return new Histogram(HIGHEST_TRACKABLE_VALUE, PRECISION); 26 | } 27 | 28 | public static AtomicHistogram createAtomic() { 29 | return new AtomicHistogram(HIGHEST_TRACKABLE_VALUE, PRECISION); 30 | } 31 | 32 | public static void writeTo(AbstractHistogram histogram, File file) throws IOException { 33 | try (FileOutputStream out = new FileOutputStream(file)) { 34 | //change scale from nanos -> micros 35 | histogram.outputPercentileDistribution(new PrintStream(out), 1000.0); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/harness/RaftBenchmark.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.harness; 2 | 3 | import java.util.function.Function; 4 | 5 | import org.HdrHistogram.AbstractHistogram; 6 | import org.HdrHistogram.Histogram; 7 | 8 | public interface RaftBenchmark extends AutoCloseable { 9 | /** 10 | * Signals the test start. 11 | */ 12 | void start(); 13 | 14 | /** 15 | * Signals the test end. 16 | */ 17 | void stop(); 18 | 19 | /** 20 | * Wait until all updaters finish their work. 21 | * 22 | * @throws InterruptedException If interrupted. 23 | */ 24 | void join() throws InterruptedException; 25 | 26 | /** 27 | * @return The total number of "add" operation invoked. 28 | */ 29 | long getTotalUpdates(); 30 | 31 | /** 32 | * Returns the results of the run. 33 | * 34 | * @param printUpdaters If supported and if {@code true}, print to {@link System#out} each updater result. 35 | * @param timePrinter {@link Function} to use to print each updater {@link AbstractHistogram} result. 36 | * @return The {@link Histogram} with the results of all updaters. 37 | */ 38 | Histogram getResults(boolean printUpdaters, Function timePrinter); 39 | } 40 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/jmh/LogJmhBenchmark.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.jmh; 2 | 3 | import org.jgroups.protocols.raft.*; 4 | import org.openjdk.jmh.annotations.*; 5 | import org.openjdk.jmh.runner.Runner; 6 | import org.openjdk.jmh.runner.RunnerException; 7 | import org.openjdk.jmh.runner.options.Options; 8 | import org.openjdk.jmh.runner.options.OptionsBuilder; 9 | 10 | import java.io.File; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.concurrent.TimeUnit; 14 | 15 | /** 16 | * @author Pedro Ruivo 17 | * @since 0.5.4 18 | */ 19 | @BenchmarkMode(Mode.AverageTime) 20 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 21 | @Warmup(iterations = 10, time = 5) 22 | @Measurement(iterations = 10, time = 5) 23 | @Fork(1) 24 | public class LogJmhBenchmark { 25 | 26 | public static void main(String[] args) throws RunnerException { 27 | Options opt = new OptionsBuilder() 28 | .include(LogJmhBenchmark.class.getCanonicalName()) 29 | .forks(1) 30 | .build(); 31 | 32 | new Runner(opt).run(); 33 | } 34 | 35 | @Benchmark 36 | public void append(ExecutionPlan plan) { 37 | plan.log.append(plan.index, plan.entries); 38 | plan.index+= plan.batchSize; 39 | } 40 | 41 | @State(Scope.Benchmark) 42 | public static class ExecutionPlan { 43 | 44 | @Param({"10", "100", "4096"}) 45 | private int dataSize; 46 | @Param({"leveldb", "file"}) 47 | private String logType; 48 | @Param({"/tmp/tmp_raft_bench", "./tmp_raft_bench"}) 49 | private String baseDir; 50 | @Param({"1","3"}) 51 | private int batchSize; 52 | private LogEntries entries; 53 | private int index; 54 | private Log log; 55 | 56 | @Setup(Level.Trial) 57 | public void setUp() throws Exception { 58 | index = 1; 59 | byte[] data = new byte[dataSize]; 60 | Arrays.fill(data, (byte) 1); 61 | entries = new LogEntries(); 62 | for(int i=0; i < batchSize; i++) 63 | entries.add(new LogEntry(1, data)); 64 | if ("leveldb".equals(logType)) { 65 | log = new LevelDBLog(); 66 | } else if ("file".equals(logType)) { 67 | log = new FileBasedLog(); 68 | } else { 69 | throw new IllegalArgumentException(); 70 | } 71 | new File(baseDir).mkdirs(); 72 | log.init(baseDir + "/raft_" + logType, Collections.emptyMap()); 73 | } 74 | 75 | @TearDown 76 | public void stop() throws Exception { 77 | if (log != null) { 78 | log.delete(); 79 | } 80 | } 81 | } 82 | } -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/replication/AsyncReplicationBenchmark.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.replication; 2 | 3 | import org.jgroups.perf.harness.RaftBenchmark; 4 | import org.jgroups.perf.counter.HistogramUtil; 5 | import org.jgroups.raft.Settable; 6 | import org.jgroups.util.CompletableFutures; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.concurrent.CompletableFuture; 11 | import java.util.concurrent.CompletionStage; 12 | import java.util.concurrent.ThreadFactory; 13 | import java.util.concurrent.atomic.AtomicBoolean; 14 | import java.util.concurrent.atomic.LongAdder; 15 | import java.util.function.Function; 16 | 17 | import org.HdrHistogram.AbstractHistogram; 18 | import org.HdrHistogram.AtomicHistogram; 19 | import org.HdrHistogram.Histogram; 20 | 21 | /** 22 | * A benchmark for asynchronous APIs. 23 | *

24 | * Run multiple asynchronous requests in parallel. A new request initiates as soon as a request finishes. Utilize the 25 | * {@link org.jgroups.raft.RaftHandle#setAsync(byte[], int, int)} API. 26 | *

27 | * 28 | */ 29 | class AsyncReplicationBenchmark implements RaftBenchmark { 30 | 31 | private final int concurrency; 32 | private final byte[] payload; 33 | private final Settable settable; 34 | private final ThreadFactory tf; 35 | private final List> requests; 36 | private final AtomicBoolean stop = new AtomicBoolean(false); 37 | private final LongAdder updates = new LongAdder(); 38 | private final AtomicHistogram histogram = HistogramUtil.createAtomic(); 39 | 40 | 41 | public AsyncReplicationBenchmark(int concurrency, Settable settable, byte[] payload, ThreadFactory tf) { 42 | this.concurrency = concurrency; 43 | this.payload = payload; 44 | this.settable = settable; 45 | this.tf = tf; 46 | this.requests = new ArrayList<>(concurrency); 47 | } 48 | 49 | @Override 50 | public void start() { 51 | stop.set(false); 52 | for (int i = 0; i < concurrency; i++) { 53 | requests.add(perform()); 54 | } 55 | } 56 | 57 | @Override 58 | public void stop() { 59 | stop.set(true); 60 | } 61 | 62 | @Override 63 | public void join() throws InterruptedException { 64 | for (CompletionStage request : requests) { 65 | request.toCompletableFuture().join(); 66 | } 67 | } 68 | 69 | @Override 70 | public long getTotalUpdates() { 71 | return updates.sum(); 72 | } 73 | 74 | @Override 75 | public Histogram getResults(boolean printUpdaters, Function timePrinter) { 76 | return histogram; 77 | } 78 | 79 | @Override 80 | public void close() throws Exception { 81 | stop.set(true); 82 | requests.clear(); 83 | } 84 | 85 | private void updateTime(long timeNanos) { 86 | updates.increment(); 87 | histogram.recordValue(timeNanos); 88 | } 89 | 90 | private CompletionStage perform() { 91 | if (stop.get()) 92 | return CompletableFutures.completedNull(); 93 | 94 | CompletableFuture cf = new CompletableFuture<>(); 95 | perform(cf, execute(), System.nanoTime()); 96 | return cf; 97 | } 98 | 99 | private void perform(CompletableFuture cf, CompletionStage prev, long start) { 100 | if (stop.get()) { 101 | cf.complete(null); 102 | return; 103 | } 104 | 105 | prev.whenComplete((ignoreV, ignoreT) -> { 106 | long currentTime = System.nanoTime(); 107 | updateTime(currentTime - start); 108 | perform(cf, execute(), System.nanoTime()); 109 | }); 110 | } 111 | 112 | private CompletableFuture execute() { 113 | try { 114 | return settable.setAsync(payload, 0, payload.length); 115 | } catch (Exception e) { 116 | if (!stop.get()) 117 | e.printStackTrace(System.err); 118 | } 119 | return CompletableFutures.completedNull(); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /tests/benchmark/org/jgroups/perf/replication/ReplicationPerf.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.perf.replication; 2 | 3 | import org.jgroups.annotations.Property; 4 | import org.jgroups.perf.harness.AbstractRaftBenchmark; 5 | import org.jgroups.perf.CommandLineOptions; 6 | import org.jgroups.perf.harness.RaftBenchmark; 7 | import org.jgroups.protocols.raft.RAFT; 8 | import org.jgroups.raft.RaftHandle; 9 | import org.jgroups.raft.StateMachine; 10 | import org.jgroups.raft.testfwk.RaftTestUtils; 11 | import org.jgroups.util.ThreadFactory; 12 | import org.jgroups.util.Util; 13 | 14 | import java.io.DataInput; 15 | import java.io.DataOutput; 16 | import java.lang.reflect.Field; 17 | import java.util.concurrent.ThreadLocalRandom; 18 | 19 | /** 20 | * Test performance of replicating data. 21 | *

22 | * This benchmark utilizes the base {@link RaftHandle} to verify the replication performance of a configurable-sized 23 | * byte array. The test verifies only the replication part, where the state machine does not interpret the bytes. 24 | * Since the {@link StateMachine} implementation is application-specific, we don't measure it in our tests. 25 | *

26 | *

27 | * The benchmark accepts configuration for the payload size, whether fsync the log. 28 | *

29 | * 30 | * @author José Bolina 31 | */ 32 | public class ReplicationPerf extends AbstractRaftBenchmark { 33 | 34 | private static final Field DATA_SIZE, USE_FSYNC; 35 | 36 | static { 37 | try { 38 | DATA_SIZE = Util.getField(ReplicationPerf.class, "data_size", true); 39 | USE_FSYNC = Util.getField(ReplicationPerf.class, "use_fsync", true); 40 | } catch (Exception e) { 41 | throw new RuntimeException(e); 42 | } 43 | } 44 | 45 | private final RaftHandle raft; 46 | private final CounterStateMachine csm; 47 | 48 | @Property 49 | protected int data_size = 526; 50 | 51 | @Property 52 | protected boolean use_fsync; 53 | 54 | public ReplicationPerf(CommandLineOptions cmd) throws Throwable { 55 | super(cmd); 56 | this.csm = new CounterStateMachine(); 57 | this.raft = new RaftHandle(channel, csm); 58 | } 59 | 60 | @Override 61 | public RaftBenchmark syncBenchmark(ThreadFactory tf) { 62 | visitRaftBeforeBenchmark(); 63 | return new SyncReplicationBenchmark(num_threads, raft, createTestPayload(), tf); 64 | } 65 | 66 | @Override 67 | public RaftBenchmark asyncBenchmark(ThreadFactory tf) { 68 | visitRaftBeforeBenchmark(); 69 | return new AsyncReplicationBenchmark(num_threads, raft, createTestPayload(), tf); 70 | } 71 | 72 | @Override 73 | public String extendedEventLoopHeader() { 74 | return String.format("[s] Data size (%d bytes) [f] Use fsync (%b)", data_size, use_fsync); 75 | } 76 | 77 | @Override 78 | public void extendedEventLoop(int c) throws Throwable { 79 | switch (c) { 80 | case 's': 81 | changeFieldAcrossCluster(DATA_SIZE, Util.readIntFromStdin("new data size: ")); 82 | break; 83 | 84 | case 'f': 85 | changeFieldAcrossCluster(USE_FSYNC, !use_fsync); 86 | break; 87 | 88 | default: 89 | System.out.printf("Unknown option: %c%n", c); 90 | break; 91 | } 92 | } 93 | 94 | @Override 95 | public void clear() { 96 | try { 97 | RaftTestUtils.deleteRaftLog(channel.getProtocolStack().findProtocol(RAFT.class)); 98 | } catch (Exception e) { 99 | System.err.printf("Failed deleting log file: %s", e); 100 | } 101 | } 102 | 103 | private void visitRaftBeforeBenchmark() { 104 | raft.raft().logUseFsync(use_fsync); 105 | } 106 | 107 | private byte[] createTestPayload() { 108 | byte[] payload = new byte[data_size]; 109 | ThreadLocalRandom.current().nextBytes(payload); 110 | return payload; 111 | } 112 | 113 | private static class CounterStateMachine implements StateMachine { 114 | private long updates; 115 | 116 | 117 | @Override 118 | public byte[] apply(byte[] data, int offset, int length, boolean serialize_response) throws Exception { 119 | updates++; 120 | return null; 121 | } 122 | 123 | @Override 124 | public void readContentFrom(DataInput in) throws Exception { } 125 | 126 | @Override 127 | public void writeContentTo(DataOutput out) throws Exception { } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/CommitTableTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Global; 5 | import org.jgroups.raft.util.CommitTable; 6 | import org.jgroups.util.Util; 7 | import org.testng.annotations.Test; 8 | 9 | import java.util.Arrays; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Set; 13 | import java.util.stream.Collectors; 14 | 15 | /** 16 | * @author Bela Ban 17 | * @since 0.1 18 | */ 19 | @Test(groups=Global.FUNCTIONAL) 20 | public class CommitTableTest { 21 | protected static final Address a=Util.createRandomAddress("A"), b=Util.createRandomAddress("B"), 22 | c=Util.createRandomAddress("C"), d=Util.createRandomAddress("D"), e=Util.createRandomAddress("E"); 23 | 24 | 25 | public void testAddition() { 26 | CommitTable table=new CommitTable(Arrays.asList(a,b,c), 5); 27 | System.out.println("table = " + table); 28 | assert table.keys().size() == 3; 29 | 30 | List
mbrs=Arrays.asList(b, c, d, e); 31 | table.adjust(mbrs, 5); 32 | System.out.println("table = " + table); 33 | assert table.keys().size() == 4; 34 | Set
keys=table.keys(); 35 | assert keys.equals(new HashSet<>(mbrs)); 36 | } 37 | 38 | 39 | protected static List
generate(String... members) { 40 | return Arrays.stream(members).map(Util::createRandomAddress).collect(Collectors.toList()); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/CompletableFutureTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.util.Util; 5 | import org.testng.annotations.BeforeMethod; 6 | import org.testng.annotations.Test; 7 | 8 | import java.util.concurrent.*; 9 | import java.util.function.BiConsumer; 10 | 11 | @Test(groups=Global.FUNCTIONAL,singleThreaded=true) 12 | public class CompletableFutureTest { 13 | protected CompletableFuture f; 14 | 15 | @BeforeMethod protected void setup() { 16 | f=new CompletableFuture<>();} 17 | 18 | public void testDone() { 19 | assert !f.isDone(); 20 | assert !f.isCancelled(); 21 | f.cancel(true); 22 | assert f.isCancelled(); 23 | assert f.isDone(); 24 | } 25 | 26 | public void testGet() throws Exception { 27 | boolean success=f.complete(1); 28 | assert success; 29 | success=f.complete(2); 30 | assert !success; 31 | int result=f.get(); 32 | assert result == 1; 33 | result=f.get(500, TimeUnit.MILLISECONDS); 34 | assert result == 1; 35 | } 36 | 37 | public void testGetWithException() throws Exception { 38 | f.completeExceptionally(new NullPointerException("booom")); 39 | try { 40 | f.get(); 41 | assert false : "should have thrown an exception"; 42 | } 43 | catch(ExecutionException ex) { 44 | System.out.println("received ExecutionException as expected: " + ex); 45 | assert ex.getCause() instanceof NullPointerException; 46 | } 47 | } 48 | 49 | public void testGetWithTimeout() throws Exception { 50 | try { 51 | f.get(50, TimeUnit.MILLISECONDS); 52 | assert false : "should have thrown a TimeoutException"; 53 | } 54 | catch(TimeoutException ex) { 55 | System.out.println("received TimeoutException as expected: " + ex); 56 | } 57 | } 58 | 59 | public void testDelayedGet() throws Exception { 60 | Completer completer=new Completer<>(f, 5, null, 500); 61 | completer.start(); 62 | int result=f.get(); 63 | System.out.println("result = " + result); 64 | assert result == 5; 65 | } 66 | 67 | public void testCancel() throws Exception { 68 | new Thread(() -> {Util.sleep(500); f.cancel(true);}).start(); 69 | 70 | try { 71 | f.get(); 72 | assert false : "should have thrown a CancellationException"; 73 | } 74 | catch(CancellationException cex) { 75 | System.out.println("received CancellationException as expected: " + cex); 76 | } 77 | assert f.isCancelled() && f.isDone(); 78 | } 79 | 80 | public void testCompletionHandler() throws Exception { 81 | CompletableFuture fut=new CompletableFuture<>(); 82 | new Completer<>(fut, 5, null, 500).start(); 83 | 84 | Util.waitUntil(10000, 100, fut::isDone); 85 | assert fut.get(2, TimeUnit.SECONDS) == 5; 86 | } 87 | 88 | public void testCompletionHandlerWithException() throws TimeoutException { 89 | MyCompletionHandler handler=new MyCompletionHandler<>(); 90 | f=new CompletableFuture<>(); 91 | f.whenComplete(handler); 92 | new Completer<>(f, 0, new NullPointerException("booom"), 50).start(); 93 | Util.waitUntil(10000, 500, () -> f.isDone()); 94 | Throwable ex=handler.getException(); 95 | assert ex instanceof NullPointerException; 96 | } 97 | 98 | 99 | protected static class Completer extends Thread { 100 | protected final CompletableFuture future; 101 | protected final R result; 102 | protected final Throwable t; 103 | protected final long sleep; 104 | 105 | public Completer(CompletableFuture future, R result, Throwable t, long sleep) { 106 | this.future=future; 107 | this.result=result; 108 | this.t=t; 109 | this.sleep=sleep; 110 | } 111 | 112 | public void run() { 113 | Util.sleep(sleep); 114 | if(t != null) 115 | future.completeExceptionally(t); 116 | else 117 | future.complete(result); 118 | } 119 | } 120 | 121 | 122 | protected static class MyCompletionHandler implements BiConsumer { 123 | protected T value; 124 | protected Throwable ex; 125 | 126 | public T getValue() {return value;} 127 | public Throwable getException() {return ex;} 128 | 129 | public void accept(T t, Throwable ex) { 130 | if(t != null) 131 | value=t; 132 | if(ex != null) 133 | this.ex=ex; 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/DummyStateMachine.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.raft.StateMachine; 4 | 5 | import java.io.DataInput; 6 | import java.io.DataOutput; 7 | 8 | /** 9 | * @author Bela Ban 10 | * @since 1.0.5 11 | */ 12 | public class DummyStateMachine implements StateMachine { 13 | public byte[] apply(byte[] data, int offset, int length, boolean serialize_response) throws Exception { 14 | return serialize_response? new byte[0] : null; 15 | } 16 | public void readContentFrom(DataInput in) throws Exception {} 17 | public void writeContentTo(DataOutput out) throws Exception {} 18 | } 19 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/LogEntriesTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.protocols.raft.LogEntries; 5 | import org.jgroups.protocols.raft.LogEntry; 6 | import org.jgroups.util.ByteArrayDataInputStream; 7 | import org.jgroups.util.ByteArrayDataOutputStream; 8 | import org.testng.annotations.BeforeMethod; 9 | import org.testng.annotations.Test; 10 | 11 | import java.io.IOException; 12 | import java.util.Iterator; 13 | 14 | /** 15 | * Tests {@link org.jgroups.protocols.raft.LogEntries} 16 | * @author Bela Ban 17 | * @since 1.0.8 18 | */ 19 | @Test(groups=Global.FUNCTIONAL,singleThreaded=true) 20 | public class LogEntriesTest { 21 | protected LogEntries entries; 22 | 23 | @BeforeMethod protected void init() { 24 | entries=new LogEntries(); 25 | } 26 | 27 | public void testEmpty() throws IOException, ClassNotFoundException { 28 | assert entries.size() == 0; 29 | LogEntries tmp=marshalAndUnmarshal(entries); 30 | assert tmp.size() == 0; 31 | } 32 | 33 | public void testAdd() throws IOException, ClassNotFoundException { 34 | LogEntry le1=new LogEntry(20, new byte[25]); 35 | LogEntry le2=new LogEntry(22, new byte[10], 0, 10, true); 36 | entries.add(le1); 37 | assert entries.size() == 1; 38 | entries.add(le2); 39 | assert entries.size() == 2; 40 | LogEntries e=marshalAndUnmarshal(entries); 41 | assert e.size() == 2; 42 | } 43 | 44 | public void testIterator() throws IOException, ClassNotFoundException { 45 | testAdd(); 46 | Iterator it=entries.iterator(); 47 | LogEntry le=it.next(); 48 | assert le.term() == 20; 49 | assert le.command().length == 25; 50 | le=it.next(); 51 | assert le.term() == 22; 52 | assert le.command().length == 10; 53 | assert le.internal(); 54 | assert !it.hasNext(); 55 | entries.clear(); 56 | assert entries.size() == 0; 57 | } 58 | 59 | public void testToArray() throws IOException, ClassNotFoundException { 60 | testAdd(); 61 | LogEntry[] arr=entries.toArray(); 62 | assert arr.length == 2; 63 | assert arr[0].term() == 20; 64 | assert arr[0].command().length == 25; 65 | assert arr[1].term() == 22; 66 | assert arr[1].command().length == 10; 67 | assert arr[1].internal(); 68 | } 69 | 70 | public void testTotalSize() throws IOException, ClassNotFoundException { 71 | testAdd(); 72 | long total_size=entries.totalSize(); 73 | assert total_size == 35; 74 | } 75 | 76 | 77 | protected static LogEntries marshalAndUnmarshal(LogEntries le) throws IOException, ClassNotFoundException { 78 | ByteArrayDataOutputStream out=new ByteArrayDataOutputStream(64); 79 | int expected_size=le.serializedSize(); 80 | le.writeTo(out); 81 | assert out.position() == expected_size; 82 | ByteArrayDataInputStream in=new ByteArrayDataInputStream(out.buffer(), 0, out.position()); 83 | LogEntries ret=new LogEntries(); 84 | ret.readFrom(in); 85 | return ret; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/LongHelperTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.testng.Assert; 5 | import org.testng.annotations.Test; 6 | 7 | import static org.jgroups.raft.util.LongHelper.fromByteArrayToLong; 8 | import static org.jgroups.raft.util.LongHelper.fromLongToByteArray; 9 | 10 | 11 | @Test(groups= Global.FUNCTIONAL,singleThreaded=true) 12 | public class LongHelperTest { 13 | 14 | public void testNull() { 15 | Assert.assertEquals(0, fromByteArrayToLong(null)); 16 | } 17 | 18 | public void testZeroConversion() { 19 | Assert.assertEquals(0, convertToBytesAndBack(0)); 20 | } 21 | 22 | public void testPositiveConversion() { 23 | Assert.assertEquals(42, convertToBytesAndBack(42)); 24 | } 25 | 26 | public void testMaxConversion() { 27 | Assert.assertEquals(Long.MAX_VALUE, convertToBytesAndBack(Long.MAX_VALUE)); 28 | } 29 | 30 | public void testNegativeConversion() { 31 | Assert.assertEquals(-42, convertToBytesAndBack(-42)); 32 | } 33 | 34 | public void testMinConversion() { 35 | Assert.assertEquals(Long.MIN_VALUE, convertToBytesAndBack(Long.MIN_VALUE)); 36 | } 37 | 38 | 39 | private static long convertToBytesAndBack(long number) { 40 | byte[] b = fromLongToByteArray(number); 41 | return fromByteArrayToLong(b); 42 | } 43 | 44 | } -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/MaintenanceClusterTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.JChannel; 5 | import org.jgroups.protocols.raft.RAFT; 6 | import org.jgroups.tests.harness.BaseRaftChannelTest; 7 | import org.jgroups.util.Util; 8 | import org.testng.annotations.AfterMethod; 9 | import org.testng.annotations.Test; 10 | 11 | import java.util.concurrent.TimeUnit; 12 | 13 | import static org.assertj.core.api.Assertions.assertThat; 14 | import static org.jgroups.raft.testfwk.RaftTestUtils.eventually; 15 | import static org.jgroups.tests.harness.RaftAssertion.assertCommitIndex; 16 | import static org.jgroups.tests.harness.RaftAssertion.waitUntilAllRaftsHaveLeader; 17 | 18 | @Test(groups = Global.FUNCTIONAL, singleThreaded = true) 19 | public class MaintenanceClusterTest extends BaseRaftChannelTest { 20 | 21 | { 22 | createManually = true; 23 | recreatePerMethod = true; 24 | } 25 | 26 | @AfterMethod 27 | protected void destroy() throws Exception { 28 | destroyCluster(); 29 | } 30 | 31 | public void testMaintenanceWorkflowFollower() throws Exception { 32 | runMaintenanceWorkflow(false); 33 | } 34 | 35 | public void testMaintenanceWorkflowLeader() throws Exception { 36 | runMaintenanceWorkflow(true); 37 | } 38 | 39 | private void runMaintenanceWorkflow(boolean removeLeader) throws Exception { 40 | // A cluster is created with all the three nodes. 41 | withClusterSize(3); 42 | createCluster(); 43 | 44 | waitUntilAllRaftsHaveLeader(channels(), this::raft); 45 | 46 | // The existing cluster operates just as usual. 47 | insertEntries(); 48 | 49 | // After a certain point, a node needs to be removed for maintenance. 50 | // The leader will remove it. 51 | RAFT leader = leader(); 52 | int removeIndex = removeIndex(removeLeader); 53 | RAFT node = raft(removeIndex); 54 | 55 | leader.removeServer(node.raftId()).get(10, TimeUnit.SECONDS); 56 | assertThat(eventually(() -> node.role().equals("Learner"), 10, TimeUnit.SECONDS)).isTrue(); 57 | 58 | // After the node is removed and become learner, the cluster continues to operate. 59 | if (removeLeader) waitUntilAllRaftsHaveLeader(channels(), this::raft); 60 | insertEntries(); 61 | 62 | // Until, the operator decides to stop the removed node. 63 | Util.close(channel(removeIndex)); 64 | channels()[removeIndex] = null; 65 | Util.waitUntilAllChannelsHaveSameView(10_000, 150, actualChannels()); 66 | 67 | // With the node removed for maintenance, the cluster still operates correctly. 68 | insertEntries(); 69 | 70 | // Maintenance is completed and the node is started again. 71 | // It should connect to the cluster and start as learner. 72 | createCluster(); 73 | RAFT restarted = raft(removeIndex); 74 | RAFT restartedLeader = leader(); 75 | assertThat(restarted.role()).isEqualTo("Learner"); 76 | assertThat(restartedLeader.members()).hasSize(2).doesNotContain(restarted.raftId()); 77 | 78 | // Eventually the learner will catch up with the cluster. 79 | assertCommitIndex(10_000, restartedLeader.lastAppended(), restartedLeader.lastAppended(), this::raft, channel(removeIndex)); 80 | 81 | // The node is added again as a Raft member. 82 | restartedLeader.addServer(restarted.raftId()).get(10, TimeUnit.SECONDS); 83 | assertThat(eventually(() -> restarted.role().equals("Follower"), 10, TimeUnit.SECONDS)).isTrue(); 84 | 85 | // And the cluster continues to operate. 86 | insertEntries(); 87 | } 88 | 89 | private int removeIndex(boolean isLeader) { 90 | JChannel[] channels = channels(); 91 | for (int i = 0; i < channels.length; i++) { 92 | RAFT r = raft(channels[i]); 93 | if (r.isLeader() == isLeader) 94 | return i; 95 | } 96 | 97 | throw new AssertionError("Requested node not found"); 98 | } 99 | 100 | private void insertEntries() throws Exception { 101 | RAFT leader = leader(); 102 | 103 | byte[] payload = new byte[] { 1 }; 104 | for (int i = 0; i < 16; i++) { 105 | leader.set(payload, 0, 1, 10, TimeUnit.SECONDS); 106 | } 107 | 108 | assertCommitIndex(10_000, leader.lastAppended(), leader.lastAppended(), this::raft, actualChannels()); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/PartialConnectivityTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Global; 5 | import org.jgroups.View; 6 | import org.jgroups.protocols.raft.ELECTION; 7 | import org.jgroups.protocols.raft.ELECTION2; 8 | import org.jgroups.protocols.raft.RAFT; 9 | import org.jgroups.raft.testfwk.PartitionedRaftCluster; 10 | import org.jgroups.raft.testfwk.RaftNode; 11 | import org.jgroups.tests.harness.BaseRaftElectionTest; 12 | import org.jgroups.util.Util; 13 | 14 | import java.util.List; 15 | import java.util.Objects; 16 | import java.util.concurrent.TimeUnit; 17 | import java.util.function.BooleanSupplier; 18 | import java.util.stream.IntStream; 19 | import java.util.stream.Stream; 20 | 21 | import org.testng.annotations.Test; 22 | 23 | import static org.assertj.core.api.Assertions.assertThat; 24 | import static org.jgroups.raft.testfwk.RaftTestUtils.eventually; 25 | import static org.jgroups.tests.harness.BaseRaftElectionTest.ALL_ELECTION_CLASSES_PROVIDER; 26 | import static org.jgroups.tests.harness.BaseRaftElectionTest.waitUntilAllHaveLeaderElected; 27 | 28 | @Test(groups= Global.FUNCTIONAL, singleThreaded=true, dataProvider = ALL_ELECTION_CLASSES_PROVIDER) 29 | public class PartialConnectivityTest extends BaseRaftElectionTest.ClusterBased { 30 | 31 | { 32 | // Create nodes A, B, C, D, E. 33 | clusterSize = 5; 34 | 35 | // Since it uses a data provider, it needs to execute per method to inject the values. 36 | recreatePerMethod = true; 37 | } 38 | 39 | public void testQuorumLossAndRecovery(Class ignore) { 40 | int id=1; 41 | View initial=createView(id++, 0, 1, 2, 3, 4); 42 | cluster.handleView(initial); 43 | 44 | waitUntilLeaderElected(5_000, 0, 1, 2, 3, 4); 45 | waitUntilAllHaveLeaderElected(rafts(), 10_000); 46 | List
leaders = leaders(); 47 | assertThat(leaders).hasSize(1); 48 | Address leader = leaders.get(0); 49 | 50 | System.out.println("leader is " + leader); 51 | 52 | // Nodes D and E do not update their view. 53 | cluster.handleView(createView(id++, 0, 2)); 54 | cluster.handleView(createView(id++, 1, 2)); 55 | 56 | BooleanSupplier bs = () -> IntStream.of(0, 1, 2) 57 | .mapToObj(this::raft) 58 | .filter(Objects::nonNull) 59 | .allMatch(r -> r.leader() == null); 60 | assertThat(eventually(bs, 5, TimeUnit.SECONDS)) 61 | .as(this::dumpLeaderAndTerms) 62 | .isTrue(); 63 | assertThat(raft(3).leader()).as("leader should be " + leader + ", but found " + raft(3).leader()).isEqualTo(leader); 64 | assertThat(raft(4).leader()).as("leader should be " + leader + ", but found " + raft(4).leader()).isEqualTo(leader); 65 | 66 | for (RaftNode n : nodes()) { 67 | assertThat(n.election().isVotingThreadRunning()) 68 | .as("election thread should not be running in " + n) 69 | .isFalse(); 70 | } 71 | 72 | // Node `E` is the new view coordinator. 73 | View after = createView(id++, 4, 3, 0, 1, 2); 74 | System.out.println("after restored network: " + after); 75 | cluster.handleView(after); 76 | 77 | boolean elected = Util.waitUntilTrue(3000, 200, () -> Stream.of(nodes()).allMatch(n -> n.raft().leader() != null)); 78 | if (election(0).getClass().equals(ELECTION2.class)) { 79 | assertThat(elected).as("leader was never elected again").isTrue(); 80 | leaders = leaders(); 81 | assertThat(leaders).hasSize(1); 82 | System.out.println("Leader after restored network: " + leaders.get(0)); 83 | } else { 84 | assertThat(elected).as("Leader was elected again").isFalse(); 85 | assertThat(election(0)).isInstanceOf(ELECTION.class); 86 | } 87 | } 88 | 89 | @Override 90 | protected PartitionedRaftCluster createNewMockCluster() { 91 | return new PartitionedRaftCluster(); 92 | } 93 | 94 | @Override 95 | protected void amendRAFTConfiguration(RAFT raft) { 96 | raft.synchronous(true); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/RaftHeaderTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.protocols.raft.*; 5 | import org.jgroups.protocols.raft.election.VoteRequest; 6 | import org.jgroups.protocols.raft.election.VoteResponse; 7 | import org.jgroups.raft.Options; 8 | import org.jgroups.util.ByteArrayDataOutputStream; 9 | import org.jgroups.util.Util; 10 | import org.testng.annotations.Test; 11 | 12 | import static org.jgroups.protocols.raft.AppendResult.Result.FAIL_ENTRY_NOT_FOUND; 13 | 14 | /** 15 | * @author Bela Ban 16 | * @since 0.1 17 | */ 18 | @Test(groups=Global.FUNCTIONAL) 19 | public class RaftHeaderTest { 20 | 21 | public void testVoteRequestHeader() throws Exception { 22 | VoteRequest hdr=new VoteRequest(22); 23 | _testSize(hdr, VoteRequest.class); 24 | } 25 | 26 | public void testVoteResponseHeader() throws Exception { 27 | VoteResponse rsp=new VoteResponse(22, 3, 7); 28 | _testSize(rsp, VoteResponse.class); 29 | } 30 | 31 | 32 | public void testAppendEntriesRequest() throws Exception { 33 | AppendEntriesRequest req=new AppendEntriesRequest(Util.createRandomAddress("A"), 22, 4, 21, 22, 18); 34 | _testSize(req, AppendEntriesRequest.class); 35 | } 36 | 37 | public void testAppendEntriesResponse() throws Exception { 38 | AppendEntriesResponse rsp=new AppendEntriesResponse(22, new AppendResult(FAIL_ENTRY_NOT_FOUND, 22, 5)); 39 | _testSize(rsp, AppendEntriesResponse.class); 40 | } 41 | 42 | public void testInstallSnapshotHeader() throws Exception { 43 | InstallSnapshotRequest hdr=new InstallSnapshotRequest(5); 44 | _testSize(hdr, InstallSnapshotRequest.class); 45 | 46 | hdr=new InstallSnapshotRequest(5, Util.createRandomAddress("A"), 5, 4); 47 | _testSize(hdr, InstallSnapshotRequest.class); 48 | } 49 | 50 | public void testRedirectHeader() throws Exception { 51 | REDIRECT.RedirectHeader hdr=new REDIRECT.RedirectHeader(REDIRECT.RequestType.REQ, 22, true); 52 | _testSize(hdr, REDIRECT.RedirectHeader.class); 53 | hdr.options(Options.create(true)); 54 | 55 | hdr=new REDIRECT.RedirectHeader(REDIRECT.RequestType.RSP, 322649, false) 56 | .options(Options.create(true)); 57 | _testSize(hdr, REDIRECT.RedirectHeader.class); 58 | } 59 | 60 | 61 | protected static void _testSize(T hdr, Class clazz) throws Exception { 62 | int size=hdr.serializedSize(); 63 | ByteArrayDataOutputStream out=new ByteArrayDataOutputStream(size); 64 | hdr.writeTo(out); 65 | System.out.println(clazz.getSimpleName() + ": size=" + size); 66 | assert out.position() == size; 67 | 68 | RaftHeader hdr2=Util.streamableFromByteBuffer(clazz, out.buffer(), 0, out.position()); 69 | assert hdr2 != null; 70 | assert hdr.currTerm() == hdr2.currTerm(); 71 | } 72 | 73 | 74 | protected static void _testSize(T hdr, Class clazz) throws Exception { 75 | int size=hdr.serializedSize(); 76 | ByteArrayDataOutputStream out=new ByteArrayDataOutputStream(size); 77 | hdr.writeTo(out); 78 | System.out.println(clazz.getSimpleName() + ": size=" + size); 79 | assert out.position() == size; 80 | 81 | REDIRECT.RedirectHeader hdr2=Util.streamableFromByteBuffer(clazz, out.buffer(), 0, out.position()); 82 | assert hdr2 != null; 83 | assert hdr.serializedSize() == hdr2.serializedSize(); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/ReplicatedStateMachineTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.JChannel; 5 | import org.jgroups.protocols.raft.ELECTION; 6 | import org.jgroups.protocols.raft.RAFT; 7 | import org.jgroups.protocols.raft.REDIRECT; 8 | import org.jgroups.raft.blocks.ReplicatedStateMachine; 9 | import org.jgroups.util.Util; 10 | import org.testng.annotations.Test; 11 | 12 | import java.util.Arrays; 13 | import java.util.List; 14 | 15 | @Test(groups=Global.FUNCTIONAL,singleThreaded=true) 16 | public class ReplicatedStateMachineTest { 17 | 18 | protected static final String CLUSTER=ReplicatedStateMachineTest.class.getSimpleName(); 19 | protected final List mbrs=Arrays.asList("A", "B", "C", "D"); 20 | 21 | public void testEquals() throws Exception { 22 | try(JChannel channelA = create("A"); 23 | JChannel channelB = create("B") ) { 24 | ReplicatedStateMachine one = new ReplicatedStateMachine<>(channelA); 25 | ReplicatedStateMachine other = new ReplicatedStateMachine<>(channelB); 26 | 27 | assert one.equals(one); 28 | assert one.equals(other); 29 | assert other.equals(one); 30 | assert !one.equals(null); 31 | assert !one.equals(new Object()); 32 | } 33 | } 34 | 35 | @SuppressWarnings("resource") 36 | protected JChannel create(String name) throws Exception { 37 | RAFT raft=new RAFT().members(mbrs).raftId(name).stateMachine(new DummyStateMachine()) 38 | .logClass("org.jgroups.protocols.raft.InMemoryLog").logPrefix(name + "-" + CLUSTER); 39 | JChannel ch=new JChannel(Util.getTestStack(new ELECTION(), raft, new REDIRECT())).name(name); 40 | ch.connect(CLUSTER); 41 | return ch; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/RequestTableTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.raft.util.RequestTable; 5 | import org.jgroups.util.Util; 6 | import org.testng.annotations.Test; 7 | 8 | import java.util.concurrent.CompletableFuture; 9 | import java.util.function.Supplier; 10 | import java.util.stream.IntStream; 11 | 12 | /** 13 | * @author Bela Ban 14 | * @since 0.1 15 | */ 16 | @Test(groups=Global.FUNCTIONAL,singleThreaded=true) // single-threaded because of the single CompletableFuture 17 | public class RequestTableTest { 18 | protected static final CompletableFuture future=new CompletableFuture<>(); 19 | 20 | public void testSimple() { 21 | Supplier majority=() -> 3; 22 | RequestTable table=new RequestTable<>(); 23 | table.create(1, "A", future, majority); 24 | table.add(1, "A", majority); 25 | assert !table.isCommitted(1); 26 | boolean done=table.add(1, "B", majority); 27 | assert !done; 28 | done=table.add(1, "C", majority); 29 | assert done; 30 | assert table.isCommitted(1); 31 | } 32 | 33 | public void testSingleNode() { 34 | RequestTable table=new RequestTable<>(); 35 | table.create(1, "A", future, () -> 1); 36 | assert table.isCommitted(1); 37 | boolean added=table.add(1, "A", () -> 1); 38 | assert !added : "should only mark as committed once"; 39 | } 40 | 41 | public void testAdd() { 42 | RequestTable table=new RequestTable<>(); 43 | table.create(3, "A", future, () -> 2); 44 | assert table.isCommitted(1); 45 | assert !table.isCommitted(3); 46 | table.add(3, "A", () -> 2); 47 | assert !table.isCommitted(3); 48 | boolean commited=table.add(3, "B", () -> 2); 49 | assert commited && table.isCommitted(3); 50 | for(int i=4; i <= 10; i++) 51 | table.create(i, "A", future, () -> 2); 52 | commited=table.add(10, "B", () -> 2); 53 | assert commited; 54 | for(int i=4; i <= 10; i++) 55 | assert table.isCommitted(10); 56 | assert table.size() == 8; 57 | } 58 | 59 | public void testNotifyAndRemove() { 60 | RequestTable table=new RequestTable<>(); 61 | for(int i=1; i <= 5; i++) 62 | table.create(i, "A", future, () -> 1); 63 | IntStream.rangeClosed(1,5).parallel().forEach(i -> table.notifyAndRemove(i, "bb".getBytes())); 64 | Util.waitUntilTrue(5000, 200, () -> table.size() == 0); 65 | assert table.size() == 0 : String.format("table size should be %d but is %d", 0, table.size()); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/TimeoutTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.JChannel; 5 | import org.jgroups.protocols.raft.RAFT; 6 | import org.jgroups.raft.blocks.ReplicatedStateMachine; 7 | import org.jgroups.tests.harness.BaseStateMachineTest; 8 | 9 | import java.util.Arrays; 10 | import java.util.concurrent.TimeUnit; 11 | import java.util.function.BooleanSupplier; 12 | import java.util.stream.IntStream; 13 | 14 | import org.testng.annotations.AfterMethod; 15 | import org.testng.annotations.Test; 16 | 17 | import static org.assertj.core.api.Assertions.assertThat; 18 | import static org.jgroups.raft.testfwk.RaftTestUtils.eventually; 19 | 20 | @Test(groups=Global.FUNCTIONAL,singleThreaded=true) 21 | public class TimeoutTest extends BaseStateMachineTest> { 22 | protected static final int NUM=200; 23 | 24 | { 25 | createManually = true; 26 | recreatePerMethod = true; 27 | } 28 | 29 | @AfterMethod(alwaysRun = true) 30 | protected void destroy() throws Exception { 31 | destroyCluster(); 32 | } 33 | 34 | public void testAppendWithSingleNode() throws Exception { 35 | _test(1); 36 | } 37 | 38 | public void testAppendWith3Nodes() throws Exception { 39 | _test(3); 40 | } 41 | 42 | protected void _test(int num) throws Exception { 43 | withClusterSize(num); 44 | createCluster(); 45 | 46 | BooleanSupplier bs = () -> Arrays.stream(channels()) 47 | .map(this::raft) 48 | .anyMatch(RAFT::isLeader); 49 | assertThat(eventually(bs, 10, TimeUnit.SECONDS)) 50 | .as("Leader election") 51 | .isTrue(); 52 | 53 | ReplicatedStateMachine sm=null; 54 | System.out.println("-- waiting for leader"); 55 | for(int i=0; i < channels().length; i++) { 56 | RAFT raft=raft(i); 57 | assertThat(raft).isNotNull(); 58 | if(raft.isLeader()) { 59 | sm=stateMachine(i); 60 | System.out.printf("-- found leader: %s\n", raft.leader()); 61 | break; 62 | } 63 | } 64 | 65 | assert sm != null : "No leader found"; 66 | for(int i=1; i <= NUM; i++) { 67 | try { 68 | sm.put(i, i); 69 | } catch(Exception ex) { 70 | System.err.printf("put(%d): last-applied=%d, commit-index=%d\n", i, sm.lastApplied(), sm.commitIndex()); 71 | throw ex; 72 | } 73 | } 74 | 75 | long start=System.currentTimeMillis(); 76 | sm.allowDirtyReads(false); 77 | assert sm.get(NUM) == NUM; 78 | 79 | // After reading correctly from the leader with a quorum read, every node should have the same state. 80 | // We still have to use eventually so the message propagate to ALL nodes, not only majority. 81 | assertStateMachineEventuallyMatch(IntStream.range(0, num).toArray()); 82 | long time=System.currentTimeMillis()-start; 83 | System.out.printf("-- it took %d member(s) %d ms to get consistent caches\n", clusterSize, time); 84 | 85 | System.out.print("-- verifying contents of state machines:\n"); 86 | for (int i = 0; i < clusterSize; i++) { 87 | ReplicatedStateMachine rsm = stateMachine(i); 88 | System.out.printf("%s: ", rsm.channel().getName()); 89 | for(int j=1; j <= NUM; j++) 90 | assert rsm.get(j) == j; 91 | System.out.println("OK"); 92 | } 93 | } 94 | 95 | @Override 96 | protected void amendRAFTConfiguration(RAFT raft) { 97 | raft.resendInterval(1_000); 98 | } 99 | 100 | @Override 101 | protected ReplicatedStateMachine createStateMachine(JChannel ch) { 102 | ReplicatedStateMachine rsm = new ReplicatedStateMachine<>(ch); 103 | rsm.timeout(2_000).allowDirtyReads(true); 104 | return rsm; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/UtilsTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Global; 5 | import org.jgroups.View; 6 | import org.jgroups.raft.util.Utils; 7 | import org.jgroups.raft.util.Utils.Majority; 8 | import org.jgroups.util.Util; 9 | import org.testng.annotations.Test; 10 | 11 | import static org.jgroups.raft.util.Utils.Majority.*; 12 | 13 | /** 14 | * @author Bela Ban 15 | * @since 1.0.6 16 | */ 17 | @Test(groups=Global.FUNCTIONAL) 18 | public class UtilsTest { 19 | protected static final Address a=Util.createRandomAddress("A"), 20 | b=Util.createRandomAddress("B"),c=Util.createRandomAddress("C"), 21 | d=Util.createRandomAddress("D"),e=Util.createRandomAddress("E"); 22 | protected static final int MAJORITY=3; 23 | 24 | public void testMajority() { 25 | View old=null, new_view=create(a); 26 | _test(old, new_view, false, false); 27 | 28 | old=new_view; 29 | new_view=create(a,b); 30 | _test(old, new_view, false, false); 31 | 32 | old=new_view; 33 | new_view=create(a,b,c); 34 | _test(old, new_view, true, false); 35 | 36 | old=new_view; 37 | new_view=create(a,b,c,d); 38 | _test(old, new_view, false, false); 39 | 40 | old=new_view; 41 | new_view=create(a,b,c,d,e); 42 | _test(old, new_view, false, false); 43 | 44 | old=new_view; 45 | new_view=create(a,b,c,d); 46 | _test(old, new_view, false, false); 47 | 48 | old=new_view; 49 | new_view=create(a,b,c); 50 | _test(old, new_view, false, false); 51 | 52 | old=new_view; 53 | new_view=create(a,b); 54 | _test(old, new_view, false, true); 55 | 56 | old=new_view; 57 | new_view=create(a); 58 | _test(old, new_view, false, false); 59 | 60 | old=new_view; 61 | new_view=create(a,b,c,d); 62 | _test(old, new_view, true, false); 63 | 64 | old=new_view; 65 | new_view=create(d); 66 | _test(old, new_view, false, true); 67 | } 68 | 69 | public void testComputeMajority() { 70 | View old=null, new_view=create(a); 71 | _test(old, new_view, no_change); 72 | 73 | old=new_view; 74 | new_view=create(a,b); 75 | _test(old, new_view, no_change); 76 | 77 | old=new_view; 78 | new_view=create(a,b,c); 79 | _test(old, new_view, reached); 80 | 81 | old=new_view; 82 | new_view=create(a,b,c,d); 83 | _test(old, new_view, no_change); 84 | 85 | old=new_view; 86 | new_view=create(a,b,c,d,e); 87 | _test(old, new_view, no_change); 88 | 89 | old=new_view; 90 | new_view=create(a,b,c,d); 91 | _test(old, new_view, no_change); 92 | 93 | old=new_view; 94 | new_view=create(a,b,c); 95 | _test(old, new_view, no_change); 96 | 97 | old=new_view; 98 | new_view=create(a,b); 99 | _test(old, new_view, lost); 100 | 101 | old=new_view; 102 | new_view=create(a); 103 | _test(old, new_view, no_change); 104 | 105 | old=new_view; 106 | new_view=create(a,b,c,d); 107 | _test(old, new_view, reached); 108 | 109 | old=new_view; 110 | new_view=create(b,c,d); 111 | _test(old, new_view, leader_lost); 112 | 113 | old=new_view; 114 | new_view=create(d); 115 | _test(old, new_view, lost); 116 | 117 | old=null; 118 | new_view=create(a,b,c,d); 119 | _test(old, new_view, reached); 120 | } 121 | 122 | protected static void _test(View old, View new_view, Majority expected) { 123 | Majority result=Utils.computeMajority(old, new_view, MAJORITY, old != null? old.getCoord() : null); 124 | System.out.printf("old: %s, new: %s, result: %s\n", old, new_view, result); 125 | assert result == expected; 126 | } 127 | 128 | protected static void _test(View old, View new_view, boolean majority_reached, boolean majority_lost) { 129 | boolean maj_reached=Utils.majorityReached(old, new_view, MAJORITY); 130 | boolean maj_lost=Utils.majorityLost(old, new_view, MAJORITY); 131 | System.out.printf("old view: %s, new view: %s, majority reached: %b, majority lost: %b\n", 132 | old, new_view, maj_reached, maj_lost); 133 | assert maj_reached == majority_reached; 134 | assert maj_lost == majority_lost; 135 | } 136 | 137 | protected static View create(Address... mbrs) { 138 | return View.create(mbrs[0], 1, mbrs); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/election/DelayedElectedLeaderMessageTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests.election; 2 | 3 | import org.jgroups.Global; 4 | import org.jgroups.Header; 5 | import org.jgroups.View; 6 | import org.jgroups.protocols.raft.election.LeaderElected; 7 | import org.jgroups.raft.testfwk.BlockingMessageInterceptor; 8 | import org.jgroups.raft.testfwk.RaftCluster; 9 | import org.jgroups.tests.harness.BaseRaftElectionTest; 10 | import org.jgroups.tests.harness.RaftAssertion; 11 | 12 | import java.util.Arrays; 13 | import java.util.Map; 14 | import java.util.concurrent.TimeUnit; 15 | import java.util.concurrent.atomic.AtomicBoolean; 16 | import java.util.function.BooleanSupplier; 17 | 18 | import org.testng.annotations.AfterMethod; 19 | import org.testng.annotations.Test; 20 | 21 | import static org.assertj.core.api.Assertions.assertThat; 22 | import static org.jgroups.raft.testfwk.RaftTestUtils.eventually; 23 | import static org.jgroups.tests.harness.BaseRaftElectionTest.ALL_ELECTION_CLASSES_PROVIDER; 24 | 25 | @Test(groups = Global.FUNCTIONAL, singleThreaded = true, dataProvider = ALL_ELECTION_CLASSES_PROVIDER) 26 | public class DelayedElectedLeaderMessageTest extends BaseRaftElectionTest.ClusterBased { 27 | 28 | /** 29 | * Use blocking interceptor to capture LeaderElected. 30 | * Install complete view to succeed in the election process. 31 | * While the message is blocked, remove the majority of nodes. 32 | * After new view installed, remove the interceptor. 33 | * The node should not install the leader. 34 | */ 35 | 36 | private static final byte[] BUF = {}; 37 | 38 | { 39 | createManually = true; 40 | } 41 | 42 | @AfterMethod 43 | protected void destroy() throws Exception { 44 | destroyCluster(); 45 | } 46 | 47 | public void testQuorumLostAfterMessageSent(Class ignore) throws Exception { 48 | long viewId = 0; 49 | withClusterSize(5); 50 | createCluster(); 51 | 52 | // Create complete view to successfully elect a node. 53 | // Use a cluster of 5 nodes. 54 | View v1 = createView(viewId++, 0, 1, 2, 3, 4); 55 | 56 | // Run asynchronously to allow the voting thread to stop. 57 | cluster.async(true); 58 | 59 | // Intercept the first `LeaderElected` message. 60 | AtomicBoolean onlyOnce = new AtomicBoolean(true); 61 | BlockingMessageInterceptor interceptor = cluster.addCommandInterceptor(m -> { 62 | for (Map.Entry h : m.getHeaders().entrySet()) { 63 | if (h.getValue() instanceof LeaderElected && onlyOnce.getAndSet(false)) { 64 | // Assert the coordinator was elected. 65 | LeaderElected le = (LeaderElected) h.getValue(); 66 | assertThat(le.leader()).isEqualTo(address(0)); 67 | return true; 68 | } 69 | } 70 | return false; 71 | }); 72 | 73 | // Install view and elect the coordinator. 74 | cluster.handleView(v1); 75 | 76 | // Intercept the leader elected message. 77 | System.out.println("-- wait command intercept"); 78 | assertThat(eventually(() -> interceptor.numberOfBlockedMessages() > 0, 10, TimeUnit.SECONDS)).isTrue(); 79 | 80 | // Install the new view while the LeaderElected is in-flight. 81 | // The new view does not have a majority. 82 | System.out.println("-- install new view without majority"); 83 | View v2 = createView(viewId++, 0, 1); 84 | cluster.handleView(v2); 85 | 86 | // Release the leader elected message. 87 | // The node should not install the new leader. 88 | System.out.println("-- release leader elected message"); 89 | interceptor.releaseNext(); 90 | interceptor.assertNoBlockedMessages(); 91 | 92 | // Make sure the leader stays null for the whole time. 93 | BooleanSupplier bs = () -> Arrays.stream(rafts()) 94 | .anyMatch(r -> r.leader() != null); 95 | assertThat(eventually(bs, 3, TimeUnit.SECONDS)) 96 | .withFailMessage(this::dumpLeaderAndTerms) 97 | .isFalse(); 98 | 99 | assertThat(bs.getAsBoolean()).isFalse(); 100 | RaftAssertion.assertLeaderlessOperationThrows(() -> raft(0).set(BUF, 0, 0)); 101 | } 102 | 103 | @Override 104 | protected RaftCluster createNewMockCluster() { 105 | return new RaftCluster(); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/election/NetworkPartitionElectionTest.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests.election; 2 | 3 | import org.jgroups.Address; 4 | import org.jgroups.Global; 5 | import org.jgroups.Header; 6 | import org.jgroups.View; 7 | import org.jgroups.protocols.raft.RAFT; 8 | import org.jgroups.protocols.raft.election.LeaderElected; 9 | import org.jgroups.raft.testfwk.BlockingMessageInterceptor; 10 | import org.jgroups.raft.testfwk.PartitionedRaftCluster; 11 | import org.jgroups.raft.testfwk.RaftTestUtils; 12 | import org.jgroups.tests.harness.BaseRaftElectionTest; 13 | 14 | import java.util.Map; 15 | import java.util.Objects; 16 | import java.util.concurrent.TimeUnit; 17 | import java.util.concurrent.atomic.AtomicBoolean; 18 | 19 | import org.testng.annotations.Test; 20 | 21 | import static org.assertj.core.api.Assertions.assertThat; 22 | import static org.jgroups.tests.harness.BaseRaftElectionTest.ALL_ELECTION_CLASSES_PROVIDER; 23 | 24 | @Test(groups = Global.FUNCTIONAL, singleThreaded = true, dataProvider = ALL_ELECTION_CLASSES_PROVIDER) 25 | public class NetworkPartitionElectionTest extends BaseRaftElectionTest.ClusterBased { 26 | 27 | { 28 | clusterSize = 5; 29 | 30 | // Since it uses a data provider, it needs to execute per method to inject the values. 31 | recreatePerMethod = true; 32 | } 33 | 34 | public void testNetworkPartitionDuringElection(Class ignore) throws Exception { 35 | withClusterSize(5); 36 | createCluster(); 37 | long id = 0; 38 | 39 | View view = createView(id++, 0, 1, 2, 3, 4); 40 | 41 | // We intercept the first `LeaderElected` message. 42 | AtomicBoolean onlyOnce = new AtomicBoolean(true); 43 | BlockingMessageInterceptor interceptor = cluster.addCommandInterceptor(m -> { 44 | for (Map.Entry h : m.getHeaders().entrySet()) { 45 | if (h.getValue() instanceof LeaderElected && onlyOnce.getAndSet(false)) { 46 | // Assert that node A was elected 47 | LeaderElected le = (LeaderElected) h.getValue(); 48 | assertThat(le.leader()).isEqualTo(address(0)); 49 | return true; 50 | } 51 | } 52 | return false; 53 | }); 54 | 55 | cluster.handleView(view); 56 | 57 | System.out.println("-- wait command intercept"); 58 | assertThat(RaftTestUtils.eventually(() -> interceptor.numberOfBlockedMessages() > 0, 10, TimeUnit.SECONDS)).isTrue(); 59 | 60 | // While the message is in-flight, the cluster splits. 61 | // The previous coordinator does not have the majority to proceed. 62 | cluster.handleView(createView(id++, 0, 1)); 63 | cluster.handleView(createView(id++, 2, 3, 4)); 64 | 65 | // We can release the elected message. 66 | interceptor.releaseNext(); 67 | interceptor.assertNoBlockedMessages(); 68 | 69 | // Check in all instances that a new leader is elected. 70 | System.out.println("-- waiting for leader in majority partition"); 71 | BaseRaftElectionTest.waitUntilLeaderElected(rafts(), 10_000); 72 | 73 | // Assert that A and B does not have a leader. 74 | assertThat(raft(0).leader()).isNull(); 75 | assertThat(raft(1).leader()).isNull(); 76 | 77 | System.out.printf("-- elected during the split\n%s%n", dumpLeaderAndTerms()); 78 | // Store who's the leader before merging. 79 | assertThat(leaders()).hasSize(1); 80 | RAFT leader = raft(leaders().get(0)); 81 | 82 | System.out.printf("-- merge partition, leader=%s%n", leader); 83 | // Join the partitions. 84 | // Note that the coordinator is different. 85 | cluster.handleView(createView(id++, 0, 1, 2, 3, 4)); 86 | 87 | // Wait until A and B receive the leader information. 88 | BaseRaftElectionTest.waitUntilAllHaveLeaderElected(rafts(), 10_000); 89 | System.out.printf("-- state after merge\n%s%n", dumpLeaderAndTerms()); 90 | } 91 | 92 | private RAFT raft(Address address) { 93 | for (RAFT raft : rafts()) { 94 | if (Objects.equals(address, raft.getAddress())) 95 | return raft; 96 | } 97 | 98 | throw new IllegalArgumentException(String.format("Node with address '%s' not present", address)); 99 | } 100 | 101 | @Override 102 | protected PartitionedRaftCluster createNewMockCluster() { 103 | return new PartitionedRaftCluster(); 104 | } 105 | 106 | @Override 107 | protected void amendRAFTConfiguration(RAFT raft) { 108 | raft.synchronous(true); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /tests/junit-functional/org/jgroups/tests/harness/RaftAssertion.java: -------------------------------------------------------------------------------- 1 | package org.jgroups.tests.harness; 2 | 3 | import java.util.Arrays; 4 | import java.util.Collection; 5 | import java.util.List; 6 | import java.util.Objects; 7 | import java.util.concurrent.ExecutionException; 8 | import java.util.concurrent.TimeUnit; 9 | import java.util.concurrent.TimeoutException; 10 | import java.util.function.BooleanSupplier; 11 | import java.util.function.Function; 12 | import java.util.function.Supplier; 13 | 14 | import org.assertj.core.api.Assertions; 15 | import org.jgroups.JChannel; 16 | import org.jgroups.protocols.raft.RAFT; 17 | 18 | import static org.assertj.core.api.Assertions.assertThat; 19 | import static org.jgroups.raft.testfwk.RaftTestUtils.eventually; 20 | 21 | public final class RaftAssertion { 22 | 23 | private RaftAssertion() { } 24 | 25 | public static void assertLeaderlessOperationThrows(ThrowingRunnable operation) { 26 | assertLeaderlessOperationThrows(operation, "Running operation without a leader."); 27 | } 28 | 29 | public static void assertLeaderlessOperationThrows(ThrowingRunnable operation, String message) { 30 | assertLeaderlessOperationThrows(operation, () -> message); 31 | } 32 | 33 | public static void assertLeaderlessOperationThrows(ThrowingRunnable operation, Supplier message) { 34 | Assertions.assertThatThrownBy(operation::run) 35 | .as(message) 36 | .satisfiesAnyOf( 37 | // In case the leader already received the view update and stepped down. 38 | tc -> assertThat(tc) 39 | .isInstanceOf(IllegalStateException.class) 40 | .hasMessageContaining("I'm not the leader "), 41 | 42 | // In case the request is sent before the leader step down. 43 | // We could update this so when the leader step down it cancel requests. 44 | tc -> assertThat(tc).isInstanceOf(TimeoutException.class), 45 | 46 | // The request was sent but failed. 47 | tc -> assertThat(tc).isInstanceOf(ExecutionException.class) 48 | .cause() 49 | .isInstanceOf(IllegalStateException.class) 50 | .hasMessageContaining("I'm not the leader ") 51 | ); 52 | } 53 | 54 | public static void assertCommitIndex(long timeout, long expected_commit, long expected_applied, Function converter, Collection channels) { 55 | BooleanSupplier bs = () -> { 56 | boolean all_ok = true; 57 | for (JChannel ch : channels) { 58 | RAFT raft = converter.apply(ch); 59 | if (expected_commit != raft.commitIndex() || expected_applied != raft.lastAppended()) 60 | all_ok = false; 61 | } 62 | return all_ok; 63 | }; 64 | assertThat(eventually(bs, timeout, TimeUnit.MILLISECONDS)) 65 | .as("Commit indexes never matched") 66 | .isTrue(); 67 | 68 | for (JChannel ch : channels) { 69 | RAFT raft = converter.apply(ch); 70 | String check = String.format("%s: last-applied=%d, commit-index=%d\n", ch.getAddress(), raft.lastAppended(), raft.commitIndex()); 71 | System.out.printf(check); 72 | assertThat(raft) 73 | .as(check) 74 | .returns(expected_commit, RAFT::commitIndex) 75 | .returns(expected_applied, RAFT::lastAppended); 76 | } 77 | } 78 | 79 | public static void assertCommitIndex(long timeout, long expected_commit, long expected_applied, Function converter, JChannel... channels) { 80 | assertCommitIndex(timeout, expected_commit, expected_applied, converter, List.of(channels)); 81 | } 82 | 83 | public static void waitUntilAllRaftsHaveLeader(JChannel[] channels, Function converter) { 84 | RAFT[] rafts = Arrays.stream(channels) 85 | .filter(Objects::nonNull) 86 | .map(converter) 87 | .toArray(RAFT[]::new); 88 | BaseRaftElectionTest.waitUntilLeaderElected(rafts, 10_000); 89 | } 90 | 91 | @FunctionalInterface 92 | public interface ThrowingRunnable { 93 | 94 | void run() throws Throwable; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /tests/resources/raft-benchmark.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 11 | 29 | 30 | 31 | 33 | 34 | 35 | 36 | 42 | 47 | 49 | 50 | 51 | 53 | 55 | 57 | 58 | 59 | 63 | 64 | 65 | --------------------------------------------------------------------------------