├── .flake8 ├── .gitignore ├── .pylintrc ├── .python-version ├── .ruby-version ├── Gemfile ├── Gemfile.lock ├── Makefile ├── Procfile ├── README.md ├── bin ├── bench │ ├── bench_db.py │ ├── bench_membership.py │ ├── data │ │ ├── membership │ │ │ ├── 1 transparent ping req │ │ │ │ └── fdi:0.5|fds:3|gi:0.2|gs:5 │ │ │ │ │ ├── 0.json │ │ │ │ │ ├── 1.json │ │ │ │ │ ├── 2.json │ │ │ │ │ ├── 3.json │ │ │ │ │ └── 4.json │ │ │ ├── 2 remove on failed ping req │ │ │ │ └── fdi:0.5|fds:3|gi:0.2|gs:5 │ │ │ │ │ ├── 0.json │ │ │ │ │ ├── 1.json │ │ │ │ │ ├── 2.json │ │ │ │ │ ├── 3.json │ │ │ │ │ └── 4.json │ │ │ ├── 3 add suspect on failed ping req │ │ │ │ └── fdi:0.5|fds:3|gi:0.2|gs:5 │ │ │ │ │ ├── 0.json │ │ │ │ │ ├── 1.json │ │ │ │ │ ├── 2.json │ │ │ │ │ ├── 3.json │ │ │ │ │ └── 4.json │ │ │ └── 4 improved random target selection │ │ │ │ └── fdi:0.5|fds:3|gi:0.2|gs:5 │ │ │ │ ├── 0.json │ │ │ │ ├── 1.json │ │ │ │ └── 2.json │ │ └── storage │ │ │ ├── 1.prof │ │ │ ├── 2.prof │ │ │ └── compare.png │ ├── plot_db.py │ └── plot_membership.py └── prof.py ├── docs ├── bench.md ├── img │ └── journal │ │ ├── 01_membership │ │ ├── membership.png │ │ ├── mermaid-diagram-20200607085911.png │ │ └── mermaid-diagram-20200607090040.png │ │ └── 02_storage │ │ ├── entry.png │ │ ├── mermaid-diagram-20201012084711.png │ │ └── mermaid-diagram-20201012094531.png ├── journal │ ├── 01_membership.md │ └── 02_storage.md └── mermaid │ ├── membership-fdloop.mmd │ ├── membership-investigation.mmd │ ├── storage-avl.mmd │ └── storage-ssi.mmd ├── jdb ├── __init__.py ├── cli.py ├── const.py ├── crdt.py ├── errors.py ├── hlc.py ├── jql.py ├── maglev.py ├── membership.py ├── node.py ├── pb │ ├── __init__.py │ ├── peer_server.proto │ ├── peer_server_pb2.py │ └── peer_server_pb2_grpc.py ├── peer.py ├── routing.py ├── server │ ├── __init__.py │ ├── client_server.py │ ├── peer_server.py │ └── server.py ├── storage │ ├── __init__.py │ ├── avltree.py │ ├── compression.py │ ├── db.py │ ├── entry.py │ ├── memtable.py │ ├── oracle.py │ └── transaction.py ├── types.py └── util.py ├── pytest.ini ├── requirements.txt ├── setup.py └── tests ├── __init__.py └── test_unit.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203,W503 3 | max-line-length = 88 4 | exclude = pb -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application using `collectstatic` 142 | media 143 | static 144 | 145 | .vscode 146 | tmp -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # Specify a configuration file. 4 | #rcfile= 5 | 6 | # Python code to execute, usually for sys.path manipulation such as 7 | # pygtk.require(). 8 | #init-hook= 9 | 10 | # Add files or directories to the blacklist. They should be base names, not 11 | # paths. 12 | ignore=CVS,peer_server_pb2.py,peer_server_pb2_grpc.py 13 | 14 | # Pickle collected data for later comparisons. 15 | persistent=yes 16 | 17 | # List of plugins (as comma separated values of python modules names) to load, 18 | # usually to register additional checkers. 19 | load-plugins= 20 | 21 | # Use multiple processes to speed up Pylint. 22 | jobs=1 23 | 24 | # Allow loading of arbitrary C extensions. Extensions are imported into the 25 | # active Python interpreter and may run arbitrary code. 26 | unsafe-load-any-extension=no 27 | 28 | # A comma-separated list of package or module names from where C extensions may 29 | # be loaded. Extensions are loading into the active Python interpreter and may 30 | # run arbitrary code 31 | extension-pkg-whitelist= 32 | 33 | # Allow optimization of some AST trees. This will activate a peephole AST 34 | # optimizer, which will apply various small optimizations. For instance, it can 35 | # be used to obtain the result of joining multiple strings with the addition 36 | # operator. Joining a lot of strings can lead to a maximum recursion error in 37 | # Pylint and this flag can prevent that. It has one side effect, the resulting 38 | # AST will be different than the one from reality. 39 | optimize-ast=no 40 | 41 | 42 | [MESSAGES CONTROL] 43 | 44 | # Only show warnings with the listed confidence levels. Leave empty to show 45 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 46 | confidence= 47 | 48 | # Enable the message, report, category or checker with the given id(s). You can 49 | # either give multiple identifier separated by comma (,) or put this option 50 | # multiple time (only on the command line, not in the configuration file where 51 | # it should appear only once). See also the "--disable" option for examples. 52 | #enable= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once).You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use"--disable=all --enable=classes 62 | # --disable=W" 63 | disable= 64 | # disabled by me, 65 | locally-disabled, 66 | missing-module-docstring, 67 | bad-continuation, 68 | fixme, 69 | # disabled by default, 70 | import-star-module-level, 71 | old-octal-literal, 72 | oct-method, 73 | print-statement, 74 | unpacking-in-except, 75 | parameter-unpacking, 76 | backtick, 77 | old-raise-syntax, 78 | old-ne-operator, 79 | long-suffix, 80 | dict-view-method, 81 | dict-iter-method, 82 | metaclass-assignment, 83 | next-method-called, 84 | raising-string, 85 | indexing-exception, 86 | raw_input-builtin, 87 | long-builtin, 88 | file-builtin, 89 | execfile-builtin, 90 | coerce-builtin, 91 | cmp-builtin, 92 | buffer-builtin, 93 | basestring-builtin, 94 | apply-builtin, 95 | filter-builtin-not-iterating, 96 | using-cmp-argument, 97 | useless-suppression, 98 | range-builtin-not-iterating, 99 | suppressed-message, 100 | no-absolute-import, 101 | old-division, 102 | cmp-method, 103 | reload-builtin, 104 | zip-builtin-not-iterating, 105 | intern-builtin, 106 | unichr-builtin, 107 | reduce-builtin, 108 | standarderror-builtin, 109 | unicode-builtin, 110 | xrange-builtin, 111 | coerce-method, 112 | delslice-method, 113 | getslice-method, 114 | setslice-method, 115 | input-builtin, 116 | round-builtin, 117 | hex-method, 118 | nonzero-method, 119 | map-builtin-not-iterating, 120 | 121 | 122 | [REPORTS] 123 | 124 | # Set the output format. Available formats are text, parseable, colorized, msvs 125 | # (visual studio) and html. You can also give a reporter class, eg 126 | # mypackage.mymodule.MyReporterClass. 127 | output-format=text 128 | 129 | # Put messages in a separate file for each module / package specified on the 130 | # command line instead of printing them on stdout. Reports (if any) will be 131 | # written in a file name "pylint_global.[txt|html]". 132 | files-output=no 133 | 134 | # Tells whether to display a full report or only the messages 135 | reports=yes 136 | 137 | # Python expression which should return a note less than 10 (10 is the highest 138 | # note). You have access to the variables errors warning, statement which 139 | # respectively contain the number of errors / warnings messages and the total 140 | # number of statements analyzed. This is used by the global evaluation report 141 | # (RP0004). 142 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 143 | 144 | # Template used to display messages. This is a python new-style format string 145 | # used to format the message information. See doc for all details 146 | #msg-template= 147 | 148 | 149 | [FORMAT] 150 | 151 | # Maximum number of characters on a single line. 152 | max-line-length=100 153 | 154 | # Regexp for a line that is allowed to be longer than the limit. 155 | ignore-long-lines=^\s*(# )??$ 156 | 157 | # Allow the body of an if to be on the same line as the test if there is no 158 | # else. 159 | single-line-if-stmt=no 160 | 161 | # List of optional constructs for which whitespace checking is disabled. `dict- 162 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 163 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 164 | # `empty-line` allows space-only lines. 165 | no-space-check=trailing-comma,dict-separator 166 | 167 | # Maximum number of lines in a module 168 | max-module-lines=1000 169 | 170 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 171 | # tab). 172 | indent-string=' ' 173 | 174 | # Number of spaces of indent required inside a hanging or continued line. 175 | indent-after-paren=4 176 | 177 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 178 | expected-line-ending-format= 179 | 180 | 181 | [SPELLING] 182 | 183 | # Spelling dictionary name. Available dictionaries: none. To make it working 184 | # install python-enchant package. 185 | spelling-dict= 186 | 187 | # List of comma separated words that should not be checked. 188 | spelling-ignore-words= 189 | 190 | # A path to a file that contains private dictionary; one word per line. 191 | spelling-private-dict-file= 192 | 193 | # Tells whether to store unknown words to indicated private dictionary in 194 | # --spelling-private-dict-file option instead of raising a message. 195 | spelling-store-unknown-words=no 196 | 197 | 198 | [LOGGING] 199 | 200 | # Logging modules to check that the string format arguments are in logging 201 | # function parameter format 202 | logging-modules=logging 203 | 204 | 205 | [BASIC] 206 | 207 | # List of builtins function names that should not be used, separated by a comma 208 | bad-functions=map,filter,input 209 | 210 | # Good variable names which should always be accepted, separated by a comma 211 | good-names=i,e,s,_,fd,fp,db,ts,j,k,m,n,v,T,k,c,op,re 212 | 213 | # Bad variable names which should always be refused, separated by a comma 214 | bad-names=foo,bar,baz,toto,tutu,tata 215 | 216 | # Colon-delimited sets of names that determine each other's naming style when 217 | # the name regexes allow several styles. 218 | name-group= 219 | 220 | # Include a hint for the correct naming format with invalid-name 221 | include-naming-hint=no 222 | 223 | # Regular expression matching correct function names 224 | # original: 225 | #function-rgx=[a-z_][a-z0-9_]{2,30}$ 226 | function-rgx=[a-zA-Z_][a-zA-Z0-9_]{2,40}$ 227 | 228 | # Naming hint for function names 229 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 230 | 231 | # Regular expression matching correct variable names 232 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 233 | 234 | # Naming hint for variable names 235 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 236 | 237 | # Regular expression matching correct constant names 238 | # original: 239 | #const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 240 | const-rgx=(([a-zA-Z_][a-zA-Z0-9_]*)|(__.*__))$ 241 | 242 | # Naming hint for constant names 243 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 244 | 245 | # Regular expression matching correct attribute names 246 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 247 | 248 | # Naming hint for attribute names 249 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 250 | 251 | # Regular expression matching correct argument names 252 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 253 | 254 | # Naming hint for argument names 255 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 256 | 257 | # Regular expression matching correct class attribute names 258 | # original: 259 | #class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 260 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,40}|(__.*__))$ 261 | 262 | # Naming hint for class attribute names 263 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 264 | 265 | # Regular expression matching correct inline iteration names 266 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 267 | 268 | # Naming hint for inline iteration names 269 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 270 | 271 | # Regular expression matching correct class names 272 | # original: 273 | #class-rgx=[A-Z_][a-zA-Z0-9]+$ 274 | class-rgx=[a-zA-Z_][a-zA-Z0-9]+$ 275 | 276 | # Naming hint for class names 277 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 278 | 279 | # Regular expression matching correct module names 280 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 281 | 282 | # Naming hint for module names 283 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 284 | 285 | # Regular expression matching correct method names 286 | # original: 287 | #method-rgx=[a-z_][a-z0-9_]{2,30}$ 288 | method-rgx=[a-zA-Z_][a-zA-Z0-9_]{2,40}$ 289 | 290 | # Naming hint for method names 291 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 292 | 293 | # Regular expression which should only match function or class names that do 294 | # not require a docstring. 295 | no-docstring-rgx=^test 296 | 297 | # Minimum line length for functions/classes that require docstrings, shorter 298 | # ones are exempt. 299 | docstring-min-length=-1 300 | 301 | 302 | [ELIF] 303 | 304 | # Maximum number of nested blocks for function / method body 305 | max-nested-blocks=5 306 | 307 | 308 | [SIMILARITIES] 309 | 310 | # Minimum lines number of a similarity. 311 | min-similarity-lines=4 312 | 313 | # Ignore comments when computing similarities. 314 | ignore-comments=yes 315 | 316 | # Ignore docstrings when computing similarities. 317 | ignore-docstrings=yes 318 | 319 | # Ignore imports when computing similarities. 320 | ignore-imports=no 321 | 322 | 323 | [TYPECHECK] 324 | 325 | # Tells whether missing members accessed in mixin class should be ignored. A 326 | # mixin class is detected if its name ends with "mixin" (case insensitive). 327 | ignore-mixin-members=yes 328 | 329 | # List of module names for which member attributes should not be checked 330 | # (useful for modules/projects where namespaces are manipulated during runtime 331 | # and thus existing member attributes cannot be deduced by static analysis. It 332 | # supports qualified module names, as well as Unix pattern matching. 333 | ignored-modules= 334 | 335 | # List of classes names for which member attributes should not be checked 336 | # (useful for classes with attributes dynamically set). This supports can work 337 | # with qualified names. 338 | ignored-classes= 339 | 340 | # List of members which are set dynamically and missed by pylint inference 341 | # system, and so shouldn't trigger E1101 when accessed. Python regular 342 | # expressions are accepted. 343 | generated-members= 344 | 345 | 346 | [MISCELLANEOUS] 347 | 348 | # List of note tags to take in consideration, separated by a comma. 349 | notes=FIXME,XXX,TODO 350 | 351 | 352 | [VARIABLES] 353 | 354 | # Tells whether we should check for unused import in __init__ files. 355 | init-import=no 356 | 357 | # A regular expression matching the name of dummy variables (i.e. expectedly 358 | # not used). 359 | dummy-variables-rgx=_$|dummy 360 | 361 | # List of additional names supposed to be defined in builtins. Remember that 362 | # you should avoid to define new builtins when possible. 363 | additional-builtins= 364 | 365 | # List of strings which can identify a callback function by name. A callback 366 | # name must start or end with one of those strings. 367 | callbacks=cb_,_cb 368 | 369 | 370 | [CLASSES] 371 | 372 | # List of method names used to declare (i.e. assign) instance attributes. 373 | defining-attr-methods=__init__,__new__,setUp 374 | 375 | # List of valid names for the first argument in a class method. 376 | valid-classmethod-first-arg=cls 377 | 378 | # List of valid names for the first argument in a metaclass class method. 379 | valid-metaclass-classmethod-first-arg=mcs 380 | 381 | # List of member names, which should be excluded from the protected access 382 | # warning. 383 | exclude-protected=_asdict,_fields,_replace,_source,_make 384 | 385 | 386 | [DESIGN] 387 | 388 | # Maximum number of arguments for function / method 389 | max-args=5 390 | 391 | # Argument names that match this expression will be ignored. Default to name 392 | # with leading underscore 393 | ignored-argument-names=_.* 394 | 395 | # Maximum number of locals for function / method body 396 | max-locals=15 397 | 398 | # Maximum number of return / yield for function / method body 399 | max-returns=6 400 | 401 | # Maximum number of branch for function / method body 402 | max-branches=12 403 | 404 | # Maximum number of statements in function / method body 405 | max-statements=50 406 | 407 | # Maximum number of parents for a class (see R0901). 408 | max-parents=7 409 | 410 | # Maximum number of attributes for a class (see R0902). 411 | max-attributes=7 412 | 413 | # Minimum number of public methods for a class (see R0903). 414 | min-public-methods=2 415 | 416 | # Maximum number of public methods for a class (see R0904). 417 | max-public-methods=20 418 | 419 | # Maximum number of boolean expressions in a if statement 420 | max-bool-expr=5 421 | 422 | 423 | [IMPORTS] 424 | 425 | # Deprecated modules which should not be used, separated by a comma 426 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 427 | 428 | # Create a graph of every (i.e. internal and external) dependencies in the 429 | # given file (report RP0402 must not be disabled) 430 | import-graph= 431 | 432 | # Create a graph of external dependencies in the given file (report RP0402 must 433 | # not be disabled) 434 | ext-import-graph= 435 | 436 | # Create a graph of internal dependencies in the given file (report RP0402 must 437 | # not be disabled) 438 | int-import-graph= 439 | 440 | 441 | [EXCEPTIONS] 442 | 443 | # Exceptions that will emit a warning when being caught. Defaults to 444 | # "Exception" 445 | overgeneral-exceptions=Exception -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.8.4 2 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.7.1 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } 6 | 7 | gem "foreman" -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | foreman (0.87.1) 5 | 6 | PLATFORMS 7 | ruby 8 | 9 | DEPENDENCIES 10 | foreman 11 | 12 | BUNDLED WITH 13 | 2.1.4 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash -e -o pipefail 2 | 3 | format: 4 | @black . 5 | 6 | lint: 7 | @flake8 jdb tests 8 | @pylint jdb tests 9 | 10 | install: 11 | @export CPPFLAGS="-I$(brew --cellar snappy)/1.1.8/include -L$(brew --cellar snappy)/1.1.8/lib" && \ 12 | pip install -r requirements.txt 13 | 14 | typecheck: 15 | @mypy . 16 | 17 | test: 18 | @pytest tests 19 | 20 | cli: 21 | @python jdb/cli.py 22 | 23 | server: server1 24 | 25 | server1: 26 | @python jdb/server/server.py -n 1 -p 1337 -r 1338 27 | 28 | server2: 29 | @python jdb/server/server.py -n 2 -p 2337 -r 2338 -j 1=127.0.0.1:1338 30 | 31 | cluster: 32 | @foreman start 33 | 34 | query: 35 | @python jdb/cli.py -q "${q}" 36 | 37 | codegen: 38 | @python -m grpc_tools.protoc -Ijdb/pb --python_out=jdb/pb --grpc_python_out=jdb/pb peer_server.proto 39 | 40 | bench.db: 41 | @python bin/bench/bench_db.py -s jdb 42 | @python bin/bench/bench_db.py -s redis 43 | @python bin/bench/bench_db.py -s lmdb 44 | 45 | bench.membership: 46 | @python bin/bench/bench_membership.py 47 | 48 | prof: 49 | @python bin/prof.py -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | node1: python3 jdb/server/server.py -n 1 -p 1337 -r 1338 2 | node2: python3 jdb/server/server.py -n 2 -p 2337 -r 2338 -j 1=127.0.0.1:1338 3 | node3: python3 jdb/server/server.py -n 3 -p 3337 -r 3338 -j 1=127.0.0.1:1338 4 | node4: python3 jdb/server/server.py -n 4 -p 4337 -r 4338 -j 1=127.0.0.1:1338 5 | node5: python3 jdb/server/server.py -n 5 -p 5337 -r 5338 -j 1=127.0.0.1:1338 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jdb 2 | 3 | #### journal 4 | 5 | - [01_membership](https://github.com/thejchap/jdb/blob/master/docs/journal/01_membership.md) 6 | - [02_storage](https://github.com/thejchap/jdb/blob/master/docs/journal/02_storage.md) 7 | 8 | ## resources 9 | 10 | ### inspo 11 | 12 | - https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf 13 | - https://www.cs.princeton.edu/courses/archive/spring13/cos461/docs/lec16-dynamo.pdf 14 | - https://www.cockroachlabs.com/docs/stable/architecture/distribution-layer.html 15 | - http://www.cs.cornell.edu/Projects/ladis2009/papers/Lakshman-ladis2009.PDF 16 | 17 | ### transactions 18 | 19 | - https://dgraph.io/blog/post/badger-txn/ 20 | - https://en.wikipedia.org/wiki/Multiversion_concurrency_control 21 | - https://dl.acm.org/doi/pdf/10.1145/356842.356846 22 | - https://dl.acm.org/doi/10.1145/2168836.2168853 23 | - https://wiki.postgresql.org/wiki/SSI 24 | - https://drkp.net/papers/ssi-vldb12.pdf 25 | - https://courses.cs.washington.edu/courses/cse444/08au/544M/READING-LIST/fekete-sigmod2008.pdf 26 | 27 | ## consensus 28 | 29 | - https://raft.github.io/raft.pdf 30 | - https://github.com/ongardie/dissertation/blob/master/stanford.pdf 31 | 32 | ### storage 33 | 34 | - https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf 35 | - http://www.vldb.org/pvldb/vol12/p2183-zhang.pdf 36 | - https://www.memsql.com/blog/what-is-skiplist-why-skiplist-index-for-memsql/ 37 | - https://www.cl.cam.ac.uk/teaching/2005/Algorithms/skiplists.pdf 38 | - https://github.com/dgraph-io/badger 39 | - https://github.com/facebook/rocksdb 40 | - https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks 41 | - https://blogs.kolabnow.com/2018/06/07/a-short-guide-to-lmdb 42 | 43 | ### lang 44 | 45 | - https://github.com/antirez/redis/blob/96a54866ab4694cf338af0441f28aa69e9643376/src/server.c 46 | - https://ply.readthedocs.io/en/latest/ply.html#parsing-basics 47 | - https://redis.io/topics/protocol 48 | 49 | ### routing 50 | 51 | - https://people.math.gatech.edu/~yu/Papers/p2p.pdf 52 | - https://blog.yugabyte.com/four-data-sharding-strategies-we-analyzed-in-building-a-distributed-sql-database/ 53 | - https://blog.memcachier.com/2017/09/01/maglev-our-new-consistent-hashing-scheme/ 54 | - https://blog.acolyer.org/2016/03/21/maglev-a-fast-and-reliable-software-network-load-balancer/ 55 | 56 | ### data integrity 57 | 58 | - https://en.wikipedia.org/wiki/Cyclic_redundancy_check 59 | 60 | ### time 61 | 62 | - https://cse.buffalo.edu/tech-reports/2014-04.pdf 63 | - https://jaredforsyth.com/posts/hybrid-logical-clocks/ 64 | - http://muratbuffalo.blogspot.com/2014/07/hybrid-logical-clocks.html 65 | - https://medium.com/@Alibaba_Cloud/in-depth-analysis-on-hlc-based-distributed-transaction-processing-e75dad5f2af8 66 | 67 | ### crdt 68 | 69 | - https://github.com/soundcloud/roshi#crdt 70 | - https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type#LWW-Element-Set_(Last-Write-Wins-Element-Set) 71 | 72 | ### general 73 | 74 | - http://book.mixu.net/distsys/eventual.html 75 | - https://hal.inria.fr/file/index/docid/555588/filename/techreport.pdf 76 | 77 | ### cluster membership 78 | 79 | - https://en.wikipedia.org/wiki/Gossip_protocol 80 | - https://asafdav2.github.io/2017/swim-protocol/ 81 | - https://research.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf 82 | - https://blog.kevingomez.fr/2019/01/29/clusters-and-membership-discovering-the-swim-protocol/ 83 | - https://www.serf.io/docs/internals/gossip.html 84 | -------------------------------------------------------------------------------- /bin/bench/bench_db.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from typing import List 3 | from argparse import ArgumentParser 4 | from threading import Thread 5 | from timeit import timeit 6 | from random import getrandbits 7 | from structlog import get_logger 8 | from redis import Redis 9 | import lmdb 10 | from jdb.storage import db 11 | 12 | KEY_SIZE = 24 13 | VAL_SIZE = 8 14 | LOGGER = get_logger() 15 | DIRNAME = path.dirname(__file__) 16 | 17 | 18 | def _exec_threads(arr: List[Thread]): 19 | """helper to run a bunch of threads and wait for them to finish""" 20 | 21 | for thread in arr: 22 | thread.start() 23 | 24 | for thread in arr: 25 | thread.join() 26 | 27 | 28 | def main(): 29 | """fire it up""" 30 | 31 | parser = ArgumentParser() 32 | parser.add_argument( 33 | "-s", 34 | "--store", 35 | type=str, 36 | help="which store to use", 37 | choices=["redis", "jdb", "lmdb"], 38 | required=True, 39 | ) 40 | 41 | parser.add_argument( 42 | "-z", "--set-size", type=int, help="number of keys to insert", default=1000000 43 | ) 44 | 45 | parser.add_argument( 46 | "-t", "--threads", type=int, help="number of clients", default=32 47 | ) 48 | 49 | args = parser.parse_args() 50 | redis = Redis(host="localhost", port=6379, db=0) 51 | jdb = db.DB(compression=None) 52 | lmdbenv = lmdb.open( 53 | path=path.join(DIRNAME, "../tmp"), map_size=jdb.memtable.max_size, lock=True 54 | ) 55 | builder_threads = [] 56 | writer_threads = [] 57 | thread_count = args.threads 58 | n = 1000 59 | batches = [] 60 | val = bytes(bytearray([1] * VAL_SIZE)) 61 | 62 | def lmdb_txn(batch): 63 | with lmdbenv.begin(write=True) as txn: 64 | for k, v in batch: 65 | txn.put(k, v) 66 | 67 | def redis_txn(batch): 68 | pipe = redis.pipeline() 69 | for k, v in batch: 70 | pipe.set(k, v) 71 | pipe.execute() 72 | 73 | def jdb_txn(batch): 74 | with jdb.transaction() as txn: 75 | for k, v in batch: 76 | txn.write(k, v) 77 | 78 | funcs = {"redis": redis_txn, "jdb": jdb_txn, "lmdb": lmdb_txn} 79 | func = funcs[args.store] 80 | 81 | LOGGER.info( 82 | "config", 83 | thread_count=thread_count, 84 | set_size=args.set_size, 85 | val_size=VAL_SIZE, 86 | key_size=KEY_SIZE, 87 | store=args.store, 88 | ) 89 | 90 | batch_size = int(args.set_size / thread_count) 91 | 92 | def build(i: int): 93 | for _ in range(0, batch_size): 94 | key = bytes(getrandbits(8) for _ in range(KEY_SIZE)) 95 | batches[i].append([key, val]) 96 | 97 | def populate(i: int): 98 | batch = batches[i] 99 | for j in range(0, len(batch), n): 100 | func(batch[j : j + n]) 101 | 102 | for i in range(0, thread_count): 103 | builder_threads.append(Thread(target=build, args=(i,))) 104 | writer_threads.append(Thread(target=populate, args=(i,))) 105 | batches.append([]) 106 | 107 | LOGGER.info("setup") 108 | _exec_threads(builder_threads) 109 | LOGGER.info("running") 110 | elapsed = timeit(lambda: _exec_threads(writer_threads), number=1) 111 | LOGGER.info("done", elapsed=elapsed) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /bin/bench/bench_membership.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import json 3 | from os import path, makedirs 4 | from time import sleep 5 | from threading import Thread 6 | from structlog import get_logger 7 | from jdb import server as srv, membership as mbr 8 | 9 | _DIRNAME = path.dirname(__file__) 10 | _LOGGER = get_logger() 11 | _VERSION = "4 improved random target selection" 12 | 13 | 14 | def _main(sample: int): 15 | """ 16 | fire up a bunch of servers, kill one, see how long it takes for them to all find out 17 | """ 18 | 19 | n = 20 20 | processes = [] 21 | servers = [] 22 | target_key = "node0000=127.0.0.1:2337" 23 | 24 | for i in range(0, n): 25 | port = 1337 + i 26 | p2p_port = 2337 + i 27 | name = f"node{i:04d}" 28 | join = None 29 | 30 | if i > 0: 31 | join = "node0000=127.0.0.1:2337" 32 | 33 | server = srv.Server(node_name=name, port=port, p2p_port=p2p_port, join=join) 34 | thread = Thread(target=server.start, name=name, daemon=True) 35 | processes.append(thread) 36 | servers.append(server) 37 | 38 | for process in processes: 39 | process.start() 40 | 41 | n = len(servers) 42 | 43 | while True: 44 | sizes = {len(list(s.node.membership.cluster_state)) for s in servers} 45 | 46 | if len(sizes) == 1 and n in sizes: 47 | break 48 | 49 | sleep(0.5) 50 | 51 | _LOGGER.info("all bootstrapped") 52 | sleep(mbr.STARTUP_GRACE_PERIOD * n) 53 | _LOGGER.info(f"killing {target_key}") 54 | target = servers.pop(0) 55 | target.stop() 56 | _LOGGER.info(f"killed {target_key}") 57 | 58 | results: Dict = {} 59 | poll_interval = 0.05 60 | j = 0 61 | 62 | while True: 63 | states = { 64 | s.node_name: set( 65 | map(lambda k: k.decode(), dict(s.node.membership.cluster_state).keys()) 66 | ) 67 | for s in servers 68 | } 69 | 70 | i = 0 71 | 72 | for nstate in states.values(): 73 | if target_key in nstate: 74 | i += 1 75 | 76 | results[j * poll_interval] = i 77 | _LOGGER.info("bench.state_poll", states=states) 78 | j += 1 79 | 80 | if i == 0: 81 | break 82 | 83 | sleep(poll_interval) 84 | 85 | dname = "|".join( 86 | map( 87 | str, 88 | [ 89 | f"fdi:{mbr.FD_INTERVAL}", 90 | f"fds:{mbr.FD_SUBGROUP_SIZE}", 91 | f"gi:{mbr.GOSSIP_INTERVAL}", 92 | f"gs:{mbr.GOSSIP_SUBGROUP_SIZE}", 93 | ], 94 | ) 95 | ) 96 | 97 | datapath = path.join(_DIRNAME, "data", "membership", str(_VERSION), dname) 98 | 99 | if not path.exists(datapath): 100 | makedirs(datapath) 101 | 102 | with open(path.join(datapath, f"{sample}.json"), "w") as file: 103 | file.write(json.dumps(results)) 104 | 105 | for server in servers: 106 | server.node.membership.stop() 107 | 108 | for server in servers: 109 | server.stop() 110 | 111 | for process in processes: 112 | process.join() 113 | 114 | 115 | if __name__ == "__main__": 116 | samp_start = 2 117 | samp_finish = 3 118 | 119 | for samp in range(samp_start, samp_finish): 120 | _main(samp) 121 | -------------------------------------------------------------------------------- /bin/bench/data/membership/1 transparent ping req/fdi:0.5|fds:3|gi:0.2|gs:5/0.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 19, "1.8": 19, "1.85": 19, "1.9000000000000001": 19, "1.9500000000000002": 19, "2.0": 19, "2.0500000000000003": 19, "2.1": 19, "2.15": 19, "2.2": 19, "2.25": 19, "2.3000000000000003": 19, "2.35": 18, "2.4000000000000004": 18, "2.45": 18, "2.5": 18, "2.5500000000000003": 17, "2.6": 17, "2.6500000000000004": 16, "2.7": 16, "2.75": 15, "2.8000000000000003": 14, "2.85": 14, "2.9000000000000004": 13, "2.95": 13, "3.0": 10, "3.0500000000000003": 8, "3.1": 6, "3.1500000000000004": 6, "3.2": 5, "3.25": 4, "3.3000000000000003": 3, "3.35": 3, "3.4000000000000004": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/1 transparent ping req/fdi:0.5|fds:3|gi:0.2|gs:5/1.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 18, "1.7000000000000002": 17, "1.75": 17, "1.8": 17, "1.85": 16, "1.9000000000000001": 16, "1.9500000000000002": 16, "2.0": 14, "2.0500000000000003": 13, "2.1": 13, "2.15": 11, "2.2": 9, "2.25": 7, "2.3000000000000003": 4, "2.35": 2, "2.4000000000000004": 1, "2.45": 1, "2.5": 1, "2.5500000000000003": 1, "2.6": 2, "2.6500000000000004": 3, "2.7": 5, "2.75": 7, "2.8000000000000003": 10, "2.85": 13, "2.9000000000000004": 16, "2.95": 16, "3.0": 16, "3.0500000000000003": 15, "3.1": 16, "3.1500000000000004": 14, "3.2": 13, "3.25": 12, "3.3000000000000003": 12, "3.35": 12, "3.4000000000000004": 9, "3.45": 6, "3.5": 7, "3.5500000000000003": 6, "3.6": 5, "3.6500000000000004": 5, "3.7": 3, "3.75": 2, "3.8000000000000003": 1, "3.85": 1, "3.9000000000000004": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/1 transparent ping req/fdi:0.5|fds:3|gi:0.2|gs:5/2.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 19, "1.8": 19, "1.85": 19, "1.9000000000000001": 19, "1.9500000000000002": 19, "2.0": 19, "2.0500000000000003": 19, "2.1": 19, "2.15": 19, "2.2": 19, "2.25": 19, "2.3000000000000003": 19, "2.35": 19, "2.4000000000000004": 19, "2.45": 19, "2.5": 19, "2.5500000000000003": 19, "2.6": 19, "2.6500000000000004": 19, "2.7": 19, "2.75": 17, "2.8000000000000003": 16, "2.85": 14, "2.9000000000000004": 13, "2.95": 12, "3.0": 11, "3.0500000000000003": 9, "3.1": 7, "3.1500000000000004": 5, "3.2": 4, "3.25": 3, "3.3000000000000003": 1, "3.35": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/1 transparent ping req/fdi:0.5|fds:3|gi:0.2|gs:5/3.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 18, "1.75": 18, "1.8": 16, "1.85": 16, "1.9000000000000001": 16, "1.9500000000000002": 13, "2.0": 12, "2.0500000000000003": 11, "2.1": 10, "2.15": 7, "2.2": 5, "2.25": 3, "2.3000000000000003": 2, "2.35": 2, "2.4000000000000004": 2, "2.45": 1, "2.5": 1, "2.5500000000000003": 1, "2.6": 1, "2.6500000000000004": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/1 transparent ping req/fdi:0.5|fds:3|gi:0.2|gs:5/4.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 17, "1.5": 17, "1.55": 16, "1.6": 15, "1.6500000000000001": 13, "1.7000000000000002": 10, "1.75": 7, "1.8": 4, "1.85": 3, "1.9000000000000001": 3, "1.9500000000000002": 1, "2.0": 2, "2.0500000000000003": 2, "2.1": 2, "2.15": 4, "2.2": 3, "2.25": 3, "2.3000000000000003": 3, "2.35": 3, "2.4000000000000004": 4, "2.45": 5, "2.5": 4, "2.5500000000000003": 4, "2.6": 3, "2.6500000000000004": 3, "2.7": 2, "2.75": 2, "2.8000000000000003": 1, "2.85": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/2 remove on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/0.json: -------------------------------------------------------------------------------- 1 | {"0.0": 15, "0.05": 14, "0.1": 13, "0.15000000000000002": 11, "0.2": 9, "0.25": 7, "0.30000000000000004": 7, "0.35000000000000003": 4, "0.4": 4, "0.45": 4, "0.5": 4, "0.55": 3, "0.6000000000000001": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/2 remove on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/1.json: -------------------------------------------------------------------------------- 1 | {"0.0": 17, "0.05": 16, "0.1": 16, "0.15000000000000002": 15, "0.2": 15, "0.25": 12, "0.30000000000000004": 12, "0.35000000000000003": 8, "0.4": 6, "0.45": 3, "0.5": 2, "0.55": 1, "0.6000000000000001": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/2 remove on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/2.json: -------------------------------------------------------------------------------- 1 | {"0.0": 4, "0.05": 3, "0.1": 4, "0.15000000000000002": 3, "0.2": 3, "0.25": 2, "0.30000000000000004": 3, "0.35000000000000003": 4, "0.4": 3, "0.45": 1, "0.5": 1, "0.55": 1, "0.6000000000000001": 1, "0.65": 1, "0.7000000000000001": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/2 remove on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/3.json: -------------------------------------------------------------------------------- 1 | {"0.0": 16, "0.05": 14, "0.1": 13, "0.15000000000000002": 11, "0.2": 9, "0.25": 7, "0.30000000000000004": 4, "0.35000000000000003": 3, "0.4": 2, "0.45": 2, "0.5": 1, "0.55": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/2 remove on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/4.json: -------------------------------------------------------------------------------- 1 | {"0.0": 11, "0.05": 10, "0.1": 8, "0.15000000000000002": 5, "0.2": 4, "0.25": 2, "0.30000000000000004": 2, "0.35000000000000003": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/3 add suspect on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/0.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 18, "1.8": 18, "1.85": 18, "1.9000000000000001": 18, "1.9500000000000002": 16, "2.0": 15, "2.0500000000000003": 12, "2.1": 12, "2.15": 7, "2.2": 7, "2.25": 4, "2.3000000000000003": 3, "2.35": 1, "2.4000000000000004": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/3 add suspect on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/1.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 19, "1.8": 19, "1.85": 19, "1.9000000000000001": 19, "1.9500000000000002": 19, "2.0": 19, "2.0500000000000003": 19, "2.1": 17, "2.15": 16, "2.2": 12, "2.25": 8, "2.3000000000000003": 8, "2.35": 5, "2.4000000000000004": 4, "2.45": 1, "2.5": 1, "2.5500000000000003": 1, "2.6": 1, "2.6500000000000004": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/3 add suspect on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/2.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 18, "1.6": 18, "1.6500000000000001": 18, "1.7000000000000002": 18, "1.75": 18, "1.8": 17, "1.85": 17, "1.9000000000000001": 16, "1.9500000000000002": 16, "2.0": 13, "2.0500000000000003": 11, "2.1": 9, "2.15": 9, "2.2": 7, "2.25": 5, "2.3000000000000003": 3, "2.35": 2, "2.4000000000000004": 2, "2.45": 2, "2.5": 2, "2.5500000000000003": 1, "2.6": 1, "2.6500000000000004": 1, "2.7": 1, "2.75": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/3 add suspect on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/3.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 18, "1.8": 18, "1.85": 18, "1.9000000000000001": 17, "1.9500000000000002": 17, "2.0": 17, "2.0500000000000003": 17, "2.1": 17, "2.15": 17, "2.2": 16, "2.25": 15, "2.3000000000000003": 15, "2.35": 15, "2.4000000000000004": 17, "2.45": 16, "2.5": 14, "2.5500000000000003": 11, "2.6": 8, "2.6500000000000004": 3, "2.7": 3, "2.75": 3, "2.8000000000000003": 2, "2.85": 2, "2.9000000000000004": 1, "2.95": 1, "3.0": 1, "3.0500000000000003": 1, "3.1": 1, "3.1500000000000004": 1, "3.2": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/3 add suspect on failed ping req/fdi:0.5|fds:3|gi:0.2|gs:5/4.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 19, "1.6": 19, "1.6500000000000001": 19, "1.7000000000000002": 19, "1.75": 19, "1.8": 19, "1.85": 19, "1.9000000000000001": 18, "1.9500000000000002": 18, "2.0": 17, "2.0500000000000003": 17, "2.1": 16, "2.15": 16, "2.2": 16, "2.25": 14, "2.3000000000000003": 13, "2.35": 11, "2.4000000000000004": 8, "2.45": 6, "2.5": 4, "2.5500000000000003": 1, "2.6": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/4 improved random target selection/fdi:0.5|fds:3|gi:0.2|gs:5/0.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 18, "1.35": 18, "1.4000000000000001": 17, "1.4500000000000002": 17, "1.5": 16, "1.55": 14, "1.6": 11, "1.6500000000000001": 8, "1.7000000000000002": 6, "1.75": 5, "1.8": 2, "1.85": 2, "1.9000000000000001": 1, "1.9500000000000002": 1, "2.0": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/4 improved random target selection/fdi:0.5|fds:3|gi:0.2|gs:5/1.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 19, "0.2": 19, "0.25": 19, "0.30000000000000004": 19, "0.35000000000000003": 19, "0.4": 19, "0.45": 19, "0.5": 19, "0.55": 19, "0.6000000000000001": 19, "0.65": 19, "0.7000000000000001": 19, "0.75": 19, "0.8": 19, "0.8500000000000001": 19, "0.9": 19, "0.9500000000000001": 19, "1.0": 19, "1.05": 19, "1.1": 19, "1.1500000000000001": 19, "1.2000000000000002": 19, "1.25": 19, "1.3": 19, "1.35": 19, "1.4000000000000001": 19, "1.4500000000000002": 19, "1.5": 19, "1.55": 18, "1.6": 18, "1.6500000000000001": 17, "1.7000000000000002": 16, "1.75": 16, "1.8": 15, "1.85": 13, "1.9000000000000001": 10, "1.9500000000000002": 9, "2.0": 7, "2.0500000000000003": 6, "2.1": 4, "2.15": 4, "2.2": 4, "2.25": 3, "2.3000000000000003": 2, "2.35": 2, "2.4000000000000004": 2, "2.45": 1, "2.5": 1, "2.5500000000000003": 0} -------------------------------------------------------------------------------- /bin/bench/data/membership/4 improved random target selection/fdi:0.5|fds:3|gi:0.2|gs:5/2.json: -------------------------------------------------------------------------------- 1 | {"0.0": 19, "0.05": 19, "0.1": 19, "0.15000000000000002": 15, "0.2": 0} -------------------------------------------------------------------------------- /bin/bench/data/storage/1.prof: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/bin/bench/data/storage/1.prof -------------------------------------------------------------------------------- /bin/bench/data/storage/2.prof: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/bin/bench/data/storage/2.prof -------------------------------------------------------------------------------- /bin/bench/data/storage/compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/bin/bench/data/storage/compare.png -------------------------------------------------------------------------------- /bin/bench/plot_db.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | MEMORY = { 4 | "jdb": [51.841734918, 39.98755088], 5 | "redis": [12.029713754, 10.795880972], 6 | } 7 | 8 | 9 | def _main(): 10 | """plot results""" 11 | 12 | width = 0.2 13 | 14 | for i, (store, vals) in enumerate(MEMORY.items()): 15 | plt.bar( 16 | [j + i * width for j in range(0, len(vals))], vals, width=width, label=store 17 | ) 18 | 19 | plt.title("in-memory data loading performance") 20 | plt.legend(loc="best") 21 | plt.xticks([0, 1], ("v1", "v2")) 22 | plt.xlabel("store") 23 | plt.ylabel("seconds") 24 | plt.show() 25 | 26 | 27 | if __name__ == "__main__": 28 | _main() 29 | -------------------------------------------------------------------------------- /bin/bench/plot_membership.py: -------------------------------------------------------------------------------- 1 | from os import path, listdir 2 | import json 3 | from statistics import mean 4 | import matplotlib.pyplot as plt 5 | from matplotlib.ticker import StrMethodFormatter 6 | 7 | _DIRNAME = path.dirname(__file__) 8 | _PATH = path.join(_DIRNAME, "data", "membership") 9 | 10 | 11 | def _main(): 12 | """plot results""" 13 | 14 | result = {} 15 | dicts = {} 16 | 17 | for version in listdir(_PATH): 18 | dicts[version] = {} 19 | 20 | for folder in listdir(path.join(_PATH, version)): 21 | if path.isfile(path.join(_PATH, version, folder)): 22 | continue 23 | 24 | dicts[version][folder] = [] 25 | 26 | for file in listdir(path.join(_PATH, version, folder)): 27 | if not path.isfile(path.join(_PATH, version, folder, file)): 28 | continue 29 | 30 | with open(path.join(_PATH, version, folder, file), "r") as jsonfile: 31 | dicts[version][folder].append(json.loads(jsonfile.read())) 32 | 33 | for version, vals in dicts.items(): 34 | result[version] = {} 35 | for grp, dicts in vals.items(): 36 | res = {} 37 | for dic in dicts: 38 | for key in dic.keys(): 39 | if key not in res: 40 | res[key] = [] 41 | res[key].append(dic[key]) 42 | result[version][grp] = res 43 | 44 | for version, vals in result.items(): 45 | for grp, res in vals.items(): 46 | for key in res: 47 | result[version][grp][key] = mean(result[version][grp][key]) 48 | 49 | for version, datas in sorted(result.items()): 50 | for key, data in datas.items(): 51 | plt.plot( 52 | list(map(lambda x: round(float(x), 2), data.keys())), 53 | list(data.values()), 54 | label=f"{version} ({key})", 55 | ) 56 | 57 | plt.gca().yaxis.set_major_formatter(StrMethodFormatter("{x:,.0f}")) 58 | plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.5)) 59 | plt.legend(loc="upper right") 60 | plt.ylabel("infected nodes") 61 | plt.xlabel("time in seconds") 62 | plt.show() 63 | 64 | 65 | if __name__ == "__main__": 66 | _main() 67 | -------------------------------------------------------------------------------- /bin/prof.py: -------------------------------------------------------------------------------- 1 | from cProfile import run 2 | from jdb.storage import db 3 | 4 | jdb = db.DB(compression=None) 5 | 6 | run( 7 | '[jdb.put(f"key{i}".encode(), b"val") for i in range(0, 1000)]', 8 | filename="tmp/jdb.prof", 9 | ) 10 | -------------------------------------------------------------------------------- /docs/bench.md: -------------------------------------------------------------------------------- 1 | # bench 2 | ## overview 3 | just keeping track of results/evolution 4 | 5 | ### setup 6 | ```bash 7 | redis-server --save "" --appendonly no 8 | ``` 9 | 10 | ## results 11 | ### 2020-05-07 12 | ```bash 13 | key_size=24 set_size=1000000 store=jdb thread_count=32 val_size=8 14 | elapsed=56.308610462000004 15 | 16 | key_size=24 set_size=1000000 store=redis thread_count=32 val_size=8 17 | elapsed=10.670338552 18 | ``` 19 | 20 | ### 2020-05-09 21 | added lmdb 22 | ```bash 23 | key_size=24 set_size=1000000 store=jdb thread_count=32 val_size=8 24 | elapsed=51.27223068 25 | 26 | key_size=24 set_size=1000000 store=redis thread_count=32 val_size=8 27 | elapsed=11.052644085999999 28 | 29 | key_size=24 set_size=1000000 store=lmdb thread_count=32 val_size=8 30 | elapsed=53.547044742 31 | ``` -------------------------------------------------------------------------------- /docs/img/journal/01_membership/membership.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/01_membership/membership.png -------------------------------------------------------------------------------- /docs/img/journal/01_membership/mermaid-diagram-20200607085911.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/01_membership/mermaid-diagram-20200607085911.png -------------------------------------------------------------------------------- /docs/img/journal/01_membership/mermaid-diagram-20200607090040.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/01_membership/mermaid-diagram-20200607090040.png -------------------------------------------------------------------------------- /docs/img/journal/02_storage/entry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/02_storage/entry.png -------------------------------------------------------------------------------- /docs/img/journal/02_storage/mermaid-diagram-20201012084711.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/02_storage/mermaid-diagram-20201012084711.png -------------------------------------------------------------------------------- /docs/img/journal/02_storage/mermaid-diagram-20201012094531.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/docs/img/journal/02_storage/mermaid-diagram-20201012094531.png -------------------------------------------------------------------------------- /docs/journal/01_membership.md: -------------------------------------------------------------------------------- 1 | # cluster membership using the SWIM gossip protocol and CRDTs 2 | 3 | ## overview 4 | 5 | jdb is entirely decentralized, meaning each node has its own picture of the state of the cluster and who its peers are. it is important for a node to have up-to-date knowledge of the state of its peers so it is able to route requests to the proper node. 6 | 7 | there has been a lot of research in this area that has led to some interesting protocols, including [Chord]() and [Kademlia](https://en.wikipedia.org/wiki/Kademlia). this past year, my friend and I implemented Chord from the paper (or two versions of the paper), so I was pretty familiar with the concepts and goals around these protocols. generally, each node only keeps track of just enough of its peers to allow for `O(log(N))` lookups. the idea behind this is to facilitate large peer-to-peer clusters with each node only having to maintain minimal routing information. 8 | 9 | since jdb is designed to handle massive web-scale traffic with tight SLA requirements from my evening side projects, latency was a concern. because of this, i decided that each node would maintain a routing table of the whole cluster so that requests could be routed in `O(1)`. this routing table would be kept up to date by a gossip protocol based on [SWIM](http://www.cs.cornell.edu/Info/Projects/Spinglass/public_pdfs/SWIM.pdf). 10 | 11 | ## cluster state 12 | 13 | each node maintains its knowledge of cluster state in the form of a type of [CRDT](https://hal.inria.fr/inria-00609399v1/document) (conflict-free replicated data type) called a [LWW register](). using this type of data structure seemed to be a good fit for this use case because it allows nodes with divergent ideas of cluster state to merge their two states efficiently and accurately. 14 | 15 | the LWW register maintains 2 dictionaries, an add set and a remove set. these dictionaries contain k/v pairs where the key is node id/address and the value is a [HLC](https://cse.buffalo.edu/tech-reports/2014-04.pdf) (hybrid logical clock). HLCs are a clock algorithm that facilitate maintaining a total order of events in a distributed system without a centralized clock. 16 | 17 | **LWW register** 18 | 19 | ```json 20 | { 21 | "add_set": { 22 | "node1=127.0.0.1:1338": 158973080536001, 23 | "node2=127.0.0.1:2338": 158973080740502 24 | }, 25 | "remove_set": { 26 | "node1=127.0.0.1:1338": 158973080738601, 27 | "node2=127.0.0.1:2338": 158792457695005 28 | } 29 | } 30 | ``` 31 | 32 | a node can change its view of cluster state either throughout the course of its own failure detection loop, or during gossip with another node. when a node becomes aware of a peer joining the cluster, it adds an entry to the add set. when the node becomes aware of a peer leaving the cluster, it adds the node to the remove set. cluster state is the union of the add set and the remove set, with the highest timestamp in either set winning for that particular node. this data structure can then be merged with other nodes versions versions of the same data structure, creating a merged version that has the most up-to-date info from both 33 | 34 | **example merge** 35 | 36 | **_1. node a cluster State_** 37 | 38 | node a has awareness of nodes c and b joining the cluster at timestamps 1 and 2 respectively 39 | 40 | ```json 41 | { 42 | "add_set": { 43 | "c": 1, 44 | "b": 2 45 | }, 46 | "remove_set": {} 47 | } 48 | ``` 49 | 50 | **_2. node b cluster state_** 51 | 52 | node b has awareness of nodes c and a joining the cluster at timestamps 2 and 1 respectively, then subsequently detected a failure when trying to communicate with node c 53 | 54 | ```json 55 | { 56 | "add_set": { 57 | "c": 2, 58 | "a": 1 59 | }, 60 | "remove_set": { 61 | "c": 3 62 | } 63 | } 64 | ``` 65 | 66 | **_3. merged cluster state_** 67 | 68 | nodes a and b communicate with each other and merge their lists 69 | 70 | ```json 71 | { 72 | "add_set": { 73 | "c": 2, 74 | "a": 1, 75 | "b": 2 76 | }, 77 | "remove_set": { 78 | "c": 3 79 | } 80 | } 81 | ``` 82 | 83 | **_4. result_** 84 | 85 | nodes a and b are only in the add set. node c is in the remove set with a greater timestamp than its value in the add set. now both nodes have an up-to-date view of the cluster containing the latest information from both nodes. 86 | 87 | ```python 88 | {"a", "b"} 89 | ``` 90 | 91 | ## gossip protocol 92 | 93 | the protocol i implemented is inspired by SWIM but is slightly simplified. the purpose of this protocol is to efficiently disseminate the above data structure throughout the cluster. 94 | 95 | ### bootstrap 96 | 97 | when a node joins the cluster, it must know the address of at least one other node to bootstrap itself. it attempts to connect to its known peer. when it does, it syncs its peer list with that of the other node, and starts the protocol. a short grace period is allowed for each node to start up before we add it in to our list of available peers to communicate with. also, a small amount of jitter is added to timing for these cycles to even out request loads a bit. 98 | 99 | ### gossip 100 | 101 | gossip occurs on a regular cadence (defaults to 1s), and also when a peer failure has been detected by a node and verified by k of its peers. during gossip, the node selects k peers at random and syncs its updated state with them, containing any new node removals/additions 102 | 103 | ### failure detection 104 | 105 | every 0.5s, a random peer is chosen from our peers list and it is probed. if an ack is returned, the loop just moves on. if we can't contact the peer, we push it into a queue to be investigated. if the peer is verified as faulty, it is removed from our nodes list then a gossip cycle is initiated to disseminate the information to the rest of the cluster 106 | 107 | ![](https://github.com/thejchap/jdb/blob/master/docs/img/journal/01_membership.py/mermaid-diagram-20200607085911.png?raw=true) 108 | 109 | ### investigation 110 | 111 | as per the SWIM protocol, when a node is detected as faulty by the probing node, it then enlists the help of k other nodes to help verify that this node is actually down and it isn't just a transient network partition. if any of those nodes are able to contact the target node, we remove it from our suspect list and move on. 112 | 113 | ![](https://github.com/thejchap/jdb/blob/master/docs/img/journal/01_membership.py/mermaid-diagram-20200607090040.png?raw=true) 114 | 115 | ### benchmarking 116 | 117 | i wanted to get an idea of how efficient the protocol is in disseminating information throughout the cluster. to do this, i spun up a 20 node cluster, each node in its own thread. once the cluster started, i gave a grace period to allow the whole cluster to become aware of all the peers. once the cluster was "stable", i picked a node and killed it, then started a timer and measured how long it took for the dead node to be removed from every node's peer list. 118 | 119 | the results were pretty interesting. in tweaking some of the parameters including the gossip interval, failure detection interval, and subgroup sizes, i noticed these didn't have a huge impact on performance. increasing the failure detection subgroup sizes slowed everything down pretty substantially. 120 | 121 | with this in mind, i decided to find a happy medium for the input parameters, then decided to tweak other parts of the protocol. here is a graph of timing of node failure dissemination throughout the cluster. 122 | 123 | ![](https://github.com/thejchap/jdb/blob/master/docs/img/journal/01_membership/membership.png?raw=true) 124 | 125 | 1. in this initial implementation, when a node sent a ping request to k of its peers to verify that a node is down, those peers simply did as they were asked, then returned the result of the request to the requesting node. this is the closest to the SWIM protocol. 126 | 2. out of curiosity, i decided to try an implementation where, whenever a node sent a ping request, all the peers that received this request imnmediately condemned the suspect node and removed it from their own lists. this resulted in significantly faster propogation throughout the cluster, but also means a lot of false positives because it is mostly bypassing the failure detection protocol. i did not set up a way to measure false positives. 127 | 3. this implementation is slightly less trigger-happy than #2 but speeds up propogation slightly. here, when a node receives a ping request (to verify a failed node), it simply adds that node to its own suspect list to investigate, rather than ignoring it. 128 | 4. this implementation builds on the last one, and improves random peer selection by cycling through a list and choosing a random peer in the remaining list, rather than picking a random node in the whole cluster on each loop. 129 | 130 | one interesting edge case that surfaced after node failure was other nodes starting to become aware of the failure, then encountering information that indicated the node had been added back into the cluster, then that information propogating and overruling the prior awareness of the node leaving. this can be seen in implementation #1 above, but also happened periodically in all the other implementations. in some cases, the conflicting information would take a pretty substantial amount of time to resolve (8-10s). 131 | 132 | ## closing thoughts 133 | 134 | - verifying the protocol was time consuming. setting up tooling to be able to visualize and understand how data is flowing throughout the system was invaluable, but took just as much time (if not more) than the implementation of the protocol itself, and felt a bit hacky. i know there are [libraries](https://jepsen.io/) and systems to help with things like this, and i look forward to becoming more familiar with the ecosystem. 135 | - the SWIM paper analyzes the efficiency of its dissemination component using epidemiology research as a foundation. this felt eerie and topical to be reading about during COVID-19. 136 | - for the purposes of this system, just using the system clock would have been fine. but learning new things is more fun. 137 | - LWW registers are one of the simpler types of CRDTS. implementing this was really helpful for me to conceptualize how they function a bit better, and was a great introduction to the data structures in general. 138 | - there is definitely still a lot of room for improvement, but i am happy with where it is for now and it is time to move on to other things! 139 | -------------------------------------------------------------------------------- /docs/journal/02_storage.md: -------------------------------------------------------------------------------- 1 | # implementing MVCC and SSI in an embedded key-value store 2 | 3 | ## intro 4 | 5 | this is the second post documenting my adventures writing [jdb](https://github.com/thejchap/jdb). jdb is a distributed key-value store written in python. i took on this project to a) learn more about distributed databases and b) get better at writing python. the first post on cluster membership and implementing a gossip protocol can be found [](https://medium.com/@chap/peer-to-peer-cluster-membership-using-the-swim-gossip-protocol-and-crdts-13f9386fe9b4) 6 | 7 | ## overview 8 | 9 | working on jdb has provided me with an opportunity to not only deepen my understanding of distributed programming in a multi-node environment, but also in a local, multi-threaded environment. i initially considered using an existing solution (for example [RocksDB](https://rocksdb.org/) or [LevelDB](https://en.wikipedia.org/wiki/LevelDB)) for embedded per-node storage in jdb, but decided it would be way more fun to write one myself. 10 | 11 | embedded data stores generally have a fairly simple API (put, get, delete) and are intended to be high-performance storage engines. the storage engine on each node in jdb is where data would end up being stored and read from after getting routed around in the cluster to the correct node. i wanted to write one that supported multiple connections, [ACID transactions](https://en.wikipedia.org/wiki/ACID) (without the D for now - everything is just in memory), and [MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control). 12 | 13 | ## design 14 | 15 | the data store is implemented in a `DB` class that gets opened per-process when jdb starts up. each `DB` instance gets instantiated with an `Oracle` and a `Memtable`. the `Oracle` is the point of entry for transactions, and maintains read/write timestamps for transactions and also tracks dependent keys to support [SSI](https://wiki.postgresql.org/wiki/Serializable) as the transaction isolation level. the `Memtable` maintains the actual data structures that store data that transactions write to. the index is maintained in the form of an [AVL tree](https://en.wikipedia.org/wiki/AVL_tree) where the nodes are pointers to the actual raw bytes in an `arena` which is just a byte array of encoded data entries. i chose an AVL tree for the index because it is self-balancing guarantees an upper bound of O(logN) time complexity for all its operations. 16 | 17 | ## entries 18 | 19 | an instance of the `Entry` class is the most granular level of storage, and represents a key, its value, and metadata. when a transaction commits, the entries included in the transaction get encoded into byte arrays that get appended onto the `arena` (one long byte array that keeps growing). entries can vary in length, and the memtable's index contains pointers to offsets where each chunk of memory lives in the `arena`. 20 | 21 | encoded entries are laid out in memory as follows: 22 | 23 | ![](https://github.com/thejchap/jdb/blob/master/docs/img/journal/02_storage/entry.png?raw=true) 24 | 25 | ## serializable snapshot isolation (SSI) 26 | 27 | when implementing transactions, i had to choose a [transaction isolation level]() (or levels) to provide to the database user. the isolation level determines, during the execution of a transaction, what data the operations in that transaction are allowed to see. i decided to implement [serializable snapshot isolation](https://wiki.postgresql.org/wiki/SSI) (SSI), which is the strictest isolation level, and a relatively new development in databases. 28 | 29 | serializability in database systems is a property which ensures that the outcome of a set of transactions is equal to the outcome as if all the transactions were executed serially (one after the other). this is an extremely important property in areas such as finance, where race conditions during debit and credit operations could cause money disappearing or appearing out of thin air. 30 | 31 | when executing transactions one after the other in a single-threaded environment, this is a very easy property to uphold. as with most concepts in programming, the second we add in any sort of concurrent processing, the problem gets a lot more interesting. 32 | 33 | [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) (SI) is a widely used isolation level in which at the beginning of a transaction, the transaction gets assigned a start timestamp, and only sees data that is a result of transactions which have committed prior to that start timestamp. this prevents dirty reads of data that is being modified by other in-flight transactions. SSI builds on top of SI by preventing in-progress writes from modifying keys that other transactions are reading by doing some "bookkeeping" of transaction dependencies. 34 | 35 | in jdb, the `Oracle` class does the bookkeeping, and is the only logic in the transaction commit code path that is not threadsafe. `Oracle` maintains a map of keys to their last commit timestamp and provides 2 public methods: 36 | 37 | - `read_ts` - called by transactions when they are instantiated to obtain a start/read timestamp that determines what snapshot of the database they are getting 38 | - `commit_request` - ensures no keys in the list of read operations in this transaction have been modified by other transactions since this transaction started 39 | 40 | ![](https://github.com/thejchap/jdb/blob/master/docs/img/journal/02_storage/mermaid-diagram-20201012084711.png?raw=true) 41 | 42 | ## multi-version concurrency control (MVCC) 43 | 44 | [multi-version concurrency control](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) (MVCC) provides a consistent point-in-time snapshot of the database to transactions who are reading data. this allows concurrent operations to happen so reads are not blocked by writes, while also ensuring that in-progress transactions don't see half committed data. this is achieved by assigning a monotonically increasing timestamp to transactions (and therefore all writes that occur within that transaction) which gets encoded as part of the `Entry`'s key when it is committed in the `Memtable`. 45 | 46 | as stated earlier, the database index is maintained as an AVL tree in which the nodes are pointers to data in the `arena`. the keys in this table are a concatenation of key and timestamp. during a lookup, we traverse the tree and find the key matching the query key which has the latest timestamp prior to the read timestamp on the transaction. 47 | 48 | ## API 49 | 50 | the finished product looks a little something like this in code: 51 | 52 | ```python 53 | from jdb.db import DB 54 | 55 | db = DB() 56 | 57 | with db.transaction() as txn: 58 | txn.read(b'foo') 59 | txn.write(b'bar', b'baz') 60 | ``` 61 | 62 | ## code 63 | 64 | [https://github.com/thejchap/jdb](https://github.com/thejchap/jdb) 65 | -------------------------------------------------------------------------------- /docs/mermaid/membership-fdloop.mmd: -------------------------------------------------------------------------------- 1 | sequenceDiagram 2 | participant node1 3 | participant node2 4 | participant node3 5 | autonumber 6 | node2->>node1: state sync 7 | node1-->>node2: cluster state 8 | node3->>node2: state sync 9 | node2-->>node3: cluster state 10 | loop failure detection loop (0.5s) 11 | node1->>node2: ping 12 | node2-->>node1: ack 13 | node2->>node1: ping 14 | node1-->>node2: ack 15 | node2->>node3: ping 16 | node3-->>node2: ack 17 | node3->>node2: ping 18 | node2-->>node3: ack 19 | node3->>node1: ping 20 | node1-->>node3: ack 21 | node1->>node3: ping 22 | node3-->>node1: ack 23 | end -------------------------------------------------------------------------------- /docs/mermaid/membership-investigation.mmd: -------------------------------------------------------------------------------- 1 | sequenceDiagram 2 | participant node1 3 | participant node2 4 | participant node3 5 | participant node4 6 | participant node5 7 | autonumber 8 | rect rgba(255,0,0) 9 | node1->>node2: ping 10 | end 11 | par investigate node2 12 | node1->>node3: ping req (node2) 13 | node1->>node4: ping req (node2) 14 | node1->>node5: ping req (node2) 15 | end 16 | par investigating node2 17 | rect rgba(255,0,0) 18 | node3->>node2: ping 19 | node4->>node2: ping 20 | node5->>node2: ping 21 | end 22 | end 23 | par node2 failure confirmed 24 | node3-->>node1: failure confirmed (node2) 25 | node4-->>node1: failure confirmed (node2) 26 | node5-->>node1: failure confirmed (node2) 27 | end 28 | node1->>node4: gossip (random peer) -------------------------------------------------------------------------------- /docs/mermaid/storage-avl.mmd: -------------------------------------------------------------------------------- 1 | classDiagram 2 | Node1 --> Node2 3 | Node1 --> Node3 4 | class Node1{ 5 | key: (b'baz\xff\xff\xff\xff\xff\xff\xff\xfc', 54) 6 | } 7 | class Node2{ 8 | key: (b'bar\xff\xff\xff\xff\xff\xff\xff\xfd', 27) 9 | } 10 | class Node3{ 11 | key: (b'foo\xff\xff\xff\xff\xff\xff\xff\xfe', 0) 12 | } 13 | -------------------------------------------------------------------------------- /docs/mermaid/storage-ssi.mmd: -------------------------------------------------------------------------------- 1 | sequenceDiagram 2 | participant txn a 3 | participant oracle 4 | participant txn b 5 | txn a->>+oracle: begin (read ts: 1) 6 | Note right of txn a: bar = get('bar'), set('foo', bar + 1) 7 | txn b->>+oracle: begin (read ts: 1) 8 | Note left of txn b: set('bar', -42) 9 | oracle-->>-txn b: commit (commit ts: 2) 10 | rect rgba(255,0,0) 11 | oracle-->>-txn a: commit (commit ts: 3) 12 | end 13 | Note right of txn a: txn a aborted - bar was modified by txn b since read ts 1 -------------------------------------------------------------------------------- /jdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/jdb/__init__.py -------------------------------------------------------------------------------- /jdb/cli.py: -------------------------------------------------------------------------------- 1 | from asyncio import open_connection, run 2 | from asyncio.streams import StreamReader, StreamWriter 3 | from argparse import ArgumentParser 4 | from jdb import const 5 | 6 | 7 | def _prompt(prompt: str): 8 | while True: 9 | statement = "" 10 | 11 | while True: 12 | txt = input(prompt) 13 | if const.TERMINATOR in txt: 14 | statement += f"{txt.split(const.TERMINATOR)[0]};\n" 15 | break 16 | statement += f"{txt}\n" 17 | yield statement 18 | 19 | 20 | async def _async_main(): 21 | parser = ArgumentParser(description="jdb client") 22 | parser.add_argument("-p", "--port", help="port", default=1337, type=int) 23 | parser.add_argument("-o", "--host", help="host", default="127.0.0.1", type=str) 24 | parser.add_argument("-q", "--query", help="query", type=str) 25 | args = parser.parse_args() 26 | prompt = f"{args.host}:{args.port}> " 27 | reader: StreamReader 28 | writer: StreamWriter 29 | reader, writer = await open_connection(args.host, args.port) 30 | 31 | if args.query: 32 | writer.write(f"{args.query}\n".encode()) 33 | await writer.drain() 34 | res = await reader.readline() 35 | print(res.decode().rstrip()) 36 | return 37 | 38 | try: 39 | for statement in _prompt(prompt): 40 | try: 41 | writer.write(statement.encode()) 42 | await writer.drain() 43 | res = await reader.readline() 44 | print(res.decode().rstrip()) 45 | except ConnectionResetError as err: 46 | print(err) 47 | except KeyboardInterrupt: 48 | pass 49 | 50 | try: 51 | writer.write(b"") 52 | await writer.drain() 53 | except ConnectionResetError: 54 | pass 55 | finally: 56 | writer.close() 57 | 58 | 59 | if __name__ == "__main__": 60 | run(_async_main()) 61 | -------------------------------------------------------------------------------- /jdb/const.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | BEGIN = "BEGIN" 4 | END = "END" 5 | PUT = "PUT" 6 | DELETE = "DELETE" 7 | GET = "GET" 8 | INFO = "INFO" 9 | TERMINATOR = ";" 10 | KEY = "key" 11 | VALUE = "value" 12 | TXN = "txn" 13 | OK = "OK" 14 | SYNTAX_ERR = "SYNTAX ERR" 15 | ABORTED = "ABORTED" 16 | COMMITTED = "COMMITTED" 17 | PENDING = "PENDING" 18 | MAX_UINT_64 = 2 ** 64 - 1 19 | MAX_UINT_32 = 2 ** 32 - 1 20 | BIT_TOMBSTONE = 1 << 0 21 | MAGLEV_OFFSET_SEED = 2 << 30 22 | MAGLEV_SKIP_SEED = 2 << 31 23 | 24 | # /table/pkey 25 | REQ_KEY_REGEX = re.compile(r"^\/([A-Za-z0-9]+)\/([A-Za-z0-9]+)$") 26 | -------------------------------------------------------------------------------- /jdb/crdt.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from threading import Lock 3 | from collections import OrderedDict 4 | from jdb import hlc 5 | 6 | 7 | class LWWRegister: 8 | """ 9 | lww register. 2 ops are add/remove. each op adds a ts to its respective set. 10 | if an element is in remove and has a ts > its counterpart in add, the element has 11 | been "deleted" from the register 12 | """ 13 | 14 | def __init__(self, replica_id: str): 15 | self.replica_id = replica_id 16 | self.clock = hlc.HLC() 17 | self.add_set: OrderedDict = OrderedDict() 18 | self.remove_set: OrderedDict = OrderedDict() 19 | self.lock = Lock() 20 | 21 | def __iter__(self): 22 | """actual representation of state""" 23 | 24 | for elem, ts in self.add_set.items(): 25 | if elem in self.remove_set and self.remove_set[elem] > ts: 26 | continue 27 | yield elem, ts 28 | 29 | def add(self, element: bytes): 30 | """add element to add set""" 31 | 32 | with self.lock: 33 | self.add_set[element] = int(self.clock.incr()) 34 | 35 | def remove(self, element: bytes): 36 | """add element to remove set""" 37 | 38 | with self.lock: 39 | self.remove_set[element] = int(self.clock.incr()) 40 | 41 | def merge(self, incoming: LWWRegister) -> LWWRegister: 42 | """threadsafe wrapper""" 43 | 44 | with self.lock: 45 | return self._merge(incoming) 46 | 47 | def _merge(self, incoming: LWWRegister) -> LWWRegister: 48 | """merge registers""" 49 | 50 | sets = ["add_set", "remove_set"] 51 | 52 | for key in sets: 53 | incoming_set = getattr(incoming, key).items() 54 | 55 | for elem, ts in incoming_set: 56 | ts = int(ts) 57 | existing = getattr(self, key).get(elem) 58 | 59 | if not existing: 60 | getattr(self, key)[elem] = ts 61 | continue 62 | 63 | incoming_ts = hlc.HLCTimestamp.from_int(ts) 64 | my_ts = hlc.HLCTimestamp.from_int(existing) 65 | 66 | self.clock.recv(incoming_ts) 67 | 68 | if incoming_ts.compare(my_ts) > 0: 69 | getattr(self, key)[elem] = ts 70 | 71 | return self 72 | -------------------------------------------------------------------------------- /jdb/errors.py: -------------------------------------------------------------------------------- 1 | class ChecksumMismatch(Exception): 2 | """unable to verify crc32""" 3 | 4 | 5 | class TableOverflow(Exception): 6 | """max reached""" 7 | 8 | 9 | class Abort(Exception): 10 | """transaction aborted""" 11 | 12 | 13 | class InvalidRequest(Exception): 14 | """invalid request""" 15 | -------------------------------------------------------------------------------- /jdb/hlc.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from threading import Lock 3 | from dataclasses import dataclass 4 | from jdb import util 5 | 6 | 7 | @dataclass 8 | class HLCTimestamp: 9 | """hybrid logical clock timestamp""" 10 | 11 | ts: int 12 | count: int 13 | 14 | @classmethod 15 | def from_int(cls, packed: int) -> HLCTimestamp: 16 | """unpack""" 17 | 18 | string = str(packed) 19 | count = int(string[-2:]) 20 | ts = int(string[:-2] or "0") 21 | 22 | return cls(ts=ts, count=count) 23 | 24 | def compare(self, other: HLCTimestamp) -> int: 25 | """compare ts""" 26 | 27 | if self.ts == other.ts: 28 | if self.count == other.count: 29 | return 0 30 | return self.count - other.count 31 | return self.ts - other.ts 32 | 33 | def __int__(self): 34 | """pack. v naive implementation. redo for real sometime""" 35 | 36 | return int("".join([str(self.ts).zfill(16), str(self.count).zfill(2)])) 37 | 38 | 39 | class HLC: 40 | """hybrid logical clock""" 41 | 42 | def __init__(self): 43 | self.ts = util.now_ms() 44 | self.count = 0 45 | self.lock = Lock() 46 | 47 | def recv(self, incoming: HLCTimestamp): 48 | """process incoming ts""" 49 | 50 | with self.lock: 51 | now = util.now_ms() 52 | 53 | if now > self.ts and now > incoming.ts: 54 | self.ts = now 55 | self.count = 0 56 | elif self.ts == incoming.ts: 57 | self.count = max(self.count, incoming.count) 58 | elif self.ts > incoming.ts: 59 | self.count += 1 60 | else: 61 | self.ts = incoming.ts 62 | self.count = incoming.count + 1 63 | 64 | def incr(self) -> HLCTimestamp: 65 | """get new ts""" 66 | 67 | with self.lock: 68 | now = util.now_ms() 69 | 70 | if now > self.ts: 71 | self.ts = now 72 | else: 73 | self.count += 1 74 | 75 | return HLCTimestamp(ts=self.ts, count=self.count) 76 | -------------------------------------------------------------------------------- /jdb/jql.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Tuple, Optional 2 | import json 3 | from pyparsing import ( 4 | CaselessKeyword, 5 | Word, 6 | alphanums, 7 | ParseResults, 8 | OneOrMore, 9 | Literal, 10 | Combine, 11 | ) 12 | import jdb.routing as rte 13 | import jdb.const as k 14 | import jdb.node as nde 15 | 16 | Result = Tuple[Optional[str], Optional[bool]] 17 | 18 | 19 | def _do_statement(node: nde.Node, tokens: ParseResults) -> Result: 20 | """entrypoint""" 21 | 22 | if len(tokens) == 1 and isinstance(tokens[0], str) and tokens[0] == k.INFO: 23 | return json.dumps(dict(node)), None 24 | 25 | if "txn" in tokens: 26 | return tokens.txn(node) 27 | 28 | return _do_batch_request(tokens)(node) 29 | 30 | 31 | def _do_batch_request(tokens: ParseResults) -> Callable[[nde.Node], Result]: 32 | """return a fn to execute a transaction""" 33 | 34 | def wrapper(node: nde.Node): 35 | req = rte.BatchRequest(requests=tokens) 36 | ret = node.router.request(req) 37 | 38 | return None, ret 39 | 40 | return wrapper 41 | 42 | 43 | def _do_put(tokens: ParseResults) -> rte.PutRequest: 44 | """build a txn entry from tokens""" 45 | 46 | return rte.PutRequest(key=tokens.key.encode(), value=tokens.value.encode()) 47 | 48 | 49 | def _do_get(tokens: ParseResults) -> rte.GetRequest: 50 | """just return the key""" 51 | 52 | return rte.GetRequest(key=tokens.key.encode()) 53 | 54 | 55 | def _do_delete(tokens: ParseResults) -> rte.DeleteRequest: 56 | """build a txn entry from tokens""" 57 | 58 | return rte.DeleteRequest(key=tokens.key.encode()) 59 | 60 | 61 | class JQL: 62 | """this whole thing is a little wack. but yea - simple parser for cli commands""" 63 | 64 | _key = Combine(Literal("/") + Word(alphanums) + Literal("/") + Word(alphanums)) 65 | _put = ( 66 | CaselessKeyword(k.PUT).suppress() 67 | + _key.setResultsName(k.KEY) 68 | + Word(alphanums).setResultsName(k.VALUE) 69 | ).addParseAction(_do_put) 70 | _get = ( 71 | CaselessKeyword(k.GET).suppress() + _key.setResultsName(k.KEY) 72 | ).addParseAction(_do_get) 73 | _delete = ( 74 | CaselessKeyword(k.DELETE).suppress() + _key.setResultsName(k.KEY) 75 | ).addParseAction(_do_delete) 76 | _info = CaselessKeyword(k.INFO) 77 | _operation = _put | _delete | _get 78 | _transaction = ( 79 | ( 80 | CaselessKeyword(k.BEGIN).suppress() 81 | + OneOrMore(_operation) 82 | + CaselessKeyword(k.END).suppress() 83 | ) 84 | .addParseAction(_do_batch_request) 85 | .setResultsName(k.TXN) 86 | ) 87 | 88 | _statement = (_operation | _transaction | _info) + Literal(k.TERMINATOR).suppress() 89 | 90 | def __init__(self, node: nde.Node): 91 | self._node = node 92 | self._statement.setParseAction(self._with_db(_do_statement)) 93 | 94 | def call(self, statement: str) -> Result: 95 | """main entrypoint""" 96 | 97 | return self._statement.parseString(statement, parseAll=True)[0] 98 | 99 | def _with_db(self, func: Callable) -> Callable: 100 | """pass node into actions""" 101 | 102 | def wrapped(tokens: ParseResults): 103 | return func(self._node, tokens) 104 | 105 | return wrapped 106 | -------------------------------------------------------------------------------- /jdb/maglev.py: -------------------------------------------------------------------------------- 1 | from typing import Set, List 2 | from sympy import nextprime 3 | from xxhash import xxh32_intdigest 4 | import jdb.const as const 5 | 6 | 7 | class Maglev: 8 | """implement Maglev hashing""" 9 | 10 | def __init__(self, nodes: Set[str]): 11 | self.nodes = list(nodes) 12 | self.n = len(nodes) 13 | self.m = nextprime(self.n * 100) 14 | self._permutation = self._gen_permutation() 15 | self.table = self._populate() 16 | 17 | def _gen_permutation(self) -> List[List[int]]: 18 | """generate permutations""" 19 | 20 | m = self.m 21 | permutation: List[List[int]] = [] 22 | 23 | for i, name in enumerate(list(self.nodes)): 24 | offset = xxh32_intdigest(name, seed=const.MAGLEV_OFFSET_SEED) % m 25 | skip = xxh32_intdigest(name, seed=const.MAGLEV_SKIP_SEED) % (m - 1) + 1 26 | permutation.append([]) 27 | 28 | for j in range(0, m): 29 | permutation[i].append((offset + j * skip) % m) 30 | 31 | return permutation 32 | 33 | def lookup(self, key: str) -> str: 34 | """lookup node for key""" 35 | 36 | hashed = xxh32_intdigest(key) 37 | return self.nodes[self.table[hashed % self.m]] 38 | 39 | def _populate(self): 40 | """generate lookup table""" 41 | 42 | if not self.nodes: 43 | return 44 | 45 | n, m, perm = self.n, self.m, self._permutation 46 | next_ = [0] * n 47 | entry = [-1] * m 48 | k = 0 49 | 50 | while True: 51 | for i in range(0, n): 52 | c = perm[i][next_[i]] 53 | 54 | while entry[c] >= 0: 55 | next_[i] += 1 56 | c = perm[i][next_[i]] 57 | 58 | entry[c] = i 59 | next_[i] += 1 60 | k += 1 61 | 62 | if k == m: 63 | return entry 64 | -------------------------------------------------------------------------------- /jdb/membership.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Dict, Optional, Set, List 3 | from contextlib import contextmanager 4 | from threading import Thread, Lock 5 | from concurrent.futures import ThreadPoolExecutor 6 | from queue import Queue 7 | from random import uniform, choice, sample 8 | from time import sleep 9 | from tenacity import retry, wait_fixed 10 | from structlog import get_logger 11 | import jdb.crdt as crdt 12 | import jdb.util as util 13 | import jdb.maglev as mag 14 | import jdb.peer as pr 15 | 16 | _LOGGER = get_logger() 17 | _JITTER = 0.01 18 | STARTUP_GRACE_PERIOD = 2 19 | FD_INTERVAL = 0.5 20 | FD_SUBGROUP_SIZE = 3 21 | GOSSIP_SUBGROUP_SIZE = 5 22 | GOSSIP_INTERVAL = 0.2 23 | 24 | 25 | class Membership: 26 | """modified implementation of SWIM protocol""" 27 | 28 | def __init__( 29 | self, 30 | node_name: str, 31 | node_addr: str, 32 | failure_detection_interval: float = FD_INTERVAL, 33 | failure_detection_subgroup_size: int = FD_SUBGROUP_SIZE, 34 | gossip_subgroup_size: int = GOSSIP_SUBGROUP_SIZE, 35 | gossip_interval: float = GOSSIP_INTERVAL, 36 | ): 37 | self.failure_detection_subgroup_size = failure_detection_subgroup_size 38 | self.gossip_subgroup_size = gossip_subgroup_size 39 | self.failure_detection_interval = failure_detection_interval 40 | self.suspects: Set[str] = set() 41 | self._choices: Set[str] = set() 42 | self.suspect_queue: Queue = Queue() 43 | self.node_name = node_name 44 | self.node_addr = node_addr 45 | self.node_key = f"{node_name}={node_addr}" 46 | self.gossip_interval = gossip_interval 47 | self.cluster_state = crdt.LWWRegister(replica_id=node_name) 48 | self.cluster_state.add(self.node_key.encode()) 49 | self.peers: Dict[str, pr.Peer] = {} 50 | self._build_route_table() 51 | self.logger = _LOGGER.bind(node=self.node_key) 52 | self.stopped = False 53 | self.lock = Lock() 54 | self.threads = [ 55 | Thread( 56 | target=self._failure_detection_loop, 57 | daemon=True, 58 | name="MembershipFailureDetectionThread", 59 | ), 60 | Thread( 61 | target=self._gossip_loop, daemon=True, name="MembershipGossipThread" 62 | ), 63 | Thread( 64 | target=self._investigation_loop, 65 | daemon=True, 66 | name="MembershipInvestigationThread", 67 | ), 68 | ] 69 | 70 | @retry(wait=wait_fixed(1)) 71 | def bootstrap(self, join: str): 72 | """initial state sync""" 73 | 74 | peer_name, addr = join.split("=") 75 | peer = self.add_peer(peer_name, addr) 76 | self._sync_with(peer) 77 | 78 | def _sync_with(self, peer: pr.Peer) -> crdt.LWWRegister: 79 | """some sugar""" 80 | 81 | merged = peer.membership_state_sync( 82 | self.cluster_state, from_addr=self.node_addr 83 | ) 84 | 85 | return self.state_sync(merged, peer_addr=peer.addr) 86 | 87 | def add_peer(self, name: str, addr: str) -> pr.Peer: 88 | """add peer""" 89 | 90 | with self.lock: 91 | peer = pr.Peer(addr=addr, name=name, logger=self.logger) 92 | self.peers[name] = peer 93 | self.cluster_state.add(peer.node_key.encode()) 94 | self._build_route_table() 95 | return peer 96 | 97 | def get_peer(self, name: str, addr: Optional[str]) -> pr.Peer: 98 | """get or add""" 99 | 100 | if name in self.peers: 101 | return self.peers[name] 102 | if addr: 103 | return self.add_peer(name, addr) 104 | 105 | raise Exception("unable to get or add peer") 106 | 107 | def remove_peer(self, peer: pr.Peer): 108 | """remove from map and cluster state""" 109 | 110 | with self.lock: 111 | self.cluster_state.remove(peer.node_key.encode()) 112 | if peer.name in self.peers: 113 | del self.peers[peer.name] 114 | self._build_route_table() 115 | 116 | def _gossip_loop(self): 117 | """run forever""" 118 | 119 | while not self.stopped: 120 | self._gossip() 121 | 122 | sleep( 123 | uniform(self.gossip_interval - _JITTER, self.gossip_interval + _JITTER) 124 | ) 125 | 126 | def _probe_random_peer(self): 127 | """pick a rando from the group and probe it""" 128 | 129 | target = self._random_peer() 130 | 131 | if not target: 132 | return 133 | 134 | self.logger.info("membership.probe", peer=target.node_key) 135 | 136 | with self._failure_detection(target): 137 | target.membership_ping() 138 | 139 | def _gossip(self): 140 | """pick k randos from the group and sync with them""" 141 | 142 | keys = self._gossip_subgroup() 143 | 144 | if not keys: 145 | return 146 | 147 | peers: List[pr.Peer] = [] 148 | 149 | for key in keys: 150 | peer_name, addr = key.split("=") 151 | peer = self.get_peer(peer_name, addr=addr) 152 | peers.append(peer) 153 | 154 | self.logger.info("membership.gossip", peers=[p.node_key for p in peers]) 155 | 156 | with ThreadPoolExecutor(max_workers=5) as e: 157 | for peer in peers: 158 | task = e.submit(self._sync_with, peer) 159 | 160 | with self._failure_detection(peer): 161 | task.result() 162 | 163 | def _failure_detection_loop(self): 164 | """SWIM fd sort of""" 165 | 166 | while not self.stopped: 167 | self._probe_random_peer() 168 | 169 | sleep( 170 | uniform( 171 | self.failure_detection_interval - _JITTER, 172 | self.failure_detection_interval + _JITTER, 173 | ) 174 | ) 175 | 176 | def _investigation_loop(self): 177 | """process suspects off suspect queue""" 178 | 179 | while not self.stopped: 180 | suspect = self.suspect_queue.get() 181 | 182 | if suspect is None: 183 | break 184 | 185 | self._investigate(suspect) 186 | self.suspect_queue.task_done() 187 | 188 | def _investigate(self, suspect: pr.Peer): 189 | """indirectly probe suspect. todo: make async""" 190 | 191 | keys = self._failure_detection_subgroup() 192 | investigators: List[pr.Peer] = [] 193 | 194 | for key in keys: 195 | name, addr = key.split("=") 196 | peer = self.get_peer(name, addr=addr) 197 | investigators.append(peer) 198 | 199 | self.logger.info( 200 | "membership.investigating", 201 | peer=suspect.node_key, 202 | investigators=[i.node_key for i in investigators], 203 | ) 204 | 205 | results = {} 206 | 207 | with ThreadPoolExecutor(max_workers=5) as e: 208 | for peer in investigators: 209 | task = e.submit(peer.membership_ping_req, suspect) 210 | results[peer] = task.result() 211 | 212 | for peer, ack in results.items(): 213 | if not ack: 214 | continue 215 | self._failure_vetoed(suspect, by_peer=peer) 216 | return 217 | 218 | self._failure_confirmed(suspect, by_peers=investigators) 219 | 220 | def _failure_vetoed(self, suspect: pr.Peer, by_peer: pr.Peer): 221 | """another node was able to contact the suspect""" 222 | 223 | self.logger.info( 224 | "membership.failure_vetoed", peer=suspect.node_key, by=by_peer.node_key 225 | ) 226 | 227 | if suspect.node_key in self.suspects: 228 | self.suspects.remove(suspect.node_key) 229 | 230 | def _failure_confirmed(self, suspect: pr.Peer, by_peers: List[pr.Peer]): 231 | """its actually faulty, update and disseminate""" 232 | 233 | self.logger.info( 234 | "membership.failure_confirmed", 235 | peer=suspect.node_key, 236 | by=list(map(lambda p: p.node_key, by_peers)), 237 | ) 238 | 239 | self.remove_peer(suspect) 240 | 241 | if suspect.node_key in self.suspects: 242 | self.suspects.remove(suspect.node_key) 243 | 244 | self._gossip() 245 | 246 | def _gossip_subgroup(self) -> List[str]: 247 | """grab k non-faulty peers for gossip""" 248 | 249 | peers = self._eligible_peers() 250 | k = min(self.gossip_subgroup_size, len(peers)) 251 | return sample(self._eligible_peers(), k=k,) 252 | 253 | def _failure_detection_subgroup(self) -> List[str]: 254 | """grab k non-faulty peers for failure verification""" 255 | 256 | peers = self._eligible_peers() 257 | k = min(self.failure_detection_subgroup_size, len(peers)) 258 | return sample(self._eligible_peers(), k=k) 259 | 260 | @contextmanager 261 | def _failure_detection(self, peer: pr.Peer): 262 | """meant to wrap a rpc call, if it fails, investigate peer""" 263 | 264 | try: 265 | yield 266 | except Exception: # pylint: disable=broad-except 267 | self._add_suspect(peer) 268 | 269 | def _add_suspect(self, peer: pr.Peer): 270 | """mark as suspect, publish to queue""" 271 | 272 | self.suspects.add(peer.node_key) 273 | self.suspect_queue.put(peer) 274 | self.logger.info("membership.add_suspect", peer=peer.node_key) 275 | 276 | def _random_peer(self) -> Optional[pr.Peer]: 277 | """pick a random peer from cluster state""" 278 | 279 | peers = self._eligible_peers() 280 | 281 | if len(self._choices) >= len(peers): 282 | self._choices = set() 283 | 284 | filtered = [i for i in peers if i not in self._choices] 285 | 286 | if not filtered: 287 | return None 288 | 289 | key = choice(filtered) 290 | self._choices.add(key) 291 | name, addr = key.split("=") 292 | return self.get_peer(name=name, addr=addr) 293 | 294 | def _eligible_peers(self) -> List[str]: 295 | """ 296 | list of peers eligible for comms, filtering out this node and ones 297 | that we are still giving some time to start up 298 | """ 299 | 300 | all_peers = dict(self.cluster_state).items() 301 | my_key = self.node_key.encode() 302 | counter_pad = 100 303 | now = util.now_ms() * counter_pad 304 | i = STARTUP_GRACE_PERIOD * 1000 * counter_pad 305 | 306 | return [ 307 | k.decode() 308 | for k, v in all_peers 309 | if k != my_key and (v + i) < now and k.decode() not in self.suspects 310 | ] 311 | 312 | def ping(self, peer_name: str, peer_addr: str) -> bool: 313 | """ping a given peer""" 314 | 315 | peer = self.get_peer(peer_name, addr=peer_addr) 316 | return peer.membership_ping() 317 | 318 | def ping_req(self, peer_name: str, peer_addr: str) -> bool: 319 | """handle a ping request from a peer""" 320 | 321 | peer = self.get_peer(peer_name, addr=peer_addr) 322 | 323 | with self._failure_detection(peer): 324 | return peer.membership_ping() 325 | 326 | def state_sync( 327 | self, incoming: crdt.LWWRegister, peer_addr: str 328 | ) -> crdt.LWWRegister: 329 | """take another nodes cluster state and merge with our own""" 330 | 331 | self.get_peer(incoming.replica_id, addr=peer_addr) 332 | 333 | with self.lock: 334 | res = self.cluster_state.merge(incoming) 335 | self._build_route_table() 336 | 337 | return res 338 | 339 | def _build_route_table(self): 340 | """rebuild rt""" 341 | 342 | nodekeys = map(lambda k: k[0].decode().split("=")[0], self.cluster_state) 343 | self.maglev = mag.Maglev(nodes=set(nodekeys)) 344 | 345 | def lookup_leaseholder(self, key: str) -> Optional[pr.Peer]: 346 | """find whos responsible for a key. if self, return None""" 347 | 348 | name = self.maglev.lookup(key) 349 | return self.peers.get(name) 350 | 351 | def stop(self): 352 | """shut down""" 353 | 354 | self.suspect_queue.put(None) 355 | self.stopped = True 356 | for thread in self.threads: 357 | thread.join() 358 | self.logger.info("membership.stop") 359 | 360 | def start(self): 361 | """fire up all components""" 362 | 363 | for thread in self.threads: 364 | thread.start() 365 | 366 | self.logger.info("membership.start") 367 | 368 | for thread in self.threads: 369 | thread.join() 370 | -------------------------------------------------------------------------------- /jdb/node.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | from uuid import uuid4 as uuid 3 | from dataclasses import dataclass, field 4 | from structlog import get_logger 5 | from jdb import ( 6 | storage as db, 7 | membership as mbr, 8 | routing as rte, 9 | util, 10 | const as k, 11 | ) 12 | 13 | _LOGGER = get_logger() 14 | 15 | 16 | @dataclass 17 | class Node: 18 | """represents the running node""" 19 | 20 | logger: Any = field(init=False) 21 | p2p_addr: str = "" 22 | client_addr: str = "" 23 | store: db.DB = field(init=False) 24 | name: Optional[str] = str(uuid()) 25 | membership: mbr.Membership = field(init=False) 26 | router: "rte.Router" = field(init=False) 27 | 28 | def __iter__(self): 29 | """return meta""" 30 | 31 | for key in ["name", "p2p_addr", "client_addr"]: 32 | yield key, getattr(self, key) 33 | 34 | yield "membership", util.stringify_keys(dict(self.membership.cluster_state)) 35 | 36 | def __post_init__(self): 37 | """override""" 38 | 39 | self.logger = _LOGGER.bind(name=self.name) 40 | self.store = db.DB() 41 | membership = mbr.Membership(node_name=self.name, node_addr=self.p2p_addr) 42 | self.membership = membership 43 | self.router = rte.Router(membership=membership, node=self) 44 | 45 | def coordinate(self, req: "rte.BatchRequest") -> db.Transaction: 46 | """handle a request i am responsible for""" 47 | 48 | self.logger.info( 49 | "node.coordinate.start", table=req.table, requests=req.requests 50 | ) 51 | 52 | with self.store.transaction() as txn: 53 | for op in req.requests: 54 | if isinstance(op, rte.GetRequest): 55 | txn.read(op.key) 56 | elif isinstance(op, rte.PutRequest): 57 | txn.write(op.key, op.value) 58 | elif isinstance(op, rte.DeleteRequest): 59 | txn.write(op.key, meta=k.BIT_TOMBSTONE) 60 | 61 | self.logger.info( 62 | "node.coordinate.done", table=req.table, returning=txn.returning 63 | ) 64 | 65 | return txn 66 | 67 | def bootstrap(self, join: str): 68 | """contact peer, merge cluster states""" 69 | 70 | self.membership.bootstrap(join) 71 | self.logger.info("node.bootstrap", join=join) 72 | -------------------------------------------------------------------------------- /jdb/pb/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | sys.path.append(str(Path(__file__).parent)) 5 | -------------------------------------------------------------------------------- /jdb/pb/peer_server.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | service PeerServer { 4 | rpc MembershipStateSync (MembershipState) returns (MembershipState) {} 5 | rpc MembershipPing (Empty) returns (Ack) {} 6 | rpc MembershipPingReq (MembershipPingRequest) returns (Ack) {} 7 | rpc Coordinate (BatchRequest) returns (BatchResponse) {} 8 | } 9 | 10 | message MembershipState { 11 | string replica_id = 1; 12 | string peer_addr = 2; 13 | map add_set = 3; 14 | map remove_set = 4; 15 | } 16 | 17 | message MembershipPingRequest { 18 | string peer_name = 1; 19 | string peer_addr = 2; 20 | } 21 | 22 | message GetRequest { 23 | bytes key = 1; 24 | } 25 | 26 | message PutRequest { 27 | bytes key = 1; 28 | bytes value = 2; 29 | } 30 | 31 | message DeleteRequest { 32 | bytes key = 1; 33 | } 34 | 35 | message RequestUnion { 36 | oneof value { 37 | GetRequest get = 1; 38 | PutRequest put = 2; 39 | DeleteRequest delete = 3; 40 | } 41 | } 42 | 43 | message BatchRequest { 44 | string table = 1; 45 | repeated RequestUnion requests = 2; 46 | } 47 | 48 | message HLC { 49 | uint64 ts = 1; 50 | uint32 count = 2; 51 | } 52 | 53 | enum TransactionStatus { 54 | PENDING = 0; 55 | COMMITTED = 1; 56 | ABORTED = 2; 57 | } 58 | 59 | message Transaction { 60 | string txnid = 1; 61 | TransactionStatus status = 2; 62 | map returning = 3; 63 | uint64 read_ts = 4; 64 | uint64 commit_ts = 5; 65 | } 66 | 67 | message BatchResponse { 68 | string table = 1; 69 | Transaction txn = 2; 70 | } 71 | 72 | message Empty {} 73 | 74 | message Ack { 75 | bool ack = 1; 76 | } -------------------------------------------------------------------------------- /jdb/pb/peer_server_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # source: peer_server.proto 4 | 5 | from google.protobuf.internal import enum_type_wrapper 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # @@protoc_insertion_point(imports) 11 | 12 | _sym_db = _symbol_database.Default() 13 | 14 | 15 | 16 | 17 | DESCRIPTOR = _descriptor.FileDescriptor( 18 | name='peer_server.proto', 19 | package='', 20 | syntax='proto3', 21 | serialized_options=None, 22 | serialized_pb=b'\n\x11peer_server.proto\"\xfd\x01\n\x0fMembershipState\x12\x12\n\nreplica_id\x18\x01 \x01(\t\x12\x11\n\tpeer_addr\x18\x02 \x01(\t\x12-\n\x07\x61\x64\x64_set\x18\x03 \x03(\x0b\x32\x1c.MembershipState.AddSetEntry\x12\x33\n\nremove_set\x18\x04 \x03(\x0b\x32\x1f.MembershipState.RemoveSetEntry\x1a-\n\x0b\x41\x64\x64SetEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x04:\x02\x38\x01\x1a\x30\n\x0eRemoveSetEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x04:\x02\x38\x01\"=\n\x15MembershipPingRequest\x12\x11\n\tpeer_name\x18\x01 \x01(\t\x12\x11\n\tpeer_addr\x18\x02 \x01(\t\"\x19\n\nGetRequest\x12\x0b\n\x03key\x18\x01 \x01(\x0c\"(\n\nPutRequest\x12\x0b\n\x03key\x18\x01 \x01(\x0c\x12\r\n\x05value\x18\x02 \x01(\x0c\"\x1c\n\rDeleteRequest\x12\x0b\n\x03key\x18\x01 \x01(\x0c\"q\n\x0cRequestUnion\x12\x1a\n\x03get\x18\x01 \x01(\x0b\x32\x0b.GetRequestH\x00\x12\x1a\n\x03put\x18\x02 \x01(\x0b\x32\x0b.PutRequestH\x00\x12 \n\x06\x64\x65lete\x18\x03 \x01(\x0b\x32\x0e.DeleteRequestH\x00\x42\x07\n\x05value\">\n\x0c\x42\x61tchRequest\x12\r\n\x05table\x18\x01 \x01(\t\x12\x1f\n\x08requests\x18\x02 \x03(\x0b\x32\r.RequestUnion\" \n\x03HLC\x12\n\n\x02ts\x18\x01 \x01(\x04\x12\r\n\x05\x63ount\x18\x02 \x01(\r\"\xc6\x01\n\x0bTransaction\x12\r\n\x05txnid\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.TransactionStatus\x12.\n\treturning\x18\x03 \x03(\x0b\x32\x1b.Transaction.ReturningEntry\x12\x0f\n\x07read_ts\x18\x04 \x01(\x04\x12\x11\n\tcommit_ts\x18\x05 \x01(\x04\x1a\x30\n\x0eReturningEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"9\n\rBatchResponse\x12\r\n\x05table\x18\x01 \x01(\t\x12\x19\n\x03txn\x18\x02 \x01(\x0b\x32\x0c.Transaction\"\x07\n\x05\x45mpty\"\x12\n\x03\x41\x63k\x12\x0b\n\x03\x61\x63k\x18\x01 \x01(\x08*<\n\x11TransactionStatus\x12\x0b\n\x07PENDING\x10\x00\x12\r\n\tCOMMITTED\x10\x01\x12\x0b\n\x07\x41\x42ORTED\x10\x02\x32\xcf\x01\n\nPeerServer\x12;\n\x13MembershipStateSync\x12\x10.MembershipState\x1a\x10.MembershipState\"\x00\x12 \n\x0eMembershipPing\x12\x06.Empty\x1a\x04.Ack\"\x00\x12\x33\n\x11MembershipPingReq\x12\x16.MembershipPingRequest\x1a\x04.Ack\"\x00\x12-\n\nCoordinate\x12\r.BatchRequest\x1a\x0e.BatchResponse\"\x00\x62\x06proto3' 23 | ) 24 | 25 | _TRANSACTIONSTATUS = _descriptor.EnumDescriptor( 26 | name='TransactionStatus', 27 | full_name='TransactionStatus', 28 | filename=None, 29 | file=DESCRIPTOR, 30 | values=[ 31 | _descriptor.EnumValueDescriptor( 32 | name='PENDING', index=0, number=0, 33 | serialized_options=None, 34 | type=None), 35 | _descriptor.EnumValueDescriptor( 36 | name='COMMITTED', index=1, number=1, 37 | serialized_options=None, 38 | type=None), 39 | _descriptor.EnumValueDescriptor( 40 | name='ABORTED', index=2, number=2, 41 | serialized_options=None, 42 | type=None), 43 | ], 44 | containing_type=None, 45 | serialized_options=None, 46 | serialized_start=941, 47 | serialized_end=1001, 48 | ) 49 | _sym_db.RegisterEnumDescriptor(_TRANSACTIONSTATUS) 50 | 51 | TransactionStatus = enum_type_wrapper.EnumTypeWrapper(_TRANSACTIONSTATUS) 52 | PENDING = 0 53 | COMMITTED = 1 54 | ABORTED = 2 55 | 56 | 57 | 58 | _MEMBERSHIPSTATE_ADDSETENTRY = _descriptor.Descriptor( 59 | name='AddSetEntry', 60 | full_name='MembershipState.AddSetEntry', 61 | filename=None, 62 | file=DESCRIPTOR, 63 | containing_type=None, 64 | fields=[ 65 | _descriptor.FieldDescriptor( 66 | name='key', full_name='MembershipState.AddSetEntry.key', index=0, 67 | number=1, type=9, cpp_type=9, label=1, 68 | has_default_value=False, default_value=b"".decode('utf-8'), 69 | message_type=None, enum_type=None, containing_type=None, 70 | is_extension=False, extension_scope=None, 71 | serialized_options=None, file=DESCRIPTOR), 72 | _descriptor.FieldDescriptor( 73 | name='value', full_name='MembershipState.AddSetEntry.value', index=1, 74 | number=2, type=4, cpp_type=4, label=1, 75 | has_default_value=False, default_value=0, 76 | message_type=None, enum_type=None, containing_type=None, 77 | is_extension=False, extension_scope=None, 78 | serialized_options=None, file=DESCRIPTOR), 79 | ], 80 | extensions=[ 81 | ], 82 | nested_types=[], 83 | enum_types=[ 84 | ], 85 | serialized_options=b'8\001', 86 | is_extendable=False, 87 | syntax='proto3', 88 | extension_ranges=[], 89 | oneofs=[ 90 | ], 91 | serialized_start=180, 92 | serialized_end=225, 93 | ) 94 | 95 | _MEMBERSHIPSTATE_REMOVESETENTRY = _descriptor.Descriptor( 96 | name='RemoveSetEntry', 97 | full_name='MembershipState.RemoveSetEntry', 98 | filename=None, 99 | file=DESCRIPTOR, 100 | containing_type=None, 101 | fields=[ 102 | _descriptor.FieldDescriptor( 103 | name='key', full_name='MembershipState.RemoveSetEntry.key', index=0, 104 | number=1, type=9, cpp_type=9, label=1, 105 | has_default_value=False, default_value=b"".decode('utf-8'), 106 | message_type=None, enum_type=None, containing_type=None, 107 | is_extension=False, extension_scope=None, 108 | serialized_options=None, file=DESCRIPTOR), 109 | _descriptor.FieldDescriptor( 110 | name='value', full_name='MembershipState.RemoveSetEntry.value', index=1, 111 | number=2, type=4, cpp_type=4, label=1, 112 | has_default_value=False, default_value=0, 113 | message_type=None, enum_type=None, containing_type=None, 114 | is_extension=False, extension_scope=None, 115 | serialized_options=None, file=DESCRIPTOR), 116 | ], 117 | extensions=[ 118 | ], 119 | nested_types=[], 120 | enum_types=[ 121 | ], 122 | serialized_options=b'8\001', 123 | is_extendable=False, 124 | syntax='proto3', 125 | extension_ranges=[], 126 | oneofs=[ 127 | ], 128 | serialized_start=227, 129 | serialized_end=275, 130 | ) 131 | 132 | _MEMBERSHIPSTATE = _descriptor.Descriptor( 133 | name='MembershipState', 134 | full_name='MembershipState', 135 | filename=None, 136 | file=DESCRIPTOR, 137 | containing_type=None, 138 | fields=[ 139 | _descriptor.FieldDescriptor( 140 | name='replica_id', full_name='MembershipState.replica_id', index=0, 141 | number=1, type=9, cpp_type=9, label=1, 142 | has_default_value=False, default_value=b"".decode('utf-8'), 143 | message_type=None, enum_type=None, containing_type=None, 144 | is_extension=False, extension_scope=None, 145 | serialized_options=None, file=DESCRIPTOR), 146 | _descriptor.FieldDescriptor( 147 | name='peer_addr', full_name='MembershipState.peer_addr', index=1, 148 | number=2, type=9, cpp_type=9, label=1, 149 | has_default_value=False, default_value=b"".decode('utf-8'), 150 | message_type=None, enum_type=None, containing_type=None, 151 | is_extension=False, extension_scope=None, 152 | serialized_options=None, file=DESCRIPTOR), 153 | _descriptor.FieldDescriptor( 154 | name='add_set', full_name='MembershipState.add_set', index=2, 155 | number=3, type=11, cpp_type=10, label=3, 156 | has_default_value=False, default_value=[], 157 | message_type=None, enum_type=None, containing_type=None, 158 | is_extension=False, extension_scope=None, 159 | serialized_options=None, file=DESCRIPTOR), 160 | _descriptor.FieldDescriptor( 161 | name='remove_set', full_name='MembershipState.remove_set', index=3, 162 | number=4, type=11, cpp_type=10, label=3, 163 | has_default_value=False, default_value=[], 164 | message_type=None, enum_type=None, containing_type=None, 165 | is_extension=False, extension_scope=None, 166 | serialized_options=None, file=DESCRIPTOR), 167 | ], 168 | extensions=[ 169 | ], 170 | nested_types=[_MEMBERSHIPSTATE_ADDSETENTRY, _MEMBERSHIPSTATE_REMOVESETENTRY, ], 171 | enum_types=[ 172 | ], 173 | serialized_options=None, 174 | is_extendable=False, 175 | syntax='proto3', 176 | extension_ranges=[], 177 | oneofs=[ 178 | ], 179 | serialized_start=22, 180 | serialized_end=275, 181 | ) 182 | 183 | 184 | _MEMBERSHIPPINGREQUEST = _descriptor.Descriptor( 185 | name='MembershipPingRequest', 186 | full_name='MembershipPingRequest', 187 | filename=None, 188 | file=DESCRIPTOR, 189 | containing_type=None, 190 | fields=[ 191 | _descriptor.FieldDescriptor( 192 | name='peer_name', full_name='MembershipPingRequest.peer_name', index=0, 193 | number=1, type=9, cpp_type=9, label=1, 194 | has_default_value=False, default_value=b"".decode('utf-8'), 195 | message_type=None, enum_type=None, containing_type=None, 196 | is_extension=False, extension_scope=None, 197 | serialized_options=None, file=DESCRIPTOR), 198 | _descriptor.FieldDescriptor( 199 | name='peer_addr', full_name='MembershipPingRequest.peer_addr', index=1, 200 | number=2, type=9, cpp_type=9, label=1, 201 | has_default_value=False, default_value=b"".decode('utf-8'), 202 | message_type=None, enum_type=None, containing_type=None, 203 | is_extension=False, extension_scope=None, 204 | serialized_options=None, file=DESCRIPTOR), 205 | ], 206 | extensions=[ 207 | ], 208 | nested_types=[], 209 | enum_types=[ 210 | ], 211 | serialized_options=None, 212 | is_extendable=False, 213 | syntax='proto3', 214 | extension_ranges=[], 215 | oneofs=[ 216 | ], 217 | serialized_start=277, 218 | serialized_end=338, 219 | ) 220 | 221 | 222 | _GETREQUEST = _descriptor.Descriptor( 223 | name='GetRequest', 224 | full_name='GetRequest', 225 | filename=None, 226 | file=DESCRIPTOR, 227 | containing_type=None, 228 | fields=[ 229 | _descriptor.FieldDescriptor( 230 | name='key', full_name='GetRequest.key', index=0, 231 | number=1, type=12, cpp_type=9, label=1, 232 | has_default_value=False, default_value=b"", 233 | message_type=None, enum_type=None, containing_type=None, 234 | is_extension=False, extension_scope=None, 235 | serialized_options=None, file=DESCRIPTOR), 236 | ], 237 | extensions=[ 238 | ], 239 | nested_types=[], 240 | enum_types=[ 241 | ], 242 | serialized_options=None, 243 | is_extendable=False, 244 | syntax='proto3', 245 | extension_ranges=[], 246 | oneofs=[ 247 | ], 248 | serialized_start=340, 249 | serialized_end=365, 250 | ) 251 | 252 | 253 | _PUTREQUEST = _descriptor.Descriptor( 254 | name='PutRequest', 255 | full_name='PutRequest', 256 | filename=None, 257 | file=DESCRIPTOR, 258 | containing_type=None, 259 | fields=[ 260 | _descriptor.FieldDescriptor( 261 | name='key', full_name='PutRequest.key', index=0, 262 | number=1, type=12, cpp_type=9, label=1, 263 | has_default_value=False, default_value=b"", 264 | message_type=None, enum_type=None, containing_type=None, 265 | is_extension=False, extension_scope=None, 266 | serialized_options=None, file=DESCRIPTOR), 267 | _descriptor.FieldDescriptor( 268 | name='value', full_name='PutRequest.value', index=1, 269 | number=2, type=12, cpp_type=9, label=1, 270 | has_default_value=False, default_value=b"", 271 | message_type=None, enum_type=None, containing_type=None, 272 | is_extension=False, extension_scope=None, 273 | serialized_options=None, file=DESCRIPTOR), 274 | ], 275 | extensions=[ 276 | ], 277 | nested_types=[], 278 | enum_types=[ 279 | ], 280 | serialized_options=None, 281 | is_extendable=False, 282 | syntax='proto3', 283 | extension_ranges=[], 284 | oneofs=[ 285 | ], 286 | serialized_start=367, 287 | serialized_end=407, 288 | ) 289 | 290 | 291 | _DELETEREQUEST = _descriptor.Descriptor( 292 | name='DeleteRequest', 293 | full_name='DeleteRequest', 294 | filename=None, 295 | file=DESCRIPTOR, 296 | containing_type=None, 297 | fields=[ 298 | _descriptor.FieldDescriptor( 299 | name='key', full_name='DeleteRequest.key', index=0, 300 | number=1, type=12, cpp_type=9, label=1, 301 | has_default_value=False, default_value=b"", 302 | message_type=None, enum_type=None, containing_type=None, 303 | is_extension=False, extension_scope=None, 304 | serialized_options=None, file=DESCRIPTOR), 305 | ], 306 | extensions=[ 307 | ], 308 | nested_types=[], 309 | enum_types=[ 310 | ], 311 | serialized_options=None, 312 | is_extendable=False, 313 | syntax='proto3', 314 | extension_ranges=[], 315 | oneofs=[ 316 | ], 317 | serialized_start=409, 318 | serialized_end=437, 319 | ) 320 | 321 | 322 | _REQUESTUNION = _descriptor.Descriptor( 323 | name='RequestUnion', 324 | full_name='RequestUnion', 325 | filename=None, 326 | file=DESCRIPTOR, 327 | containing_type=None, 328 | fields=[ 329 | _descriptor.FieldDescriptor( 330 | name='get', full_name='RequestUnion.get', index=0, 331 | number=1, type=11, cpp_type=10, label=1, 332 | has_default_value=False, default_value=None, 333 | message_type=None, enum_type=None, containing_type=None, 334 | is_extension=False, extension_scope=None, 335 | serialized_options=None, file=DESCRIPTOR), 336 | _descriptor.FieldDescriptor( 337 | name='put', full_name='RequestUnion.put', index=1, 338 | number=2, type=11, cpp_type=10, label=1, 339 | has_default_value=False, default_value=None, 340 | message_type=None, enum_type=None, containing_type=None, 341 | is_extension=False, extension_scope=None, 342 | serialized_options=None, file=DESCRIPTOR), 343 | _descriptor.FieldDescriptor( 344 | name='delete', full_name='RequestUnion.delete', index=2, 345 | number=3, type=11, cpp_type=10, label=1, 346 | has_default_value=False, default_value=None, 347 | message_type=None, enum_type=None, containing_type=None, 348 | is_extension=False, extension_scope=None, 349 | serialized_options=None, file=DESCRIPTOR), 350 | ], 351 | extensions=[ 352 | ], 353 | nested_types=[], 354 | enum_types=[ 355 | ], 356 | serialized_options=None, 357 | is_extendable=False, 358 | syntax='proto3', 359 | extension_ranges=[], 360 | oneofs=[ 361 | _descriptor.OneofDescriptor( 362 | name='value', full_name='RequestUnion.value', 363 | index=0, containing_type=None, fields=[]), 364 | ], 365 | serialized_start=439, 366 | serialized_end=552, 367 | ) 368 | 369 | 370 | _BATCHREQUEST = _descriptor.Descriptor( 371 | name='BatchRequest', 372 | full_name='BatchRequest', 373 | filename=None, 374 | file=DESCRIPTOR, 375 | containing_type=None, 376 | fields=[ 377 | _descriptor.FieldDescriptor( 378 | name='table', full_name='BatchRequest.table', index=0, 379 | number=1, type=9, cpp_type=9, label=1, 380 | has_default_value=False, default_value=b"".decode('utf-8'), 381 | message_type=None, enum_type=None, containing_type=None, 382 | is_extension=False, extension_scope=None, 383 | serialized_options=None, file=DESCRIPTOR), 384 | _descriptor.FieldDescriptor( 385 | name='requests', full_name='BatchRequest.requests', index=1, 386 | number=2, type=11, cpp_type=10, label=3, 387 | has_default_value=False, default_value=[], 388 | message_type=None, enum_type=None, containing_type=None, 389 | is_extension=False, extension_scope=None, 390 | serialized_options=None, file=DESCRIPTOR), 391 | ], 392 | extensions=[ 393 | ], 394 | nested_types=[], 395 | enum_types=[ 396 | ], 397 | serialized_options=None, 398 | is_extendable=False, 399 | syntax='proto3', 400 | extension_ranges=[], 401 | oneofs=[ 402 | ], 403 | serialized_start=554, 404 | serialized_end=616, 405 | ) 406 | 407 | 408 | _HLC = _descriptor.Descriptor( 409 | name='HLC', 410 | full_name='HLC', 411 | filename=None, 412 | file=DESCRIPTOR, 413 | containing_type=None, 414 | fields=[ 415 | _descriptor.FieldDescriptor( 416 | name='ts', full_name='HLC.ts', index=0, 417 | number=1, type=4, cpp_type=4, label=1, 418 | has_default_value=False, default_value=0, 419 | message_type=None, enum_type=None, containing_type=None, 420 | is_extension=False, extension_scope=None, 421 | serialized_options=None, file=DESCRIPTOR), 422 | _descriptor.FieldDescriptor( 423 | name='count', full_name='HLC.count', index=1, 424 | number=2, type=13, cpp_type=3, label=1, 425 | has_default_value=False, default_value=0, 426 | message_type=None, enum_type=None, containing_type=None, 427 | is_extension=False, extension_scope=None, 428 | serialized_options=None, file=DESCRIPTOR), 429 | ], 430 | extensions=[ 431 | ], 432 | nested_types=[], 433 | enum_types=[ 434 | ], 435 | serialized_options=None, 436 | is_extendable=False, 437 | syntax='proto3', 438 | extension_ranges=[], 439 | oneofs=[ 440 | ], 441 | serialized_start=618, 442 | serialized_end=650, 443 | ) 444 | 445 | 446 | _TRANSACTION_RETURNINGENTRY = _descriptor.Descriptor( 447 | name='ReturningEntry', 448 | full_name='Transaction.ReturningEntry', 449 | filename=None, 450 | file=DESCRIPTOR, 451 | containing_type=None, 452 | fields=[ 453 | _descriptor.FieldDescriptor( 454 | name='key', full_name='Transaction.ReturningEntry.key', index=0, 455 | number=1, type=9, cpp_type=9, label=1, 456 | has_default_value=False, default_value=b"".decode('utf-8'), 457 | message_type=None, enum_type=None, containing_type=None, 458 | is_extension=False, extension_scope=None, 459 | serialized_options=None, file=DESCRIPTOR), 460 | _descriptor.FieldDescriptor( 461 | name='value', full_name='Transaction.ReturningEntry.value', index=1, 462 | number=2, type=9, cpp_type=9, label=1, 463 | has_default_value=False, default_value=b"".decode('utf-8'), 464 | message_type=None, enum_type=None, containing_type=None, 465 | is_extension=False, extension_scope=None, 466 | serialized_options=None, file=DESCRIPTOR), 467 | ], 468 | extensions=[ 469 | ], 470 | nested_types=[], 471 | enum_types=[ 472 | ], 473 | serialized_options=b'8\001', 474 | is_extendable=False, 475 | syntax='proto3', 476 | extension_ranges=[], 477 | oneofs=[ 478 | ], 479 | serialized_start=803, 480 | serialized_end=851, 481 | ) 482 | 483 | _TRANSACTION = _descriptor.Descriptor( 484 | name='Transaction', 485 | full_name='Transaction', 486 | filename=None, 487 | file=DESCRIPTOR, 488 | containing_type=None, 489 | fields=[ 490 | _descriptor.FieldDescriptor( 491 | name='txnid', full_name='Transaction.txnid', index=0, 492 | number=1, type=9, cpp_type=9, label=1, 493 | has_default_value=False, default_value=b"".decode('utf-8'), 494 | message_type=None, enum_type=None, containing_type=None, 495 | is_extension=False, extension_scope=None, 496 | serialized_options=None, file=DESCRIPTOR), 497 | _descriptor.FieldDescriptor( 498 | name='status', full_name='Transaction.status', index=1, 499 | number=2, type=14, cpp_type=8, label=1, 500 | has_default_value=False, default_value=0, 501 | message_type=None, enum_type=None, containing_type=None, 502 | is_extension=False, extension_scope=None, 503 | serialized_options=None, file=DESCRIPTOR), 504 | _descriptor.FieldDescriptor( 505 | name='returning', full_name='Transaction.returning', index=2, 506 | number=3, type=11, cpp_type=10, label=3, 507 | has_default_value=False, default_value=[], 508 | message_type=None, enum_type=None, containing_type=None, 509 | is_extension=False, extension_scope=None, 510 | serialized_options=None, file=DESCRIPTOR), 511 | _descriptor.FieldDescriptor( 512 | name='read_ts', full_name='Transaction.read_ts', index=3, 513 | number=4, type=4, cpp_type=4, label=1, 514 | has_default_value=False, default_value=0, 515 | message_type=None, enum_type=None, containing_type=None, 516 | is_extension=False, extension_scope=None, 517 | serialized_options=None, file=DESCRIPTOR), 518 | _descriptor.FieldDescriptor( 519 | name='commit_ts', full_name='Transaction.commit_ts', index=4, 520 | number=5, type=4, cpp_type=4, label=1, 521 | has_default_value=False, default_value=0, 522 | message_type=None, enum_type=None, containing_type=None, 523 | is_extension=False, extension_scope=None, 524 | serialized_options=None, file=DESCRIPTOR), 525 | ], 526 | extensions=[ 527 | ], 528 | nested_types=[_TRANSACTION_RETURNINGENTRY, ], 529 | enum_types=[ 530 | ], 531 | serialized_options=None, 532 | is_extendable=False, 533 | syntax='proto3', 534 | extension_ranges=[], 535 | oneofs=[ 536 | ], 537 | serialized_start=653, 538 | serialized_end=851, 539 | ) 540 | 541 | 542 | _BATCHRESPONSE = _descriptor.Descriptor( 543 | name='BatchResponse', 544 | full_name='BatchResponse', 545 | filename=None, 546 | file=DESCRIPTOR, 547 | containing_type=None, 548 | fields=[ 549 | _descriptor.FieldDescriptor( 550 | name='table', full_name='BatchResponse.table', index=0, 551 | number=1, type=9, cpp_type=9, label=1, 552 | has_default_value=False, default_value=b"".decode('utf-8'), 553 | message_type=None, enum_type=None, containing_type=None, 554 | is_extension=False, extension_scope=None, 555 | serialized_options=None, file=DESCRIPTOR), 556 | _descriptor.FieldDescriptor( 557 | name='txn', full_name='BatchResponse.txn', index=1, 558 | number=2, type=11, cpp_type=10, label=1, 559 | has_default_value=False, default_value=None, 560 | message_type=None, enum_type=None, containing_type=None, 561 | is_extension=False, extension_scope=None, 562 | serialized_options=None, file=DESCRIPTOR), 563 | ], 564 | extensions=[ 565 | ], 566 | nested_types=[], 567 | enum_types=[ 568 | ], 569 | serialized_options=None, 570 | is_extendable=False, 571 | syntax='proto3', 572 | extension_ranges=[], 573 | oneofs=[ 574 | ], 575 | serialized_start=853, 576 | serialized_end=910, 577 | ) 578 | 579 | 580 | _EMPTY = _descriptor.Descriptor( 581 | name='Empty', 582 | full_name='Empty', 583 | filename=None, 584 | file=DESCRIPTOR, 585 | containing_type=None, 586 | fields=[ 587 | ], 588 | extensions=[ 589 | ], 590 | nested_types=[], 591 | enum_types=[ 592 | ], 593 | serialized_options=None, 594 | is_extendable=False, 595 | syntax='proto3', 596 | extension_ranges=[], 597 | oneofs=[ 598 | ], 599 | serialized_start=912, 600 | serialized_end=919, 601 | ) 602 | 603 | 604 | _ACK = _descriptor.Descriptor( 605 | name='Ack', 606 | full_name='Ack', 607 | filename=None, 608 | file=DESCRIPTOR, 609 | containing_type=None, 610 | fields=[ 611 | _descriptor.FieldDescriptor( 612 | name='ack', full_name='Ack.ack', index=0, 613 | number=1, type=8, cpp_type=7, label=1, 614 | has_default_value=False, default_value=False, 615 | message_type=None, enum_type=None, containing_type=None, 616 | is_extension=False, extension_scope=None, 617 | serialized_options=None, file=DESCRIPTOR), 618 | ], 619 | extensions=[ 620 | ], 621 | nested_types=[], 622 | enum_types=[ 623 | ], 624 | serialized_options=None, 625 | is_extendable=False, 626 | syntax='proto3', 627 | extension_ranges=[], 628 | oneofs=[ 629 | ], 630 | serialized_start=921, 631 | serialized_end=939, 632 | ) 633 | 634 | _MEMBERSHIPSTATE_ADDSETENTRY.containing_type = _MEMBERSHIPSTATE 635 | _MEMBERSHIPSTATE_REMOVESETENTRY.containing_type = _MEMBERSHIPSTATE 636 | _MEMBERSHIPSTATE.fields_by_name['add_set'].message_type = _MEMBERSHIPSTATE_ADDSETENTRY 637 | _MEMBERSHIPSTATE.fields_by_name['remove_set'].message_type = _MEMBERSHIPSTATE_REMOVESETENTRY 638 | _REQUESTUNION.fields_by_name['get'].message_type = _GETREQUEST 639 | _REQUESTUNION.fields_by_name['put'].message_type = _PUTREQUEST 640 | _REQUESTUNION.fields_by_name['delete'].message_type = _DELETEREQUEST 641 | _REQUESTUNION.oneofs_by_name['value'].fields.append( 642 | _REQUESTUNION.fields_by_name['get']) 643 | _REQUESTUNION.fields_by_name['get'].containing_oneof = _REQUESTUNION.oneofs_by_name['value'] 644 | _REQUESTUNION.oneofs_by_name['value'].fields.append( 645 | _REQUESTUNION.fields_by_name['put']) 646 | _REQUESTUNION.fields_by_name['put'].containing_oneof = _REQUESTUNION.oneofs_by_name['value'] 647 | _REQUESTUNION.oneofs_by_name['value'].fields.append( 648 | _REQUESTUNION.fields_by_name['delete']) 649 | _REQUESTUNION.fields_by_name['delete'].containing_oneof = _REQUESTUNION.oneofs_by_name['value'] 650 | _BATCHREQUEST.fields_by_name['requests'].message_type = _REQUESTUNION 651 | _TRANSACTION_RETURNINGENTRY.containing_type = _TRANSACTION 652 | _TRANSACTION.fields_by_name['status'].enum_type = _TRANSACTIONSTATUS 653 | _TRANSACTION.fields_by_name['returning'].message_type = _TRANSACTION_RETURNINGENTRY 654 | _BATCHRESPONSE.fields_by_name['txn'].message_type = _TRANSACTION 655 | DESCRIPTOR.message_types_by_name['MembershipState'] = _MEMBERSHIPSTATE 656 | DESCRIPTOR.message_types_by_name['MembershipPingRequest'] = _MEMBERSHIPPINGREQUEST 657 | DESCRIPTOR.message_types_by_name['GetRequest'] = _GETREQUEST 658 | DESCRIPTOR.message_types_by_name['PutRequest'] = _PUTREQUEST 659 | DESCRIPTOR.message_types_by_name['DeleteRequest'] = _DELETEREQUEST 660 | DESCRIPTOR.message_types_by_name['RequestUnion'] = _REQUESTUNION 661 | DESCRIPTOR.message_types_by_name['BatchRequest'] = _BATCHREQUEST 662 | DESCRIPTOR.message_types_by_name['HLC'] = _HLC 663 | DESCRIPTOR.message_types_by_name['Transaction'] = _TRANSACTION 664 | DESCRIPTOR.message_types_by_name['BatchResponse'] = _BATCHRESPONSE 665 | DESCRIPTOR.message_types_by_name['Empty'] = _EMPTY 666 | DESCRIPTOR.message_types_by_name['Ack'] = _ACK 667 | DESCRIPTOR.enum_types_by_name['TransactionStatus'] = _TRANSACTIONSTATUS 668 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 669 | 670 | MembershipState = _reflection.GeneratedProtocolMessageType('MembershipState', (_message.Message,), { 671 | 672 | 'AddSetEntry' : _reflection.GeneratedProtocolMessageType('AddSetEntry', (_message.Message,), { 673 | 'DESCRIPTOR' : _MEMBERSHIPSTATE_ADDSETENTRY, 674 | '__module__' : 'peer_server_pb2' 675 | # @@protoc_insertion_point(class_scope:MembershipState.AddSetEntry) 676 | }) 677 | , 678 | 679 | 'RemoveSetEntry' : _reflection.GeneratedProtocolMessageType('RemoveSetEntry', (_message.Message,), { 680 | 'DESCRIPTOR' : _MEMBERSHIPSTATE_REMOVESETENTRY, 681 | '__module__' : 'peer_server_pb2' 682 | # @@protoc_insertion_point(class_scope:MembershipState.RemoveSetEntry) 683 | }) 684 | , 685 | 'DESCRIPTOR' : _MEMBERSHIPSTATE, 686 | '__module__' : 'peer_server_pb2' 687 | # @@protoc_insertion_point(class_scope:MembershipState) 688 | }) 689 | _sym_db.RegisterMessage(MembershipState) 690 | _sym_db.RegisterMessage(MembershipState.AddSetEntry) 691 | _sym_db.RegisterMessage(MembershipState.RemoveSetEntry) 692 | 693 | MembershipPingRequest = _reflection.GeneratedProtocolMessageType('MembershipPingRequest', (_message.Message,), { 694 | 'DESCRIPTOR' : _MEMBERSHIPPINGREQUEST, 695 | '__module__' : 'peer_server_pb2' 696 | # @@protoc_insertion_point(class_scope:MembershipPingRequest) 697 | }) 698 | _sym_db.RegisterMessage(MembershipPingRequest) 699 | 700 | GetRequest = _reflection.GeneratedProtocolMessageType('GetRequest', (_message.Message,), { 701 | 'DESCRIPTOR' : _GETREQUEST, 702 | '__module__' : 'peer_server_pb2' 703 | # @@protoc_insertion_point(class_scope:GetRequest) 704 | }) 705 | _sym_db.RegisterMessage(GetRequest) 706 | 707 | PutRequest = _reflection.GeneratedProtocolMessageType('PutRequest', (_message.Message,), { 708 | 'DESCRIPTOR' : _PUTREQUEST, 709 | '__module__' : 'peer_server_pb2' 710 | # @@protoc_insertion_point(class_scope:PutRequest) 711 | }) 712 | _sym_db.RegisterMessage(PutRequest) 713 | 714 | DeleteRequest = _reflection.GeneratedProtocolMessageType('DeleteRequest', (_message.Message,), { 715 | 'DESCRIPTOR' : _DELETEREQUEST, 716 | '__module__' : 'peer_server_pb2' 717 | # @@protoc_insertion_point(class_scope:DeleteRequest) 718 | }) 719 | _sym_db.RegisterMessage(DeleteRequest) 720 | 721 | RequestUnion = _reflection.GeneratedProtocolMessageType('RequestUnion', (_message.Message,), { 722 | 'DESCRIPTOR' : _REQUESTUNION, 723 | '__module__' : 'peer_server_pb2' 724 | # @@protoc_insertion_point(class_scope:RequestUnion) 725 | }) 726 | _sym_db.RegisterMessage(RequestUnion) 727 | 728 | BatchRequest = _reflection.GeneratedProtocolMessageType('BatchRequest', (_message.Message,), { 729 | 'DESCRIPTOR' : _BATCHREQUEST, 730 | '__module__' : 'peer_server_pb2' 731 | # @@protoc_insertion_point(class_scope:BatchRequest) 732 | }) 733 | _sym_db.RegisterMessage(BatchRequest) 734 | 735 | HLC = _reflection.GeneratedProtocolMessageType('HLC', (_message.Message,), { 736 | 'DESCRIPTOR' : _HLC, 737 | '__module__' : 'peer_server_pb2' 738 | # @@protoc_insertion_point(class_scope:HLC) 739 | }) 740 | _sym_db.RegisterMessage(HLC) 741 | 742 | Transaction = _reflection.GeneratedProtocolMessageType('Transaction', (_message.Message,), { 743 | 744 | 'ReturningEntry' : _reflection.GeneratedProtocolMessageType('ReturningEntry', (_message.Message,), { 745 | 'DESCRIPTOR' : _TRANSACTION_RETURNINGENTRY, 746 | '__module__' : 'peer_server_pb2' 747 | # @@protoc_insertion_point(class_scope:Transaction.ReturningEntry) 748 | }) 749 | , 750 | 'DESCRIPTOR' : _TRANSACTION, 751 | '__module__' : 'peer_server_pb2' 752 | # @@protoc_insertion_point(class_scope:Transaction) 753 | }) 754 | _sym_db.RegisterMessage(Transaction) 755 | _sym_db.RegisterMessage(Transaction.ReturningEntry) 756 | 757 | BatchResponse = _reflection.GeneratedProtocolMessageType('BatchResponse', (_message.Message,), { 758 | 'DESCRIPTOR' : _BATCHRESPONSE, 759 | '__module__' : 'peer_server_pb2' 760 | # @@protoc_insertion_point(class_scope:BatchResponse) 761 | }) 762 | _sym_db.RegisterMessage(BatchResponse) 763 | 764 | Empty = _reflection.GeneratedProtocolMessageType('Empty', (_message.Message,), { 765 | 'DESCRIPTOR' : _EMPTY, 766 | '__module__' : 'peer_server_pb2' 767 | # @@protoc_insertion_point(class_scope:Empty) 768 | }) 769 | _sym_db.RegisterMessage(Empty) 770 | 771 | Ack = _reflection.GeneratedProtocolMessageType('Ack', (_message.Message,), { 772 | 'DESCRIPTOR' : _ACK, 773 | '__module__' : 'peer_server_pb2' 774 | # @@protoc_insertion_point(class_scope:Ack) 775 | }) 776 | _sym_db.RegisterMessage(Ack) 777 | 778 | 779 | _MEMBERSHIPSTATE_ADDSETENTRY._options = None 780 | _MEMBERSHIPSTATE_REMOVESETENTRY._options = None 781 | _TRANSACTION_RETURNINGENTRY._options = None 782 | 783 | _PEERSERVER = _descriptor.ServiceDescriptor( 784 | name='PeerServer', 785 | full_name='PeerServer', 786 | file=DESCRIPTOR, 787 | index=0, 788 | serialized_options=None, 789 | serialized_start=1004, 790 | serialized_end=1211, 791 | methods=[ 792 | _descriptor.MethodDescriptor( 793 | name='MembershipStateSync', 794 | full_name='PeerServer.MembershipStateSync', 795 | index=0, 796 | containing_service=None, 797 | input_type=_MEMBERSHIPSTATE, 798 | output_type=_MEMBERSHIPSTATE, 799 | serialized_options=None, 800 | ), 801 | _descriptor.MethodDescriptor( 802 | name='MembershipPing', 803 | full_name='PeerServer.MembershipPing', 804 | index=1, 805 | containing_service=None, 806 | input_type=_EMPTY, 807 | output_type=_ACK, 808 | serialized_options=None, 809 | ), 810 | _descriptor.MethodDescriptor( 811 | name='MembershipPingReq', 812 | full_name='PeerServer.MembershipPingReq', 813 | index=2, 814 | containing_service=None, 815 | input_type=_MEMBERSHIPPINGREQUEST, 816 | output_type=_ACK, 817 | serialized_options=None, 818 | ), 819 | _descriptor.MethodDescriptor( 820 | name='Coordinate', 821 | full_name='PeerServer.Coordinate', 822 | index=3, 823 | containing_service=None, 824 | input_type=_BATCHREQUEST, 825 | output_type=_BATCHRESPONSE, 826 | serialized_options=None, 827 | ), 828 | ]) 829 | _sym_db.RegisterServiceDescriptor(_PEERSERVER) 830 | 831 | DESCRIPTOR.services_by_name['PeerServer'] = _PEERSERVER 832 | 833 | # @@protoc_insertion_point(module_scope) 834 | -------------------------------------------------------------------------------- /jdb/pb/peer_server_pb2_grpc.py: -------------------------------------------------------------------------------- 1 | # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! 2 | import grpc 3 | 4 | import peer_server_pb2 as peer__server__pb2 5 | 6 | 7 | class PeerServerStub(object): 8 | """Missing associated documentation comment in .proto file""" 9 | 10 | def __init__(self, channel): 11 | """Constructor. 12 | 13 | Args: 14 | channel: A grpc.Channel. 15 | """ 16 | self.MembershipStateSync = channel.unary_unary( 17 | '/PeerServer/MembershipStateSync', 18 | request_serializer=peer__server__pb2.MembershipState.SerializeToString, 19 | response_deserializer=peer__server__pb2.MembershipState.FromString, 20 | ) 21 | self.MembershipPing = channel.unary_unary( 22 | '/PeerServer/MembershipPing', 23 | request_serializer=peer__server__pb2.Empty.SerializeToString, 24 | response_deserializer=peer__server__pb2.Ack.FromString, 25 | ) 26 | self.MembershipPingReq = channel.unary_unary( 27 | '/PeerServer/MembershipPingReq', 28 | request_serializer=peer__server__pb2.MembershipPingRequest.SerializeToString, 29 | response_deserializer=peer__server__pb2.Ack.FromString, 30 | ) 31 | self.Coordinate = channel.unary_unary( 32 | '/PeerServer/Coordinate', 33 | request_serializer=peer__server__pb2.BatchRequest.SerializeToString, 34 | response_deserializer=peer__server__pb2.BatchResponse.FromString, 35 | ) 36 | 37 | 38 | class PeerServerServicer(object): 39 | """Missing associated documentation comment in .proto file""" 40 | 41 | def MembershipStateSync(self, request, context): 42 | """Missing associated documentation comment in .proto file""" 43 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 44 | context.set_details('Method not implemented!') 45 | raise NotImplementedError('Method not implemented!') 46 | 47 | def MembershipPing(self, request, context): 48 | """Missing associated documentation comment in .proto file""" 49 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 50 | context.set_details('Method not implemented!') 51 | raise NotImplementedError('Method not implemented!') 52 | 53 | def MembershipPingReq(self, request, context): 54 | """Missing associated documentation comment in .proto file""" 55 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 56 | context.set_details('Method not implemented!') 57 | raise NotImplementedError('Method not implemented!') 58 | 59 | def Coordinate(self, request, context): 60 | """Missing associated documentation comment in .proto file""" 61 | context.set_code(grpc.StatusCode.UNIMPLEMENTED) 62 | context.set_details('Method not implemented!') 63 | raise NotImplementedError('Method not implemented!') 64 | 65 | 66 | def add_PeerServerServicer_to_server(servicer, server): 67 | rpc_method_handlers = { 68 | 'MembershipStateSync': grpc.unary_unary_rpc_method_handler( 69 | servicer.MembershipStateSync, 70 | request_deserializer=peer__server__pb2.MembershipState.FromString, 71 | response_serializer=peer__server__pb2.MembershipState.SerializeToString, 72 | ), 73 | 'MembershipPing': grpc.unary_unary_rpc_method_handler( 74 | servicer.MembershipPing, 75 | request_deserializer=peer__server__pb2.Empty.FromString, 76 | response_serializer=peer__server__pb2.Ack.SerializeToString, 77 | ), 78 | 'MembershipPingReq': grpc.unary_unary_rpc_method_handler( 79 | servicer.MembershipPingReq, 80 | request_deserializer=peer__server__pb2.MembershipPingRequest.FromString, 81 | response_serializer=peer__server__pb2.Ack.SerializeToString, 82 | ), 83 | 'Coordinate': grpc.unary_unary_rpc_method_handler( 84 | servicer.Coordinate, 85 | request_deserializer=peer__server__pb2.BatchRequest.FromString, 86 | response_serializer=peer__server__pb2.BatchResponse.SerializeToString, 87 | ), 88 | } 89 | generic_handler = grpc.method_handlers_generic_handler( 90 | 'PeerServer', rpc_method_handlers) 91 | server.add_generic_rpc_handlers((generic_handler,)) 92 | 93 | 94 | # This class is part of an EXPERIMENTAL API. 95 | class PeerServer(object): 96 | """Missing associated documentation comment in .proto file""" 97 | 98 | @staticmethod 99 | def MembershipStateSync(request, 100 | target, 101 | options=(), 102 | channel_credentials=None, 103 | call_credentials=None, 104 | compression=None, 105 | wait_for_ready=None, 106 | timeout=None, 107 | metadata=None): 108 | return grpc.experimental.unary_unary(request, target, '/PeerServer/MembershipStateSync', 109 | peer__server__pb2.MembershipState.SerializeToString, 110 | peer__server__pb2.MembershipState.FromString, 111 | options, channel_credentials, 112 | call_credentials, compression, wait_for_ready, timeout, metadata) 113 | 114 | @staticmethod 115 | def MembershipPing(request, 116 | target, 117 | options=(), 118 | channel_credentials=None, 119 | call_credentials=None, 120 | compression=None, 121 | wait_for_ready=None, 122 | timeout=None, 123 | metadata=None): 124 | return grpc.experimental.unary_unary(request, target, '/PeerServer/MembershipPing', 125 | peer__server__pb2.Empty.SerializeToString, 126 | peer__server__pb2.Ack.FromString, 127 | options, channel_credentials, 128 | call_credentials, compression, wait_for_ready, timeout, metadata) 129 | 130 | @staticmethod 131 | def MembershipPingReq(request, 132 | target, 133 | options=(), 134 | channel_credentials=None, 135 | call_credentials=None, 136 | compression=None, 137 | wait_for_ready=None, 138 | timeout=None, 139 | metadata=None): 140 | return grpc.experimental.unary_unary(request, target, '/PeerServer/MembershipPingReq', 141 | peer__server__pb2.MembershipPingRequest.SerializeToString, 142 | peer__server__pb2.Ack.FromString, 143 | options, channel_credentials, 144 | call_credentials, compression, wait_for_ready, timeout, metadata) 145 | 146 | @staticmethod 147 | def Coordinate(request, 148 | target, 149 | options=(), 150 | channel_credentials=None, 151 | call_credentials=None, 152 | compression=None, 153 | wait_for_ready=None, 154 | timeout=None, 155 | metadata=None): 156 | return grpc.experimental.unary_unary(request, target, '/PeerServer/Coordinate', 157 | peer__server__pb2.BatchRequest.SerializeToString, 158 | peer__server__pb2.BatchResponse.FromString, 159 | options, channel_credentials, 160 | call_credentials, compression, wait_for_ready, timeout, metadata) 161 | -------------------------------------------------------------------------------- /jdb/peer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Any 3 | import grpc 4 | from jdb.pb import peer_server_pb2_grpc as pgrpc, peer_server_pb2 as pb 5 | from jdb import crdt, util, routing as rte, storage as db 6 | 7 | 8 | class Peer: 9 | """represents remote peer""" 10 | 11 | def __init__(self, addr: str, name: str, logger: Any): 12 | self.addr = addr 13 | self.name = name 14 | self.logger = logger.bind(name=name, addr=addr) 15 | self.channel = grpc.insecure_channel(self.addr) 16 | self.transport = pgrpc.PeerServerStub(self.channel) 17 | 18 | @property 19 | def node_key(self) -> str: 20 | """concatenation, pretty much all the data we need for a peer""" 21 | 22 | return f"{self.name}={self.addr}" 23 | 24 | def coordinate(self, req: rte.BatchRequest) -> rte.BatchResponse: 25 | """coordinate""" 26 | 27 | requests = [] 28 | 29 | for re in req.requests: 30 | if isinstance(re, rte.PutRequest): 31 | val = pb.PutRequest(key=re.key, value=re.value) 32 | requests.append(pb.RequestUnion(put=val)) 33 | elif isinstance(re, rte.GetRequest): 34 | val = pb.GetRequest(key=re.key) 35 | requests.append(pb.RequestUnion(get=val)) 36 | elif isinstance(re, rte.DeleteRequest): 37 | val = pb.DeleteRequest(key=re.key) 38 | requests.append(pb.RequestUnion(delete=val)) 39 | 40 | msg = pb.BatchRequest(table=req.table, requests=requests) 41 | res = self.transport.Coordinate(msg) 42 | txn = res.txn 43 | transaction = db.TransactionMeta( 44 | status=db.TransactionStatus(txn.status), 45 | read_ts=txn.read_ts, 46 | commit_ts=txn.commit_ts, 47 | returning={k.encode(): v.encode() for k, v in txn.returning.items()}, 48 | txnid=txn.txnid, 49 | ) 50 | 51 | return rte.BatchResponse(txn=transaction, table=res.table) 52 | 53 | def membership_ping(self) -> bool: 54 | """ping""" 55 | 56 | msg = pb.Empty() 57 | 58 | try: 59 | ack = self.transport.MembershipPing(msg) 60 | return ack.ack 61 | except Exception: # pylint: disable=broad-except 62 | return False 63 | 64 | def membership_ping_req(self, other: Peer) -> bool: 65 | """ping""" 66 | 67 | msg = pb.MembershipPingRequest(peer_name=other.name, peer_addr=other.addr) 68 | 69 | try: 70 | res = self.transport.MembershipPingReq(msg) 71 | return res.ack 72 | except Exception: # pylint: disable=broad-except 73 | return False 74 | 75 | def membership_state_sync( 76 | self, state: crdt.LWWRegister, from_addr: str 77 | ) -> crdt.LWWRegister: 78 | """rpc call wrapper""" 79 | 80 | req = pb.MembershipState( 81 | add_set=state.add_set, 82 | remove_set=state.remove_set, 83 | replica_id=state.replica_id, 84 | peer_addr=from_addr, 85 | ) 86 | 87 | res = self.transport.MembershipStateSync(req) 88 | merged = crdt.LWWRegister(replica_id=res.replica_id) 89 | merged.add_set = util.byteify_keys(res.add_set) 90 | merged.remove_set = util.byteify_keys(res.remove_set) 91 | return merged 92 | -------------------------------------------------------------------------------- /jdb/routing.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | from dataclasses import dataclass, field 3 | from structlog import get_logger 4 | 5 | # pylint: disable=unused-import 6 | from jdb import ( 7 | membership as mbr, 8 | node as nde, 9 | ) 10 | from jdb import errors as err, types as t, const as k, storage as db 11 | 12 | LOGGER = get_logger() 13 | 14 | 15 | @dataclass 16 | class DeleteRequest: 17 | """request type""" 18 | 19 | key: t.Key 20 | 21 | 22 | @dataclass 23 | class PutRequest: 24 | """request type""" 25 | 26 | key: t.Key 27 | value: t.Value 28 | 29 | 30 | @dataclass 31 | class GetRequest: 32 | """request type""" 33 | 34 | key: t.Key 35 | 36 | 37 | RequestUnion = Union[PutRequest, GetRequest, DeleteRequest] 38 | 39 | 40 | @dataclass 41 | class BatchResponse: 42 | """wrap response""" 43 | 44 | txn: db.TransactionMeta 45 | table: str 46 | 47 | 48 | @dataclass 49 | class BatchRequest: 50 | """represents a client request to get routed""" 51 | 52 | requests: List[RequestUnion] = field(default_factory=list) 53 | 54 | @property 55 | def table(self) -> str: 56 | """key that is used to route the request""" 57 | 58 | if not self.requests: 59 | raise err.InvalidRequest("no requests") 60 | 61 | match = k.REQ_KEY_REGEX.match(self.requests[0].key.decode()) 62 | 63 | if not match: 64 | raise err.InvalidRequest("invalid key") 65 | 66 | table, _ = match.groups() 67 | return table 68 | 69 | 70 | class Router: 71 | """handle request routing""" 72 | 73 | def __init__(self, membership: "mbr.Membership", node: "nde.Node"): 74 | self._membership = membership 75 | self._node = node 76 | 77 | def request(self, req: BatchRequest) -> BatchResponse: 78 | """send a request""" 79 | 80 | peer = self._membership.lookup_leaseholder(req.table) 81 | 82 | if not peer: 83 | LOGGER.info("routing.request.local", table=req.table) 84 | txn = self._node.coordinate(req) 85 | txnmeta = db.TransactionMeta( 86 | txnid=txn.txnid, 87 | read_ts=txn.read_ts, 88 | commit_ts=txn.commit_ts, 89 | returning=txn.returning, 90 | status=txn.status, 91 | ) 92 | return BatchResponse(txn=txnmeta, table=req.table) 93 | 94 | LOGGER.info( 95 | "routing.request.remote", 96 | peer_name=peer.name, 97 | peer_addr=peer.addr, 98 | table=req.table, 99 | ) 100 | 101 | return peer.coordinate(req) 102 | -------------------------------------------------------------------------------- /jdb/server/__init__.py: -------------------------------------------------------------------------------- 1 | from .peer_server import PeerServer 2 | from .client_server import ClientServer 3 | from .server import Server 4 | 5 | __all__ = ["PeerServer", "ClientServer", "Server"] 6 | -------------------------------------------------------------------------------- /jdb/server/client_server.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Tuple, Optional 2 | from threading import Thread 3 | from collections import OrderedDict 4 | from uuid import uuid4 as uuid 5 | from socketserver import StreamRequestHandler, ThreadingTCPServer 6 | from structlog import get_logger 7 | from pyparsing import ParseException 8 | from jdb import jql, const, node as nde 9 | 10 | _LOGGER = get_logger() 11 | 12 | 13 | class Client(StreamRequestHandler): 14 | """client connection""" 15 | 16 | logger: Any 17 | 18 | def setup(self): 19 | """override""" 20 | 21 | super().setup() 22 | 23 | addr = self.client_address 24 | 25 | self.client_id = str(uuid()) 26 | self.jql = jql.JQL(node=self.server.node) 27 | self.logger = self.server.logger.bind( 28 | client_id=self.client_id, client_addr=f"{addr[0]}:{addr[1]}", 29 | ) 30 | self.server.client_connected(self) 31 | 32 | def finish(self): 33 | super().finish() 34 | 35 | self.server.client_disconnected(self) 36 | 37 | def handle(self): 38 | """override""" 39 | 40 | super().handle() 41 | 42 | statement = "" 43 | 44 | for data in self.rfile: 45 | raw = data.decode().rstrip() 46 | 47 | if not raw: 48 | break 49 | 50 | statement += f"{raw}\n" 51 | stripped = statement.rstrip() 52 | 53 | if stripped[-1:] == const.TERMINATOR: 54 | self._call(statement) 55 | statement = "" 56 | 57 | def _call(self, statement: str): 58 | """send statement to parser for execution""" 59 | 60 | self.logger.debug("statement", statement=f"{statement!r}") 61 | 62 | try: 63 | result, response = self.jql.call(statement=statement) 64 | 65 | self.logger.debug("result", result=result, response=response) 66 | 67 | if result: 68 | self.wfile.write(f"{result}\n".encode()) 69 | elif response: 70 | txn = response.txn 71 | 72 | if txn.returning: 73 | for _, v in txn.returning.items(): 74 | self.wfile.write(f"{v.decode() if v else ''}\n".encode()) 75 | else: 76 | self.wfile.write(f"{txn.txnid} {const.COMMITTED}\n".encode()) 77 | elif txn.isaborted: 78 | self.wfile.write(f"{txn.txnid} {const.ABORTED}\n".encode()) 79 | elif txn.ispending: 80 | self.wfile.write(f"{txn.txnid} {const.PENDING}\n".encode()) 81 | except ParseException as err: 82 | self.logger.err(err) 83 | 84 | self.wfile.write( 85 | f"{const.SYNTAX_ERR}: ln {err.lineno}, col {err.col}\n".encode() 86 | ) 87 | 88 | 89 | class ClientServer(ThreadingTCPServer): 90 | """server for client communication""" 91 | 92 | allow_reuse_address = True 93 | daemon_threads = True 94 | clients: OrderedDict 95 | 96 | def __init__( 97 | self, 98 | addr: Tuple[str, int], 99 | node: nde.Node, 100 | max_connections: Optional[int] = 100, 101 | ): 102 | self.max_connections = max_connections 103 | self.clients = OrderedDict() 104 | self.node = node 105 | self.logger = _LOGGER.bind(addr=f"{addr[0]}:{addr[1]}") 106 | 107 | super().__init__(addr, Client) 108 | 109 | def process_request(self, request, client_address): 110 | """override""" 111 | 112 | thread = Thread( 113 | target=self.process_request_thread, 114 | args=(request, client_address), 115 | daemon=self.daemon_threads, 116 | name=f"ClientRequestThread-{':'.join(map(str, client_address))}", 117 | ) 118 | 119 | if not thread.daemon and self.block_on_close: 120 | if self._threads is None: 121 | self._threads = [] 122 | self._threads.append(thread) 123 | 124 | thread.start() 125 | 126 | def client_connected(self, client: Client): 127 | """add client""" 128 | 129 | self.clients[client.client_id] = client 130 | addr = client.client_address 131 | self.logger.msg("client.connected", client_address=f"{addr[0]}:{addr[1]}") 132 | 133 | def client_disconnected(self, client: Client): 134 | """remove client""" 135 | 136 | del self.clients[client.client_id] 137 | addr = client.client_address 138 | self.logger.msg("client.disconnected", client_address=f"{addr[0]}:{addr[1]}") 139 | 140 | def server_activate(self): 141 | """override""" 142 | 143 | super().server_activate() 144 | self.logger.msg("client_server.listening") 145 | 146 | def shutdown(self): 147 | "shut it down" 148 | 149 | super().shutdown() 150 | self.logger.msg("client_server.shutdown") 151 | -------------------------------------------------------------------------------- /jdb/server/peer_server.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from time import sleep 3 | from concurrent.futures import ThreadPoolExecutor 4 | from structlog import get_logger 5 | import grpc 6 | from jdb import node as nde, crdt, util, routing as rte 7 | from jdb.pb import peer_server_pb2_grpc as pgrpc, peer_server_pb2 as pb 8 | 9 | _LOGGER = get_logger() 10 | 11 | 12 | class PeerServer(pgrpc.PeerServerServicer): 13 | """server for p2p communication""" 14 | 15 | def Coordinate(self, request, context): 16 | req = rte.BatchRequest() 17 | 18 | for re in request.requests: 19 | which = re.WhichOneof("value") 20 | 21 | if which == "put": 22 | req.requests.append(rte.PutRequest(re.put.key, re.put.value)) 23 | elif which == "get": 24 | req.requests.append(rte.GetRequest(re.get.key)) 25 | elif which == "delete": 26 | req.requests.append(rte.DeleteRequest(re.delete.key)) 27 | 28 | txn = self.node.coordinate(req) 29 | 30 | transaction = pb.Transaction( 31 | txnid=txn.txnid, 32 | status=txn.status.value, 33 | read_ts=txn.read_ts, 34 | commit_ts=txn.commit_ts, 35 | returning={k: v if v else b"" for k, v in txn.returning.items()}, 36 | ) 37 | 38 | return pb.BatchResponse(table=request.table, txn=transaction) 39 | 40 | def MembershipPing(self, request, context): 41 | return pb.Ack(ack=True) 42 | 43 | def MembershipPingReq(self, request, context): 44 | try: 45 | ack = self.node.membership.ping_req(request.peer_name, request.peer_addr) 46 | except Exception: # pylint: disable=broad-except 47 | ack = False 48 | 49 | return pb.Ack(ack=ack) 50 | 51 | def MembershipStateSync(self, request, context): 52 | incoming = crdt.LWWRegister(replica_id=request.replica_id) 53 | incoming.add_set = util.byteify_keys(request.add_set) 54 | incoming.remove_set = util.byteify_keys(request.remove_set) 55 | state = self.node.membership.state_sync(incoming, peer_addr=request.peer_addr) 56 | 57 | return pb.MembershipState( 58 | replica_id=self.node.name, 59 | peer_addr=self.node.p2p_addr, 60 | remove_set=state.remove_set, 61 | add_set=state.add_set, 62 | ) 63 | 64 | def __init__(self, addr: Tuple[str, int], node: nde.Node): 65 | super().__init__() 66 | 67 | addr_str = ":".join(map(str, addr)) 68 | self.node = node 69 | self.logger = _LOGGER.bind(addr=addr_str) 70 | self.addr = addr 71 | self.stopped = False 72 | 73 | server = grpc.server( 74 | ThreadPoolExecutor(10, thread_name_prefix="PeerServerThreadPool") 75 | ) 76 | 77 | pgrpc.add_PeerServerServicer_to_server(self, server) 78 | server.add_insecure_port(addr_str) 79 | 80 | self.server = server 81 | 82 | def serve_forever(self): 83 | """start it up""" 84 | 85 | self.server.start() 86 | self.logger.msg("peer_server.listening") 87 | 88 | while not self.stopped: 89 | sleep(1) 90 | 91 | def shutdown(self): 92 | """shut it down""" 93 | 94 | self.server.stop(1) 95 | self.server.wait_for_termination(10) 96 | self.stopped = True 97 | self.logger.msg("peer_server.shutdown") 98 | -------------------------------------------------------------------------------- /jdb/server/server.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | from uuid import uuid4 as uuid 3 | from typing import Optional, List 4 | from dataclasses import dataclass, field 5 | from argparse import ArgumentParser 6 | from jdb import server, node as nde 7 | 8 | 9 | @dataclass 10 | class Server: 11 | """top-level server. starts client and peer servers in threads""" 12 | 13 | port: Optional[int] = 1337 14 | p2p_port: Optional[int] = 1338 15 | max_connections: Optional[int] = 100 16 | host: Optional[str] = "127.0.0.1" 17 | p2p_host: Optional[str] = "127.0.0.1" 18 | join: Optional[str] = None 19 | node_name: Optional[str] = str(uuid()) 20 | _client_server: server.ClientServer = field(init=False) 21 | _peer_server: server.PeerServer = field(init=False) 22 | _threads: List[Thread] = field(default_factory=list) 23 | node: nde.Node = field(init=False) 24 | p2p_addr: str = field(init=False) 25 | 26 | def __post_init__(self): 27 | """override""" 28 | 29 | p2p_addr = f"{self.p2p_host}:{self.p2p_port}" 30 | self.p2p_addr = p2p_addr 31 | client_addr = f"{self.host}:{self.port}" 32 | 33 | self.node = nde.Node( 34 | p2p_addr=p2p_addr, client_addr=client_addr, name=self.node_name 35 | ) 36 | 37 | self._client_server = server.ClientServer( 38 | addr=(self.host, self.port), 39 | node=self.node, 40 | max_connections=self.max_connections, 41 | ) 42 | 43 | self._peer_server = server.PeerServer( 44 | addr=(self.p2p_host, self.p2p_port), node=self.node 45 | ) 46 | 47 | self._threads = [ 48 | Thread( 49 | target=self._start_client_server, daemon=True, name="ClientServerThread" 50 | ), 51 | Thread( 52 | target=self._start_peer_server, daemon=True, name="PeerServerThread" 53 | ), 54 | Thread(target=self._start_membership, daemon=True, name="MembershipThread"), 55 | ] 56 | 57 | def start(self): 58 | """fire up server for client comms and p2p comms""" 59 | 60 | if self.join: 61 | self.node.bootstrap(self.join) 62 | 63 | for thread in self._threads: 64 | thread.start() 65 | 66 | try: 67 | for thread in self._threads: 68 | thread.join() 69 | except (KeyboardInterrupt, SystemExit): 70 | self.stop() 71 | 72 | def stop(self): 73 | """shut it down""" 74 | 75 | self.node.membership.stop() 76 | self._client_server.shutdown() 77 | self._peer_server.shutdown() 78 | 79 | for thread in self._threads: 80 | thread.join() 81 | 82 | def _start_peer_server(self): 83 | """start up peer grpc server""" 84 | 85 | self._peer_server.serve_forever() 86 | 87 | def _start_membership(self): 88 | """start up peer grpc server""" 89 | 90 | self.node.membership.start() 91 | 92 | def _start_client_server(self): 93 | """start up server for client requests""" 94 | 95 | with self._client_server as cserver: 96 | cserver.serve_forever() 97 | 98 | 99 | def _main(): 100 | """main entry point""" 101 | 102 | parser = ArgumentParser(description="jdb server") 103 | 104 | parser.add_argument( 105 | "-p", "--port", help="port for client connections", default=1337, type=int 106 | ) 107 | parser.add_argument( 108 | "-o", 109 | "--host", 110 | help="host for client connections", 111 | default="127.0.0.1", 112 | type=str, 113 | ) 114 | parser.add_argument( 115 | "-n", "--node-name", help="node name", type=str, default=str(uuid()) 116 | ) 117 | parser.add_argument("-j", "--join", help="node address to join", type=str) 118 | parser.add_argument( 119 | "-r", "--p2p-port", help="port for p2p communication", default=1338, type=int, 120 | ) 121 | parser.add_argument( 122 | "-s", 123 | "--p2p-host", 124 | help="host for p2p communication", 125 | default="127.0.0.1", 126 | type=str, 127 | ) 128 | parser.add_argument( 129 | "-c", "--max-connections", help="max connections", default=100, type=int 130 | ) 131 | args = parser.parse_args() 132 | 133 | srv = Server( 134 | host=args.host, 135 | port=int(args.port), 136 | join=args.join, 137 | max_connections=args.max_connections, 138 | p2p_host=args.p2p_host, 139 | p2p_port=int(args.p2p_port), 140 | node_name=args.node_name, 141 | ) 142 | 143 | srv.start() 144 | 145 | 146 | if __name__ == "__main__": 147 | _main() 148 | -------------------------------------------------------------------------------- /jdb/storage/__init__.py: -------------------------------------------------------------------------------- 1 | from .db import DB 2 | from .transaction import Transaction, TransactionMeta, TransactionStatus 3 | from .avltree import AVLTree 4 | from .entry import Entry 5 | 6 | __all__ = [ 7 | "DB", 8 | "Transaction", 9 | "AVLTree", 10 | "Entry", 11 | "TransactionMeta", 12 | "TransactionStatus", 13 | ] 14 | -------------------------------------------------------------------------------- /jdb/storage/avltree.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | from jdb import types 5 | 6 | 7 | @dataclass 8 | class Node: 9 | """tree nodes""" 10 | 11 | key: types.IndexEntry 12 | left: Optional[Node] = None 13 | right: Optional[Node] = None 14 | maximum: Optional[Node] = None 15 | height: int = 1 16 | 17 | def __post_init__(self): 18 | """override""" 19 | 20 | self.maximum = self 21 | 22 | 23 | class AVLTree: 24 | """avl tree implementation""" 25 | 26 | root: Optional[Node] = None 27 | 28 | def search( 29 | self, key: types.IndexEntry, gte: Optional[bool] = False 30 | ) -> Optional[types.IndexEntry]: 31 | """proxy to root node""" 32 | 33 | return self._search(self.root, key=key, gte=gte) 34 | 35 | def _search( 36 | self, root: Optional[Node], key: types.IndexEntry, gte: Optional[bool] = False 37 | ) -> Optional[types.IndexEntry]: 38 | """ 39 | bst search. if gte is true, find exact match or closest node gte search key 40 | """ 41 | 42 | if not root: 43 | return None 44 | 45 | cmp = self._compare(key, root.key) 46 | 47 | if cmp < 0: 48 | if gte and ( 49 | not root.left 50 | or ( 51 | self._compare(key, root.left.key) > 0 52 | and root.left.maximum 53 | and self._compare(key, root.left.maximum.key) > 0 54 | ) 55 | ): 56 | return root.key 57 | 58 | return self._search(root.left, key, gte=gte) 59 | if cmp > 0: 60 | return self._search(root.right, key, gte=gte) 61 | 62 | return root.key 63 | 64 | def _compare(self, one: types.IndexEntry, other: types.IndexEntry) -> int: 65 | """simple comparator""" 66 | 67 | if one[0] == other[0]: 68 | return 0 69 | if one[0] < other[0]: 70 | return -1 71 | 72 | return 1 73 | 74 | def insert(self, key: types.IndexEntry) -> None: 75 | """proxy to root node""" 76 | 77 | node = Node(key=key) 78 | self.root = self._insert(self.root, node) 79 | 80 | def _insert(self, root: Optional[Node], node: Node) -> Node: 81 | """bst insert then rebalance if balance factor +/- 2""" 82 | 83 | if not root: 84 | return node 85 | 86 | cmp = self._compare(node.key, root.key) 87 | 88 | if cmp == 0: 89 | root.key = node.key 90 | elif cmp < 0: 91 | root.left = self._insert(root.left, node) 92 | elif cmp > 0: 93 | root.maximum = node 94 | root.right = self._insert(root.right, node) 95 | 96 | lheight = self._getheight(root.left) 97 | rheight = self._getheight(root.right) 98 | root.height = 1 + max(lheight, rheight) 99 | balance = lheight - rheight 100 | result = root 101 | 102 | if balance > 1 and root.left and self._compare(node.key, root.left.key) < 0: 103 | result = self._right_rotate(root) 104 | elif ( 105 | balance < -1 and root.right and self._compare(node.key, root.right.key) > 0 106 | ): 107 | result = self._left_rotate(root) 108 | elif balance > 1 and root.left and self._compare(node.key, root.left.key) > 0: 109 | root.left = self._left_rotate(root.left) 110 | result = self._right_rotate(root) 111 | elif ( 112 | balance < -1 and root.right and self._compare(node.key, root.right.key) < 0 113 | ): 114 | root.right = self._right_rotate(root.right) 115 | result = self._left_rotate(root) 116 | 117 | return result 118 | 119 | def _left_rotate(self, node: Node): 120 | """l rotate""" 121 | 122 | right = node.right 123 | if not right: 124 | return node 125 | rleft = right.left 126 | right.left = node 127 | node.right = rleft 128 | node.height = 1 + max(self._getheight(node.left), self._getheight(node.right)) 129 | right.height = 1 + max( 130 | self._getheight(right.left), self._getheight(right.right) 131 | ) 132 | return right 133 | 134 | def _right_rotate(self, node: Node): 135 | """r rotate""" 136 | 137 | left = node.left 138 | if not left: 139 | return node 140 | lright = left.right 141 | left.right = node 142 | node.left = lright 143 | node.height = 1 + max(self._getheight(node.left), self._getheight(node.right)) 144 | left.height = 1 + max(self._getheight(left.left), self._getheight(left.right)) 145 | return left 146 | 147 | def _getheight(self, node: Optional[Node]) -> int: 148 | """helper""" 149 | 150 | if not node: 151 | return 0 152 | 153 | return node.height 154 | -------------------------------------------------------------------------------- /jdb/storage/compression.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from enum import Enum 3 | from snappy import compress, decompress 4 | 5 | 6 | class CompressionType(Enum): 7 | """only snappy supported for now""" 8 | 9 | SNAPPY = 1 10 | 11 | 12 | class Compression: 13 | """wrapper for compression. lz4 was being weird so just snappy for now""" 14 | 15 | def __init__(self, compression_type: Optional[CompressionType]): 16 | self._compression_type = compression_type 17 | 18 | @property 19 | def isenabled(self) -> bool: 20 | """did we set one""" 21 | 22 | return bool(self._compression_type) 23 | 24 | def compress(self, raw: bytes) -> bytes: 25 | """only one type for now""" 26 | 27 | compressed = raw 28 | 29 | if self._compression_type == CompressionType.SNAPPY: 30 | compressed = compress(raw) 31 | 32 | return compressed 33 | 34 | def decompress(self, compressed: bytes) -> bytes: 35 | """only one type for now""" 36 | 37 | raw = compressed 38 | 39 | if self._compression_type == CompressionType.SNAPPY: 40 | raw = decompress(compressed) 41 | 42 | return raw 43 | -------------------------------------------------------------------------------- /jdb/storage/db.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from contextlib import contextmanager 3 | from jdb.storage import ( 4 | oracle as orc, 5 | entry as ent, 6 | memtable as mem, 7 | compression as cmp, 8 | transaction as txn, 9 | ) 10 | from jdb import ( 11 | const, 12 | types, 13 | ) 14 | 15 | 16 | class DB: 17 | """main db/storage entry point""" 18 | 19 | def __init__( 20 | self, 21 | max_table_size: int = 1024 << 20, 22 | compression: Optional[cmp.CompressionType] = cmp.CompressionType.SNAPPY, 23 | ): 24 | self.oracle = orc.Oracle() 25 | self.memtable = mem.Memtable( 26 | max_size=max_table_size, compression=cmp.Compression(compression) 27 | ) 28 | 29 | def get(self, key: bytes) -> bytes: 30 | """main get API if interfacing with db class directly""" 31 | 32 | with self.transaction() as transaction: 33 | return transaction.read(key=key) 34 | 35 | def put(self, key: types.Key, value: types.Value): 36 | """main put API if interfacing with db class directly""" 37 | 38 | with self.transaction() as transaction: 39 | transaction.write(key=key, value=value) 40 | 41 | def delete(self, key: bytes): 42 | """main delete API if interfacing with db class directly""" 43 | 44 | with self.transaction() as transaction: 45 | transaction.write(key=key, meta=const.BIT_TOMBSTONE) 46 | 47 | def write(self, entries: List[ent.Entry]): 48 | """called by transactions to submit their writes""" 49 | 50 | for entry in entries: 51 | self.memtable.put(entry) 52 | 53 | def read(self, key: types.Key) -> Optional[ent.Entry]: 54 | """called by transactions to read from the db""" 55 | 56 | return self.memtable.get(key) 57 | 58 | @contextmanager 59 | def transaction(self): 60 | """create/yield/commit transaction""" 61 | 62 | transaction = txn.Transaction(db=self) 63 | yield transaction 64 | transaction.commit() 65 | -------------------------------------------------------------------------------- /jdb/storage/entry.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional 3 | from binascii import crc32 4 | from dataclasses import dataclass 5 | import uvarint 6 | from jdb.storage import compression as cmp 7 | from jdb import errors as err, const, types 8 | 9 | 10 | @dataclass 11 | class Entry: 12 | """represents a unit of storage in the db/log""" 13 | 14 | key: bytes 15 | value: bytes = bytes() 16 | meta: int = 0 17 | 18 | @classmethod 19 | def decode(cls, buf: bytes, compression: Optional[cmp.Compression] = None) -> Entry: 20 | """ 21 | 1. decode header 22 | 2. use header metadata to decode body 23 | 3. verify checksum, raise if mismatch 24 | 4. return object 25 | """ 26 | 27 | decoded = uvarint.cut(4, buf) 28 | body = decoded.rest 29 | _, meta, keylen, valuelen = decoded.integers 30 | key = bytes(body[0:keylen]) 31 | value = bytes(body[keylen : keylen + valuelen]) 32 | 33 | checksum_bytes = body[keylen + valuelen :] 34 | checksum = uvarint.decode(checksum_bytes).integer 35 | header = cls._encode_header(key=key, value=value, meta=meta) 36 | check = crc32(header) 37 | check = crc32(key, check) 38 | check = crc32(value, check) 39 | 40 | if checksum != check: 41 | raise err.ChecksumMismatch() 42 | 43 | if compression and compression.isenabled: 44 | key = compression.decompress(key) 45 | value = compression.decompress(value) 46 | 47 | return Entry(key=key, value=value, meta=meta) 48 | 49 | @property 50 | def isdeleted(self) -> bool: 51 | """return true if tombstone bit is set""" 52 | 53 | return self.meta & const.BIT_TOMBSTONE == 1 54 | 55 | def encode(self, compression: Optional[cmp.Compression] = None) -> bytes: 56 | """ 57 | byte array representation of log entry. 58 | append CRC32 checksum of header and k/v 59 | ----------------------------------------------------------------------- 60 | | block size | meta | key length | value length | key | value | crc32 | 61 | ----------------------------------------------------------------------- 62 | """ 63 | 64 | key, value, meta = self.key, self.value, self.meta 65 | 66 | if compression and compression.isenabled: 67 | key = compression.compress(key) 68 | value = compression.compress(value) 69 | 70 | header = self._encode_header(key=key, value=value, meta=meta) 71 | checksum = crc32(header) 72 | encoded = bytearray(header) 73 | checksum = crc32(key, checksum) 74 | encoded += key 75 | checksum = crc32(value, checksum) 76 | encoded += value 77 | encoded += uvarint.encode(checksum) 78 | block_size = uvarint.encode(len(encoded)) 79 | 80 | return bytes([*block_size, *encoded]) 81 | 82 | @classmethod 83 | def _encode_header(cls, key: types.Key, value: types.Value, meta: int) -> bytes: 84 | """ 85 | byte array representation of header fields/metadata 86 | ------------------------------------ 87 | | meta | key length | value length | 88 | ------------------------------------ 89 | """ 90 | 91 | header_fields = [len(key), len(value)] 92 | header = bytearray([meta]) 93 | 94 | for val in header_fields: 95 | header += uvarint.encode(val) 96 | 97 | return header 98 | -------------------------------------------------------------------------------- /jdb/storage/memtable.py: -------------------------------------------------------------------------------- 1 | from typing import Generator, Tuple, Optional 2 | from math import ceil 3 | import uvarint 4 | from jdb.storage import entry as ent, avltree as avl, compression as cmp 5 | from jdb import errors as err, types 6 | 7 | 8 | class Memtable: 9 | """in memory representation of db""" 10 | 11 | def __init__(self, max_size: int, compression: cmp.Compression): 12 | self.max_size = max_size 13 | self._compression = compression 14 | self._arena = bytearray() 15 | self._entries_count = 0 16 | self._offset = 0 17 | self._index = avl.AVLTree() 18 | 19 | def put(self, entry: ent.Entry) -> None: 20 | """append an entry to the log""" 21 | 22 | encoded = entry.encode(compression=self._compression) 23 | size = len(encoded) 24 | 25 | if self.size() + size > self.max_size: 26 | raise err.TableOverflow() 27 | 28 | self._index.insert((entry.key, self._offset)) 29 | self._arena += encoded 30 | self._entries_count += 1 31 | self._offset += size 32 | 33 | def get(self, key: types.Key) -> Optional[ent.Entry]: 34 | """find key and pointer in index, lookup value""" 35 | 36 | val = self._find_near(key) 37 | 38 | if not val: 39 | return None 40 | 41 | offset = val[1] 42 | entry, _ = self._decode_at_offset(offset) 43 | return entry 44 | 45 | def size(self) -> int: 46 | """byte length of storage""" 47 | 48 | return len(self._arena) 49 | 50 | def entries_count(self) -> int: 51 | """number of entries in db""" 52 | 53 | return self._entries_count 54 | 55 | def scan(self) -> Generator[ent.Entry, None, None]: 56 | """scan through log""" 57 | 58 | offset = 0 59 | 60 | while offset < len(self._arena): 61 | entry, bytes_read = self._decode_at_offset(offset) 62 | yield entry 63 | offset = offset + bytes_read 64 | 65 | def _find_near(self, key: types.Key) -> Optional[types.IndexEntry]: 66 | """find the closest version of this key""" 67 | 68 | return self._index.search((key, 0), gte=True) 69 | 70 | def _decode_at_offset(self, offset: types.Offset) -> Tuple[ent.Entry, int]: 71 | """ 72 | given an offset, return the entry starting there 73 | and the byte length of the entry 74 | """ 75 | 76 | block_size = uvarint.cut(1, self._arena[offset:]).integers[0] 77 | block_end = offset + block_size + ceil(block_size.bit_length() / 8) 78 | bytes_read = block_end - offset 79 | chunk = self._arena[offset:block_end] 80 | decoded = ent.Entry.decode(chunk, compression=self._compression) 81 | 82 | return (decoded, bytes_read) 83 | -------------------------------------------------------------------------------- /jdb/storage/oracle.py: -------------------------------------------------------------------------------- 1 | from threading import Lock 2 | from jdb import errors as err, const 3 | 4 | 5 | class Oracle: 6 | """ 7 | transaction status oracle. 8 | enforce isolation levels and maintain ordering of transactions. 9 | transactions aren't threadsafe but the operations in this class must be 10 | """ 11 | 12 | def __init__(self): 13 | self._next_ts = 1 14 | self._commits = {} 15 | self._lock = Lock() 16 | self.write_lock = Lock() 17 | 18 | def read_ts(self) -> int: 19 | """ 20 | if a transaction gets a commit timestamp of 1 21 | then its snapshot of the db includes everything that occurred until 0 22 | """ 23 | 24 | with self._lock: 25 | return self._next_ts - 1 26 | 27 | def commit_request(self, txn) -> int: 28 | """ 29 | per ssi - abort transaction if there are any writes that have occurred since 30 | this transaction started that affect keys read by this transaction, then keep 31 | track of this transaction's writes for other transactions to do the same. 32 | threadsafe 33 | """ 34 | 35 | with self._lock: 36 | return self._commit_request(txn) 37 | 38 | def _commit_request(self, txn) -> int: 39 | """not threadsafe""" 40 | 41 | for key in txn.reads: 42 | last_commit = self._commits.get(key) 43 | 44 | if last_commit and last_commit > txn.read_ts: 45 | raise err.Abort() 46 | 47 | ts = self._next_ts 48 | self._next_ts += 1 49 | 50 | if ts == const.MAX_UINT_64: 51 | raise OverflowError() 52 | 53 | for key in txn.writes.keys(): 54 | self._commits[key] = ts 55 | 56 | return ts 57 | -------------------------------------------------------------------------------- /jdb/storage/transaction.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from dataclasses import dataclass 3 | from typing import Optional, MutableSet, Dict 4 | from uuid import uuid4 as uuid 5 | from enum import Enum 6 | from collections import OrderedDict 7 | from jdb import util, types as t, storage, errors as err 8 | 9 | 10 | class TransactionStatus(Enum): 11 | """"what state txn is in""" 12 | 13 | PENDING = 0 14 | COMMITTED = 1 15 | ABORTED = 2 16 | NOOP = 3 17 | 18 | 19 | @dataclass 20 | class TransactionMeta: 21 | """data only. TODO refactor""" 22 | 23 | txnid: str 24 | read_ts: t.Timestamp 25 | commit_ts: Optional[t.Timestamp] 26 | status: TransactionStatus 27 | returning: t.Returning 28 | 29 | @property 30 | def ispending(self) -> bool: 31 | """helper""" 32 | 33 | return self.status == TransactionStatus.PENDING 34 | 35 | @property 36 | def isaborted(self) -> bool: 37 | """helper""" 38 | 39 | return self.status == TransactionStatus.ABORTED 40 | 41 | @property 42 | def iscommitted(self) -> bool: 43 | """helper""" 44 | 45 | return self.status == TransactionStatus.COMMITTED 46 | 47 | 48 | class Transaction: 49 | """represents a db transaction""" 50 | 51 | db: storage.DB 52 | writes: OrderedDict 53 | reads: MutableSet[t.Key] 54 | txnid: str 55 | read_ts: t.Timestamp 56 | commit_ts: Optional[t.Timestamp] 57 | status: TransactionStatus 58 | 59 | def __init__(self, db: storage.DB): 60 | self.db = db 61 | self.writes = OrderedDict() 62 | self.reads = set() 63 | self.returning: Dict[t.Key, Optional[t.Value]] = {} 64 | self.txnid = str(uuid()) 65 | self.read_ts = db.oracle.read_ts() 66 | self.commit_ts = None 67 | self.status = TransactionStatus.PENDING 68 | 69 | def read(self, key: t.Key) -> Optional[t.Value]: 70 | """ 71 | if this transaction has any writes for this key, fulfill from there. 72 | else, load latest version from its snapshot of the db and track the 73 | read key 74 | """ 75 | 76 | if key in self.writes: 77 | return self.writes[key].value 78 | 79 | self.reads.add(key) 80 | 81 | seek = util.encode_key_with_ts(key, self.read_ts) 82 | version = self.db.read(seek) 83 | 84 | if not version: 85 | self.returning[key] = None 86 | return None 87 | 88 | versionkey, _ = util.decode_key_with_ts(version.key) 89 | 90 | if versionkey != key or version.isdeleted: 91 | self.returning[key] = None 92 | return None 93 | 94 | self.returning[key] = version.value 95 | return version.value 96 | 97 | def write(self, key: t.Key, value: t.Value = bytes(), meta: int = 0): 98 | """add a pending write""" 99 | 100 | self.writes[key] = storage.Entry(key=key, value=value, meta=meta) 101 | 102 | def isreadonly(self) -> bool: 103 | """helper""" 104 | 105 | return not self.writes 106 | 107 | def commit(self) -> Transaction: 108 | """ 109 | dont incur any overhead with oracle if no writes to process. 110 | else, get a commit ts from oracle and apply to all writes then ship 111 | over to db to persist 112 | """ 113 | 114 | if not self.writes: 115 | self.status = TransactionStatus.NOOP 116 | return self 117 | 118 | with self.db.oracle.write_lock: 119 | return self._commit() 120 | 121 | def _commit(self) -> Transaction: 122 | """we have writes, commit transaction""" 123 | 124 | try: 125 | commit_ts = self.db.oracle.commit_request(self) 126 | except err.Abort as exc: 127 | self.status = TransactionStatus.ABORTED 128 | raise exc 129 | 130 | self.commit_ts = commit_ts 131 | writes = [] 132 | 133 | for key, write in self.writes.items(): 134 | key_with_ts = util.encode_key_with_ts(key=key, ts=commit_ts) 135 | new_entry = storage.Entry( 136 | key=key_with_ts, value=write.value, meta=write.meta 137 | ) 138 | writes.append(new_entry) 139 | 140 | self.db.write(writes) 141 | self.status = TransactionStatus.COMMITTED 142 | return self 143 | -------------------------------------------------------------------------------- /jdb/types.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Dict, Optional 2 | 3 | ID = int 4 | Key = bytes 5 | Value = bytes 6 | Offset = int 7 | Timestamp = int 8 | IndexEntry = Tuple[Key, Offset] 9 | Returning = Dict[Key, Optional[Value]] 10 | -------------------------------------------------------------------------------- /jdb/util.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from typing import Tuple 3 | from collections import OrderedDict 4 | from jdb.types import Key, Timestamp 5 | from jdb.const import MAX_UINT_64 6 | 7 | 8 | def encode_key_with_ts(key: Key, ts: Timestamp) -> Key: 9 | """append ts as last 8 bytes of key""" 10 | 11 | encoded_ts = (MAX_UINT_64 - ts).to_bytes(8, byteorder="big") 12 | return key + encoded_ts 13 | 14 | 15 | def decode_key_with_ts(key_with_ts: Key) -> Tuple[Key, Timestamp]: 16 | """parse out ts""" 17 | 18 | key = key_with_ts[:-8] 19 | ts = MAX_UINT_64 - int.from_bytes(key_with_ts[-8:], byteorder="big") 20 | return (key, ts) 21 | 22 | 23 | def now_ms() -> int: 24 | """ms since epoch""" 25 | 26 | return int(datetime.now(tz=timezone.utc).timestamp() * 1000) 27 | 28 | 29 | def byteify_keys(obj: OrderedDict) -> OrderedDict: 30 | """hacky""" 31 | 32 | return OrderedDict({k.encode(): v for k, v in obj.items()}) 33 | 34 | 35 | def stringify_keys(obj: OrderedDict) -> OrderedDict: 36 | """hacky""" 37 | 38 | return OrderedDict({k.decode(): v for k, v in obj.items()}) 39 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | junit_family=xunit1 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="jdb", 5 | version="0.0.1", 6 | description="jdb", 7 | author="thejchap", 8 | packages=["jdb"], 9 | install_requires=[ 10 | "black", 11 | "pylint", 12 | "flake8", 13 | "mypy", 14 | "pytest", 15 | "uvarint", 16 | "python-snappy", 17 | "xxhash", 18 | "structlog", 19 | "colorama", 20 | "freezegun", 21 | "grpcio", 22 | "grpcio-tools", 23 | "tenacity", 24 | "redis", 25 | "lmdb", 26 | "sympy", 27 | "matplotlib", 28 | "snakeviz", 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thejchap/jdb/d1652b46069e3d515215bb6681a8d026c31b81c2/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_unit.py: -------------------------------------------------------------------------------- 1 | # pylint:disable=redefined-outer-name 2 | 3 | from pytest import fixture, mark, raises 4 | from freezegun import freeze_time 5 | import jdb.storage as db 6 | import jdb.errors as err 7 | import jdb.jql as jql 8 | import jdb.util as util 9 | import jdb.node as nde 10 | import jdb.hlc as hlc 11 | import jdb.crdt as crdt 12 | import jdb.routing as rte 13 | import jdb.membership as mbr 14 | import jdb.maglev as mag 15 | 16 | 17 | @fixture 18 | def parser(node: nde.Node): 19 | """parser test subject""" 20 | 21 | return jql.JQL(node=node) 22 | 23 | 24 | @fixture 25 | def tree(): 26 | """index test subject""" 27 | 28 | return db.AVLTree() 29 | 30 | 31 | def test_basic(): 32 | database = db.DB() 33 | database.put(b"a", b"hello") 34 | database.put(b"b", b"world") 35 | 36 | val1 = database.get(b"a") 37 | val2 = database.get(b"b") 38 | 39 | assert val1 == b"hello" 40 | assert val2 == b"world" 41 | 42 | 43 | def test_basic_2(): 44 | database = db.DB() 45 | database.put(b"hello", b"world") 46 | database.put(b"hello1", b"world1") 47 | database.put(b"hello2", b"world2") 48 | 49 | val = database.get(b"hello2") 50 | 51 | assert val == b"world2" 52 | 53 | 54 | def test_correct_key(): 55 | database = db.DB() 56 | database.put(b"/world/1", b"hello") 57 | 58 | val = database.get(b"/hello/world") 59 | 60 | assert not val 61 | 62 | 63 | @mark.parametrize("key,value,meta", [(b"foo", b"world", 0), (b"hello", b"bar", 1)]) 64 | def test_encode_decode(key, value, meta): 65 | entry = db.Entry(key=key, value=value, meta=meta) 66 | encoded = entry.encode() 67 | assert db.Entry.decode(encoded) == entry 68 | 69 | 70 | def test_overflow(): 71 | smalldb = db.DB(max_table_size=256, compression=None) 72 | key = bytes(bytearray(128)) 73 | value = b"" 74 | smalldb.put(key, value) 75 | 76 | with raises(err.TableOverflow): 77 | smalldb.put(key, value) 78 | 79 | 80 | def test_compression(): 81 | database = db.DB() 82 | value = ("hello " * 1000 + "world " * 1000).encode("utf-8") 83 | key = b"hello" 84 | database.put(key, value) 85 | 86 | assert database.get(key) == value 87 | 88 | 89 | def test_ssi(): 90 | database = db.DB() 91 | database.put(b"a", b"b") 92 | 93 | txn1 = db.Transaction(database) 94 | txn2 = db.Transaction(database) 95 | txn3 = db.Transaction(database) 96 | 97 | txn1.write(b"a", b"z") 98 | assert txn2.read(b"a") == b"b" 99 | txn2.write(b"a", b"y") 100 | 101 | txn3.write(b"c", b"d") 102 | txn3.read(b"c") 103 | 104 | txn1.commit() 105 | 106 | with raises(err.Abort): 107 | txn2.commit() 108 | 109 | txn3.commit() 110 | 111 | assert txn1.read_ts == 1 112 | assert txn2.read_ts == 1 113 | assert txn3.read_ts == 1 114 | assert txn1.commit_ts == 2 115 | assert txn3.commit_ts == 3 116 | 117 | 118 | def test_avl(tree: db.AVLTree): 119 | tree.insert((bytes([10]), 0)) 120 | tree.insert((bytes([20]), 0)) 121 | tree.insert((bytes([30]), 0)) 122 | tree.insert((bytes([40]), 0)) 123 | tree.insert((bytes([50]), 0)) 124 | tree.insert((bytes([25]), 0)) 125 | 126 | assert tree.root 127 | assert int.from_bytes(tree.root.key[0], byteorder="big") == 30 128 | assert tree.root.left 129 | assert int.from_bytes(tree.root.left.key[0], byteorder="big") == 20 130 | assert tree.root.left.left 131 | assert int.from_bytes(tree.root.left.left.key[0], byteorder="big") == 10 132 | assert tree.root.left.right 133 | assert int.from_bytes(tree.root.left.right.key[0], byteorder="big") == 25 134 | assert tree.root.right 135 | assert int.from_bytes(tree.root.right.key[0], byteorder="big") == 40 136 | assert tree.root.right.right 137 | assert int.from_bytes(tree.root.right.right.key[0], byteorder="big") == 50 138 | assert not tree.search((bytes([70]), 0)) 139 | 140 | 141 | def test_avl_near(tree: db.AVLTree): 142 | tree.insert((bytes([3]), 0)) 143 | tree.insert((bytes([4]), 0)) 144 | tree.insert((bytes([1]), 0)) 145 | 146 | assert tree.search((bytes([2]), 0), gte=True) == (bytes([3]), 0) 147 | 148 | 149 | def test_avl_near_2(tree: db.AVLTree): 150 | tree.insert((bytes([2]), 0)) 151 | tree.insert((bytes([1]), 0)) 152 | tree.insert((bytes([5]), 0)) 153 | 154 | assert tree.search((bytes([4]), 0), gte=True) == (bytes([5]), 0) 155 | 156 | 157 | def test_avl_near_3(tree: db.AVLTree): 158 | tree.insert((bytes([5]), 0)) 159 | tree.insert((bytes([2]), 0)) 160 | tree.insert((bytes([1]), 0)) 161 | tree.insert((bytes([3]), 0)) 162 | 163 | assert tree.search((bytes([3]), 0), gte=True) == (bytes([3]), 0) 164 | 165 | 166 | @mark.skip 167 | def test_parse_put(): 168 | parser = jql.JQL(node=nde.Node()) 169 | statement = "put hello world;" 170 | _, txn = parser.call(statement) 171 | 172 | assert txn 173 | assert txn.writes[b"hello"].value == b"world" 174 | 175 | 176 | def test_parse_get(): 177 | node = nde.Node() 178 | parser = jql.JQL(node) 179 | database = node.store 180 | database.put(b"hello", b"world") 181 | statement = "get hello;" 182 | val, txn1 = parser.call(statement) 183 | 184 | assert not txn1 185 | assert val == "world" 186 | 187 | 188 | @mark.skip 189 | def test_parse_transaction(): 190 | parser = jql.JQL(node=nde.Node()) 191 | statement = "begin\nput a b\nput c d\nend;" 192 | _, txn1 = parser.call(statement) 193 | 194 | assert txn1 195 | assert txn1.writes[b"a"].value == b"b" 196 | assert txn1.writes[b"c"].value == b"d" 197 | 198 | 199 | @mark.skip 200 | def test_parse_transaction_with_read(): 201 | parser = jql.JQL(node=nde.Node()) 202 | statement = "begin\nput a b\nget a\nend;" 203 | _, txn1 = parser.call(statement) 204 | 205 | assert txn1 206 | assert txn1.writes[b"a"].value == b"b" 207 | assert b"a" not in txn1.reads 208 | 209 | 210 | def test_key_with_ts(): 211 | key_with_ts = util.encode_key_with_ts(b"hello", 100) 212 | key, ts = util.decode_key_with_ts(key_with_ts) 213 | 214 | assert key == b"hello" 215 | assert ts == 100 216 | 217 | 218 | def test_peer_merge_basic(): 219 | global_clock = hlc.HLC() 220 | cs1 = crdt.LWWRegister(replica_id=1) 221 | cs2 = crdt.LWWRegister(replica_id=2) 222 | cs1.clock = global_clock 223 | cs2.clock = global_clock 224 | cs1.add(b"a") 225 | cs2.remove(b"a") 226 | cs1.add(b"b") 227 | cs1.add(b"d") 228 | cs2.add(b"c") 229 | cs2.remove(b"d") 230 | merged = dict(cs1.merge(cs2)) 231 | 232 | assert b"b" in merged 233 | assert b"c" in merged 234 | assert b"a" not in merged 235 | assert b"d" not in merged 236 | 237 | 238 | @freeze_time("1970-01-01") 239 | def test_peer_merge_concurrent(): 240 | cs1 = crdt.LWWRegister(replica_id=1) 241 | cs2 = crdt.LWWRegister(replica_id=2) 242 | cs1.remove(b"a") 243 | cs2.add(b"a") 244 | merged = cs1.merge(cs2) 245 | merge_dict = dict(merged) 246 | 247 | assert b"a" in merge_dict 248 | 249 | 250 | def test_hlc(): 251 | clock = hlc.HLC() 252 | clock.incr() 253 | clock.incr() 254 | clock.incr() 255 | ts1 = clock.incr() 256 | ts1int = int(ts1) 257 | ts2 = hlc.HLCTimestamp.from_int(ts1int) 258 | 259 | assert ts2.ts == ts1.ts 260 | assert ts2.count == ts1.count 261 | 262 | 263 | def test_routing(): 264 | name = "3" 265 | addr = "0.0.0.3" 266 | node = nde.Node(name=name, p2p_addr=addr) 267 | membership = mbr.Membership(node_name=name, node_addr=addr) 268 | membership.add_peer("1", "0.0.0.1") 269 | router = rte.Router(membership=membership, node=node) 270 | req = rte.BatchRequest() 271 | req.requests.append(rte.PutRequest(key=b"/2/a", value=b"1")) 272 | 273 | router.request(req) 274 | 275 | 276 | def test_maglev(): 277 | maglev = mag.Maglev({"a", "b", "c"}) 278 | assert maglev.m == 307 279 | 280 | for entry in maglev.table: 281 | assert entry != -1 282 | --------------------------------------------------------------------------------