├── .lint ├── .gitignore ├── lint.sh ├── remark-lint-config.js └── package.json ├── etherdog.png ├── discv5 ├── img │ ├── ticket-validity.png │ ├── message-packet-layout.png │ ├── topic-queue-diagram.png │ ├── topic-radius-diagram.png │ ├── handshake-packet-layout.png │ └── whoareyou-packet-layout.png ├── discv5.md ├── discv5-wire-test-vectors.md ├── discv5-wire.md ├── discv5-rationale.md └── discv5-theory.md ├── .circleci └── config.yml ├── enr-entries ├── eth.md └── les.md ├── README.md ├── caps ├── wit.md ├── pip.md ├── snap.md └── les.md ├── enr.md ├── dnsdisc.md ├── discv4.md └── rlpx.md /.lint/.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | -------------------------------------------------------------------------------- /etherdog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/etherdog.png -------------------------------------------------------------------------------- /discv5/img/ticket-validity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/ticket-validity.png -------------------------------------------------------------------------------- /discv5/img/message-packet-layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/message-packet-layout.png -------------------------------------------------------------------------------- /discv5/img/topic-queue-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/topic-queue-diagram.png -------------------------------------------------------------------------------- /discv5/img/topic-radius-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/topic-radius-diagram.png -------------------------------------------------------------------------------- /discv5/img/handshake-packet-layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/handshake-packet-layout.png -------------------------------------------------------------------------------- /discv5/img/whoareyou-packet-layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/whoareyou-packet-layout.png -------------------------------------------------------------------------------- /.lint/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | d=$(dirname $0) 4 | $d/node_modules/remark-cli/cli.js --no-stdout --frail --rc-path $d/remark-lint-config.js $* 5 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.0 2 | jobs: 3 | build: 4 | docker: 5 | - image: cimg/node:lts 6 | steps: 7 | - checkout 8 | - restore_cache: 9 | key: lint-node-modules-{{ checksum ".lint/package-lock.json" }} 10 | - run: 11 | name: Install the Markdown Linter 12 | command: "cd .lint && npm install" 13 | - run: 14 | name: Run the Markdown Linter 15 | command: ".lint/lint.sh ." 16 | - save_cache: 17 | key: lint-node-modules-{{ checksum ".lint/package-lock.json" }} 18 | paths: 19 | - .lint/node_modules 20 | -------------------------------------------------------------------------------- /enr-entries/eth.md: -------------------------------------------------------------------------------- 1 | # The "eth" ENR entry 2 | 3 | This specification defines the "eth" ENR entry, which provides information 4 | about the [eth capability] on a certain node. 5 | 6 | ## Entry Format 7 | 8 | entry-key = "eth" 9 | entry-value = [[ forkHash, forkNext ]] 10 | 11 | At this time, the "eth" entry is a single element list containing an [EIP-2124] fork ID 12 | value. Please see the EIP for definitions of `forkHash` and `forkNext`. 13 | 14 | In order to be compatible with future versions of this specifications, implementations 15 | should ignore any additional list elements in `entry-value`. 16 | 17 | ## Change Log 18 | 19 | ### EIP-2124 (May 2019) 20 | 21 | The initial version of the "eth" entry was proposed in [EIP-2124]. 22 | 23 | [eth capability]: ../caps/eth.md 24 | [EIP-2124]: https://eips.ethereum.org/EIPS/eip-2124 25 | -------------------------------------------------------------------------------- /.lint/remark-lint-config.js: -------------------------------------------------------------------------------- 1 | // Get list of plugins from package.json. 2 | var fs = require('fs'), path = require('path'); 3 | var packageFile = path.resolve(__dirname, 'package.json'); 4 | var deps = Object.keys(JSON.parse(fs.readFileSync(packageFile)).dependencies); 5 | 6 | var pluginOptions = { 7 | 'remark-lint-code-block-style': 'indented', 8 | 'remark-lint-emphasis-marker': '*', 9 | 'remark-lint-strong-marker': '*', 10 | 'remark-lint-heading-style': 'atx', 11 | 'remark-lint-list-item-indent': 'space', 12 | 'remark-lint-no-heading-punctuation': '.,;:!', 13 | 'remark-lint-unordered-list-marker-style': '-', 14 | 'remark-lint-no-dead-urls': { skipOffline: true }, 15 | 'remark-lint-no-missing-blank-lines': { exceptTightLists: true }, 16 | }; 17 | 18 | exports.plugins = []; 19 | deps.forEach(function (d) { 20 | if (d.match(/^remark-(lint|validate)/)) { 21 | var option = pluginOptions[d]; 22 | exports.plugins.push(option ? [d, option] : d); 23 | } 24 | }); 25 | -------------------------------------------------------------------------------- /enr-entries/les.md: -------------------------------------------------------------------------------- 1 | # The "les" ENR entry 2 | 3 | This specification defines the "les" ENR entry, which provides information about the [les 4 | capability] provided by a node. The presence of this entry in a node's ENR indicates that 5 | the node is acting as a light client server. ENRs containing the "les" entry must also 6 | contain an [eth entry], which provides information about the specific Ethereum blockchain 7 | served by LES. 8 | 9 | ## Entry Format 10 | 11 | entry-key = "les" 12 | entry-value = [ vflux-version ] 13 | 14 | At this time, the "les" entry is a single element list containing the version number of 15 | the 'vflux' payment protocol. 16 | 17 | In order to be compatible with future versions of this specifications, implementations 18 | should ignore any additional list elements in `entry-value`. 19 | 20 | ## Change Log 21 | 22 | ### vflux-version (March 2021) 23 | 24 | In March 2021, the les entry was updated to include the vflux version number. 25 | 26 | ### Initial Version (October 2019) 27 | 28 | The initial version of the les entry was an empty list with the sole purpose of 29 | signaling LES server support. 30 | 31 | [les capability]: ../caps/les.md 32 | [eth entry]: ./eth.md 33 | -------------------------------------------------------------------------------- /.lint/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "devp2p-specs-lint-setup", 3 | "description": "markdown linter setup for devp2p specs", 4 | "version": "0.0.0", 5 | "keywords": [], 6 | "repository": "https://github.com/ethereum/devp2p", 7 | "bugs": "https://github.com/ethereum/devp2p/issues", 8 | "dependencies": { 9 | "remark-cli": "^10.0.0", 10 | "remark-lint": "^6.0.6", 11 | "remark-lint-blockquote-indentation": "^1.0.4", 12 | "remark-lint-code-block-style": "^1.0.4", 13 | "remark-lint-definition-spacing": "^1.0.5", 14 | "remark-lint-emphasis-marker": "^1.0.4", 15 | "remark-lint-fenced-code-flag": "^1.0.4", 16 | "remark-lint-fenced-code-marker": "^1.0.4", 17 | "remark-lint-final-definition": "^1.0.4", 18 | "remark-lint-final-newline": "^1.0.5", 19 | "remark-lint-hard-break-spaces": "^1.0.5", 20 | "remark-lint-heading-style": "^1.0.4", 21 | "remark-lint-linebreak-style": "^1.0.4", 22 | "remark-lint-link-title-style": "^1.0.5", 23 | "remark-lint-list-item-bullet-indent": "^1.0.4", 24 | "remark-lint-list-item-indent": "^1.0.5", 25 | "remark-lint-no-auto-link-without-protocol": "^1.0.4", 26 | "remark-lint-no-blockquote-without-marker": "^2.0.4", 27 | "remark-lint-no-consecutive-blank-lines": "^1.0.4", 28 | "remark-lint-no-dead-urls": "^0.4.1", 29 | "remark-lint-no-duplicate-definitions": "^1.0.6", 30 | "remark-lint-no-duplicate-headings-in-section": "^1.0.5", 31 | "remark-lint-no-empty-sections": "^3.0.0", 32 | "remark-lint-no-empty-url": "^1.0.6", 33 | "remark-lint-no-heading-content-indent": "^1.0.4", 34 | "remark-lint-no-heading-indent": "^1.0.4", 35 | "remark-lint-no-heading-like-paragraph": "^1.0.4", 36 | "remark-lint-no-heading-punctuation": "^1.0.4", 37 | "remark-lint-no-inline-padding": "^1.0.5", 38 | "remark-lint-no-literal-urls": "^1.0.4", 39 | "remark-lint-no-missing-blank-lines": "^1.0.4", 40 | "remark-lint-no-paragraph-content-indent": "^1.0.7", 41 | "remark-lint-no-reference-like-url": "^1.0.5", 42 | "remark-lint-no-shortcut-reference-image": "^1.0.4", 43 | "remark-lint-no-table-indentation": "^1.0.5", 44 | "remark-lint-no-tabs": "^1.0.4", 45 | "remark-lint-no-undefined-references": "^1.1.2", 46 | "remark-lint-no-unneeded-full-reference-image": "^1.0.1", 47 | "remark-lint-no-unneeded-full-reference-link": "^1.0.1", 48 | "remark-lint-no-unused-definitions": "^1.0.6", 49 | "remark-lint-ordered-list-marker-style": "^1.0.4", 50 | "remark-lint-ordered-list-marker-value": "^1.0.5", 51 | "remark-lint-rule-style": "^1.0.4", 52 | "remark-lint-strong-marker": "^1.0.4", 53 | "remark-lint-table-cell-padding": "^1.0.5", 54 | "remark-lint-table-pipe-alignment": "^1.0.4", 55 | "remark-lint-table-pipes": "^1.0.4", 56 | "remark-lint-unordered-list-marker-style": "^1.0.4", 57 | "remark-validate-links": "^8.0.0" 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /discv5/discv5.md: -------------------------------------------------------------------------------- 1 | # Node Discovery Protocol v5 2 | 3 | **Protocol version v5.1** 4 | 5 | Welcome to the Node Discovery Protocol v5 specification! 6 | 7 | Note that this specification is a work in progress and may change incompatibly without 8 | prior notice. 9 | 10 | Node Discovery is a system for finding other participants in a peer-to-peer network. The 11 | system can be used by any node, for any purpose, at no cost other than running the network 12 | protocol and storing a limited number of other nodes' records. Any node can be used as an 13 | entry point into the network. 14 | 15 | The system's design is loosely inspired by the Kademlia DHT, but unlike most DHTs no 16 | arbitrary keys and values are stored. Instead, the DHT stores and relays 'node records', 17 | which are signed documents providing information about nodes in the network. Node 18 | Discovery acts as a database of all live nodes in the network and performs three basic 19 | functions: 20 | 21 | - Sampling the set of all live participants: by walking the DHT, the network can be 22 | enumerated. 23 | - Searching for participants providing a certain service: Node Discovery v5 includes a 24 | scalable facility for registering 'topic advertisements'. These advertisements can be 25 | queried and nodes advertising a topic found. 26 | - Authoritative resolution of node records: if a node's ID is known, the most recent 27 | version of its record can be retrieved. 28 | 29 | ## Specification Overview 30 | 31 | The specification has three parts: 32 | 33 | - [discv5-wire.md] defines the wire protocol. 34 | - [discv5-theory.md] describes the algorithms and data structures. 35 | - [discv5-rationale.md] contains the design rationale. 36 | 37 | ## Comparison With Other Discovery Mechanisms 38 | 39 | Systems such as MDNS/Bonjour allow finding hosts in a local-area network. The Node 40 | Discovery Protocol is designed to work on the Internet and is most useful for applications 41 | with a large number of participants spread across the Internet. 42 | 43 | Systems using a rendezvous server: these systems are commonly used by desktop applications 44 | or cloud services to connect participants to each other. While undoubtedly efficient, this 45 | requires trust in the operator of the rendezvous server and these systems are prone to 46 | censorship. Compared to a rendezvous server, The Node Discovery Protocol doesn't rely on a 47 | single operator and places a small amount of trust in every participant. It becomes more 48 | resistant to censorship as the size of the network increases and participants of multiple 49 | distinct peer-to-peer networks can share the discovery network to further increase its 50 | resilience. 51 | 52 | The Achilles heel of the Node Discovery Protocol is the process of joining the network: 53 | while any other node may be used as an entry point, such a node must first be located 54 | through some other mechanism. Several approaches including scalable listing of initial 55 | entry points in DNS or discovery of participants in the local network can be used for 56 | reasonable secure entry into the network. 57 | 58 | ## Comparison With Node Discovery v4 59 | 60 | - Topic advertisement was added. 61 | - Arbitrary node metadata can be stored/relayed. 62 | - Node identity crypto is extensible, use of secp256k1 keys isn't strictly required. 63 | - The protocol no longer relies on the system clock. 64 | - Communication is encrypted, protecting topic searches and record lookups against passive 65 | observers. 66 | 67 | [discv5-wire.md]: ./discv5-wire.md 68 | [discv5-theory.md]: ./discv5-theory.md 69 | [discv5-rationale.md]: ./discv5-rationale.md 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | This repository contains specifications for the peer-to-peer networking protocols used by 4 | Ethereum. The issue tracker here is for discussions of protocol changes. It's also OK to 5 | open an issue if you just have a question. 6 | 7 | Protocol level security issues are valuable! Please report serious issues responsibly 8 | through the [Ethereum Foundation Bounty Program]. 9 | 10 | We have several specifications for low-level protocols: 11 | 12 | - [Ethereum Node Records] 13 | - [DNS Node Lists] 14 | - [Node Discovery Protocol v4] 15 | - [Node Discovery Protocol v5] 16 | - [RLPx protocol] 17 | 18 | The repository also contains specifications of many RLPx-based application-level protocols: 19 | 20 | - [Ethereum Wire Protocol] (eth/68) 21 | - [Ethereum Snapshot Protocol] (snap/1) 22 | - [Light Ethereum Subprotocol] (les/4) 23 | - [Parity Light Protocol] (pip/1) 24 | - [Ethereum Witness Protocol] (wit/0) 25 | 26 | ### The Mission 27 | 28 | devp2p is a set of network protocols which form the Ethereum peer-to-peer network. 29 | 'Ethereum network' is meant in a broad sense, i.e. devp2p isn't specific to a particular 30 | blockchain, but should serve the needs of any networked application associated with the 31 | Ethereum umbrella. 32 | 33 | We aim for an integrated system of orthogonal parts, implemented in multiple programming 34 | environments. The system provides discovery of other participants throughout the Internet 35 | as well as secure communication with those participants. 36 | 37 | The network protocols in devp2p should be easy to implement from scratch given only the 38 | specification, and must work within the limits of a consumer-grade Internet connection. We 39 | usually design protocols in a 'specification first' approach, but any specification 40 | proposed must be accompanied by a working prototype or implementable within reasonable 41 | time. 42 | 43 | ### Relationship with libp2p 44 | 45 | The [libp2p] project was started at about the same time as devp2p and seeks to be a 46 | collection of modules for assembling a peer-to-peer network from modular components. 47 | Questions about the relationship between devp2p and libp2p come up rather often. 48 | 49 | It's hard to compare the two projects because they have different scope and are designed 50 | with different goals in mind. devp2p is an integrated system definition that wants to 51 | serve Ethereum's needs well (although it may be a good fit for other applications, too) 52 | while libp2p is a collection of programming library parts serving no single application in 53 | particular. 54 | 55 | That said, both projects are very similar in spirit and devp2p is slowly adopting parts of 56 | libp2p as they mature. 57 | 58 | ### Implementations 59 | 60 | devp2p is part of most Ethereum clients. Implementations include: 61 | 62 | - C#: Nethermind 63 | - C++: Aleth 64 | - C: Breadwallet 65 | - Elixir: Exthereum 66 | - Go: go-ethereum/geth 67 | - Java: Tuweni RLPx library 68 | - Java: Besu 69 | - JavaScript: EthereumJS 70 | - Kotlin: Tuweni Discovery library 71 | - Nim: Nimbus nim-eth 72 | - Python: Trinity 73 | - Ruby: Ciri 74 | - Ruby: ruby-devp2p 75 | - Rust: rust-devp2p 76 | - Rust: openethereum 77 | - Rust: reth 78 | 79 | WireShark dissectors are available here: 80 | 81 | [Ethereum Foundation Bounty Program]: https://bounty.ethereum.org 82 | [Ethereum Wire Protocol]: ./caps/eth.md 83 | [Ethereum Snapshot Protocol]: ./caps/snap.md 84 | [Light Ethereum Subprotocol]: ./caps/les.md 85 | [Ethereum Witness Protocol]: ./caps/wit.md 86 | [Ethereum Node Records]: ./enr.md 87 | [DNS Node Lists]: ./dnsdisc.md 88 | [Node Discovery Protocol v4]: ./discv4.md 89 | [Node Discovery Protocol v5]: ./discv5/discv5.md 90 | [Parity Light Protocol]: ./caps/pip.md 91 | [RLPx protocol]: ./rlpx.md 92 | [libp2p]: https://libp2p.io 93 | -------------------------------------------------------------------------------- /caps/wit.md: -------------------------------------------------------------------------------- 1 | # Ethereum Witness Protocol (wit) 2 | 3 | The `wit` protocol runs on top of [RLPx], facilitating the exchange of Ethereum state 4 | witnesses between peers. The protocol is an optional extension for peers supporting (or 5 | caring about) the state witnesses for Ethereum blocks. 6 | 7 | The current version is `wit/0`. 8 | 9 | ### Overview 10 | 11 | The `wit` protocol is designed to assist clients in syncing up to the tip of the chain. 12 | Eventually, it also aspires to assist in stateless client operation. The `wit` protocol 13 | does not take part in chain maintenance (block and transaction propagation); and it is 14 | **meant to be run side-by-side with the `eth` protocol**, not standalone (e.g. chain 15 | progression is announced via `eth`). (like the `snap` protocol) 16 | 17 | Despite the name, version 0 will not provide actual witnesses. It will provide meta-data 18 | about the witness, which can be used to download the witness over the `eth` protocol. 19 | 20 | For now, the known use case is to assist Beam Syncing peers. By requesting witness 21 | metadata, these peers will keep up with the tip of the network and become fully-synced 22 | nodes faster. 23 | 24 | Using the `wit` protocol, peers ask each other for the list of trie node hashes read 25 | during the execution of a particular block. This includes the following data: 26 | 27 | - Storage nodes 28 | - Bytecodes 29 | - Account nodes 30 | - Read during EVM execution 31 | - Read during transaction validation 32 | - Read during block reward calculation 33 | - Nodes read when generating the final state root (i.e. sometimes deleting data requires a 34 | trie refactor that reads nearby trie nodes) 35 | 36 | The trie node hashes which are generated at the end of the block from existing data are 37 | *not* included. For example, the final state root hash is not included. 38 | 39 | ### Relation to `eth` 40 | 41 | The `wit` protocol follows the same pattern as `snap`. It is a *dependent satellite* of 42 | `eth` (i.e. to run `wit`, you need to run `eth` too), not a fully standalone protocol. 43 | This is a deliberate design decision: 44 | 45 | - `wit` is meant to be a bootstrap aid for newly joining full nodes. By enforcing all 46 | `wit` peers to also speak `eth`, we can avoid non-full nodes from lingering attached to 47 | `wit` indefinitely. 48 | - `eth` already contains well established chain and fork negotiation mechanisms, as well 49 | as remote peer staleness detection during sync. By running both protocols side-by-side, 50 | `wit` can benefit of all these mechanisms without having to duplicate them. 51 | 52 | This *satellite* status may be changed later, but it's better to launch with a more 53 | restricted protocol first and then expand if need be vs. trying to withdraw depended-upon 54 | features. 55 | 56 | In order to follow the `wit` protocol, clients must generate witness metadata when 57 | executing blocks. For now, its primary purpose is also one specific sync method that might 58 | not be suitable for all clients. Keeping `wit` as a separate protocol permits every client 59 | to decide to pursue it or not, without hindering their capacity to participate in the 60 | `eth` protocol. 61 | 62 | ### Accelerating Beam Sync 63 | 64 | At its most naive, Beam Sync needs to download any missing state one trie node at a time. 65 | According to a recent test, after Beam Syncing for 22 hours, the median block still 66 | required more than 300 new trie nodes. At an optimistic 100ms round-trip time, that means 67 | 30 seconds per block of data download. This is where witness metadata can help 68 | tremendously. 69 | 70 | If a client can request the trie node hashes used by a block up front, those 300 trie 71 | nodes can likely be accessed in a fraction of a second. That's easily enough to keep 72 | synced with mainnet. 73 | 74 | Unfortunately, the list of trie node hashes cannot be verified before the block is 75 | imported. This would be a huge problem for a stateless client, which would be permanently 76 | at risk to a DoS attack where peers feed it a long list of incorrect hashes. But Beam 77 | Syncing clients are only vulnerable until they've finished downloading the full network 78 | state, so the payoff for such an attack is smaller. 79 | 80 | ## Protocol Messages 81 | 82 | ### RESERVED (0x00) 83 | 84 | This command is undefined, held in place for a possible future Status message. 85 | 86 | ### GetBlockWitnessHashes (0x01) 87 | 88 | `[reqID: P, blockHash: B_32]` 89 | 90 | Requests a list of trie node hashes used by a given block. 91 | 92 | - `reqID`: Request ID to match up responses with 93 | - `blockHash`: Hash of the header to request the witness hashes for 94 | 95 | Notes: 96 | 97 | - Nodes **must** always respond to the query. 98 | - If the node does **not** have the trie hashes requested block, it **must** return an 99 | empty reply. 100 | 101 | ### BlockWitnessHashes (0x02) 102 | 103 | `[reqID: P, witnessHashes: [trieNodeHash: B_32, ...]]` 104 | 105 | Returns a list of the trie node hashes that were read during execution and validation of 106 | the given block. 107 | 108 | - `reqID`: ID of the request this is a response for 109 | - `witnessHashes`: List of trie node hashes 110 | 111 | ## Change Log 112 | 113 | ### wit/0 (October 2020) 114 | 115 | Version 0 was the introduction of the witness protocol. 116 | 117 | [RLPx]: ../rlpx.md 118 | -------------------------------------------------------------------------------- /enr.md: -------------------------------------------------------------------------------- 1 | # Ethereum Node Records 2 | 3 | This specification defines Ethereum Node Records (ENR), an open format for p2p 4 | connectivity information. A node record usually contains the network endpoints of a node, 5 | i.e. the node's IP addresses and ports. It also holds information about the node's purpose 6 | on the network so others can decide whether to connect to the node. 7 | 8 | Ethereum Node Records were originally proposed in [EIP-778]. 9 | 10 | ## Record Structure 11 | 12 | The components of a node record are: 13 | 14 | - `signature`: cryptographic signature of record contents 15 | - `seq`: The sequence number, a 64-bit unsigned integer. Nodes should increase the number 16 | whenever the record changes and republish the record. 17 | - The remainder of the record consists of arbitrary key/value pairs 18 | 19 | A record's signature is made and validated according to an *identity scheme*. The identity 20 | scheme is also responsible for deriving a node's address in the DHT. 21 | 22 | The key/value pairs must be sorted by key and must be unique, i.e. any key may be present 23 | only once. The keys can technically be any byte sequence, but ASCII text is preferred. Key 24 | names in the table below have pre-defined meaning. 25 | 26 | | Key | Value | 27 | |:------------|:-------------------------------------------| 28 | | `id` | name of identity scheme, e.g. "v4" | 29 | | `secp256k1` | compressed secp256k1 public key, 33 bytes | 30 | | `ip` | IPv4 address, 4 bytes | 31 | | `tcp` | TCP port, big endian integer | 32 | | `udp` | UDP port, big endian integer | 33 | | `ip6` | IPv6 address, 16 bytes | 34 | | `tcp6` | IPv6-specific TCP port, big endian integer | 35 | | `udp6` | IPv6-specific UDP port, big endian integer | 36 | 37 | All keys except `id` are optional, including IP addresses and ports. A record without 38 | endpoint information is still valid as long as its signature is valid. If no `tcp6` / 39 | `udp6` port is provided, the `tcp` / `udp` port applies to both IP addresses. Declaring 40 | the same port number in both `tcp`, `tcp6` or `udp`, `udp6` should be avoided but doesn't 41 | render the record invalid. 42 | 43 | ### RLP Encoding 44 | 45 | The canonical encoding of a node record is an RLP list of `[signature, seq, k, v, ...]`. 46 | The maximum encoded size of a node record is 300 bytes. Implementations should reject 47 | records larger than this size. 48 | 49 | Records are signed and encoded as follows: 50 | 51 | content = [seq, k, v, ...] 52 | signature = sign(content) 53 | record = [signature, seq, k, v, ...] 54 | 55 | ### Text Encoding 56 | 57 | The textual form of a node record is the base64 encoding of its RLP representation, 58 | prefixed by `enr:`. Implementations should use the [URL-safe base64 alphabet] 59 | and omit padding characters. 60 | 61 | ### "v4" Identity Scheme 62 | 63 | This specification defines a single identity scheme to be used as the default until other 64 | schemes are defined by further EIPs. The "v4" scheme is backwards-compatible with the 65 | cryptosystem used by Node Discovery v4. 66 | 67 | - To sign record `content` with this scheme, apply the keccak256 hash function (as used by 68 | the EVM) to `content`, then create a signature of the hash. The resulting 64-byte 69 | signature is encoded as the concatenation of the `r` and `s` signature values (the 70 | recovery ID `v` is omitted). 71 | 72 | - To verify a record, check that the signature was made by the public key in the 73 | "secp256k1" key/value pair of the record. 74 | 75 | - To derive a node address, take the keccak256 hash of the uncompressed public key, i.e. 76 | `keccak256(x || y)`. Note that `x` and `y` must be zero-padded up to length 32. 77 | 78 | ## Rationale 79 | 80 | The format is meant to suit future needs in two ways: 81 | 82 | - Adding new key/value pairs: This is always possible and doesn't require implementation 83 | consensus. Existing clients will accept any key/value pairs regardless of whether they 84 | can interpret their content. 85 | - Adding identity schemes: these need implementation consensus because the network won't 86 | accept the signature otherwise. To introduce a new identity scheme, propose an EIP and 87 | get it implemented. The scheme can be used as soon as most clients accept it. 88 | 89 | The size of a record is limited because records are relayed frequently and may be included 90 | in size-constrained protocols such as DNS. A record containing a IPv4 address, when signed 91 | using the "v4" scheme occupies roughly 120 bytes, leaving plenty of room for additional 92 | metadata. 93 | 94 | You might wonder about the need for so many pre-defined keys related to IP addresses and 95 | ports. This need arises because residential and mobile network setups often put IPv4 96 | behind NAT while IPv6 traffic—if supported—is directly routed to the same host. Declaring 97 | both address types ensures a node is reachable from IPv4-only locations and those 98 | supporting both protocols. 99 | 100 | ## Test Vectors 101 | 102 | This is an example record containing the IPv4 address `127.0.0.1` and UDP port `30303`. 103 | The node ID is `a448f24c6d18e575453db13171562b71999873db5b286df957af199ec94617f7`. 104 | 105 | enr:-IS4QHCYrYZbAKWCBRlAy5zzaDZXJBGkcnh4MHcBFZntXNFrdvJjX04jRzjzCBOonrkTfj499SZuOh8R33Ls8RRcy5wBgmlkgnY0gmlwhH8AAAGJc2VjcDI1NmsxoQPKY0yuDUmstAHYpMa2_oxVtw0RW_QAdpzBQA8yWM0xOIN1ZHCCdl8 106 | 107 | The record is signed using the "v4" identity scheme using sequence number `1` and this 108 | private key: 109 | 110 | b71c71a67e1177ad4e901695e1b4b9ee17ae16c6668d313eac2f96dbcda3f291 111 | 112 | The RLP structure of the record is: 113 | 114 | [ 115 | 7098ad865b00a582051940cb9cf36836572411a47278783077011599ed5cd16b76f2635f4e234738f30813a89eb9137e3e3df5266e3a1f11df72ecf1145ccb9c, 116 | 01, 117 | "id", 118 | "v4", 119 | "ip", 120 | 7f000001, 121 | "secp256k1", 122 | 03ca634cae0d49acb401d8a4c6b6fe8c55b70d115bf400769cc1400f3258cd3138, 123 | "udp", 124 | 765f, 125 | ] 126 | 127 | [EIP-778]: https://eips.ethereum.org/EIPS/eip-778 128 | [URL-safe base64 alphabet]: https://tools.ietf.org/html/rfc4648#section-5 129 | -------------------------------------------------------------------------------- /discv5/discv5-wire-test-vectors.md: -------------------------------------------------------------------------------- 1 | # Test Vectors 2 | 3 | This document provides a collection of test vectors for the Discovery v5 wire protocol 4 | aimed to aid new implementations conform to the specification. 5 | 6 | ## Packet Encodings 7 | 8 | This section provides test vectors for the different packet types. Your implementation 9 | should load the `node-b-key` and then be able to decrypt and authenticate these as-is. 10 | 11 | The secp256k1 private keys used here are: 12 | 13 | node-a-key = 0xeef77acb6c6a6eebc5b363a475ac583ec7eccdb42b6481424c60f59aa326547f 14 | node-b-key = 0x66fb62bfbd66b9177a138c1e5cddbe4f7c30c343e94e68df8769459cb1cde628 15 | 16 | Ping message packet (flag 0): 17 | 18 | # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb 19 | # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 20 | # nonce = 0xffffffffffffffffffffffff 21 | # read-key = 0x00000000000000000000000000000000 22 | # ping.req-id = 0x00000001 23 | # ping.enr-seq = 2 24 | 25 | 00000000000000000000000000000000088b3d4342774649325f313964a39e55 26 | ea96c005ad52be8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3 27 | 4c4f53245d08dab84102ed931f66d1492acb308fa1c6715b9d139b81acbdcc 28 | 29 | WHOAREYOU packet (flag 1): 30 | 31 | # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb 32 | # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 33 | # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000 34 | # whoareyou.request-nonce = 0x0102030405060708090a0b0c 35 | # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10 36 | # whoareyou.enr-seq = 0 37 | 38 | 00000000000000000000000000000000088b3d434277464933a1ccc59f5967ad 39 | 1d6035f15e528627dde75cd68292f9e6c27d6b66c8100a873fcbaed4e16b8d 40 | 41 | Ping handshake packet (flag 2): 42 | 43 | # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb 44 | # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 45 | # nonce = 0xffffffffffffffffffffffff 46 | # read-key = 0x4f9fac6de7567d1e3b1241dffe90f662 47 | # ping.req-id = 0x00000001 48 | # ping.enr-seq = 1 49 | # 50 | # handshake inputs: 51 | # 52 | # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000001 53 | # whoareyou.request-nonce = 0x0102030405060708090a0b0c 54 | # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10 55 | # whoareyou.enr-seq = 1 56 | # ephemeral-key = 0x0288ef00023598499cb6c940146d050d2b1fb914198c327f76aad590bead68b6 57 | # ephemeral-pubkey = 0x039a003ba6517b473fa0cd74aefe99dadfdb34627f90fec6362df85803908f53a5 58 | 59 | 00000000000000000000000000000000088b3d4342774649305f313964a39e55 60 | ea96c005ad521d8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3 61 | 4c4f53245d08da4bb252012b2cba3f4f374a90a75cff91f142fa9be3e0a5f3ef 62 | 268ccb9065aeecfd67a999e7fdc137e062b2ec4a0eb92947f0d9a74bfbf44dfb 63 | a776b21301f8b65efd5796706adff216ab862a9186875f9494150c4ae06fa4d1 64 | f0396c93f215fa4ef524f1eadf5f0f4126b79336671cbcf7a885b1f8bd2a5d83 65 | 9cf8 66 | 67 | Ping handshake message packet (flag 2, with ENR): 68 | 69 | # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb 70 | # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 71 | # nonce = 0xffffffffffffffffffffffff 72 | # read-key = 0x53b1c075f41876423154e157470c2f48 73 | # ping.req-id = 0x00000001 74 | # ping.enr-seq = 1 75 | # 76 | # handshake inputs: 77 | # 78 | # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000 79 | # whoareyou.request-nonce = 0x0102030405060708090a0b0c 80 | # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10 81 | # whoareyou.enr-seq = 0 82 | # ephemeral-key = 0x0288ef00023598499cb6c940146d050d2b1fb914198c327f76aad590bead68b6 83 | # ephemeral-pubkey = 0x039a003ba6517b473fa0cd74aefe99dadfdb34627f90fec6362df85803908f53a5 84 | 85 | 00000000000000000000000000000000088b3d4342774649305f313964a39e55 86 | ea96c005ad539c8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3 87 | 4c4f53245d08da4bb23698868350aaad22e3ab8dd034f548a1c43cd246be9856 88 | 2fafa0a1fa86d8e7a3b95ae78cc2b988ded6a5b59eb83ad58097252188b902b2 89 | 1481e30e5e285f19735796706adff216ab862a9186875f9494150c4ae06fa4d1 90 | f0396c93f215fa4ef524e0ed04c3c21e39b1868e1ca8105e585ec17315e755e6 91 | cfc4dd6cb7fd8e1a1f55e49b4b5eb024221482105346f3c82b15fdaae36a3bb1 92 | 2a494683b4a3c7f2ae41306252fed84785e2bbff3b022812d0882f06978df84a 93 | 80d443972213342d04b9048fc3b1d5fcb1df0f822152eced6da4d3f6df27e70e 94 | 4539717307a0208cd208d65093ccab5aa596a34d7511401987662d8cf62b1394 95 | 71 96 | 97 | ## Cryptographic Primitives 98 | 99 | This section provides test vectors for the currently supported "v4" identity scheme. 100 | 101 | ### ECDH 102 | 103 | The ECDH function takes the elliptic-curve scalar multiplication of a public key and a 104 | private key. The wire protocol describes this process. 105 | 106 | public-key = 0x039961e4c2356d61bedb83052c115d311acb3a96f5777296dcf297351130266231 107 | secret-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736 108 | 109 | This output is the result of the ECDH function which will be used by the KDF. 110 | 111 | shared-secret = 0x033b11a2a1f214567e1537ce5e509ffd9b21373247f2a3ff6841f4976f53165e7e 112 | 113 | ### Key Derivation 114 | 115 | This test vector checks the complete key derivation as used by the handshake. 116 | 117 | ephemeral-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736 118 | dest-pubkey = 0x0317931e6e0840220642f230037d285d122bc59063221ef3226b1f403ddc69ca91 119 | node-id-a = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb 120 | node-id-b = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 121 | challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000 122 | 123 | The expected outputs, resulting from the HKDF-EXPAND function. 124 | 125 | initiator-key = 0xdccc82d81bd610f4f76d3ebe97a40571 126 | recipient-key = 0xac74bb8773749920b0d3a8881c173ec5 127 | 128 | ### ID Nonce Signing 129 | 130 | This test vector checks the ID signature as used by the handshake. 131 | The `static-key` is the secp256k1 private key used for signing. 132 | 133 | static-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736 134 | challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000 135 | ephemeral-pubkey = 0x039961e4c2356d61bedb83052c115d311acb3a96f5777296dcf297351130266231 136 | node-id-B = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9 137 | 138 | The expected output is the `id-signature`. You can also apply this test vector in reverse 139 | by verifying the signature against the inputs above. 140 | 141 | id-signature = 0x94852a1e2318c4e5e9d422c98eaf19d1d90d876b29cd06ca7cb7546d0fff7b484fe86c09a064fe72bdbef73ba8e9c34df0cd2b53e9d65528c2c7f336d5dfc6e6 142 | 143 | ### Encryption/Decryption 144 | 145 | This test vector demonstrates the `AES_GCM` encryption/decryption used in the wire 146 | protocol. 147 | 148 | encryption-key: 0x9f2d77db7004bf8a1a85107ac686990b 149 | nonce: 0x27b5af763c446acd2749fe8e 150 | pt: 0x01c20101 151 | ad: 0x93a7400fa0d6a694ebc24d5cf570f65d04215b6ac00757875e3f3a5f42107903 152 | 153 | Note that the 16 byte MAC is appended to the ciphertext. 154 | 155 | message-ciphertext: 0xa5d12a2d94b8ccb3ba55558229867dc13bfa3648 156 | -------------------------------------------------------------------------------- /dnsdisc.md: -------------------------------------------------------------------------------- 1 | # DNS Node Lists 2 | 3 | Peer-to-peer node software often contains hard-coded bootstrap node lists. Updating those 4 | lists requires a software update and, effort is required from the software maintainers to 5 | ensure the list is up-to-date. As a result, the provided lists are usually small, giving 6 | the software little choice of initial entry point into the network. 7 | 8 | This specification describes a scheme for authenticated, updateable node lists retrievable 9 | via DNS. In order to use such a list, the client only requires information about the DNS 10 | name and the public key that signs the list. 11 | 12 | DNS-based discovery was initially proposed in [EIP-1459]. 13 | 14 | ## Node Lists 15 | 16 | A 'node list' is a list of ['node records' (ENRs)](./enr.md) of arbitrary length. Lists 17 | may refer to other lists using links. The entire list is signed using a secp256k1 private 18 | key. The corresponding public key must be known to the client in order to verify the list. 19 | 20 | ## URL Scheme 21 | 22 | To refer to a DNS node list, clients use a URL with 'enrtree' scheme. The URL contains the 23 | DNS name on which the list can be found as well as the public key that signed the list. 24 | The public key is contained in the username part of the URL and is the base32 encoding of 25 | the compressed 32-byte binary public key. 26 | 27 | Example: 28 | 29 | enrtree://AM5FCQLWIZX2QFPNJAP7VUERCCRNGRHWZG3YYHIUV7BVDQ5FDPRT2@nodes.example.org 30 | 31 | This URL refers to a node list at the DNS name 'nodes.example.org' and is signed by the 32 | public key 33 | 34 | 0x049f88229042fef9200246f49f94d9b77c4e954721442714e85850cb6d9e5daf2d880ea0e53cb3ac1a75f9923c2726a4f941f7d326781baa6380754a360de5c2b6 35 | 36 | ## DNS Record Structure 37 | 38 | The nodes in a list are encoded as a merkle tree for distribution via the DNS protocol. 39 | Entries of the merkle tree are contained in DNS TXT records. The root of the tree is a TXT 40 | record with the following content: 41 | 42 | enrtree-root:v1 e= l= seq= sig= 43 | 44 | where 45 | 46 | - `enr-root` and `link-root` refer to the root hashes of subtrees containing nodes and 47 | links subtrees. 48 | - `sequence-number` is the tree's update sequence number, a decimal integer. 49 | - `signature` is a 65-byte secp256k1 EC signature over the keccak256 hash of the record 50 | content, excluding the `sig=` part, encoded as URL-safe base64. 51 | 52 | Further TXT records on subdomains map hashes to one of three entry types. The subdomain 53 | name of any entry is the base32 encoding of the (abbreviated) keccak256 hash of its text 54 | content. 55 | 56 | - `enrtree-branch:,,...,` is an intermediate tree entry containing hashes of 57 | subtree entries. 58 | - `enrtree://@` is a leaf pointing to a different list located at another fully 59 | qualified domain name. Note that this format matches the URL encoding. This type of 60 | entry may only appear in the subtree pointed to by `link-root`. 61 | - `enr:` is a leaf containing a node record. The node record is encoded as a 62 | URL-safe base64 string. Note that this type of entry matches the canonical ENR text 63 | encoding. It may only appear in the `enr-root` subtree. 64 | 65 | No particular ordering or structure is defined for the tree. Whenever the tree is updated, 66 | its sequence number should increase. The content of any TXT record should be small enough 67 | to fit into the 512 byte limit imposed on UDP DNS packets. This limits the number of 68 | hashes that can be placed into an `enrtree-branch` entry. 69 | 70 | Example in zone file format: 71 | 72 | ; name ttl class type content 73 | @ 60 IN TXT enrtree-root:v1 e=JWXYDBPXYWG6FX3GMDIBFA6CJ4 l=C7HRFPF3BLGF3YR4DY5KX3SMBE seq=1 sig=o908WmNp7LibOfPsr4btQwatZJ5URBr2ZAuxvK4UWHlsB9sUOTJQaGAlLPVAhM__XJesCHxLISo94z5Z2a463gA 74 | C7HRFPF3BLGF3YR4DY5KX3SMBE 86900 IN TXT enrtree://AM5FCQLWIZX2QFPNJAP7VUERCCRNGRHWZG3YYHIUV7BVDQ5FDPRT2@morenodes.example.org 75 | JWXYDBPXYWG6FX3GMDIBFA6CJ4 86900 IN TXT enrtree-branch:2XS2367YHAXJFGLZHVAWLQD4ZY,H4FHT4B454P6UXFD7JCYQ5PWDY,MHTDO6TMUBRIA2XWG5LUDACK24 76 | 2XS2367YHAXJFGLZHVAWLQD4ZY 86900 IN TXT enr:-HW4QOFzoVLaFJnNhbgMoDXPnOvcdVuj7pDpqRvh6BRDO68aVi5ZcjB3vzQRZH2IcLBGHzo8uUN3snqmgTiE56CH3AMBgmlkgnY0iXNlY3AyNTZrMaECC2_24YYkYHEgdzxlSNKQEnHhuNAbNlMlWJxrJxbAFvA 77 | H4FHT4B454P6UXFD7JCYQ5PWDY 86900 IN TXT enr:-HW4QAggRauloj2SDLtIHN1XBkvhFZ1vtf1raYQp9TBW2RD5EEawDzbtSmlXUfnaHcvwOizhVYLtr7e6vw7NAf6mTuoCgmlkgnY0iXNlY3AyNTZrMaECjrXI8TLNXU0f8cthpAMxEshUyQlK-AM0PW2wfrnacNI 78 | MHTDO6TMUBRIA2XWG5LUDACK24 86900 IN TXT enr:-HW4QLAYqmrwllBEnzWWs7I5Ev2IAs7x_dZlbYdRdMUx5EyKHDXp7AV5CkuPGUPdvbv1_Ms1CPfhcGCvSElSosZmyoqAgmlkgnY0iXNlY3AyNTZrMaECriawHKWdDRk2xeZkrOXBQ0dfMFLHY4eENZwdufn1S1o 79 | 80 | ## Client Protocol 81 | 82 | To find nodes at a given DNS name, say "mynodes.org": 83 | 84 | 1. Resolve the TXT record of the name and check whether it contains a valid 85 | "enrtree-root=v1" entry. Let's say the `enr-root` hash contained in the entry is 86 | "CFZUWDU7JNQR4VTCZVOJZ5ROV4". 87 | 2. Verify the signature on the root against the known public key and check whether the 88 | sequence number is larger than or equal to any previous number seen for that name. 89 | 3. Resolve the TXT record of the hash subdomain, e.g. 90 | "CFZUWDU7JNQR4VTCZVOJZ5ROV4.mynodes.org" and verify whether the content matches the 91 | hash. 92 | 4. The next step depends on the entry type found: 93 | - for `enrtree-branch`: parse the list of hashes and continue resolving them (step 3). 94 | - for `enr`: decode, verify the node record and import it to local node storage. 95 | 96 | During traversal, the client must track hashes and domains which are already resolved to 97 | avoid going into an infinite loop. It's in the client's best interest to traverse the tree 98 | in random order. 99 | 100 | Client implementations should avoid downloading the entire tree at once during normal 101 | operation. It's much better to request entries via DNS when-needed, i.e. at the time when 102 | the client is looking for peers. 103 | 104 | ## Rationale 105 | 106 | DNS is used because it is a low-latency protocol that is pretty much guaranteed to be 107 | available. 108 | 109 | Being a merkle tree, any node list can be authenticated by a single signature on the root. 110 | Hash subdomains protect the integrity of the list. At worst intermediate resolvers can 111 | block access to the list or disallow updates to it, but cannot corrupt its content. The 112 | sequence number prevents replacing the root with an older version. 113 | 114 | Synchronizing updates on the client side can be done incrementally, which matters for 115 | large lists. Individual entries of the tree are small enough to fit into a single UDP 116 | packet, ensuring compatibility with environments where only basic UDP DNS can be used. The 117 | tree format also works well with caching resolvers: only the root of the tree needs a 118 | short TTL. Intermediate entries and leaves can be cached for days. 119 | 120 | ### Why does the link subtree exist? 121 | 122 | Links between lists enable federation and web-of-trust functionality. The operator of a 123 | large list can delegate maintenance to other list providers. If two node lists link to 124 | each other, users can use either list and get nodes from both. 125 | 126 | The link subtree is separate from the tree containing ENRs. This is done to enable client 127 | implementations to sync these trees independently. A client wanting to get as many nodes 128 | as possible will sync the link tree first and add all linked names to the sync horizon. 129 | 130 | ## References 131 | 132 | 1. The base64 and base32 encodings used to represent binary data are defined in [RFC 133 | 4648]. No padding is used for base64 and base32 data. 134 | 135 | [EIP-1459]: https://eips.ethereum.org/EIPS/eip-1459 136 | [RFC 4648]: https://tools.ietf.org/html/rfc4648 137 | -------------------------------------------------------------------------------- /discv4.md: -------------------------------------------------------------------------------- 1 | # Node Discovery Protocol 2 | 3 | This specification defines the Node Discovery protocol version 4, a Kademlia-like DHT that 4 | stores information about Ethereum nodes. The Kademlia structure was chosen because it is 5 | an efficient way to organize a distributed index of nodes and yields a topology of low 6 | diameter. 7 | 8 | The current protocol version is **4**. You can find a list of changes in past protocol 9 | versions at the end of this document. 10 | 11 | ## Node Identities 12 | 13 | Every node has a cryptographic identity, a key on the secp256k1 elliptic curve. The public 14 | key of the node serves as its identifier or 'node ID'. 15 | 16 | The 'distance' between two node keys is the bitwise exclusive or on the hashes of the 17 | public keys, taken as the number. 18 | 19 | distance(n₁, n₂) = keccak256(n₁) XOR keccak256(n₂) 20 | 21 | ## Node Records 22 | 23 | Participants in the Discovery Protocol are expected to maintain a [node record] \(ENR\) 24 | containing up-to-date information. All records must use the "v4" identity scheme. Other 25 | nodes may request the local record at any time by sending an [ENRRequest] packet. 26 | 27 | To resolve the current record of any node public key, perform a Kademlia lookup using 28 | [FindNode] packets. When the node is found, send ENRRequest to it and return the record 29 | from the response. 30 | 31 | ## Kademlia Table 32 | 33 | Nodes in the Discovery Protocol keep information about other nodes in their neighborhood. 34 | Neighbor nodes are stored in a routing table consisting of 'k-buckets'. For each `i` in 35 | `0 ≤ i < 256`, every node keeps a k-bucket of neighbors with distance 36 | `2^i ≤ distance < 2^(i+1)` from itself. 37 | 38 | The Node Discovery Protocol uses `k = 16`, i.e. every k-bucket contains up to 16 node 39 | entries. The entries are sorted by time last seen — least-recently seen node at the head, 40 | most-recently seen at the tail. 41 | 42 | Whenever a new node N₁ is encountered, it can be inserted into the corresponding bucket. 43 | If the bucket contains less than `k` entries N₁ can simply be added as the last entry. If 44 | the bucket already contains `k` entries, the least recently seen node in the bucket, N₂, 45 | needs to be revalidated by sending a [Ping] packet. If no reply is received from N₂ it is 46 | considered dead, removed and N₁ added to the tail of the bucket. 47 | 48 | ## Endpoint Proof 49 | 50 | To prevent traffic amplification attacks, implementations must verify that the sender of a 51 | query participates in the discovery protocol. The sender of a packet is considered 52 | verified if it has sent a valid [Pong] response with matching ping hash within the last 12 53 | hours. 54 | 55 | ## Recursive Lookup 56 | 57 | A 'lookup' locates the `k` closest nodes to a node ID. 58 | 59 | The lookup initiator starts by picking `α` closest nodes to the target it knows of. The 60 | initiator then sends concurrent [FindNode] packets to those nodes. `α` is a system-wide 61 | concurrency parameter, such as 3. In the recursive step, the initiator resends FindNode to 62 | nodes it has learned about from previous queries. Of the `k` nodes the initiator has heard 63 | of closest to the target, it picks `α` that it has not yet queried and resends [FindNode] 64 | to them. Nodes that fail to respond quickly are removed from consideration until and 65 | unless they do respond. 66 | 67 | If a round of FindNode queries fails to return a node any closer than the closest already 68 | seen, the initiator resends the find node to all of the `k` closest nodes it has not 69 | already queried. The lookup terminates when the initiator has queried and gotten responses 70 | from the `k` closest nodes it has seen. 71 | 72 | ## Wire Protocol 73 | 74 | Node discovery messages are sent as UDP datagrams. The maximum size of any packet is 1280 75 | bytes. 76 | 77 | packet = packet-header || packet-data 78 | 79 | Every packet starts with a header: 80 | 81 | packet-header = hash || signature || packet-type 82 | hash = keccak256(signature || packet-type || packet-data) 83 | signature = sign(packet-type || packet-data) 84 | 85 | The `hash` exists to make the packet format recognizable when running multiple protocols 86 | on the same UDP port. It serves no other purpose. 87 | 88 | Every packet is signed by the node's identity key. The `signature` is encoded as a byte 89 | array of length 65 as the concatenation of the signature values `r`, `s` and the 'recovery 90 | id' `v`. 91 | 92 | The `packet-type` is a single byte defining the type of message. Valid packet types are 93 | listed below. Data after the header is specific to the packet type and is encoded as an 94 | RLP list. Implementations should ignore any additional elements in the `packet-data` list 95 | as well as any extra data after the list. 96 | 97 | ### Ping Packet (0x01) 98 | 99 | packet-data = [version, from, to, expiration, enr-seq ...] 100 | version = 4 101 | from = [sender-ip, sender-udp-port, sender-tcp-port] 102 | to = [recipient-ip, recipient-udp-port, 0] 103 | 104 | The `expiration` field is an absolute UNIX time stamp. Packets containing a time stamp 105 | that lies in the past are expired may not be processed. 106 | 107 | The `enr-seq` field is the current ENR sequence number of the sender. This field is 108 | optional. 109 | 110 | When a ping packet is received, the recipient should reply with a [Pong] packet. It may 111 | also consider the sender for addition into the local table. Implementations should ignore 112 | any mismatches in version. 113 | 114 | If no communication with the sender has occurred within the last 12h, a ping should be 115 | sent in addition to pong in order to receive an endpoint proof. 116 | 117 | ### Pong Packet (0x02) 118 | 119 | packet-data = [to, ping-hash, expiration, enr-seq, ...] 120 | 121 | Pong is the reply to ping. 122 | 123 | `ping-hash` should be equal to `hash` of the corresponding ping packet. Implementations 124 | should ignore unsolicited pong packets that do not contain the hash of the most recent 125 | ping packet. 126 | 127 | The `enr-seq` field is the current ENR sequence number of the sender. This field is 128 | optional. 129 | 130 | ### FindNode Packet (0x03) 131 | 132 | packet-data = [target, expiration, ...] 133 | 134 | A FindNode packet requests information about nodes close to `target`. The `target` is a 135 | 64-byte secp256k1 public key. When FindNode is received, the recipient should reply with 136 | [Neighbors] packets containing the closest 16 nodes to target found in its local table. 137 | 138 | To guard against traffic amplification attacks, Neighbors replies should only be sent if 139 | the sender of FindNode has been verified by the endpoint proof procedure. 140 | 141 | ### Neighbors Packet (0x04) 142 | 143 | packet-data = [nodes, expiration, ...] 144 | nodes = [[ip, udp-port, tcp-port, node-id], ...] 145 | 146 | Neighbors is the reply to [FindNode]. 147 | 148 | ### ENRRequest Packet (0x05) 149 | 150 | packet-data = [expiration] 151 | 152 | When a packet of this type is received, the node should reply with an ENRResponse packet 153 | containing the current version of its [node record]. 154 | 155 | To guard against amplification attacks, the sender of ENRRequest should have replied to a 156 | ping packet recently (just like for FindNode). The `expiration` field, a UNIX timestamp, 157 | should be handled as for all other existing packets i.e. no reply should be sent if it 158 | refers to a time in the past. 159 | 160 | ### ENRResponse Packet (0x06) 161 | 162 | packet-data = [request-hash, ENR] 163 | 164 | This packet is the response to ENRRequest. 165 | 166 | - `request-hash` is the hash of the entire ENRRequest packet being replied to. 167 | - `ENR` is the node record. 168 | 169 | The recipient of the packet should verify that the node record is signed by the public key 170 | which signed the response packet. 171 | 172 | # Change Log 173 | 174 | ## Known Issues in the Current Version 175 | 176 | The `expiration` field present in all packets is supposed to prevent packet replay. Since 177 | it is an absolute time stamp, the node's clock must be accurate to verify it correctly. 178 | Since the protocol's launch in 2016 we have received countless reports about connectivity 179 | issues related to the user's clock being wrong. 180 | 181 | The endpoint proof is imprecise because the sender of FindNode can never be sure whether 182 | the recipient has seen a recent enough pong. Geth handles it as follows: If no 183 | communication with the recipient has occurred within the last 12h, initiate the procedure 184 | by sending a ping. Wait for a ping from the other side, reply to it and then send 185 | FindNode. 186 | 187 | ## EIP-868 (October 2019) 188 | 189 | [EIP-868] adds the [ENRRequest] and [ENRResponse] packets. It also modifies [Ping] and 190 | [Pong] to include the local ENR sequence number. 191 | 192 | ## EIP-8 (December 2017) 193 | 194 | [EIP-8] mandated that implementations ignore mismatches in Ping version and any additional 195 | list elements in `packet-data`. 196 | 197 | [Ping]: #ping-packet-0x01 198 | [Pong]: #pong-packet-0x02 199 | [FindNode]: #findnode-packet-0x03 200 | [Neighbors]: #neighbors-packet-0x04 201 | [ENRRequest]: #enrrequest-packet-0x05 202 | [ENRResponse]: #enrresponse-packet-0x06 203 | [EIP-8]: https://eips.ethereum.org/EIPS/eip-8 204 | [EIP-868]: https://eips.ethereum.org/EIPS/eip-868 205 | [node record]: ./enr.md 206 | -------------------------------------------------------------------------------- /caps/pip.md: -------------------------------------------------------------------------------- 1 | # Parity Light Protocol (PIP) 2 | 3 | The Parity Light Protocol is a variation of LES designed and implemented by Parity Tech 4 | for the Parity Ethereum client. Please refer to the [LES specification] for information on 5 | the purpose of the light client protocol. 6 | 7 | Like LES, PIP adopts a flow-control mechanism closely analogous to a [token-bucket rate 8 | limiter] where the client is expected to mirror the server token-bucket state (as 9 | exceeding the 'burstiness' depth is a violation that results in disconnection). PIP 10 | utilises [Canonical Hash Tries] \(CHTs), which are also described in the LES documentation. 11 | Unlike LES, a PIP CHT is generated once every 2048 blocks. One 32-byte trie root is stored 12 | for every range of 2048 blocks. 13 | 14 | The current version is **pip/1**. This specification was derived from the official 15 | specification at `https://wiki.parity.io`. However, the official specification has since 16 | been deleted. 17 | 18 | ## Notation 19 | 20 | Throughout this document, and in accordance with other devp2p documents, when referring to 21 | wire message formats the following symbols apply: 22 | 23 | `[ .. , .. , .. ]` means an RLP list 24 | 25 | `a || b` means concatenation of `a` and `b` 26 | 27 | `...` means additional list elements 28 | 29 | ## Handshake 30 | 31 | After the initial RLPx handshake, the first message that must be communicated is from the 32 | server to the light peer and is a status message. Updates to information in the status 33 | message are supplied with announcements. 34 | 35 | ### Status (0x00) 36 | 37 | `[[key0, value0], [key1, value1], ...]` 38 | 39 | Keys are strings. Mandatory keys and values are as follows: 40 | 41 | - `"protocol_version"` 1 for this PIP/1 protocol version. 42 | - `"network_id"` 0 for testnet, 1 for mainnet 43 | - `"total_difficulty"` integer total difficulty of the best chain as found in the block header. 44 | - `"head_blockhash"` the hash of the best (i.e. highest total difficulty) known block. 45 | - `"head_blocknum"` the number of the best (i.e. highest total difficulty) known block. 46 | - `"genesisHash""` the hash of the genesis block. 47 | 48 | Optional keys and values are as follows: 49 | 50 | - `"serve_headers"` any value and key-pair present if the peer can serve header chain 51 | downloads. 52 | - `"serve_chain_since"` present if the peer can serve Body/Receipts ODR requests starting 53 | from the given block number. 54 | - `"serve_state_since"` present if the peer can serve Proof/Code ODR requests starting 55 | from the given block number. 56 | - `"tx_relay"` present if the peer can relay transactions to the network. 57 | - `"flow_control_bl"` max credits (positive integer describing the burst-depth of the 58 | token bucket), 59 | - `"flow_control_mrc"` the initial cost table (see below) 60 | - `"flow_control_mrr"` rate of recharge (positive integer of credits recharged per second) 61 | 62 | #### Cost Table 63 | 64 | The cost table includes a mapping of individual [PIP Request/Response Messages] to costs, 65 | which are applied in the token-bucket rate limiter. The [Headers] and [Execution] request 66 | messages are special cases where the cost is multiplied by the maximum number of requested 67 | header or gas requested, respectively. The table also includes a base cost, which is 68 | applied for every [Request Batch]. 69 | 70 | cost_table = [base_cost, [id,cost],...] 71 | base_cost = positive integer cost applied to a request batch. 72 | id = identifier of an individual PIP message type 73 | cost = positive integer to apply to cost calculations for this message type 74 | 75 | ### Announcement (0x01) 76 | 77 | `[head_blockhash, head_blocknum, total_difficulty, reorg_depth, [key0, value0], [key1, value1], ...]` 78 | 79 | - `reorg_depth` is positive integer containing the reorganization depth to the common 80 | ancestor of the new head and the last announced head. 81 | - Other elements have the same meaning as in the [Status] message with the exception of 82 | `reorg_depth`. 83 | 84 | ### Request Batch (0x02) 85 | 86 | `[request-id, [req1, ...]]` 87 | 88 | where 89 | 90 | - `request-id` is a unique scalar request identifier for request-reply correlation. 91 | - `[req1, ...]` is the list of request messages, as described in the [PIP Request/Response Messages] 92 | section. 93 | 94 | This message, sent from client to server, requests that the given request messages should 95 | be executed. The server responds with a Response Batch. 96 | 97 | ### Response Batch (0x03) 98 | 99 | `[request-id, cr, [resp1, ...]]` 100 | 101 | where 102 | 103 | - `request-id1` is the unique scalar correlating with a previously received request message. 104 | - `cr` is an updated amount of request credits prior to recharge events at the time of 105 | processing on the server (please see throttling below). 106 | - `[resp1, ...]` is the list of response messages. 107 | 108 | There must be a response message for each request contained in the corresponding request batch. 109 | The individual responses must supply all elements of the response message specifications. 110 | The PIP protocol considers messages missing any of these elements *incomplete*. 111 | 112 | ### UpdateCreditParameters (0x04) 113 | 114 | `[max, recharge, cost_table]` 115 | 116 | where 117 | 118 | - `max` is a positive integer, the new maximum credit depth for the token bucket. 119 | - `recharge` a positive integer, the new recharge rate in credits per second. 120 | - `cost_table` is the updated [Cost Table]. 121 | 122 | The server may periodically update the token-bucket parameters, such as depth, message 123 | cost and recharge rate, for the particular client. Received updates must be acknowledged 124 | with an AcknowledgeUpdate message. 125 | 126 | ### AcknowledgeUpdate (0x05) 127 | 128 | This message acknowledges receipt of updated credit parameters and has no payload. 129 | 130 | ### RelayTransactions (0x06) 131 | 132 | `[tx1, tx2, ...]` 133 | 134 | where 135 | 136 | `tx1`, `tx2` are RLP encoded transactions as per [ETH] documentation. 137 | 138 | This message requests that the given transactions should be relayed to the 139 | to the eth network. 140 | 141 | ## PIP Request/Response Messages 142 | 143 | PIP request and response messages are batched and cannot be sent individually. Unlike LES, 144 | PIP batches may contain multiple messages of different types. The [Request Batch] is used 145 | to send messages of the types described below to the server. 146 | 147 | Each message type also specifies its corresponding response message (referred to as 148 | *outputs*). Response messages are sent as a [Response Batch] by the server when requests 149 | have executed. 150 | 151 | PIP tries to further optimise client-server round trips by allowing the individual 152 | requests in the batch to include references to what their responses would contain if 153 | processed sequentially. For clarification, an example PIP batch request could contain two 154 | request messages in order, where the second message specifies that an input is a specific 155 | 'output' of the first message, where 'output' means the server response to that request. 156 | 157 | Referencing a field in a response to a batched request is achieved with *loose inputs* and 158 | *reusable outputs*. Response message fields are documented as being **reusable as `n`** 159 | where `n` is an identifier labelling the field in the response message body. 160 | 161 | *Loose inputs* may be a back-reference to a *reusable output* or may be hard data. 162 | 163 | loose_input = [raw_flag, input] 164 | raw_flag = is 0 or 1 (a.k.a. 'discriminant') 165 | input = if raw_flag is 0, this is the RLP encoded value 166 | if raw_flag is 1, this is back_reference 167 | back_reference = [request_message_index, reusable_output] 168 | request_message_index = the 0-based position of a prior message in the request batch 169 | reusable_output = the unsigned integer identifying the corresponding response message field 170 | 171 | The following are the individual messages, paired as requests and their responses. 172 | 173 | ### Headers (0x00) 174 | 175 | Request and retrieve block headers from the server. 176 | 177 | #### Request 178 | 179 | `[message-id, [start, skip, max, reverse]]` 180 | 181 | - `start` Loose, of type either 32byte hash (block hash), or unsigned integer block number 182 | - `skip` unsigned integer N, specifying the server should return every Nth block 183 | - `max` unsinged integer, the maximum number of blocks to return 184 | - `reverse` 0 if the block numbers should be increasing, 1 to return in reverse order 185 | 186 | #### Response 187 | 188 | `[message-id, [header1, header2, ...]]` 189 | 190 | - `header1, header2, ...` the requested block headers 191 | 192 | ### HeaderProof (0x01) 193 | 194 | Request for a header proof. 195 | 196 | #### Request 197 | 198 | `[message-id, [block]]` 199 | 200 | - `block` Loose, of type unsigned integer, referring to the block number 201 | 202 | #### Response 203 | 204 | `[message-id, [cht_inclusion_proof, block_hash, total_difficulty]]` 205 | 206 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]` 207 | - `node1` merkle tree node as byte array 208 | - `block_hash` hash of the requested block **reusable as 0** 209 | - `total_difficulty` unsigned integer, the requested block total difficulty 210 | 211 | ### TransactionIndex (0x02) 212 | 213 | Request for transaction inclusion information by transaction hash. 214 | 215 | #### Request 216 | 217 | `[message-id, [hash]]` 218 | 219 | - `hash` Loose, of type 32 byte hash, referring to the transaction hash. 220 | 221 | #### Response 222 | 223 | `[message-id, [block_number, block_hash, index]]` 224 | 225 | - `block_number` the block number of the block containing the transaction **reusable as 0** 226 | - `block_hash` hash of the requested block **reusable as 1** 227 | - `index` index in the block 228 | 229 | ### BlockReceipts (0x03) 230 | 231 | Request for a block's receipts. 232 | 233 | #### Request 234 | 235 | `[message-id, [hash]]` 236 | 237 | - `hash` Loose, of type 32 byte hash, referring to the block hash. 238 | 239 | #### Response 240 | 241 | `[message-id, [receipts]]` 242 | 243 | - `receipts` is `[receipt1, receipt2, ...]` 244 | - `receipt1` a receipt, as per ETH spec. 245 | 246 | ### BlockBody (0x04) 247 | 248 | Request for a block's transactions. 249 | 250 | #### Request 251 | 252 | `[message-id, [hash]]` 253 | 254 | - `hash` Loose, of type 32 byte hash, referring to the transaction hash 255 | 256 | #### Response 257 | 258 | `[message-id, [transactions, uncles]]` 259 | 260 | - `transactions` is `[tx1, tx2, ...]` 261 | - `tx1` a transaction, as per ETH spec 262 | - `uncles` is `[header1, header2,...]` 263 | - `header1` an uncle block header as per ETH spec 264 | 265 | ### Account (0x05) 266 | 267 | Request for proof of specific account in the state. 268 | 269 | #### Request 270 | 271 | `[message-id , [block_hash, address_hash]]` 272 | 273 | - `block_hash` Loose, of type 32 byte hash, referring to the block hash 274 | - `address_hash` Loose, of type 32 byte hash, referring to the account address hash 275 | 276 | #### Response 277 | 278 | `[message-id, [cht_inclusion_proof, nonce, balance, code_hash, storage_root]]` 279 | 280 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]` 281 | - `node1` merkle tree node as byte array 282 | - `nonce` the block nonce (unsigned integer) 283 | - `balance` the account balance (unsigned integer) 284 | - `code_hash` 32 byte hash **reusable as 0** 285 | - `storage_root` 32 byte storage root hash **reusable as 1** 286 | 287 | ### Storage (0x06) 288 | 289 | Request for a proof of contract storage. 290 | 291 | #### Request 292 | 293 | `[message-id, [block_hash, address_hash, storage_key_hash]]` 294 | 295 | - `block_hash` Loose, of type 32 byte hash, referring to the block hash 296 | - `address_hash` Loose, of type 32 byte hash, referring to the account address hash 297 | - `storage_key_hash` Loose, of type 32 byte hash, referring to the storage key 298 | 299 | #### Response 300 | 301 | `[message-id, [cht_inclusion_proof, storage_value]]` 302 | 303 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]` 304 | - `node1` merkle tree node as byte array 305 | - `storage_value` 32 byte hash **reusable as 0** 306 | 307 | ### Code (0x07) 308 | 309 | Request for contract code. 310 | 311 | #### Request 312 | 313 | `[message-id, [block_hash, code_hash]]` 314 | 315 | - `block_hash` Loose, of type 32 byte hash, identifying the block. 316 | - `code_hash` Loose, of type 32 byte hash, identifying the code. 317 | 318 | #### Response 319 | 320 | `[message-id, [bytecode]]` 321 | 322 | - `bytecode` byte array of the contract code 323 | 324 | ### Execution (0x08) 325 | 326 | Request for Merkle proofs of a contract execution. 327 | 328 | #### Request 329 | 330 | `[message-id, [block_hash, from_address, call_or_create_address, gas_to_prove, gas_price, value, data]]` 331 | 332 | - `block_hash` Loose, of type 32 byte hash, identifying the block 333 | - `from_address` Type 32 byte hash, referring to the caller account address hash 334 | - `call_or_create_address` 32 byte hash, call contract if address, otherwise create contract if empty 335 | - `gas_to_prove` 32 byte unsigned integer of gas to prove 336 | - `gas_price` 32 byte unsigned integer of gas price 337 | - `value` 32 byte unsigned integer of value to transfer 338 | - `data` byte array of relevant data 339 | 340 | #### Response 341 | 342 | `[message-id, [proof]]` 343 | 344 | - `proof` is `[[node1, node2, ...], ...]`, the necessary execution proof 345 | - `node1` merkle tree node as byte array 346 | 347 | [LES specification]: ./les.md 348 | [ETH]: ./eth.md 349 | [Cost Table]: #cost-table 350 | [Canonical Hash Tries]: ./les.md#canonical-hash-trie 351 | [token-bucket rate limiter]: https://en.wikipedia.org/wiki/Token_bucket 352 | [Status]: #status-0x00 353 | [Request Batch]: #request-batch-0x02 354 | [Response Batch]: #response-batch-0x03 355 | [PIP Request/Response Messages]: #pip-requestresponse-messages 356 | [Headers]: #headers-0x00 357 | [Execution]: #execution-0x08 358 | -------------------------------------------------------------------------------- /discv5/discv5-wire.md: -------------------------------------------------------------------------------- 1 | # Node Discovery Protocol v5 - Wire Protocol 2 | 3 | **Protocol version v5.1** 4 | 5 | This document specifies the wire protocol of Node Discovery v5. 6 | 7 | ## Notation 8 | 9 | Here we present the notation that is used throughout this document. 10 | 11 | `[ .. , .. , .. ]`\ 12 |     is recursive encoding as an RLP list\ 13 | `a || b`\ 14 |     means binary concatenation of `a` and `b`\ 15 | `xor(a, b)`\ 16 |     means binary XOR of `a` and `b`\ 17 | `sha256(x)`\ 18 |     is the SHA256 digest of `x`\ 19 | `aesctr_encrypt(key, iv, pt)`\ 20 |     is unauthenticated AES/CTR symmetric encryption with the given `key` and `iv`.\ 21 |     Size of `key` and `iv` is 16 bytes (AES-128).\ 22 | `aesgcm_encrypt(key, nonce, pt, ad)`\ 23 |     is AES-GCM encryption/authentication with the given `key`, `nonce` and additional\ 24 |     authenticated data `ad`. Size of `key` is 16 bytes (AES-128), size of `nonce` 12 bytes. 25 | 26 | ## UDP Communication 27 | 28 | Node discovery messages are sent as UDP datagrams. Since UDP is a lossy transport, packets 29 | may be received in any order or not at all. Implementations should not re-send packets if 30 | the recipient doesn't respond. 31 | 32 | The maximum size of any packet is 1280 bytes. Implementations should not generate or 33 | process packets larger than this size. Most messages are smaller than this limit by 34 | definition, the exception being the NODES message. FINDNODE returns up to 16 records, plus 35 | other data, and TOPICQUERY may also distribute a significantly long list of ENRs. As per 36 | specification the maximum size of an ENR is 300 bytes. A NODES message containing all 37 | FINDNODE response records would be at least 4800 bytes, not including additional data such 38 | as the header. To stay below the size limit, NODES responses are sent as multiple messages 39 | and specify the total number of responses in the message. 40 | 41 | The minimum size of any Discovery v5 packet is 63 bytes. Implementations should reject 42 | packets smaller than this size. 43 | 44 | Since low-latency communication is expected, implementations should place short timeouts 45 | on request/response interactions. Good timeout values are 500ms for a single 46 | request/response and 1s for the handshake. 47 | 48 | When responding to a request, the response should be sent to the UDP envelope address of 49 | the request. 50 | 51 | ## Packet Encoding 52 | 53 | The protocol deals with three distinct kinds of packets: 54 | 55 | - Ordinary message packets, which carry an encrypted/authenticated message. 56 | - WHOAREYOU packets, which are sent when the recipient of an ordinary message packet 57 | cannot decrypt/authenticate the packet's message. 58 | - Handshake message packets, which are sent following WHOAREYOU. These packets establish a 59 | new session and carry handshake-related data in addition to the encrypted/authenticated 60 | message. 61 | 62 | In the following definitions, we assume that the sender of a packet has knowledge of its 63 | own 256-bit node ID (`src-id`) and the node ID of the packet destination (`dest-id`). When 64 | sending any packet except WHOAREYOU, the sender also generates a unique 96-bit `nonce` 65 | value. 66 | 67 | ### Protocol Header 68 | 69 | All discovery packets contain a header followed by an optional encrypted and authenticated 70 | message. 71 | 72 | Header information is 'masked' using symmetric encryption in order to avoid static 73 | identification of the protocol by firewalls. 74 | 75 | packet = masking-iv || masked-header || message 76 | masked-header = aesctr_encrypt(masking-key, masking-iv, header) 77 | masking-key = dest-id[:16] 78 | masking-iv = uint128 -- random data unique to packet 79 | 80 | The `masked-header` contains the actual packet header, which starts with a fixed-size 81 | `static-header`, followed by a variable-length `authdata` section (of size `authdata-size`). 82 | 83 | header = static-header || authdata 84 | static-header = protocol-id || version || flag || nonce || authdata-size 85 | protocol-id = "discv5" 86 | version = 0x0001 87 | authdata-size = uint16 -- byte length of authdata 88 | flag = uint8 -- packet type identifier 89 | nonce = uint96 -- nonce of message 90 | 91 | Decrypting the masked header data works as follows: The recipient constructs an AES/CTR 92 | stream cipher using its own node ID (`dest-id`) as the key and taking the IV from the 93 | packet. It can then decrypt the `static-header` and verify that `protocol-id` matches the 94 | expected string. If it does, the recipient can read `authdata-size` and unmask the 95 | remaining `authdata`. 96 | 97 | Implementations should not respond to packets with mismatching `protocol-id`. 98 | 99 | In ordinary message packets and handshake message packets, the packet contains an 100 | authenticated message after the `authdata` section. For WHOAREYOU packets, the `message` 101 | is empty. Implementations must generate a unique `nonce` value for every message packet. 102 | 103 | message = aesgcm_encrypt(initiator-key, nonce, message-pt, message-ad) 104 | message-pt = message-type || message-data 105 | message-ad = masking-iv || header 106 | 107 | The `flag` field of the header identifies the kind of packet and determines the encoding 108 | of `authdata`, which differs depending on the packet type. 109 | 110 | ### Ordinary Message Packet (`flag = 0`) 111 | 112 | For message packets, the `authdata` section is just the source node ID. 113 | 114 | authdata = src-id 115 | authdata-size = 32 116 | 117 | ![message packet layout](./img/message-packet-layout.png) 118 | 119 | ### WHOAREYOU Packet (`flag = 1`) 120 | 121 | In WHOAREYOU packets, the `authdata` section contains information for the identity 122 | verification procedure. The `message` part of WHOAREYOU packets is always empty. The 123 | `nonce` part of the packet must be set to the `nonce` of the message packet that caused 124 | the WHOAREYOU response. 125 | 126 | authdata = id-nonce || enr-seq 127 | authdata-size = 24 128 | id-nonce = uint128 -- random bytes 129 | enr-seq = uint64 -- ENR sequence number of the requesting node 130 | 131 | ![whoareyou packet layout](./img/whoareyou-packet-layout.png) 132 | 133 | ### Handshake Message Packet (`flag = 2`) 134 | 135 | For handshake message packets, the `authdata` section has variable size since public key 136 | and signature sizes depend on the ENR identity scheme. For the "v4" identity scheme, we 137 | assume 64-byte signature size and 33 bytes of (compressed) public key size. 138 | 139 | `authdata` starts with a fixed-size `authdata-head` component, followed by the ID 140 | signature, ephemeral public key and optional node record. 141 | 142 | The `record` field may be omitted if the `enr-seq` of WHOAREYOU is recent enough, i.e. 143 | when it matches the current sequence number of the sending node. If `enr-seq` is zero, the 144 | record must be sent. Node records are encoded and verified as specified in [EIP-778]. 145 | 146 | Please refer to the [handshake section] for more information about the content of the 147 | handshake packet. 148 | 149 | authdata = authdata-head || id-signature || eph-pubkey || record 150 | authdata-head = src-id || sig-size || eph-key-size 151 | authdata-size = 34 + sig-size + eph-key-size + len(record) 152 | sig-size = uint8 -- value: 64 for ID scheme "v4" 153 | eph-key-size = uint8 -- value: 33 for ID scheme "v4" 154 | 155 | ![handshake packet layout](./img/handshake-packet-layout.png) 156 | 157 | ## Protocol Messages 158 | 159 | This section lists all defined messages which can be sent and received. The hexadecimal 160 | value in parentheses is the `message-type`. 161 | 162 | The first element of every `message-data` list is the request ID. `request-id` is an RLP 163 | byte array of length <= 8 bytes. For requests, this value is assigned by the requester. 164 | The recipient of a message must mirror the value in the `request-id` element of the 165 | response. The selection of appropriate values for request IDs is left to the implementation. 166 | 167 | ### PING Request (0x01) 168 | 169 | message-data = [request-id, enr-seq] 170 | message-type = 0x01 171 | enr-seq = local ENR sequence number of sender 172 | 173 | PING checks whether the recipient is alive and informs it about the sender's ENR sequence 174 | number. 175 | 176 | ### PONG Response (0x02) 177 | 178 | message-data = [request-id, enr-seq, recipient-ip, recipient-port] 179 | message-type = 0x02 180 | enr-seq = ENR sequence number of sender 181 | recipient-ip = 16 or 4 byte IP address of the intended recipient 182 | recipient-port = recipient UDP port, a 16-bit integer 183 | 184 | PONG is the reply to PING. 185 | 186 | ### FINDNODE Request (0x03) 187 | 188 | message-data = [request-id, [distance₁, distance₂, ..., distanceₙ]] 189 | message-type = 0x03 190 | distanceₙ = requested log2 distance, a positive integer 191 | 192 | FINDNODE queries for nodes at the given logarithmic distances from the recipient's node 193 | ID. When distance `0` is requested, the result set should contain the recipient's current 194 | record. 195 | 196 | The recipient should create the result set by collecting nodes from its local node table 197 | according to the requested distances. Implementations should limit the number of nodes in 198 | the result set. The recommended result limit for FINDNODE queries is 16 nodes. 199 | 200 | ### NODES Response (0x04) 201 | 202 | message-data = [request-id, total, [ENR, ...]] 203 | message-type = 0x04 204 | total = total number of responses to the request 205 | 206 | NODES is the response to a FINDNODE or TOPICQUERY message. Multiple NODES messages may be 207 | sent as responses to a single query. Implementations may place a limit on the allowed 208 | maximum for `total`. If exceeded, additional responses may be ignored. 209 | 210 | When handling NODES as a response to FINDNODE, the recipient should verify that the 211 | received nodes match the requested distances. 212 | 213 | ### TALKREQ Request (0x05) 214 | 215 | message-data = [request-id, protocol, request] 216 | message-type = 0x05 217 | 218 | TALKREQ sends an application-level request. The purpose of this message is pre-negotiating 219 | connections made through another application-specific protocol identified by `protocol`. 220 | `protocol` and `request` are RLP byte arrays. 221 | 222 | The recipient must respond with a TALKRESP message containing the response to the request. 223 | If the `protocol` is unknown to the recipient, it must respond with a TALKRESP response 224 | containing empty `response` data. 225 | 226 | ### TALKRESP Response (0x06) 227 | 228 | message-data = [request-id, response] 229 | message-type = 0x06 230 | request-id = request-id of TALKREQ 231 | 232 | TALKRESP is the response to TALKREQ. The `response` is a RLP byte array containing the 233 | response data. 234 | 235 | ### REGTOPIC Request (0x07) 236 | 237 | **NOTE: the content and semantics of this message are not final.** 238 | **Implementations should not respond to or send these messages.** 239 | 240 | message-data = [request-id, topic, ENR, ticket] 241 | message-type = 0x07 242 | node-record = current node record of sender 243 | ticket = byte array containing ticket content 244 | 245 | REGTOPIC attempts to register the sender for the given topic. If the requesting node has a 246 | ticket from a previous registration attempt, it must present the ticket. Otherwise 247 | `ticket` is the empty byte array (RLP: `0x80`). The ticket must be valid and its waiting 248 | time must have elapsed before using the ticket. 249 | 250 | REGTOPIC is always answered by a TICKET response. The requesting node may also receive a 251 | REGCONFIRMATION response when registration is successful. It may take up to 10s for the 252 | confirmation to be sent. 253 | 254 | ### TICKET Response (0x08) 255 | 256 | **NOTE: the content and semantics of this message are not final.** 257 | **Implementations should not respond to or send these messages.** 258 | 259 | message-data = [request-id, ticket, wait-time] 260 | message-type = 0x08 261 | ticket = an opaque byte array representing the ticket 262 | wait-time = time to wait before registering, in seconds 263 | 264 | TICKET is the response to REGTOPIC. It contains a ticket which can be used to register for 265 | the requested topic after `wait-time` has elapsed. See the [theory section on tickets] for 266 | more information. 267 | 268 | ### REGCONFIRMATION Response (0x09) 269 | 270 | **NOTE: the content and semantics of this message are not final.** 271 | **Implementations should not respond to or send these messages.** 272 | 273 | message-data = [request-id, topic] 274 | message-type = 0x09 275 | request-id = request-id of REGTOPIC 276 | 277 | REGCONFIRMATION notifies the recipient about a successful registration for the given 278 | topic. This call is sent by the advertisement medium after the time window for 279 | registration has elapsed on a topic queue. 280 | 281 | ### TOPICQUERY Request (0x0A) 282 | 283 | **NOTE: the content and semantics of this message are not final.** 284 | **Implementations should not respond to or send these messages.** 285 | 286 | message-data = [request-id, topic] 287 | message-type = 0x0a 288 | topic = 32-byte topic hash 289 | 290 | TOPICQUERY requests nodes in the [topic queue] of the given topic. The recipient of this 291 | request must send one or more NODES messages containing node records registered for the 292 | topic. 293 | 294 | ## Test Vectors 295 | 296 | A collection of test vectors for this specification can be found at 297 | [discv5 wire test vectors]. 298 | 299 | [handshake section]: ./discv5-theory.md#handshake-steps 300 | [topic queue]: ./discv5-theory.md#topic-table 301 | [theory section on tickets]: ./discv5-theory.md#tickets 302 | [EIP-778]: ../enr.md 303 | [discv5 wire test vectors]: ./discv5-wire-test-vectors.md 304 | -------------------------------------------------------------------------------- /rlpx.md: -------------------------------------------------------------------------------- 1 | # The RLPx Transport Protocol 2 | 3 | This specification defines the RLPx transport protocol, a TCP-based transport protocol 4 | used for communication among Ethereum nodes. The protocol carries encrypted messages 5 | belonging to one or more 'capabilities' which are negotiated during connection 6 | establishment. RLPx is named after the [RLP] serialization format. The name is not an 7 | acronym and has no particular meaning. 8 | 9 | The current protocol version is **5**. You can find a list of changes in past versions at 10 | the end of this document. 11 | 12 | ## Notation 13 | 14 | `X || Y`\ 15 |     denotes concatenation of X and Y.\ 16 | `X ^ Y`\ 17 |     is byte-wise XOR of X and Y.\ 18 | `X[:N]`\ 19 |     denotes an N-byte prefix of X.\ 20 | `[X, Y, Z, ...]`\ 21 |     denotes recursive encoding as an RLP list.\ 22 | `keccak256(MESSAGE)`\ 23 |     is the Keccak256 hash function as used by Ethereum.\ 24 | `ecies.encrypt(PUBKEY, MESSAGE, AUTHDATA)`\ 25 |     is the asymmetric authenticated encryption function as used by RLPx.\ 26 |     AUTHDATA is authenticated data which is not part of the resulting ciphertext,\ 27 |     but written to HMAC-256 before generating the message tag.\ 28 | `ecdh.agree(PRIVKEY, PUBKEY)`\ 29 |     is elliptic curve Diffie-Hellman key agreement between PRIVKEY and PUBKEY. 30 | 31 | ## ECIES Encryption 32 | 33 | ECIES (Elliptic Curve Integrated Encryption Scheme) is an asymmetric encryption method 34 | used in the RLPx handshake. The cryptosystem used by RLPx is 35 | 36 | - The elliptic curve secp256k1 with generator `G`. 37 | - `KDF(k, len)`: the NIST SP 800-56 Concatenation Key Derivation Function 38 | - `MAC(k, m)`: HMAC using the SHA-256 hash function. 39 | - `AES(k, iv, m)`: the AES-128 encryption function in CTR mode. 40 | 41 | Alice wants to send an encrypted message that can be decrypted by Bobs static private key 42 | kB. Alice knows about Bobs static public key 43 | KB. 44 | 45 | To encrypt the message `m`, Alice generates a random number `r` and corresponding elliptic 46 | curve public key `R = r * G` and computes the shared secret S = Px 47 | where (Px, Py) = r * KB. She derives key 48 | material for encryption and authentication as 49 | kE || kM = KDF(S, 32) as well as a random 50 | initialization vector `iv`. Alice sends the encrypted message `R || iv || c || d` where 51 | c = AES(kE, iv , m) and 52 | d = MAC(sha256(kM), iv || c) to Bob. 53 | 54 | For Bob to decrypt the message `R || iv || c || d`, he derives the shared secret 55 | S = Px where 56 | (Px, Py) = kB * R as well as the encryption and 57 | authentication keys kE || kM = KDF(S, 32). Bob verifies 58 | the authenticity of the message by checking whether 59 | d == MAC(sha256(kM), iv || c) then obtains the plaintext as 60 | m = AES(kE, iv || c). 61 | 62 | ## Node Identity 63 | 64 | All cryptographic operations are based on the secp256k1 elliptic curve. Each node is 65 | expected to maintain a static secp256k1 private key which is saved and restored between 66 | sessions. It is recommended that the private key can only be reset manually, for example, 67 | by deleting a file or database entry. 68 | 69 | ## Initial Handshake 70 | 71 | An RLPx connection is established by creating a TCP connection and agreeing on ephemeral 72 | key material for further encrypted and authenticated communication. The process of 73 | creating those session keys is the 'handshake' and is carried out between the 'initiator' 74 | (the node which opened the TCP connection) and the 'recipient' (the node which accepted it). 75 | 76 | 1. initiator connects to recipient and sends its `auth` message 77 | 2. recipient accepts, decrypts and verifies `auth` (checks that recovery of signature == 78 | `keccak256(ephemeral-pubk)`) 79 | 3. recipient generates `auth-ack` message from `remote-ephemeral-pubk` and `nonce` 80 | 4. recipient derives secrets and sends the first encrypted frame containing the [Hello] message 81 | 5. initiator receives `auth-ack` and derives secrets 82 | 6. initiator sends its first encrypted frame containing initiator [Hello] message 83 | 7. recipient receives and authenticates first encrypted frame 84 | 8. initiator receives and authenticates first encrypted frame 85 | 9. cryptographic handshake is complete if MAC of first encrypted frame is valid on both sides 86 | 87 | Either side may disconnect if authentication of the first framed packet fails. 88 | 89 | Handshake messages: 90 | 91 | auth = auth-size || enc-auth-body 92 | auth-size = size of enc-auth-body, encoded as a big-endian 16-bit integer 93 | auth-vsn = 4 94 | auth-body = [sig, initiator-pubk, initiator-nonce, auth-vsn, ...] 95 | enc-auth-body = ecies.encrypt(recipient-pubk, auth-body || auth-padding, auth-size) 96 | auth-padding = arbitrary data 97 | 98 | ack = ack-size || enc-ack-body 99 | ack-size = size of enc-ack-body, encoded as a big-endian 16-bit integer 100 | ack-vsn = 4 101 | ack-body = [recipient-ephemeral-pubk, recipient-nonce, ack-vsn, ...] 102 | enc-ack-body = ecies.encrypt(initiator-pubk, ack-body || ack-padding, ack-size) 103 | ack-padding = arbitrary data 104 | 105 | Implementations must ignore any mismatches in `auth-vsn` and `ack-vsn`. Implementations 106 | must also ignore any additional list elements in `auth-body` and `ack-body`. 107 | 108 | Secrets generated following the exchange of handshake messages: 109 | 110 | static-shared-secret = ecdh.agree(privkey, remote-pubk) 111 | ephemeral-key = ecdh.agree(ephemeral-privkey, remote-ephemeral-pubk) 112 | shared-secret = keccak256(ephemeral-key || keccak256(nonce || initiator-nonce)) 113 | aes-secret = keccak256(ephemeral-key || shared-secret) 114 | mac-secret = keccak256(ephemeral-key || aes-secret) 115 | 116 | ## Framing 117 | 118 | All messages following the initial handshake are framed. A frame carries a single 119 | encrypted message belonging to a capability. 120 | 121 | The purpose of framing is multiplexing multiple capabilities over a single connection. 122 | Secondarily, as framed messages yield reasonable demarcation points for message 123 | authentication codes, supporting an encrypted and authenticated stream becomes 124 | straight-forward. Frames are encrypted and authenticated via key material generated during 125 | the handshake. 126 | 127 | The frame header provides information about the size of the message and the message's 128 | source capability. Padding is used to prevent buffer starvation, such that frame 129 | components are byte-aligned to block size of cipher. 130 | 131 | frame = header-ciphertext || header-mac || frame-ciphertext || frame-mac 132 | header-ciphertext = aes(aes-secret, header) 133 | header = frame-size || header-data || header-padding 134 | header-data = [capability-id, context-id] 135 | capability-id = integer, always zero 136 | context-id = integer, always zero 137 | header-padding = zero-fill header to 16-byte boundary 138 | frame-ciphertext = aes(aes-secret, frame-data || frame-padding) 139 | frame-padding = zero-fill frame-data to 16-byte boundary 140 | 141 | See the [Capability Messaging] section for definitions of `frame-data` and `frame-size.` 142 | 143 | ### MAC 144 | 145 | Message authentication in RLPx uses two keccak256 states, one for each direction of 146 | communication. The `egress-mac` and `ingress-mac` keccak states are continuously updated 147 | with the ciphertext of bytes sent (egress) or received (ingress). Following the initial 148 | handshake, the MAC states are initialized as follows: 149 | 150 | Initiator: 151 | 152 | egress-mac = keccak256.init((mac-secret ^ recipient-nonce) || auth) 153 | ingress-mac = keccak256.init((mac-secret ^ initiator-nonce) || ack) 154 | 155 | Recipient: 156 | 157 | egress-mac = keccak256.init((mac-secret ^ initiator-nonce) || ack) 158 | ingress-mac = keccak256.init((mac-secret ^ recipient-nonce) || auth) 159 | 160 | When a frame is sent, the corresponding MAC values are computed by updating the 161 | `egress-mac` state with the data to be sent. The update is performed by XORing the header 162 | with the encrypted output of its corresponding MAC. This is done to ensure uniform 163 | operations are performed for both plaintext MAC and ciphertext. All MACs are sent 164 | cleartext. 165 | 166 | header-mac-seed = aes(mac-secret, keccak256.digest(egress-mac)[:16]) ^ header-ciphertext 167 | egress-mac = keccak256.update(egress-mac, header-mac-seed) 168 | header-mac = keccak256.digest(egress-mac)[:16] 169 | 170 | Computing `frame-mac`: 171 | 172 | egress-mac = keccak256.update(egress-mac, frame-ciphertext) 173 | frame-mac-seed = aes(mac-secret, keccak256.digest(egress-mac)[:16]) ^ keccak256.digest(egress-mac)[:16] 174 | egress-mac = keccak256.update(egress-mac, frame-mac-seed) 175 | frame-mac = keccak256.digest(egress-mac)[:16] 176 | 177 | Verifying the MAC on ingress frames is done by updating the `ingress-mac` state in the 178 | same way as `egress-mac` and comparing to the values of `header-mac` and `frame-mac` in 179 | the ingress frame. This should be done before decrypting `header-ciphertext` and 180 | `frame-ciphertext`. 181 | 182 | # Capability Messaging 183 | 184 | All messages following the initial handshake are associated with a 'capability'. Any 185 | number of capabilities can be used concurrently on a single RLPx connection. 186 | 187 | A capability is identified by a short ASCII name (max eight characters) and version number. The capabilities 188 | supported on either side of the connection are exchanged in the [Hello] message belonging 189 | to the 'p2p' capability which is required to be available on all connections. 190 | 191 | ## Message Encoding 192 | 193 | The initial [Hello] message is encoded as follows: 194 | 195 | frame-data = msg-id || msg-data 196 | frame-size = length of frame-data, encoded as a 24bit big-endian integer 197 | 198 | where `msg-id` is an RLP-encoded integer identifying the message and `msg-data` is an RLP 199 | list containing the message data. 200 | 201 | All messages following Hello are compressed using the Snappy algorithm. 202 | 203 | frame-data = msg-id || snappyCompress(msg-data) 204 | frame-size = length of frame-data encoded as a 24bit big-endian integer 205 | 206 | Note that the `frame-size` of compressed messages refers to the compressed size of 207 | `msg-data`. Since compressed messages may inflate to a very large size after 208 | decompression, implementations should check for the uncompressed size of the data before 209 | decoding the message. This is possible because the [snappy format] contains a length 210 | header. Messages carrying uncompressed data larger than 16 MiB should be rejected by 211 | closing the connection. 212 | 213 | ## Message ID-based Multiplexing 214 | 215 | While the framing layer supports a `capability-id`, the current version of RLPx doesn't 216 | use that field for multiplexing between different capabilities. Instead, multiplexing 217 | relies purely on the message ID. 218 | 219 | Each capability is given as much of the message-ID space as it needs. All such 220 | capabilities must statically specify how many message IDs they require. On connection and 221 | reception of the [Hello] message, both peers have equivalent information about what 222 | capabilities they share (including versions) and are able to form consensus over the 223 | composition of message ID space. 224 | 225 | Message IDs are assumed to be compact from ID 0x10 onwards (0x00-0x0f is reserved for the 226 | "p2p" capability) and given to each shared (equal-version, equal-name) capability in 227 | alphabetic order. Capability names are case-sensitive. Capabilities which are not shared 228 | are ignored. If multiple versions are shared of the same (equal name) capability, the 229 | numerically highest wins, others are ignored. 230 | 231 | ## "p2p" Capability 232 | 233 | The "p2p" capability is present on all connections. After the initial handshake, both 234 | sides of the connection must send either [Hello] or a [Disconnect] message. Upon receiving 235 | the [Hello] message a session is active and any other message may be sent. Implementations 236 | must ignore any difference in protocol version for forward-compatibility reasons. When 237 | communicating with a peer of lower version, implementations should try to mimic that 238 | version. 239 | 240 | At any time after protocol negotiation, a [Disconnect] message may be sent. 241 | 242 | ### Hello (0x00) 243 | 244 | `[protocolVersion: P, clientId: B, capabilities, listenPort: P, nodeKey: B_64, ...]` 245 | 246 | First packet sent over the connection, and sent once by both sides. No other messages may 247 | be sent until a Hello is received. Implementations must ignore any additional list elements 248 | in Hello because they may be used by a future version. 249 | 250 | - `protocolVersion` the version of the "p2p" capability, **5**. 251 | - `clientId` Specifies the client software identity, as a human-readable string (e.g. 252 | "Ethereum(++)/1.0.0"). 253 | - `capabilities` is the list of supported capabilities and their versions: 254 | `[[cap1, capVersion1], [cap2, capVersion2], ...]`. 255 | - `listenPort` (legacy) specifies the port that the client is listening on (on the 256 | interface that the present connection traverses). If 0 it indicates the client is 257 | not listening. This field should be ignored. 258 | - `nodeId` is the secp256k1 public key corresponding to the node's private key. 259 | 260 | ### Disconnect (0x01) 261 | 262 | `[reason: P]` 263 | 264 | Inform the peer that a disconnection is imminent; if received, a peer should disconnect 265 | immediately. When sending, well-behaved hosts give their peers a fighting chance (read: 266 | wait 2 seconds) to disconnect to before disconnecting themselves. 267 | 268 | `reason` is an optional integer specifying one of a number of reasons for disconnect: 269 | 270 | | Reason | Meaning | 271 | |--------|:-------------------------------------------------------------| 272 | | `0x00` | Disconnect requested | 273 | | `0x01` | TCP sub-system error | 274 | | `0x02` | Breach of protocol, e.g. a malformed message, bad RLP, ... | 275 | | `0x03` | Useless peer | 276 | | `0x04` | Too many peers | 277 | | `0x05` | Already connected | 278 | | `0x06` | Incompatible P2P protocol version | 279 | | `0x07` | Null node identity received - this is automatically invalid | 280 | | `0x08` | Client quitting | 281 | | `0x09` | Unexpected identity in handshake | 282 | | `0x0a` | Identity is the same as this node (i.e. connected to itself) | 283 | | `0x0b` | Ping timeout | 284 | | `0x10` | Some other reason specific to a subprotocol | 285 | 286 | ### Ping (0x02) 287 | 288 | `[]` 289 | 290 | Requests an immediate reply of [Pong] from the peer. 291 | 292 | ### Pong (0x03) 293 | 294 | `[]` 295 | 296 | Reply to the peer's [Ping] packet. 297 | 298 | # Change Log 299 | 300 | ### Known Issues in the current version 301 | 302 | - The frame encryption/MAC scheme is considered 'broken' because `aes-secret` and 303 | `mac-secret` are reused for both reading and writing. The two sides of a RLPx connection 304 | generate two CTR streams from the same key, nonce and IV. If an attacker knows one 305 | plaintext, they can decrypt unknown plaintexts of the reused keystream. 306 | - General feedback from reviewers has been that the use of a keccak256 state as a MAC 307 | accumulator and the use of AES in the MAC algorithm is an uncommon and overly complex 308 | way to perform message authentication but can be considered safe. 309 | - The frame encoding provides `capability-id` and `context-id` fields for multiplexing 310 | purposes, but these fields are unused. 311 | 312 | ### Version 5 (EIP-706, September 2017) 313 | 314 | [EIP-706] added Snappy message compression. 315 | 316 | ### Version 4 (EIP-8, December 2015) 317 | 318 | [EIP-8] changed the encoding of `auth-body` and `ack-body` in the initial handshake to 319 | RLP, added a version number to the handshake and mandated that implementations should 320 | ignore additional list elements in handshake messages and [Hello]. 321 | 322 | # References 323 | 324 | - Elaine Barker, Don Johnson, and Miles Smid. NIST Special Publication 800-56A Section 5.8.1, 325 | Concatenation Key Derivation Function. 2017.\ 326 | URL  327 | 328 | - Victor Shoup. A proposal for an ISO standard for public key encryption, Version 2.1. 2001.\ 329 | URL  330 | 331 | - Mike Belshe and Roberto Peon. SPDY Protocol - Draft 3. 2014.\ 332 | URL  333 | 334 | - Snappy compressed format description. 2011.\ 335 | URL  336 | 337 | Copyright © 2014 Alex Leverington. 338 | 339 | This work is licensed under a 340 | Creative Commons Attribution-NonCommercial-ShareAlike 341 | 4.0 International License. 342 | 343 | [Hello]: #hello-0x00 344 | [Disconnect]: #disconnect-0x01 345 | [Ping]: #ping-0x02 346 | [Pong]: #pong-0x03 347 | [Capability Messaging]: #capability-messaging 348 | [EIP-8]: https://eips.ethereum.org/EIPS/eip-8 349 | [EIP-706]: https://eips.ethereum.org/EIPS/eip-706 350 | [RLP]: https://ethereum.org/en/developers/docs/data-structures-and-encoding/rlp 351 | [snappy format]: https://github.com/google/snappy/blob/master/format_description.txt 352 | -------------------------------------------------------------------------------- /caps/snap.md: -------------------------------------------------------------------------------- 1 | # Ethereum Snapshot Protocol (SNAP) 2 | 3 | The `snap` protocol runs on top of [RLPx], facilitating the exchange of Ethereum state 4 | snapshots between peers. The protocol is an optional extension for peers supporting (or 5 | caring about) the dynamic snapshot format. 6 | 7 | The current version is `snap/1`. 8 | 9 | ## Overview 10 | 11 | The `snap` protocol is designed for semi real-time data retrieval. It's goal is to make 12 | dynamic snapshots of recent states available for peers. The `snap` protocol does not take 13 | part in chain maintenance (block and transaction propagation); and it is **meant to be run 14 | side-by-side with the `eth` protocol**, not standalone (e.g. chain progression is 15 | announced via `eth`). 16 | 17 | The protocol itself is simplistic by design (take note, the supporting implementation is 18 | everything but simple). In its crux, `snap` supports retrieving a contiguous segment of 19 | accounts from the Ethereum state trie, or a contiguous segment of storage slots from one 20 | particular storage trie. Both replies are Merkle proven for immediate verification. In 21 | addition batches of bytecodes can also be retrieved similarly to the `eth` protocol. 22 | 23 | The synchronization mechanism the protocol enables is for peers to retrieve and verify all 24 | the account and storage data without downloading intermediate Merkle trie nodes. The final 25 | state trie is reassembled locally. An additional complexity nodes must be aware of, is 26 | that state is ephemeral and moves with the chain, so syncers need to support reassembling 27 | partially consistent state segments. This is supported by trie node retrieval similar to 28 | `eth`, which can be used to heal trie inconsistencies (more on this later). 29 | 30 | The `snap` protocol permits downloading the entire Ethereum state without having to 31 | download all the intermediate Merkle proofs, which can be regenerated locally. This 32 | reduces the networking load enormously: 33 | 34 | - Ingress bandwidth is reduced from `O(accounts * log account + SUM(states * log states))` 35 | (Merkle trie nodes) to `O(accounts + SUM(states))` (actual state data). 36 | - Egress bandwidth is reduced from `O(accounts * log account + SUM(states * log states)) * 37 | 32 bytes` (Merkle trie node hashes) to `O(accounts + SUM(states)) / 100000 bytes` 38 | (number of 100KB chucks to cover the state). 39 | - Round trip time is reduced from `O(accounts * log account + SUM(states * log states)) / 40 | 384` (states retrieval packets) to `O(accounts + SUM(states)) / 100000 bytes` (number of 41 | 100KB chucks to cover the state). 42 | 43 | ### Expected results 44 | 45 | To put some numbers on the above abstract orders of magnitudes, synchronizing Ethereum 46 | mainnet state (i.e. ignoring blocks and receipts, as those are the same) with `eth` vs. 47 | the `snap` protocol: 48 | 49 | Block ~#11,177,000: 50 | 51 | - Accounts: 107,598,788 @ 19.70GiB 52 | - Byte codes: 319,654 @ 1.48GiB 53 | - Storage slots: 365,787,020 @ 49.88GiB 54 | - Trie nodes: 617,045,138 55 | 56 | | | Time | Upload | Download | Packets | Serving disk reads* | 57 | |:------:|:------:|:-------:|:--------:|:--------:|:-------------------:| 58 | | `eth` | 10h50m | 20.38GB | 43.8GB | 1607M | 15.68TB | 59 | | `snap` | 2h6m | 0.15GB | 20.44GB | 0.099M | 0.096TB | 60 | | | -80.6% | -99.26% | -53.33% | -99.993% | -99.39% | 61 | 62 | *\*Also accounts for other peer requests during the time span.* 63 | 64 | Post snap state heal: 65 | 66 | - Additional trie nodes: 541,260 @ 160.44MiB 67 | - Additional byte codes: 34 @ 234.98KiB 68 | 69 | ## Relation to `eth` 70 | 71 | The `snap` protocol is a *dependent satellite* of `eth` (i.e. to run `snap`, you need to 72 | run `eth` too), not a fully standalone protocol. This is a deliberate design decision: 73 | 74 | - `snap` is meant to be a bootstrap aid for newly joining full nodes. By enforcing all 75 | `snap` peers to also speak `eth`, we can avoid non-full nodes from lingering attached to 76 | `snap` indefinitely. 77 | - `eth` already contains well established chain and fork negotiation mechanisms, as well 78 | as remote peer staleness detection during sync. By running both protocols side-by-side, 79 | `snap` can benefit of all these mechanisms without having to duplicate them. 80 | 81 | This *satellite* status may be changed later, but it's better to launch with a more 82 | restricted protocol first and then expand if need be vs. trying to withdraw depended-upon 83 | features. 84 | 85 | The `snap` protocol is not an extension / next version of `eth` as it relies on the 86 | availability of a *snapshot* acceleration structure that can iterate accounts and storage 87 | slots linearly. Its purpose is also one specific sync method that might not be suitable 88 | for all clients. Keeping `snap` as a separate protocol permits every client to decide to 89 | pursue it or not, without hindering their capacity to participate in the `eth` protocol. 90 | 91 | ## Synchronization algorithm 92 | 93 | The crux of the snapshot synchronization is making contiguous ranges of accounts and 94 | storage slots available for remote retrieval. The sort order is the same as the state trie 95 | iteration order, which makes it possible to not only request N subsequent accounts, but 96 | also to Merkle prove them. Some important properties of this simple algorithm: 97 | 98 | - Opposed to *fast sync*, we only need to transfer the useful leaf data from the state 99 | trie and can reconstruct internal nodes locally. 100 | - Opposed to *warp sync*, we can download small chunks of accounts and storage slots and 101 | immediately verify their Merkle proofs, making junk attacks impossible. 102 | - Opposed to *warp sync*, random account ranges can be retrieved, thus synchronization 103 | concurrency is totally dependent on client implementation and is not forced by the 104 | protocol. 105 | 106 | The gotcha of the snapshot synchronization is that serving nodes need to be able to 107 | provide **fast** iterable access to the state of the most recent `N` (128) blocks. 108 | Iterating the Merkle trie itself might be functional, but it's not viable (iterating the 109 | state trie at the time of writing takes 9h 30m on an idle machine). Geth introduced 110 | support for [dynamic snapshots], which allows iterating all the accounts in 7m 111 | (see [blog for more]). Some important properties of the dynamic snapshots: 112 | 113 | - Serving a contiguous range of accounts or storage slots take `O(n)` operations, and more 114 | importantly, it's the same for disk access too, being stored contiguously on disk (not 115 | counting the database read amplification). 116 | - Maintaining a live dynamic snapshot means: 117 | - Opposed to *warp sync*, syncing nodes can always get the latest data, thus they don't 118 | need to process days' worth of blocks afterwards. 119 | - Opposed to *warp sync*, there is no pre-computation to generate a snapshot (it's 120 | updated live), so there's no periodic burden on the nodes to iterate the tries (there 121 | it an initial burden to create the first snapshot after sync though). 122 | - Providing access to 128 recent snapshots permits `O(1)` direct access to any account 123 | and state, which can be used during EVM execution for `SLOAD`. 124 | 125 | The caveat of the snapshot synchronization is that as with *fast sync* (and opposed to 126 | *warp sync*), the available data constantly moves (as new blocks arrive). The probability 127 | of finishing sync before the 128 block window (15m) moves out is asymptotically zero. This 128 | is not a problem, because we can self-heal. It is fine to import state snapshot chunks 129 | from different tries, because the inconsistencies can be fixed by running a 130 | *fast-sync-style-state-sync* on top of the assembled semi-correct state afterwards. Some 131 | important properties of the self-healing: 132 | 133 | - Synchronization can be aborted at any time and resumed later. It might cause 134 | self-healing to run longer, but it will fix the data either way. 135 | - Synchronization on slow connections is guaranteed to finish too (as long as the node can 136 | download data faster than it's being produced by the network), the data cannot disappear 137 | from the network (opposed to warp sync). 138 | 139 | ## Data format 140 | 141 | The accounts in the `snap` protocol are analogous to the Ethereum RLP consensus encoding 142 | (same fields, same order), but in a **slim** format: 143 | 144 | - The code hash is `empty list` instead of `Keccak256("")` 145 | - The root hash is `empty list` instead of `Hash()` 146 | 147 | This is done to avoid having to transfer the same 32+32 bytes for all plain accounts over 148 | the network. 149 | 150 | ## Protocol Messages 151 | 152 | ### GetAccountRange (0x00) 153 | 154 | `[reqID: P, rootHash: B_32, startingHash: B_32, limitHash: B_32, responseBytes: P]` 155 | 156 | Requests an unknown number of accounts from a given account trie, starting at the 157 | specified account hash and capped by the maximum allowed response size in bytes. The 158 | intended purpose of this message is to fetch a large number of subsequent accounts from a 159 | remote node and reconstruct a state subtrie locally. 160 | 161 | - `reqID`: Request ID to match up responses with 162 | - `rootHash`: Root hash of the account trie to serve 163 | - `startingHash`: Account hash of the first to retrieve 164 | - `limitHash`: Account hash after which to stop serving data 165 | - `responseBytes`: Soft limit at which to stop returning data 166 | 167 | Notes: 168 | 169 | - Nodes **must** always respond to the query. 170 | - If the node does **not** have the state for the requested state root, it **must** return 171 | an empty reply. It is the responsibility of the caller to query an state not older than 172 | 128 blocks. 173 | - The responding node is allowed to return **less** data than requested (own QoS limits), 174 | but the node **must** return at least one account. If no accounts exist between `startingHash` and `limitHash`, then 175 | the first (if any) account **after** `limitHash` must be provided. 176 | - The responding node **must** Merkle prove the starting hash (even if it does not exist) 177 | and the last returned account (if any exists after the starting hash). 178 | 179 | Rationale: 180 | 181 | - The starting account is identified deliberately by hash and not by address. As the 182 | accounts in the Ethereum Merkle trie are sorted by hash, the address is irrelevant. In 183 | addition, there is no consensus requirement for full nodes to be aware of the address 184 | pre-images. 185 | - The response is capped by byte size and not by number of accounts, because it makes the 186 | network traffic more deterministic. As the state density is unknowable, it's also 187 | impossible to delimit the query with an ending hash. 188 | 189 | Caveats: 190 | 191 | - When requesting accounts from a starting hash, malicious nodes may skip ahead and return 192 | a gapped reply. Such a reply would cause sync to finish early with a lot of missing data. 193 | Proof of non-existence for the starting hash prevents this attack, completely covering 194 | the range from start to end. 195 | - No special signaling is needed if there are no more accounts after the last returned 196 | one, as the attached Merkle proof for the last account will have all trie nodes right of 197 | the proven path zero. 198 | 199 | ### AccountRange (0x01) 200 | 201 | `[reqID: P, accounts: [[accHash: B_32, accBody: B], ...], proof: [node_1: B, node_2, ...]]` 202 | 203 | Returns a number of consecutive accounts and the Merkle proofs for the entire range 204 | (boundary proofs). The left-side proof must be for the requested origin hash (even if an 205 | associated account does not exist) and the right-side proof must be for the last returned 206 | account. 207 | 208 | - `reqID`: ID of the request this is a response for 209 | - `accounts`: List of consecutive accounts from the trie 210 | - `accHash`: Hash of the account address (trie path) 211 | - `accBody`: Account body in slim format 212 | - `proof`: List of trie nodes proving the account range 213 | 214 | Notes: 215 | 216 | - If the account range is the entire state (requested origin was `0x00..0` and all 217 | accounts fit into the response), no proofs should be sent along the response. This is 218 | unlikely for accounts, but since it's a common situation for storage slots, this clause 219 | keeps the behavior the same across both. 220 | 221 | ### GetStorageRanges (0x02) 222 | 223 | `[reqID: P, rootHash: B_32, accountHashes: [B_32], startingHash: B, limitHash: B, responseBytes: P]` 224 | 225 | Requests the storage slots of multiple accounts' storage tries. Since certain contracts 226 | have huge state, the method can also request storage slots from a single account, starting 227 | at a specific storage key hash. The intended purpose of this message is to fetch a large 228 | number of subsequent storage slots from a remote node and reconstruct a state subtrie 229 | locally. 230 | 231 | - `reqID`: Request ID to match up responses with 232 | - `rootHash`: Root hash of the account trie to serve 233 | - `accountHashes`: Account hashes of the storage tries to serve 234 | - `startingHash`: Storage slot hash of the first to retrieve 235 | - `limitHash`: Storage slot hash after which to stop serving 236 | - `responseBytes`: Soft limit at which to stop returning data 237 | 238 | Notes: 239 | 240 | - Nodes **must** always respond to the query. 241 | - If the node does **not** have the state for the requested state root or for **any** 242 | requested account hash, it **must** return an empty reply. It is the responsibility of 243 | the caller to query an state not older than 128 blocks; and the caller is expected to 244 | only ever query existing accounts. 245 | - The responding node is allowed to return **less** data than requested (serving QoS 246 | limits), but the node **must** return at least one slot, unless none exists. 247 | - If multiple accounts' storage is requested, serving nodes should reply with the entire 248 | storage ranges (thus no Merkle proofs needed), up to the first contract which exceeds 249 | the packet limit. If the last included storage range does not fit entirely, a Merkle 250 | proof **must** be attached to that and **only** that. 251 | - If a single account's storage is requested, serving nodes should only return slots 252 | starting with the requested starting hash, up to the last one or until the packet fills 253 | up. It the entire storage range is not being returned, a Merkle proof **must** be 254 | attached. 255 | - If a proof is attached, the responding node **must** Merkle prove the starting hash 256 | (even if it does not exist) and the last returned slot (if any exists after the starting 257 | hash). 258 | 259 | Rationale: 260 | 261 | - The response is capped by byte size and not by number of slots, because it makes the 262 | network traffic more deterministic. 263 | - The request supports querying multiple contracts at the same time as most storage tries 264 | are in the order of 100s of bytes. Querying these individually would produce a lot of 265 | network round trips. 266 | 267 | Caveats: 268 | 269 | - When requesting storage slots from a starting hash, malicious nodes may skip ahead and 270 | return a prefix-gapped reply. Such a reply would cause sync to finish early with a lot 271 | of missing data. Proof of non-existence for the starting hash prevents this attack, 272 | completely covering the range from start to end. 273 | - Although serving nodes should respect the response limit requested by the caller, it is 274 | valuable to slightly force the limit (consider it soft only) when adding the last 275 | contract to avoid having to split it and prove it. 276 | - No special signaling is needed if there are no more slots after the last returned one, 277 | as the attached Merkle proof for the last account will have all trie nodes right of the 278 | proven path zero. 279 | 280 | ### StorageRanges (0x03) 281 | 282 | `[reqID: P, slots: [[[slotHash: B_32, slotData: B], ...], ...], proof: [node_1: B, node_2, ...]]` 283 | 284 | Returns a number of consecutive storage slots for the requested account (i.e. list of list 285 | of slots) and optionally the Merkle proofs for the last range (boundary proofs) if it only 286 | partially covers the storage trie. The left-side proof must be for the requested origin 287 | slots (even if it does not exist) and the right-side proof must be for the last returned 288 | slots. 289 | 290 | - `reqID`: ID of the request this is a response for 291 | - `slots`: List of list of consecutive slots from the trie (one list per account) 292 | - `slotHash`: Hash of the storage slot key (trie path) 293 | - `slotData`: Data content of the slot 294 | - `proof`: List of trie nodes proving the slot range 295 | 296 | Notes: 297 | 298 | - If the slot range is the entire storage state, no proofs will be sent along the response. 299 | 300 | ### GetByteCodes (0x04) 301 | 302 | `[reqID: P, hashes: [hash1: B_32, hash2: B_32, ...], bytes: P]` 303 | 304 | Requests a number of contract byte-codes by hash. This is analogous to the `eth/63` 305 | `GetNodeData`, but restricted to only bytecode to break the generality that causes issues 306 | with database optimizations. The intended purpose of this request is to allow retrieving 307 | the code associated with accounts retrieved via GetAccountRange, but it's needed during 308 | healing too. 309 | 310 | - `reqID`: Request ID to match up responses with 311 | - `hashes`: Code hashes to retrieve the code for 312 | - `bytes`: Soft limit at which to stop returning data 313 | 314 | *This functionality was duplicated into `snap` from `eth/65` to permit `eth` long term to 315 | become a chain maintenance protocol only and move synchronization primitives out into 316 | satellite protocols only.* 317 | 318 | Notes: 319 | 320 | - Nodes **must** always respond to the query. 321 | - The returned codes **must** be in the request order. 322 | - The responding node is allowed to return **less** data than requested (serving QoS 323 | limits), but the node **must** return at least one bytecode, unless none requested are 324 | available, in which case it **must** answer with an empty response. 325 | - If a bytecode is unavailable, the node **must** skip that slot and proceed to the next 326 | one. The node **must not** return `nil` or other placeholders. 327 | 328 | Rationale: 329 | 330 | - The response is capped by byte size and not by number of slots, because it makes the 331 | network traffic more deterministic, as contract sizes can vary randomly up to 24KB with 332 | current consensus rules. 333 | - By retaining the original request order and skipping unavailable bytecodes, the 334 | requesting node can differentiate between unavailable data (gaps in the hashes) and QoS 335 | limitations (missing suffix). 336 | 337 | Caveats: 338 | 339 | - Implementations are free to request as many or as few bytecodes in a single request, but 340 | they should keep in mind that requesting too few results in wasted time due to network 341 | latency; but requesting too many results in wasted bandwidth if the response doesn't 342 | fit. Average (unique) contract size on mainnet is about 5-6KB, so `bytes / 6KB` is a 343 | good heuristic for the number of codes to request in a single packet (e.g. for 512KB 344 | desired response size, 80-100 bytecodes per request is a good choice). 345 | 346 | ### ByteCodes (0x05) 347 | 348 | `[reqID: P, codes: [code1: B, code2: B, ...]]` 349 | 350 | Returns a number of requested contract codes. The order is the same as in the request, but 351 | there might be gaps if not all codes are available or there might be fewer is QoS limits 352 | are reached. 353 | 354 | ### GetTrieNodes (0x06) 355 | 356 | `[reqID: P, rootHash: B_32, paths: [[accPath: B, slotPath1: B, slotPath2: B, ...]...], bytes: P]` 357 | 358 | Requests a number of state (either account or storage) Merkle trie nodes **by path**. This 359 | is analogous in functionality to the `eth/63` `GetNodeData`, but restricted to only tries 360 | and queried by path, to break the generality that causes issues with database 361 | optimizations. 362 | 363 | - `reqID`: Request ID to match up responses with 364 | - `rootHash`: Root hash of the account trie to serve 365 | - `paths`: Trie paths to retrieve the nodes for, grouped by account 366 | - `bytes`: Soft limit at which to stop returning data 367 | 368 | The `paths` is one array of trie node paths to retrieve per account (i.e. list of list of 369 | paths). Each list in the array special cases the first element as the path in the account 370 | trie and the remaining elements as paths in the storage trie. To address an account node, 371 | the inner list should have a length of 1 consisting of only the account path. Partial 372 | paths (<32 bytes) should be compact encoded per the Ethereum wire protocol, full paths 373 | should be plain binary encoded. 374 | 375 | *This functionality was mutated into `snap` from `eth/65` to permit `eth` long term to 376 | become a chain maintenance protocol only and move synchronization primitives out into 377 | satellite protocols only.* 378 | 379 | Notes: 380 | 381 | - Nodes **must** always respond to the query. 382 | - The returned nodes **must** be in the request order. 383 | - If the node does **not** have the state for the requested state root or for **any** 384 | requested account paths, it **must** return an empty reply. It is the responsibility of 385 | the caller to query an state not older than 128 blocks; and the caller is expected to 386 | only ever query existing trie nodes. 387 | - The responding node is allowed to return **less** data than requested (serving QoS 388 | limits), but the node **must** return at least one trie node. 389 | 390 | Rationale: 391 | 392 | - The response is capped by byte size and not by number of slots, because it makes the 393 | network traffic more deterministic. Although opposed to the previous request types 394 | (accounts, slots, codes), trie nodes are relatively deterministic (100-500B), the 395 | protocol remains cleaner if all packets follow the same traffic shaping rules. 396 | - A naive way to represent trie nodes would be a simple list of `account || storage` path 397 | segments concatenated, but that would be very wasteful on the network as it would 398 | duplicate the account hash for every storage trie node. 399 | 400 | ### TrieNodes (0x07) 401 | 402 | `[reqID: P, nodes: [node1: B, node2: B, ...]]` 403 | 404 | Returns a number of requested state trie nodes. The order is the same as in the request, 405 | but there might be fewer is QoS limits are reached. 406 | 407 | ## Change Log 408 | 409 | ### snap/1 (November 2020) 410 | 411 | Version 1 was the introduction of the snapshot protocol. 412 | 413 | [RLPx]: ../rlpx.md 414 | [dynamic snapshots]: https://github.com/ethereum/go-ethereum/pull/20152 415 | [blog for more]: https://blog.ethereum.org/2020/07/17/ask-about-geth-snapshot-acceleration/ 416 | -------------------------------------------------------------------------------- /caps/les.md: -------------------------------------------------------------------------------- 1 | # Light Ethereum Subprotocol (LES) 2 | 3 | The Light Ethereum Subprotocol (LES) is the protocol used by "light" clients, which only 4 | download block headers as they appear and fetch other parts of the blockchain on-demand. 5 | They provide full functionality in terms of safely accessing the blockchain, but do not 6 | mine and therefore do not take part in the consensus process. Full and archive nodes can 7 | also support the 'les' protocol besides 'eth' in order to be able to serve light nodes. 8 | 9 | The current protocol version is **les/4**. See end of document for a list of changes in 10 | past protocol versions. Some of the les protocol messages are similar to of the [Ethereum 11 | Wire Protocol], with the addition of a few new fields. 12 | 13 | ## Canonical Hash Trie 14 | 15 | Canonical Hash Trie (CHT) structures are used by LES for quick initial syncing and secure 16 | on-demand retrieval of canonical hash mappings, block headers and total difficulty (TD) 17 | values. 18 | 19 | A CHT is a Merkle trie (specifically '[Merkle Patricia Trie]' as used for Ethereum state) 20 | that contains `blockNumber -> [blockHash, TD]` mappings where keys are binary big endian 21 | encoded 64 bit integers and values are RLP-encoded `[hash, number]` pairs. 22 | 23 | CHTs are generated by LES servers for every 32768 blocks, `CHT[i]` containing data for 24 | blocks `0..(i+1) * 32768 - 1`. If a client knows the root hash of `CHT[i]` and wants to fetch 25 | header number `N` (where `N < (i+1) * 32768`), it can obtain the header and the corresponding 26 | Merkle proof of the CHT with a [GetHelperTrieProofs] request. 27 | 28 | CHTs are only generated after 2048 confirmations, making it sure they will not be changed 29 | by a chain reorg. In the current version of the light client there is a hard-coded 30 | `[chtNumber, chtRoot]` pair associated with the genesis block hash of both the mainnet and 31 | the testnet. A trustless validation algorithm is planned for later protocol versions. 32 | 33 | ## BloomBits 34 | 35 | The BloomBits data structure optimizes log searching by doing a bitwise transformation 36 | that makes it cheaper to retrieve bloom filter data relevant to a specific filter. 37 | 38 | When searching in a long section of the block history, we are checking three specific bits 39 | of each bloom filter per queried address/topic. In order to do that, LES must retrieve a 40 | ~550 byte block header per filtered block. 41 | 42 | The BloomBits structure optimizes bloom filter lookups through a "bitwise 90 degree 43 | rotation" of the bloom filters. Blocks are grouped into fixed length sections (section 44 | size for the LES BloomBits Trie is 32768 blocks), `BloomBits[bitIdx][sectionIdx]` is a 45 | 32768 bit (4096 byte) long bit vector that contains a single bit of each bloom filter from 46 | the block range `sectionIdx*SectionSize ... (sectionIdx+1)*SectionSize-1`. Since bloom 47 | filters are usually sparse, a simple data compression makes this structure even more 48 | efficient, especially for on-demand retrieval. By reading and binary AND-ing three 49 | BloomBits sections, we can filter for an address/topic in 32768 blocks at once ("1" bits 50 | in the binary AND result mean bloom matches). 51 | 52 | ### Compression Algorithm 53 | 54 | BloomBits data is stored in compressed form. The compression algorithm is optimized for 55 | sparse input data which contains a lot of zero bytes. Decompression requires knowledge of 56 | the decompressed data length. 57 | 58 | The algorithm can be described with this pseudo-code: 59 | 60 | if data only contains zeroes, 61 | CompressBytes(data) == nil 62 | otherwise if len(data) <= 1, 63 | CompressBytes(data) == data 64 | otherwise: 65 | CompressBytes(data) == append(CompressBytes(nonZeroBitset(data)), nonZeroBytes(data)...) 66 | where 67 | nonZeroBitset(data) is a bit vector with len(data) bits (MSB first): 68 | nonZeroBitset(data)[i/8] && (1 << (7-i%8)) != 0 if data[i] != 0 69 | len(nonZeroBitset(data)) == (len(data)+7)/8 70 | nonZeroBytes(data) contains the non-zero bytes of data in the same order 71 | 72 | ### BloomBits Trie 73 | 74 | In order to make this data structure retrievable on-demand for the light client, we put 75 | the generated vectors in a trie. Parts of this trie can be retrieved with the 76 | [GetHelperTrieProofs] message. Currently the trie root is part of the trusted syncing 77 | checkpoint but trustless validation of the BloomBits trie is part of the development 78 | plans. The trie consists of the compressed bit vectors as values stored at keys 79 | constructed from the bloom bit index encoded as a 2-byte big endian, followed by the 80 | section index encoded as an 8-byte big endian. Since all-zero bit vectors have a zero 81 | length when compressed, these vectors are not added to the trie at all. 82 | 83 | BloomBits tries are generated for each new section of transformed bloom filter data by 84 | adding the vectors belonging to the latest section index to the previous trie. 85 | 86 | ## Client Side Flow Control 87 | 88 | Any node which takes on a server role in the LES protocol needs to be able to somehow 89 | limit the amount of work it does for each client peer during a given time period. They can 90 | always just serve requests slowly if they are overloaded, but it is beneficial to give 91 | some sort of flow control feedback to the clients. This way, clients could (and would have 92 | incentive to) behave nicely and not send requests too quickly in the first place (and then 93 | possibly timeout and resend while the server is still working on them). They could also 94 | distribute requests better between multiple servers they are connected to. And if clients 95 | can do this, servers can expect them to do this and throttle or drop them if they break 96 | the flow control rules. 97 | 98 | ### The Model 99 | 100 | Let us assume that serving each request has a cost (depending on type and parameters) for 101 | the server. This cost is determined by the server, but it has an upper limit for any valid 102 | request. The server assigns a "buffer" for each client from which the cost of each request 103 | is deduced. The buffer has an upper limit (the "buffer limit") and a recharge rate (cost 104 | per second). The server can decide to recharge it more quickly at any time if it has more 105 | free resources, but there is a guaranteed minimum recharge rate. If a request is received 106 | that would drain the client's buffer below zero, the client has broken the flow control 107 | rules and is throttled or disconnected. 108 | 109 | ### The Protocol 110 | 111 | The server announces three parameters in the [Status] message: 112 | 113 | - `"flowControl/BL"`: Buffer Limit, an integer value 114 | - `"flowControl/MRR"`: Minimum Rate of Recharge, an integer value 115 | - `"flowControl/MRC"`: Maximum Request Cost table. The value of this parameter is a 116 | table assigning cost values to every on-demand retrieval message in the LES protocol. 117 | The table is encoded as a list of integer triples: `[[MsgCode, BaseCost, ReqCost], ...]` 118 | 119 | On the server side: 120 | 121 | When a client connects, the server sets the initial Buffer Value (`BV`) of the client to 122 | `BL` and announces `BL` in [Status]. When a request is received from the client, it 123 | calculates the cost according to its own estimates (but not higher than `MaxCost`, which 124 | equals `BaseCost + ReqCost * N`, where `N` is the number of individual elements asked in 125 | the request), then deducts it from `BV`. If `BV` goes negative, drops the peer, otherwise 126 | starts serving the request. The reply message contains a `BV` value that is the previously 127 | calculated `BV` plus the amount recharged during the time spent serving. Note that since 128 | the server can always determine any cost up to `MaxCost` for a request (and a client 129 | should not assume otherwise), it can reject a message without processing it if received 130 | while `BV < MaxCost` because that's already a flow control breach. 131 | 132 | On the client side: 133 | 134 | The client always has a lowest estimate for its current `BV`, called `BLE`. It 135 | 136 | - sets `BLE` to `BL` received in [Status] 137 | - doesn't send any request to the server when `BLE < MaxCost` 138 | - deduces `MaxCost` when sending a request 139 | - recharges `BLE` at the rate of `MRR` when less than `BL` 140 | 141 | When a reply message with a new `BV` value is received, it sets `BLE` to `BV - 142 | Sum(MaxCost)`, summing the `MaxCost` values of requests sent after the one belonging to 143 | this reply. 144 | 145 | #### Buffer underrun 146 | 147 | Before **les/3** buffer underruns always resulted in immediate disconnection. Now it is 148 | possible and recommended to send a [StopMsg] instead and then a [ResumeMsg] when the 149 | buffer has been at least partially recharged. This allows clients to treat the buffer 150 | feedback as an optional performance optimization hint instead of a mandatory mechanism 151 | and allows simple implementations that do not care about the buffer at all. 152 | 153 | ## Request ID 154 | 155 | Every on-demand request message contains a `reqID` field, which is simply returned by the 156 | server in the corresponding reply message. This helps matching replies for requests on the 157 | client side so that each reply doesn't need to be matched against each pending request. 158 | 159 | ## Protocol Messages 160 | 161 | ### Status (0x00) 162 | 163 | `[[key_0, value_0], [key_1, value_1], ...]` 164 | 165 | Inform a peer of the sender's current LES state. This message should be sent just after 166 | the connection is established and prior to any other LES messages. The following keys 167 | are required (value types are noted after the key string): 168 | 169 | - `"protocolVersion"` `P`: is 1 for protocol version one. 170 | - `"networkId"` `P`: specifies the network ID of the chain, as in the [Ethereum Wire Protocol]. 171 | - `"headTd"` `P`: Total Difficulty of the best chain. Integer, as found in block header. 172 | - `"headHash"` `B_32`: the hash of the best (i.e. highest TD) known block. 173 | - `"headNum"` `P`: the number of the best (i.e. highest TD) known block. 174 | - `"genesisHash"` `B_32`: the hash of the Genesis block. 175 | - `"forkID"` `[crc32, nextFork: P]`: mandatory since **les/4**. 176 | The value identifies the chain/fork the node is operating on. 177 | - `"recentTxLookup"` `P`: announced by servers since **les/4**. Transaction status 178 | is served for transactions included in the N-1 most recent blocks (N=1 means that 179 | mined transactions are not served at all). N=0 means all transactions are available. 180 | 181 | There are several optional key/value pairs which can be set: 182 | 183 | - `"announceType"` `P`: set by clients, this field affects the [Announce] messages of the 184 | server. Allowed integer values are: 185 | 186 | - none (`0`): no [Announce] messages are sent, i.e. the client is not interested in announcements. 187 | - simple (`1`): Default. [Announce] messages use the **les/1** format. 188 | - signed (`2`): there is a `"sign"` key in the key/value list of [Announce] messages. The 189 | associated value is a signature of an RLP encoded `[headHash: B_32, headNumber: P, headTd: P]` 190 | structure by the server's node key. 191 | 192 | - `"serveHeaders"` (empty value): present if the peer can serve header chain downloads. 193 | 194 | - `"serveChainSince"` `P`: present if the peer can serve Body/Receipts ODR requests 195 | starting from the given block number. 196 | 197 | - `"serveRecentChain"` `P`: if present then the availability of chain data is only guaranteed 198 | for the given number of recent blocks. If the node serves chain data then `"serveChainSince"` 199 | should always be present while `"serveRecentChain"` is optional. Chain availability can 200 | be assumed for blocks with `blockNumber >= MAX(serveChainSince, headNumber-serveRecentChain+1)`. 201 | 202 | - `"serveStateSince"` `P`: present if the peer can serve Proof/Code ODR requests starting 203 | from the given block number. 204 | 205 | - `"serveRecentState"` `P`: if present then the availability of state data is only guaranteed 206 | for the given number of recent blocks. If the node serves state data then `"serveStateSince"` 207 | should always be present while `"serveRecentState"` is optional. State availability can 208 | be assumed for blocks with `blockNumber >= MAX(serveStateSince, headNumber-serveRecentState+1)`. 209 | 210 | - `"txRelay"` (no value): present if the peer can relay transactions to the ETH network. 211 | 212 | - `"flowControl/BL"`, `"flowControl/MRC"`, `"flowControl/MRR"`: see [Client Side Flow Control] 213 | 214 | Unknown keys should be ignored by both sides. This allows announcing additional 215 | capabilities while staying compatible with past protocol versions. 216 | 217 | ### Announce (0x01) 218 | 219 | `[headHash: B_32, headNumber: P, headTd: P, reorgDepth: P, [[key_0, value_0], [key_1, value_1], ...]]` 220 | 221 | Announce a new chain head and optionally also a change to some of the values announced at 222 | handshake. A restrictive change of server capabilities (for example, an increase of 223 | `"serveStateSince"` due to state pruning) should be announced at least 10 seconds prior to 224 | actually restricting those capabilities in order to avoid asynchronous problems. Changes 225 | to unknown keys should be ignored. Changes to known keys that make no sense lead to 226 | disconnection. 227 | 228 | Announcing a head with a lower or equal TD than previously announced or a head that the 229 | sending node later refuses to honor with a proceeding [GetBlockHeaders] message (with 230 | number and TD also matching) is considered bad form, and may lead to disconnection or 231 | reduce the reputation of the sending node. 232 | 233 | The field `reorgDepth` contains the number of blocks to be rolled back from the last head 234 | announced by the same node in order to find the last common ancestor of the last and 235 | current heaviest chain. Adding this field helps the client to minimize the number of 236 | requests and the amount of bandwidth required to fetch new headers. 237 | 238 | ### GetBlockHeaders (0x02) 239 | 240 | `[reqID: P, [block: {P, B_32}, maxHeaders: P, skip: P, reverse: P in {0, 1}]]` 241 | 242 | Require peer to return a [BlockHeaders] message. Reply must contain a number of block 243 | headers, of rising number when `reverse` is `0`, falling when `1`, `skip` blocks apart, 244 | beginning at block `block` (denoted by either number or hash) in the canonical chain, and 245 | with at most `maxHeaders` items. 246 | 247 | ### BlockHeaders (0x03) 248 | 249 | `[reqID: P, BV: P, [blockHeader_0, blockHeader_1, ...]]` 250 | 251 | Reply to [GetBlockHeaders]. The items in the list (following the message ID) are block 252 | headers in the format described in the main Ethereum specification, previously asked for 253 | in a [GetBlockHeaders] message. The list may be empty if none of the requested block 254 | headers were available on the server side. 255 | 256 | ### GetBlockBodies (0x04) 257 | 258 | `[reqID: P, [hash_0: B_32, hash_1: B_32, ...]]` 259 | 260 | Require peer to return a [BlockBodies] message. Specify the set of blocks that we're 261 | interested in with the hashes. 262 | 263 | ### BlockBodies (0x05) 264 | 265 | `[reqID: P, BV: P, [[transactions_0, uncles_0] , ...]]` 266 | 267 | Reply to [GetBlockBodies]. The items in the list (following the message ID) are some of 268 | the blocks, minus the header, in the format described in the main Ethereum specification, 269 | previously asked for in a [GetBlockBodies] message. 270 | 271 | ### GetReceipts (0x06) 272 | 273 | `[reqID: P, [hash_0: B_32, hash_1: B_32, ...]]` 274 | 275 | Require peer to return a [Receipts] message. 276 | 277 | ### Receipts (0x07) 278 | 279 | `[reqID: P, BV: P, [[receipt_0, receipt_1, ...], ...]]` 280 | 281 | Provide a set of receipts which correspond to the block hashes previously asked for in 282 | [GetReceipts]. 283 | 284 | ### GetProofs (0x08) 285 | 286 | `[reqID: P, [[blockhash: B_32, key: B_32, key2: B_32, fromLevel: P], ...]]` 287 | 288 | Require peer to return a [Proofs] message, containing one or more Merkle proofs, each 289 | proving the value of index `key` from the state trie of the given block (if `key2` is 290 | empty), or the storage value of index `key2` from the storage trie referenced in the 291 | account at `key`. If `fromLevel` is greater than zero, the given number of trie nodes 292 | closest to the root can be omitted from the proof. 293 | 294 | This message was deprecated in **les/2**, use [GetProofsV2] instead. 295 | 296 | ### Proofs (0x09) 297 | 298 | `[reqID: P, BV: P, [[node_1, node_2, ...], ...]]` 299 | 300 | Return a set of Merkle proofs, each consisting of a set of nodes that must be processed in 301 | order to access the trie entry value (or prove the absence of an entry) requested in 302 | [GetProofs]. 303 | 304 | ### GetContractCodes (0x0a) 305 | 306 | `[reqID: P, [[blockhash: B_32, key: B_32], ...]]` 307 | 308 | Require peer to return a [ContractCodes] message. 309 | 310 | ### ContractCodes (0x0b) 311 | 312 | `[reqID: P, BV: P, [value_0: B, value_1: B, ...]]` 313 | 314 | Provide a set of contract codes which correspond to the block hashes and account keys 315 | previously asked in [GetContractCodes]. 316 | 317 | ### GetHeaderProofs (0x0d) 318 | 319 | `[reqID: P, [[chtNumber: P, blockNumber: P, fromLevel: P], ...]]` 320 | 321 | Require peer to return a [HeaderProofs] message, containing one or more canonical block 322 | headers (of block number `blockNumber`) and corresponding Merkle proofs of the [CHT] 323 | (Canonical Hash Trie) identified by `chtNumber`. If `fromLevel` is greater than zero, the 324 | given number of trie nodes closest to the root can be omitted from the proof. 325 | 326 | This message was deprecated in **les/2**, use [GetHelperTrieProofs] instead. 327 | 328 | ### HeaderProofs (0x0e) 329 | 330 | `[reqID: P, BV: P, [[blockHeader, [node_1, node_2...]], ...]]` 331 | 332 | Return a set of structures, each containing a block header and a Merkle proof proving the 333 | header hash and belonging TD against a given CHT requested in [GetHeaderProofs]. 334 | 335 | ### SendTx (0x0c) 336 | 337 | `[txdata_1, txdata_2, ...]` 338 | 339 | Require peer to add a set of transactions into its transaction pool and relay them to the 340 | ETH network. 341 | 342 | This message was deprecated in **les/2**, use [SendTxV2] instead. 343 | 344 | ### GetProofsV2 (0x0f) 345 | 346 | `[reqID: P, [[blockhash: B_32, key: B_32, key2: B_32, fromLevel: P], ...]]` 347 | 348 | Require peer to return a [ProofsV2] message, containing a single (and smallest possible) 349 | set of trie nodes that proves for each request the value of index `key` from the state 350 | trie of the given block (if `key2` is empty), or the storage value of index `key2` from 351 | the storage trie referenced in the account at `key`. If `fromLevel` is greater than zero, 352 | the given number of trie nodes closest to the root can be omitted from the proof. 353 | 354 | ### ProofsV2 (0x10) 355 | 356 | `[reqID: P, BV: P, [node_1, node_2, ...]]` 357 | 358 | Return the smallest set of trie nodes required to access the trie entry value (or prove 359 | the absence of an entry) requested in [GetProofsV2]. This set will be called a *proof 360 | set*. Compared to [Proofs], this message contains a single list of nodes satisfying all 361 | requested proofs. The list shouldn't contain duplicate nodes. 362 | 363 | ### GetHelperTrieProofs (0x11) 364 | 365 | `[reqID: P, [[subType: P, sectionIdx: P, key: B, fromLevel: P, auxReq: P], ...]]` 366 | 367 | Require peer to return a [HelperTrieProofs] message, containing a *proof set* and optional 368 | auxiliary data for each request. 369 | 370 | Note: this request is a generalization of the **les/1** [GetHeaderProofs] message. It 371 | retrieves Merkle proofs from different types of "helper tries" which are generated for 372 | every fixed-length section of the canonical chain. `subType` identifies the helper trie 373 | that is being requested for the section marked by `sectionIdx`. `key` and `fromLevel` are 374 | interpreted like in case of proof requests. 375 | 376 | If `auxReq` is greater than zero then auxiliary data is requested too. If `auxReq` is 1 377 | then the root hash of the specified trie (according to the server) is returned and no trie 378 | nodes are added to the proof set. This special request will be required for trustless 379 | validation of helper tries. The interpretation of `auxReq` values greater than 1 is 380 | subject to `subType`. 381 | 382 | The following `subType` integer values are allowed in **les/2**: 383 | 384 | - CHT (`0`): request a key from the [Canonical Hash Trie]. If `auxReq` is 2 then the 385 | belonging header is returned as `auxData`. `key` is the block number encoded as an 386 | 8-byte big endian. Note that the section size for CHTs has been raised to 32k instead of 387 | 4k blocks so for example a `sectionIdx` of 100 equals a `chtNumber` of 807 in case of 388 | the **les/1** [GetHeaderProofs] message. 389 | - BloomBits (`1`): request a key from the [BloomBits Trie]. In this trie `key` is 10 bytes 390 | long, it consists of the bloom bit index encoded as a 2-byte big endian, followed by the 391 | section index encoded as an 8-byte big endian. The returned value is the corresponding 392 | compressed bloom bit vector. 393 | 394 | ### HelperTrieProofs (0x12) 395 | 396 | `[reqID: P, BV: P, [[node_1, node_2...], [auxData_0, auxData_1, ...]]]` 397 | 398 | Return a proof set and a set of `auxData` requested in [GetHelperTrieProofs]. The length 399 | of the `auxData` list equals the number of requests with a non-zero `auxReq`. 400 | 401 | ### SendTxV2 (0x13) 402 | 403 | `[reqID: P, [txdata_1, txdata_2, ...]]` 404 | 405 | Require peer to add a set of transactions into its transaction pool and relay them to the 406 | ETH network, then return a [TxStatus] message containing the status of the sent 407 | transactions. 408 | 409 | ### GetTxStatus (0x14) 410 | 411 | `[reqID: P, [txHash_1, txHash_2, ...]]` 412 | 413 | Require peer to return a [TxStatus] message containing the status of the referenced 414 | transactions. This message is intended for inquiry about past transactions sent by the 415 | client. Note that the server is not required to make every transaction available 416 | indefinitely. 417 | 418 | ### TxStatus (0x15) 419 | 420 | `[reqID: P, BV: P, [[status: P, data: B], ...]]` 421 | 422 | Return the current status of the sent/queried transactions. Possible `status` values are: 423 | 424 | - Unknown (`0`): transaction is unknown 425 | - Queued (`1`): transaction is queued (not processable yet) 426 | - Pending (`2`): transaction is pending (processable) 427 | - Included (`3`): transaction is already included in the canonical chain. `data` contains 428 | an RLP-encoded `[blockHash: B_32, blockNumber: P, txIndex: P]` structure. 429 | - Error (`4`): transaction sending failed. `data` contains a text error message. 430 | 431 | ### StopMsg (0x16) 432 | 433 | Instruct the client to temporarily stop sending requests and to not expect responses to those requests it did not already receive a reply for. 434 | 435 | Implementer's note: this message can be used to handle transient server overloads or individual client flow control buffer underruns. The server should avoid sending [StopMsg] too often though if the client also avoids buffer underruns. It should try to regulate its own utilization (and thereby also the frequency of transient overload occurences) with the flow control feedback. Receiving [StopMsg] more than once every few minutes in long term average or not receiving [ResumeMsg] in a few seconds can be considered bad service quality by the clients. 436 | 437 | ### ResumeMsg (0x17) 438 | 439 | `[BV: P]` 440 | 441 | Update flow control buffer and allow sending requests again. Note that the requests not answered before [StopMsg] were permanently canceled and will not be answered after [ResumeMsg]. If a [ResumeMsg] is received without a preceding [StopMsg] then it should be treated as a simple flow control buffer update (assuming that the server has already deducted the cost of the previously answered messages). 442 | 443 | ## Change Log 444 | 445 | ### les/4 (March 2021) 446 | 447 | - Keys `"forkID"` and `"recentTxLookup"` were added to the [Status] message. 448 | 449 | ### les/3 (May 2019) 450 | 451 | - Keys `"serveRecentChain"` and `"serveRecentState"` were added to the [Status] message. 452 | - Messages [StopMsg] and [ResumeMsg] were added to improve handling transient overloads 453 | and flow control buffer underruns. 454 | 455 | ### les/2 (November 2017) 456 | 457 | - The `"announceType"` key was added to the [Status] message. 458 | - The BloomBits Trie and associated messages [GetHelperTrieProofs], [HelperTrieProofs] 459 | were added to facilitate server-assisted log search. **les/1** clients would frequently 460 | download large ranges of receipts to search for specific logs. 461 | - Messages [GetProofsV2], [ProofsV2] were added to de-duplicate result nodes when 462 | requesting multiple proofs at the same time. 463 | - Messages [SendTxV2], [GetTxStatus] and [TxStatus] were added to allow querying for past 464 | transactions and to enable user-lever error reporting for non-includable transactions at 465 | the time of submission. 466 | - The [GetHeaderProofs], [HeaderProofs], [GetProofs], [Proofs] and [SendTx] messages from 467 | **les/1** are no longer supported in **les/2**. 468 | 469 | [Client Side Flow Control]: #client-side-flow-control 470 | [Canonical Hash Trie]: #canonical-hash-trie 471 | [CHT]: #canonical-hash-trie 472 | [BloomBits Trie]: #bloombits-trie 473 | [Status]: #status-0x00 474 | [Announce]: #announce-0x01 475 | [GetBlockHeaders]: #getblockheaders-0x02 476 | [BlockHeaders]: #blockheaders-0x03 477 | [GetBlockBodies]: #getblockbodies-0x04 478 | [BlockBodies]: #blockbodies-0x05 479 | [GetReceipts]: #getreceipts-0x06 480 | [Receipts]: #receipts-0x07 481 | [GetProofs]: #getproofs-0x08 482 | [Proofs]: #proofs-0x09 483 | [GetContractCodes]: #getcontractcodes-0x0a 484 | [ContractCodes]: #contractcodes-0x0b 485 | [GetHeaderProofs]: #getheaderproofs-0x0d 486 | [HeaderProofs]: #headerproofs-0x0e 487 | [SendTx]: #sendtx-0x0c 488 | [GetProofsV2]: #getproofsv2-0x0f 489 | [ProofsV2]: #proofsv2-0x10 490 | [GetHelperTrieProofs]: #gethelpertrieproofs-0x11 491 | [HelperTrieProofs]: #helpertrieproofs-0x12 492 | [SendTxV2]: #sendtxv2-0x13 493 | [GetTxStatus]: #gettxstatus-0x14 494 | [TxStatus]: #txstatus-0x15 495 | [StopMsg]: #stopmsg-0x16 496 | [ResumeMsg]: #resumemsg-0x17 497 | [Ethereum Wire Protocol]: ./eth.md 498 | [Merkle Patricia Trie]: https://github.com/ethereum/wiki/wiki/Patricia-Tree 499 | -------------------------------------------------------------------------------- /discv5/discv5-rationale.md: -------------------------------------------------------------------------------- 1 | # Node Discovery Protocol v5 - Rationale 2 | 3 | **Protocol version v5.1** 4 | 5 | Note that this specification is a work in progress and may change incompatibly without 6 | prior notice. 7 | 8 | This document explains the design requirements and security needs of Discovery v5. In 9 | addition, the document tries to gather the various vulnerabilities and threats that 10 | pertain to Kademlia-like p2p networks. Our aim is to make it plain which issues are 11 | addressed and how they are mitigated, so that the design of the [wire protocol] may be 12 | verified. 13 | 14 | # Design Requirements 15 | 16 | ## Basic Goals 17 | 18 | #### 1.1.1 Replace the Discovery v4 Endpoint Proof 19 | 20 | The existing mutual endpoint verification process is unreliable because either side may 21 | forget about a previously performed endpoint proof. If node A assumes that node B already 22 | knows about a recent PING/PONG interaction and sends FINDNODE, the request may fail. 23 | Implementations of Discovery v4 may guard against this flaw using retries, but retrying is 24 | really slow and usually not done. 25 | 26 | #### 1.1.2 Require knowledge of destination node ID for communication 27 | 28 | Make it expensive to obtain the logical node ID from discovery communications. In 29 | Discovery v4, any node can provoke responses knowing IP alone, and obtain information 30 | about a node without knowing its ID. This encourages sloppy implementations to not perform 31 | proper validation of FINDNODE results and increases the risk of DHT misuse for DDoS 32 | purposes. 33 | 34 | #### 1.1.3 Support more than one node ID cryptosystem 35 | 36 | Ensure the DHT can accommodate ENR's with multiple identity systems. This will allow 37 | identity cryptosystems other than *secp256k1/keccak256*. 38 | 39 | #### 1.1.4 Replace node information tuples with ENRs 40 | 41 | ENRs include discovery information and more. These signed, versioned records fulfill 42 | multiple requirements, such as permitting capability advertisement and transport 43 | negotiation. 44 | 45 | #### 1.1.5 Guard against Kademlia implementation flaws 46 | 47 | Discovery v4 trusts other nodes to return neighbors according to an agreed distance 48 | metric. Mismatches in implementation can make it hard for nodes to join the network, or 49 | lead to network fragmentation. 50 | 51 | #### 1.1.6 Secondary topic-based node index 52 | 53 | The protocol must support discovery of nodes via an arbitrary topic identifier. Finding 54 | nodes belonging to a topic should be as fast or faster than finding a node with a certain 55 | ID. 56 | 57 | #### 1.1.7 Change replay prevention 58 | 59 | The use of timestamps as a replay prevention mechanism in Discovery v4 has led to many 60 | complaints about connectivity when the host's clock was wrong. The protocol should be 61 | independent of the clock. 62 | 63 | #### 1.1.8 Message obfuscation 64 | 65 | The protocol should obfuscate traffic to prevent accidental packet mangling or trivial 66 | sniffing. It must also avoid inclusion of obvious markers to prevent naive blocking of 67 | discovery traffic using hard-coded packet signatures. Defense against advanced traffic 68 | analysis systems, e.g. using inter-packet timing is a secondary concern. 69 | 70 | ## Security Goals 71 | 72 | Individual potential vulnerabilities are identified below. These each represent their own 73 | risk mitigation goal. 74 | 75 | #### 1.2.1 Replay of the handshake 76 | 77 | The handshake, if successfully replayed from an older session, would allow a malicious 78 | node to occupy a former IP location, or pollute the routing table with old information. 79 | 80 | #### 1.2.2 Replay NODES 81 | 82 | A NODES response, if successfully replayed, would pollute the routing table with stale 83 | information. 84 | 85 | #### 1.2.3 Replay PONG 86 | 87 | A PONG, if successfully replayed, could convince a node that a node is live and 88 | participating when it isn't. 89 | 90 | #### 1.2.4 Kademlia redirection 91 | 92 | A FindNode response contains false endpoint information intended at directing traffic at a 93 | victim / polluting the routing table. A topic query results in fake endpoint information, 94 | directing traffic at a victim. 95 | 96 | #### 1.2.5 Kademlia redirection + self-propagation 97 | 98 | As 1.2.3 but the responses attempt to replicate the malicious node throughout the routing 99 | table, to amplify the source of pollution and traffic. 100 | 101 | #### 1.2.6 Unsolicited replies 102 | 103 | A malicious node is attempting to spam a node with fake responses to typical requests. 104 | These messages may be replayed from previous communications, or may be new messages with 105 | spoofed source endpoints. The aim is to disrupt weak implementations or have their 106 | information be received as authentic, to pollute the recipient's routing table. 107 | 108 | #### 1.2.7 Amplification 109 | 110 | Malicious requests of small message size are sent from spoofed source IPs to direct larger 111 | response messages at the victim. 112 | 113 | #### 1.2.8 Kademlia direct validation 114 | 115 | Direct validation of a newly discovered node can be an attack vector. A malicious node may 116 | supply false node information with the IP of a victim. Validation traffic is then directed 117 | at the victim. 118 | 119 | #### 1.2.9 Kademlia ID count per address validations 120 | 121 | There are various attacks facilitated by being able to associate multiple fake (or even 122 | real) malicious node ids with a single IP endpoint. One mitigation method that is 123 | sometimes considered is to globally limit the number of logical node IDs that can be 124 | associated with an IP address. However, this is an attack vector. A malicious actor can 125 | supply many logical node ids for a single IP address and thus prevent the correct node 126 | from being able to join the network. 127 | 128 | #### 1.2.10 Sybil/Eclipse attacks 129 | 130 | These attacks rely on being able to create many real nodes, or spoof many logical node IDs 131 | for a small number of physical endpoints, to form a large, isolated area of the network 132 | under the control of the malicious actor. The victim's discovery findings are directed 133 | into that part of the network, either to manipulate their traffic or to fully isolate them 134 | from the network. 135 | 136 | ## Version Interoperability / Upgrade Paths 137 | 138 | There are several considerations regarding the coexistence of v4 and v5 network members. 139 | 140 | #### 1.3.1 Transition period during network formation 141 | 142 | Discovery v4 clients should be able to serve as discovery v5 bootstrap nodes while the 143 | number of new discovery v5 clients is still low. 144 | 145 | #### 1.3.2 Circumvention of 1.1.2 with v4 PING 146 | 147 | While a client supports both the old v4 and newer versions, it is possible for malicious 148 | actors to pose as a v4 node and recover node IDs from arbitrary IP addresses. This should 149 | somehow be avoided. 150 | 151 | # Rationale 152 | 153 | ## Why UDP? 154 | 155 | The wire protocol specification mandates the use of UDP. This may seem restrictive, but 156 | use of UDP communication is an important part of the design. While there is no single 157 | reason which ultimately dictates this choice, there are many reasons why the system as a 158 | whole will function a lot better in the context of UDP. 159 | 160 | For discovery to work, all nodes must be able to communicate with each other on equal 161 | footing. The network won't form properly if some nodes can only communicate with certain 162 | other nodes. Incooperative NAT in between the node and the Internet can cause 163 | communication failure. UDP is fundamentally easier to work with when it comes to NAT 164 | traversal. No explicit hole-punching is required if the NAT setup is capable of full-cone 165 | translation, i.e. a single packet sent to any other node establishes a port mapping which 166 | allows packets from others to reach the node behind NAT. 167 | 168 | Unlike other DHT systems such as IPFS, the node discovery protocol mandates a single wire 169 | protocol to be implemented by everyone. This avoids communication failures due to 170 | incompatible transports and strengthens the DHT because all participants are guaranteed to 171 | be reachable on the declared endpoint. It is also fundamentally simpler to reason about 172 | and implement: the protocol either works in a certain context or it doesn't. If the 173 | protocol cannot be used because the networking environment doesn't support UDP, another 174 | discovery mechanism must be chosen. 175 | 176 | Another reason for UDP is communication latency: participants in the discovery protocol 177 | must be able to communicate with a large number of other nodes within a short time frame 178 | to establish and maintain the neighbor set and must perform regular liveness checks on 179 | their neighbors. For the topic advertisement system, registrants collect tickets and must 180 | use them as soon as the ticket expires to place an ad in a topic queue. 181 | 182 | These protocol interactions are difficult to implement in a TCP setting where connections 183 | require multiple round-trips before application data can be sent and the connection 184 | lifecycle needs to be maintained. An implementation of the wire protocol on a TCP-based 185 | transport would either need permanent connection to hundreds of nodes, in which case the 186 | application would be short on file descriptors, or establish many short-lived TCP 187 | connections per second to communicate with specific nodes. 188 | 189 | Yet another useful property of UDP is that packets aren't required to reach their 190 | destination --- intermediaries may drop arbitrary packets. This strengthens the protocol 191 | because it must be designed to function even under bad connectivity. Implementations may 192 | exploit the possibility of packet loss to their advantage. A participant can never tell 193 | whether a certain request wasn't answered in time because the recipient chose to ignore it 194 | or because their own connection isn't working. An implementation that tries to minimize 195 | traffic or CPU overhead could simply drop a certain amount of packets at application level 196 | to stay within self-imposed limits. 197 | 198 | ## Why Kademlia? 199 | 200 | Kademlia is a simple distributed hash table design proposed in 2002. It is commonly used 201 | for file-sharing systems where content is stored by hash and distributed among 202 | participants based on their 'proximity' according to the XOR distance metric. 203 | 204 | Node discovery is a Kademlia-inspired system but doesn't store any files, only node 205 | information is relayed. We chose Kademlia primarily because the algorithm is simple and 206 | understandable while providing a distributed database that scales with the number of 207 | participants. Our system also relies on the routing table to allow enumeration and random 208 | traversal of the whole network, i.e. all participants can be found. Most importantly, 209 | having a structured network with routing enables thinking about DHT 'address space' and 210 | 'regions of address space'. These concepts are used to build the [topic-based node index]. 211 | 212 | Kademlia is often criticized as a naive design with obvious weaknesses. We believe that 213 | most issues with simple Kademlia can be overcome by careful programming and the benefits 214 | of a simple design outweigh the cost and risks of maintaining a more complex system. 215 | 216 | ## Sybil and Eclipse Attacks 217 | 218 | The well-known 'sybil attack' is based on the observation that creating node identities is 219 | essentially free. In any system using a measure of proximity among node identities, an 220 | adversary may place nodes close to a chosen node by generating suitable identities. For 221 | basic node discovery through network enumeration, the 'sybil attack' poses no significant 222 | challenge. Sybils are a serious issue for the topic-based node index, especially for 223 | topics provided by few participants, because the index relies on node distance. 224 | 225 | An 'eclipse attack' is usually based on generating sybil nodes with the goal of polluting 226 | the victim node's routing table. Once the table is overtaken, the victim has no way to 227 | find any other nodes but those controlled by the adversary. Even if creating sybil nodes 228 | were somehow impossible, 'eclipsing' a node might still be achieved through other means 229 | such as directing large amounts of traffic to the node. When the victim node is unable to 230 | keep up regular communication with the rest of the network it may lose connection and be 231 | forced into re-bootstrapping its routing table --- a situation in which it is most 232 | vulnerable. 233 | 234 | Both the 'sybil attack' and the 'eclipse attack' must be considered for any structured 235 | overlay network, and there is no single optimal solution to fully protect against these 236 | attacks. However, certain implementation decisions can make them more expensive or render 237 | them ineffective. 238 | 239 | As a general measure, implementations can place IP-based limits on the content of their 240 | routing table. For example, limiting Kademlia table buckets to two nodes from every /24 IP 241 | subnetwork and the whole table to 10 nodes per /24 IP subnetwork significantly increases 242 | the number of hosts an attacker must control to overtake the routing table. Such limits 243 | are effective because IPv4 addresses are a scarce resource. Subnetwork-based limits remain 244 | effective even as IPv6 adoption progresses. 245 | 246 | To counter being eclipsed via repeated contact by an adversary, implementations of the 247 | Kademlia table should avoid taking on new members on incoming contact unless the table is 248 | well-stocked from outbound queries. Readers of the original Kademlia paper may easily 249 | assume that liveness checks on bucket members should be performed just when a new node 250 | tries to enter the bucket, but doing so increases the risk of emptying the table through 251 | DoS. We therefore recommend to perform liveness checks on a separate schedule which is 252 | independent of incoming requests. Checks may also be paused or delayed when the node is 253 | under high load. The number of past liveness checks performed on a bucket member is an 254 | important indicator of its age: Implementations should favor long-lived nodes and may 255 | relax liveness checks according to node age. 256 | 257 | A well-researched countermeasure to sybil attacks is to make creation of identities 258 | computationally expensive. While effective in theory, there are significant downsides to 259 | this approach. Nodes on resource-constrained devices such as mobile phones may not be able 260 | to solve the computational puzzle in time to join the network. Continuous advances in 261 | hashing technology which speed up cryptocurrency proof-of-work algorithms show that this 262 | way of securing the network requires constant adjustments to thresholds and can never beat 263 | determined attackers. 264 | 265 | Support for mixed ENR identity schemes, described later in this document, allows for an 266 | escape hatch to introduce arbitrary optional constraints (including proof-of-work) on node 267 | identities. Thus, while the issue is not directly addressed at wire protocol level, there 268 | is no inherent blocker for solving it as the need arises. 269 | 270 | ## Node Records and Their Properties 271 | 272 | In Discovery v5, all node information is exchanged using [node records]. Records are 273 | self-signed by the node they describe and contain arbitrary key-value pairs. They also 274 | contain a sequence number to determine which copy of the record is newer when multiple 275 | copies are available. When a node record is changed by its owner, the sequence number 276 | increases. The new record 'syncs' to neighboring nodes because they will request it during 277 | liveness revalidation. The record is also 'pushed' on to newly seen nodes as part of the 278 | handshake. 279 | 280 | Signing records prevents any intermediary node from changing the content of a record. Any 281 | node's information is either available in the exact form it was published or not at all. 282 | To make the system secure, proper validation of records is important. Implementations must 283 | verify the signature of all received records. Implementations should also avoid sharing 284 | records containing no usable IP addresses or ports and check that Internet hosts do not 285 | attempt to share records containing LAN IP addresses. 286 | 287 | ## On Encryption 288 | 289 | An early draft of Discovery v5 integrated weak obfuscation based on XORing packet content 290 | as an optional facility. As development of the protocol progressed, we understood that 291 | traffic amplification, replay and packet authentication could all be solved by introducing 292 | a real encryption scheme. The way the handshake and encryption works is primarily aimed at 293 | these issues and is not supposed to ensure complete anonymity of DHT users. While it does 294 | protect against passive observers, the handshake is not forward-secure and active protocol 295 | participants can access node information by simply asking for it. 296 | 297 | Node identities can use different kinds of keys depending on the identity scheme used in 298 | the node record. This has implications on the handshake because it deals with the public 299 | key used to derive the identity. Implementations of Discovery v5 must agree on the set of 300 | supported identity schemes to keep the network interoperable and custom code to verify the 301 | handshake is required for every new scheme. We believe this is an acceptable tradeoff 302 | because introducing a new kind of node identity is a rare event. 303 | 304 | Since the handshake performs complex cryptographic operations (ECDH, signature 305 | verification) performance of the handshake is a big concern. Benchmarking the experimental 306 | Go implementation shows that the handshake computation takes 500µs on a 2014-era laptop 307 | using the default secp256k1/keccak256 identity scheme. That's a lot, but note the cost 308 | amortizes because nodes commonly exchange multiple packets. Subsequent packets in the same 309 | conversation can be decrypted and authenticated in just 2µs. The most common protocol 310 | interaction is a FINDNODE or TOPICQUERY request on an unknown node with 4 NODES responses. 311 | 312 | To put things into perspective: encryption and authentication in Discovery v5 is still a 313 | significant improvement over the authentication scheme used in Discovery v4, which 314 | performs secp256k1 signature 'recovery' (benchmark: ~170µs) on every packet. A FINDNODE 315 | interaction with an unknown v4 node takes 7 packets (2x PING/PONG, FINDNODE, 2x NEIGHBORS) 316 | and costs 1.2ms on each side for the crypto alone. In addition, the v5 handshake reduces 317 | the risk of computational DoS because it costs as much to create as it costs to verify and 318 | cannot be replayed. 319 | 320 | ## On Amplification and Replay 321 | 322 | Any openly accessible packet-based system must consider misuse of the protocol for traffic 323 | amplification purposes. There are two possible avenues of attack: In the first, an 324 | adversary who wishes to attack a third-party host may send packets with 'spoofed' source 325 | IP address to a node, attempting to make the node send a larger response to the victim 326 | endpoint. In the second, the adversary attempts to install a node record containing the 327 | victim's endpoint in the DHT, causing other nodes to direct packets to the victim. 328 | 329 | The handshake handles the first kind of attack by responding with a small WHOAREYOU packet 330 | whenever any request is received from an unknown endpoint. This is safe because the 331 | adversary's packet is always larger than the WHOAREYOU response, removing the incentive 332 | for the attack. To make the countermeasure work, implementations must keep session secrets 333 | not just per node ID, but also per node IP. 334 | 335 | The second kind of attack--- installing the victim as a node ---is handled by requiring 336 | that implementations mustn't answer queries with nodes whose liveness hasn't been 337 | verified. When a node is added to the Kademlia table, it must pass at least one check on 338 | the IP declared in the node record before it can be returned in a NODES response. 339 | 340 | An adversary may also try to replay previously sent/seen packets to impersonate a node or 341 | disturb the operation of the protocol. Session keys per node-ID/IP generally prevent 342 | replay across sessions. The `request-id`, mirrored in response packets, prevents replay of 343 | responses within a session. 344 | 345 | ## The Topic Index 346 | 347 | Using FINDNODE queries with appropriately chosen targets, the entire DHT can be sampled by 348 | a random walk to find all other participants. When building a distributed application, it 349 | is often desirable to restrict the search to participants which provide a certain service. 350 | A simple solution to this problem would be to simply split up the network and require 351 | participation in many smaller application-specific networks. However, such networks are 352 | hard to bootstrap and also more vulnerable to attacks which could isolate nodes. 353 | 354 | The topic index provides discovery by provided service in a different way. Nodes maintain 355 | a single node table tracking their neighbors and advertise 'topics' on nodes found by 356 | randomly walking the DHT. While the 'global' topic index can be also spammed, it makes 357 | complete isolation a lot harder. To prevent nodes interested in a certain topic from 358 | finding each other, the entire discovery network would have to be overpowered. 359 | 360 | To make the index useful, searching for nodes by topic must be efficient regardless of the 361 | number of advertisers. This is achieved by estimating the topic 'radius', i.e. the 362 | percentage of all live nodes which are advertising the topic. Advertisement and search 363 | activities are restricted to a region of DHT address space around the topic's 'center'. 364 | 365 | We also want the index to satisfy another property: When a topic advertisement is placed, 366 | it should last for a well-defined amount of time. This ensures nodes may rely on their 367 | advertisements staying placed rather than worrying about keeping them alive. 368 | 369 | Finally, the index should consume limited resources. Just as the node table is limited in 370 | number and size of buckets, the size of the index data structure on each node is limited. 371 | 372 | ### Why should advertisers wait? 373 | 374 | Advertisers must wait a certain amount of time before they can be registered. Enforcing 375 | this time limit prevents misuse of the topic index because any topic must be important 376 | enough to outweigh the cost of waiting. Imagine a group phone call: announcing the 377 | participants of the call using topic advertisement isn't a good use of the system because 378 | the topic exists only for a short time and will have very few participants. The waiting 379 | time prevents using the index for this purpose because the call might already be over 380 | before everyone could get registered. 381 | 382 | ### Dealing with Topic Spam 383 | 384 | Our model is based on the following assumptions: 385 | 386 | - Anyone can place their own advertisements under any topics and the rate of placing ads 387 | is not limited globally. The number of active ads for any node is roughly proportional 388 | to the resources (network bandwidth, mostly) spent on advertising. 389 | - Honest actors whose purpose is to connect to other honest actors will spend an adequate 390 | amount of efforts on registering and searching for ads, depending on the rate of newly 391 | established connections they are targeting. If the given topic is used only by honest 392 | actors, a few registrations per minute will be satisfactory, regardless of the size of 393 | the subnetwork. 394 | - Dishonest actors may want to place an excessive amount of ads just to disrupt the 395 | discovery service. This will reduce the effectiveness of honest registration efforts by 396 | increasing the topic radius and/or topic queue waiting times. If the attacker(s) can 397 | place a comparable amount or more ads than all honest actors combined then the rate of 398 | new (useful) connections established throughout the network will reduce proportionally 399 | to the `honest / (dishonest + honest)` registration rates. 400 | 401 | This adverse effect can be countered by honest actors increasing their registration and 402 | search efforts. Fortunately, the rate of established connections between them will 403 | increase proportionally both with increased honest registration and search efforts. If 404 | both are increased in response to an attack, the required factor of increased efforts from 405 | honest actors is proportional to the square root of the attacker's efforts. 406 | 407 | ### Detecting a useless registration attack 408 | 409 | In the case of a symmetrical protocol, where nodes are both searching and advertising 410 | under the same topic, it is easy to detect when most of the found ads turn out to be 411 | useless and increase both registration and query frequency. It is a bit harder but still 412 | possible with asymmetrical (client-server) protocols, where only clients can easily detect 413 | useless registrations, while advertisers (servers) do not have a direct way of detecting 414 | when they should increase their advertising efforts. One possible solution is for servers 415 | to also act as clients just to test the server capabilities of other advertisers. It is 416 | also possible to implement a feedback system between trusted clients and servers. 417 | 418 | # References 419 | 420 | - Petar Maymounkov and David Mazières. 421 | *Kademlia: A Peer-to-peer Information System Based on the XOR Metric.* 2002.\ 422 | 423 | 424 | - Atul Singh, Tsuen-Wan “Johnny” Ngan, Peter Druschel, Dan S. Wallach. 425 | *Eclipse Attacks on Overlay Networks: Threats and Defenses*. 2006.\ 426 | 427 | 428 | - Ingmar Baumgart and Sebastian Mies. 429 | *S/Kademlia: A Practicable Approach Towards Secure Key-Based Routing.* 2007.\ 430 | 431 | 432 | - Xin Sun, Ruben Torres and Sanjay Rao. *Feasiblity of DDoS Attacks with P2P Systems and 433 | Prevention through Robust Membership Management.* 2007.\ 434 | 435 | 436 | - Erik Hjelmvik, Wolfgang John. *Breaking and Improving Protocol Obfuscation.* 2010.\ 437 | 438 | 439 | - Adam Langley, Wan-Teh Chang. *QUIC Crypto*. 2016.\ 440 | 441 | 442 | - W3C Credentials Community Group. *Decentralized Identifiers (DIDs) Spec.* 2017.\ 443 | 444 | 445 | - Seoung Kyun Kim, Zane Ma, Siddharth Murali, Joshua Mason, Andrew Miller, Michael Bailey. 446 | *Measuring Ethereum Network Peers*. 2018.\ 447 | 448 | 449 | - Yuval Marcus, Ethan Heilman, Sharon Goldberg. 450 | *Low-Resource Eclipse Attacks on Ethereum’s Peer-to-Peer Network.* 2018.\ 451 | 452 | 453 | [wire protocol]: ./discv5-wire.md 454 | [topic-based node index]: ./discv5-theory.md#topic-advertisement 455 | [node records]: ../enr.md 456 | -------------------------------------------------------------------------------- /discv5/discv5-theory.md: -------------------------------------------------------------------------------- 1 | # Node Discovery Protocol v5 - Theory 2 | 3 | **Protocol version v5.1** 4 | 5 | This document explains the algorithms and data structures used by the protocol. 6 | 7 | ## Nodes, Records and Distances 8 | 9 | A participant in the Node Discovery Protocol is represented by a 'node record' as defined 10 | in [EIP-778]. The node record keeps arbitrary information about the node. For the purposes 11 | of this protocol, the node must at least provide an IP address (`"ip"` or `"ip6"` key) and 12 | UDP port (`"udp"` key) in order to have it's record relayed in the DHT. 13 | 14 | Node records are signed according to an 'identity scheme'. Any scheme can be used with 15 | Node Discovery Protocol, and nodes using different schemes can communicate. 16 | 17 | The identity scheme of a node record defines how a 32-byte 'node ID' is derived from the 18 | information contained in the record. The 'distance' between two node IDs is the bitwise 19 | XOR of the IDs, taken as the big-endian number. 20 | 21 | distance(n₁, n₂) = n₁ XOR n₂ 22 | 23 | In many situations, the logarithmic distance (i.e. length of differing suffix in bits) is 24 | used in place of the actual distance. 25 | 26 | logdistance(n₁, n₂) = log2(distance(n₁, n₂)) 27 | 28 | ### Maintaining The Local Node Record 29 | 30 | Participants should update their record, increase the sequence number and sign a new 31 | version of the record whenever their information changes. This is especially important for 32 | changes to the node's IP address and port. Implementations should determine the external 33 | endpoint (the Internet-facing IP address and port on which the node can be reached) and 34 | include it in their record. 35 | 36 | If communication flows through a NAT device, the UPnP/NAT-PMP protocols or the mirrored 37 | UDP envelope IP and port found in the [PONG] message can be used to determine the external 38 | IP address and port. 39 | 40 | If the endpoint cannot be determined (e.g. when the NAT doesn't support 'full-cone' 41 | translation), implementations should omit IP address and UDP port from the record. 42 | 43 | ## Sessions 44 | 45 | Discovery communication is encrypted and authenticated using session keys, established in 46 | the handshake. Since every node participating in the network acts as both client and 47 | server, a handshake can be initiated by either side of communication at any time. 48 | 49 | ### Handshake Steps 50 | 51 | #### Step 1: Node A sends message packet 52 | 53 | In the following definitions, we assume that node A wishes to communicate with node B, 54 | e.g. to send a FINDNODE message. Node A must have a copy of node B's record in order to 55 | communicate with it. 56 | 57 | If node A has session keys from prior communication with B, it encrypts its request with 58 | those keys. If no keys are known, it initiates the handshake by sending an ordinary 59 | message packet with random message content. 60 | 61 | A -> B FINDNODE message packet encrypted with unknown key 62 | 63 | #### Step 2: Node B responds with challenge 64 | 65 | Node B receives the message packet and extracts the source node ID from the packet header. 66 | If node B has session keys from prior communication with A, it attempts to decrypt the 67 | message data. If decryption and authentication of the message succeeds, there is no need 68 | for a handshake and node B can simply respond to the request. 69 | 70 | If node B does not have session keys or decryption is not successful, it must initiate a 71 | handshake by responding with a [WHOAREYOU packet]. 72 | 73 | It first generates a unique `id-nonce` value and includes it in the packet. Node B also 74 | checks if it has a copy of node A's record. If it does, it also includes the sequence 75 | number of this record in the challenge packet, otherwise it sets the `enr-seq` field to 76 | zero. 77 | 78 | Node B must also store the A's record and the WHOAREYOU challenge for a short duration 79 | after sending it to node A because they will be needed again in step 4. 80 | 81 | A <- B WHOAREYOU packet including id-nonce, enr-seq 82 | 83 | #### Step 3: Node A processes the challenge 84 | 85 | Node A receives the challenge sent by node B, which confirms that node B is alive and is 86 | ready to perform the handshake. The challenge can be traced back to the request packet 87 | which solicited it by checking the `nonce`, which mirrors the request packet's `nonce`. 88 | 89 | Node A proceeds with the handshake by re-sending the FINDNODE request as a [handshake 90 | message packet]. This packet contains three parts in addition to the message: 91 | `id-signature`, `ephemeral-pubkey` and `record`. 92 | 93 | The handshake uses the unmasked WHOAREYOU challenge as an input: 94 | 95 | challenge-data = masking-iv || static-header || authdata 96 | 97 | Node A can now derive the new session keys. To do so, it first generates an ephemeral key 98 | pair on the elliptic curve used by node B's identity scheme. As an example, let's assume 99 | the node record of B uses the "v4" scheme. In this case the `ephemeral-pubkey` will be a 100 | public key on the secp256k1 curve. 101 | 102 | ephemeral-key = random private key generated by node A 103 | ephemeral-pubkey = public key corresponding to ephemeral-key 104 | 105 | The ephemeral key is used to perform Diffie-Hellman key agreement with node B's static 106 | public key and the session keys are derived from it using the HKDF key derivation 107 | function. 108 | 109 | dest-pubkey = public key corresponding to node B's static private key 110 | secret = ecdh(dest-pubkey, ephemeral-key) 111 | kdf-info = "discovery v5 key agreement" || node-id-A || node-id-B 112 | prk = HKDF-Extract(secret, challenge-data) 113 | key-data = HKDF-Expand(prk, kdf-info) 114 | initiator-key = key-data[:16] 115 | recipient-key = key-data[16:] 116 | 117 | Node A creates the `id-signature`, which proves that it controls the private key which 118 | signed its node record. The signature also prevents replay of the handshake. 119 | 120 | id-signature-text = "discovery v5 identity proof" 121 | id-signature-input = id-signature-text || challenge-data || ephemeral-pubkey || node-id-B 122 | id-signature = id_sign(sha256(id-signature-input)) 123 | 124 | Finally, node A compares the `enr-seq` element of the WHOAREYOU challenge against its own 125 | node record sequence number. If the sequence number in the challenge is lower, it includes 126 | its record into the handshake message packet. 127 | 128 | The request is now re-sent, with the message encrypted using the new session keys. 129 | 130 | A -> B FINDNODE handshake message packet, encrypted with new initiator-key 131 | 132 | #### Step 4: Node B receives handshake message 133 | 134 | When node B receives the handshake message packet, it first loads the node record and 135 | WHOAREYOU challenge which it sent and stored earlier. 136 | 137 | If node B did not have the node record of node A, the handshake message packet must 138 | contain a node record. A record may also be present if node A determined that its record 139 | is newer than B's current copy. If the packet contains a node record, B must first 140 | validate it by checking the record's signature. 141 | 142 | Node B then verifies the `id-signature` against the identity public key of A's record. 143 | 144 | After that, B can perform the key derivation using its own static private key and the 145 | `ephemeral-pubkey` from the handshake packet. Using the resulting session keys, it 146 | attempts to decrypt the message contained in the packet. 147 | 148 | If the message can be decrypted and authenticated, Node B considers the new session keys 149 | valid and responds to the message. In our example case, the response is a `NODES` message: 150 | 151 | A <- B NODES encrypted with new recipient-key 152 | 153 | #### Step 5: Node A receives response message 154 | 155 | Node A receives the message packet response and authenticates/decrypts it with the new 156 | session keys. If decryption/authentication succeeds, node B's identity is verified and 157 | node A also considers the new session keys valid. 158 | 159 | ### Identity-Specific Cryptography in the Handshake 160 | 161 | Establishment of session keys is dependent on the [identity scheme] used by the recipient 162 | (i.e. the node which sends WHOAREYOU). Likewise, the signature over `id-sig-input` is made 163 | by the identity key of the initiator. It is not required that initiator and recipient use 164 | the same identity scheme in their respective node records. Implementations must be able to 165 | perform the handshake for all supported identity schemes. 166 | 167 | At this time, the only supported identity scheme is "v4". 168 | 169 | `id_sign(hash)` creates a signature over `hash` using the node's static private key. The 170 | signature is encoded as the 64-byte array `r || s`, i.e. as the concatenation of the 171 | signature values. 172 | 173 | `ecdh(pubkey, privkey)` creates a secret through elliptic-curve Diffie-Hellman key 174 | agreement. The public key is multiplied by the private key to create a secret ephemeral 175 | key `eph = pubkey * privkey`. The 33-byte secret output is `y || eph.x` where `y` is 176 | `0x02` when `eph.y` is even or `0x03` when `eph.y` is odd. 177 | 178 | ### Handshake Implementation Considerations 179 | 180 | Since a handshake may happen at any time, UDP packets may be reordered by transmitting 181 | networking equipment, implementations must deal with certain subtleties regarding the 182 | handshake. 183 | 184 | In general, implementations should keep a reference to all sent request packets until the 185 | request either times out, is answered by the corresponding response packet or answered by 186 | WHOAREYOU. If WHOAREYOU is received as the answer to a request, the request must be 187 | re-sent as a handshake packet. 188 | 189 | If an implementation supports sending concurrent requests, multiple responses may be 190 | pending when WHOAREYOU is received, as in the following example: 191 | 192 | A -> B FINDNODE 193 | A -> B PING 194 | A -> B TOPICQUERY 195 | A <- B WHOAREYOU (nonce references PING) 196 | 197 | When this happens, all buffered requests can be considered invalid (the remote end cannot 198 | decrypt them) and the packet referenced by the WHOAREYOU `nonce` (in this example: PING) 199 | must be re-sent as a handshake. When the response to the re-sent is received, the new 200 | session is established and other pending requests (example: FINDNODE, TOPICQUERY) may be 201 | re-sent. 202 | 203 | Note that WHOAREYOU is only ever valid as a response to a previously sent request. If 204 | WHOAREYOU is received but no requests are pending, the handshake attempt can be ignored. 205 | 206 | Another important issue is the processing of message packets while a challenge is 207 | received: consider the case where node A has sent a packet that B cannot decrypt, and B 208 | has responded with WHOAREYOU. 209 | 210 | A -> B FINDNODE 211 | A <- B WHOAREYOU 212 | 213 | Node B is now waiting for a handshake message packet to complete the new session, but 214 | instead receives another ordinary message packet. 215 | 216 | A -> B ORDINARY MESSAGE PACKET 217 | 218 | In this case, implementations should respond with a new WHOAREYOU challenge referencing 219 | the message packet. 220 | 221 | ### Session Cache 222 | 223 | Nodes should store session keys for communication with other recently-seen nodes. Since 224 | sessions are ephemeral and can be re-established whenever necessary, it is sufficient to 225 | store a limited number of sessions in an in-memory LRU cache. 226 | 227 | To prevent IP spoofing attacks, implementations must ensure that session secrets and the 228 | handshake are tied to a specific UDP endpoint. This is simple to implement by using the 229 | node ID and IP/port as the 'key' into the in-memory session cache. When a node switches 230 | endpoints, e.g. when roaming between different wireless networks, sessions will have to be 231 | re-established by handshaking again. This requires no effort on behalf of the roaming node 232 | because the recipients of protocol messages will simply refuse to decrypt messages from 233 | the new endpoint and reply with WHOAREYOU. 234 | 235 | The number of messages which can be encrypted with a certain session key is limited 236 | because encryption of each message requires a unique nonce for AES-GCM. In addition to the 237 | keys, the session cache must also keep track of the count of outgoing messages to ensure 238 | the uniqueness of nonce values. Since the wire protocol uses 96 bit AES-GCM nonces, it is 239 | strongly recommended to generate them by encoding the current outgoing message count into 240 | the first 32 bits of the nonce and filling the remaining 64 bits with random data 241 | generated by a cryptographically secure random number generator. 242 | 243 | ## Node Table 244 | 245 | Nodes keep information about other nodes in their neighborhood. Neighbor nodes are stored 246 | in a routing table consisting of 'k-buckets'. For each `0 ≤ i < 256`, every node keeps a 247 | k-bucket for nodes of `logdistance(self, n) == i`. The Node Discovery Protocol uses `k = 248 | 16`, i.e. every k-bucket contains up to 16 node entries. The entries are sorted by time 249 | last seen — least-recently seen node at the head, most-recently seen at the tail. 250 | 251 | Whenever a new node N₁ is encountered, it can be inserted into the corresponding bucket. 252 | If the bucket contains less than `k` entries N₁ can simply be added as the first entry. If 253 | the bucket already contains `k` entries, the liveness of the least recently seen node in 254 | the bucket, N₂, needs to be revalidated. If no reply is received from N₂ it is considered 255 | dead, removed and N₁ added to the front of the bucket. 256 | 257 | Neighbors of very low distance are unlikely to occur in practice. Implementations may omit 258 | k-buckets for low distances. 259 | 260 | ### Table Maintenance In Practice 261 | 262 | Nodes are expected to keep track of their close neighbors and regularly refresh their 263 | information. To do so, a lookup targeting the least recently refreshed bucket should be 264 | performed at regular intervals. 265 | 266 | Checking node liveness whenever a node is to be added to a bucket is impractical and 267 | creates a DoS vector. Implementations should perform liveness checks asynchronously with 268 | bucket addition and occasionally verify that a random node in a random bucket is live by 269 | sending [PING]. When the PONG response indicates that a new version of the node record is 270 | available, the liveness check should pull the new record and update it in the local table. 271 | 272 | If a node's liveness has been verified many times, implementations may consider occasional 273 | non-responsiveness permissible and assume the node is live. 274 | 275 | When responding to FINDNODE, implementations must avoid relaying any nodes whose liveness 276 | has not been verified. This is easy to achieve by storing an additional flag per node in 277 | the table, tracking whether the node has ever successfully responded to a PING request. 278 | 279 | In order to keep all k-bucket positions occupied even when bucket members fail liveness 280 | checks, it is strongly recommended to maintain a 'replacement cache' alongside each 281 | bucket. This cache holds recently-seen nodes which would fall into the corresponding bucket 282 | but cannot become a member of the bucket because it is already at capacity. Once a bucket 283 | member becomes unresponsive, a replacement can be chosen from the cache. 284 | 285 | ### Lookup 286 | 287 | A 'lookup' locates the `k` closest nodes to a node ID. 288 | 289 | The lookup initiator starts by picking `α` closest nodes to the target it knows of from 290 | the local table. The initiator then sends [FINDNODE] requests to those nodes. `α` is an 291 | implementation-defined concurrency parameter, typically `3`. As NODES responses are 292 | received, the initiator resends FINDNODE to nodes it has learned about from previous 293 | queries. Of the `k` nodes the initiator has heard of closest to the target, it picks `α` 294 | that it has not yet queried and sends FINDNODE to them. The lookup terminates when the 295 | initiator has queried and gotten responses from the `k` closest nodes it has seen. 296 | 297 | To improve the resilience of lookups against adversarial nodes, the algorithm may be 298 | adapted to perform network traversal on multiple disjoint paths. Not only does this 299 | approach benefit security, it also improves effectiveness because more nodes are visited 300 | during a single lookup. The initial `k` closest nodes are partitioned into multiple 301 | independent 'path' buckets, and ​concurrent FINDNODE​ requests executed as described above, 302 | with one difference: results discovered on one path are not reused on another, i.e. each 303 | path attempts to reach the closest nodes to the lookup target independently without 304 | reusing intermediate results found on another path. Note that it is still necessary to 305 | track previously asked nodes across all paths to keep the paths disjoint. 306 | 307 | ### Lookup Protocol 308 | 309 | This section shows how the wire protocol messages can be used to perform a lookup 310 | interaction against a single node. 311 | 312 | Node `A` is looking for target `x`. It selects node `B` from the local table or 313 | intermediate lookup results. To query for nodes close to `x` on `B`, node `A` computes the 314 | query distance `d = logdistance(B, x)` and sends its request. 315 | 316 | A -> B FINDNODE [d] 317 | 318 | Node `B` responds with multiple nodes messages containing the nodes at the queried 319 | distance. 320 | 321 | A <- B NODES [N₁, N₂, N₃] 322 | A <- B NODES [N₄, N₅] 323 | 324 | Depending on the value of `d` and the content of `B`s table, the response to the initial 325 | query might contain very few nodes or no nodes at all. Should this be the case, `A` varies 326 | the distance to retrieve more nodes from adjacent k-buckets on `B`: 327 | 328 | A -> B FINDNODE [d+1] 329 | 330 | `B` responds with more nodes: 331 | 332 | A <- B NODES [N₆, N₇] 333 | 334 | Node `A` now sorts all received nodes by distance to the lookup target and proceeds by 335 | repeating the lookup procedure on another, closer node. 336 | 337 | ## Topic Advertisement 338 | 339 | The topic advertisement subsystem indexes participants by their provided services. A 340 | node's provided services are identified by arbitrary strings called 'topics'. A node 341 | providing a certain service is said to 'place an ad' for itself when it makes itself 342 | discoverable under that topic. Depending on the needs of the application, a node can 343 | advertise multiple topics or no topics at all. Every node participating in the discovery 344 | protocol acts as an advertisement medium, meaning that it accepts topic ads from other 345 | nodes and later returns them to nodes searching for the same topic. 346 | 347 | ### Topic Table 348 | 349 | Nodes store ads for any number of topics and a limited number of ads for each topic. The 350 | data structure holding advertisements is called the 'topic table'. The list of ads for a 351 | particular topic is called the 'topic queue' because it functions like a FIFO queue of 352 | limited length. The image below depicts a topic table containing three queues. The queue 353 | for topic `T₁` is at capacity. 354 | 355 | ![topic table](./img/topic-queue-diagram.png) 356 | 357 | The queue size limit is implementation-defined. Implementations should place a global 358 | limit on the number of ads in the topic table regardless of the topic queue which contains 359 | them. Reasonable limits are 100 ads per queue and 50000 ads across all queues. Since ENRs 360 | are at most 300 bytes in size, these limits ensure that a full topic table consumes 361 | approximately 15MB of memory. 362 | 363 | Any node may appear at most once in any topic queue, that is, registration of a node which 364 | is already registered for a given topic fails. Implementations may impose other 365 | restrictions on the table, such as restrictions on the number of IP-addresses in a certain 366 | range or number of occurrences of the same node across queues. 367 | 368 | ### Tickets 369 | 370 | Ads should remain in the queue for a constant amount of time, the `target-ad-lifetime`. To 371 | maintain this guarantee, new registrations are throttled and registrants must wait for a 372 | certain amount of time before they are admitted. When a node attempts to place an ad, it 373 | receives a 'ticket' which tells them how long they must wait before they will be accepted. 374 | It is up to the registrant node to keep the ticket and present it to the advertisement 375 | medium when the waiting time has elapsed. 376 | 377 | The waiting time constant is: 378 | 379 | target-ad-lifetime = 15min 380 | 381 | The assigned waiting time for any registration attempt is determined according to the 382 | following rules: 383 | 384 | - When the table is full, the waiting time is assigned based on the lifetime of the oldest 385 | ad across the whole table, i.e. the registrant must wait for a table slot to become 386 | available. 387 | - When the topic queue is full, the waiting time depends on the lifetime of the oldest ad 388 | in the queue. The assigned time is `target-ad-lifetime - oldest-ad-lifetime` in this 389 | case. 390 | - Otherwise the ad may be placed immediately. 391 | 392 | Tickets are opaque objects storing arbitrary information determined by the issuing node. 393 | While details of encoding and ticket validation are up to the implementation, tickets must 394 | contain enough information to verify that: 395 | 396 | - The node attempting to use the ticket is the node which requested it. 397 | - The ticket is valid for a single topic only. 398 | - The ticket can only be used within the registration window. 399 | - The ticket can't be used more than once. 400 | 401 | Implementations may choose to include arbitrary other information in the ticket, such as 402 | the cumulative wait time spent by the advertiser. A practical way to handle tickets is to 403 | encrypt and authenticate them with a dedicated secret key: 404 | 405 | ticket = aesgcm_encrypt(ticket-key, ticket-nonce, ticket-pt, '') 406 | ticket-pt = [src-node-id, src-ip, topic, req-time, wait-time, cum-wait-time] 407 | src-node-id = node ID that requested the ticket 408 | src-ip = IP address that requested the ticket 409 | topic = the topic that ticket is valid for 410 | req-time = absolute time of REGTOPIC request 411 | wait-time = waiting time assigned when ticket was created 412 | cum-wait = cumulative waiting time of this node 413 | 414 | ### Registration Window 415 | 416 | The image below depicts a single ticket's validity over time. When the ticket is issued, 417 | the node keeping it must wait until the registration window opens. The length of the 418 | registration window is 10 seconds. The ticket becomes invalid after the registration 419 | window has passed. 420 | 421 | ![ticket validity over time](./img/ticket-validity.png) 422 | 423 | Since all ticket waiting times are assigned to expire when a slot in the queue opens, the 424 | advertisement medium may receive multiple valid tickets during the registration window and 425 | must choose one of them to be admitted in the topic queue. The winning node is notified 426 | using a [REGCONFIRMATION] response. 427 | 428 | Picking the winner can be achieved by keeping track of a single 'next ticket' per queue 429 | during the registration window. Whenever a new ticket is submitted, first determine its 430 | validity and compare it against the current 'next ticket' to determine which of the two is 431 | better according to an implementation-defined metric such as the cumulative wait time 432 | stored in the ticket. 433 | 434 | ### Advertisement Protocol 435 | 436 | This section explains how the topic-related protocol messages are used to place an ad. 437 | 438 | Let us assume that node `A` provides topic `T`. It selects node `C` as advertisement 439 | medium and wants to register an ad, so that when node `B` (who is searching for topic `T`) 440 | asks `C`, `C` can return the registration entry of `A` to `B`. 441 | 442 | Node `A` first attempts to register without a ticket by sending [REGTOPIC] to `C`. 443 | 444 | A -> C REGTOPIC [T, ""] 445 | 446 | `C` replies with a ticket and waiting time. 447 | 448 | A <- C TICKET [ticket, wait-time] 449 | 450 | Node `A` now waits for the duration of the waiting time. When the wait is over, `A` sends 451 | another registration request including the ticket. `C` does not need to remember its 452 | issued tickets since the ticket is authenticated and contains enough information for `C` 453 | to determine its validity. 454 | 455 | A -> C REGTOPIC [T, ticket] 456 | 457 | Node `C` replies with another ticket. Node `A` must keep this ticket in place of the 458 | earlier one, and must also be prepared to handle a confirmation call in case registration 459 | was successful. 460 | 461 | A <- C TICKET [ticket, wait-time] 462 | 463 | Node `C` waits for the registration window to end on the queue and selects `A` as the node 464 | which is registered. Node `C` places `A` into the topic queue for `T` and sends a 465 | [REGCONFIRMATION] response. 466 | 467 | A <- C REGCONFIRMATION [T] 468 | 469 | ### Ad Placement And Topic Radius 470 | 471 | Since every node may act as an advertisement medium for any topic, advertisers and nodes 472 | looking for ads must agree on a scheme by which ads for a topic are distributed. When the 473 | number of nodes advertising a topic is at least a certain percentage of the whole 474 | discovery network (rough estimate: at least 1%), ads may simply be placed on random nodes 475 | because searching for the topic on randomly selected nodes will locate the ads quickly enough. 476 | 477 | However, topic search should be fast even when the number of advertisers for a topic is 478 | much smaller than the number of all live nodes. Advertisers and searchers must agree on a 479 | subset of nodes to serve as advertisement media for the topic. This subset is simply a 480 | region of the node ID address space, consisting of nodes whose Kademlia address is within a 481 | certain distance to the topic hash `sha256(T)`. This distance is called the 'topic 482 | radius'. 483 | 484 | Example: for a topic `f3b2529e...` with a radius of 2^240, the subset covers all nodes 485 | whose IDs have prefix `f3b2...`. A radius of 2^256 means the entire network, in which case 486 | advertisements are distributed uniformly among all nodes. The diagram below depicts a 487 | region of the address space with topic hash `t` in the middle and several nodes close to 488 | `t` surrounding it. Dots above the nodes represent entries in the node's queue for the 489 | topic. 490 | 491 | ![diagram explaining the topic radius concept](./img/topic-radius-diagram.png) 492 | 493 | To place their ads, participants simply perform a random walk within the currently 494 | estimated radius and run the advertisement protocol by collecting tickets from all nodes 495 | encountered during the walk and using them when their waiting time is over. 496 | 497 | ### Topic Radius Estimation 498 | 499 | Advertisers must estimate the topic radius continuously in order to place their ads on 500 | nodes where they will be found. The radius mustn't fall below a certain size because 501 | restricting registration to too few nodes leaves the topic vulnerable to censorship and 502 | leads to long waiting times. If the radius were too large, searching nodes would take too 503 | long to find the ads. 504 | 505 | Estimating the radius uses the waiting time as an indicator of how many other nodes are 506 | attempting to place ads in a certain region. This is achieved by keeping track of the 507 | average time to successful registration within segments of the address space surrounding 508 | the topic hash. Advertisers initially assume the radius is 2^256, i.e. the entire network. 509 | As tickets are collected, the advertiser samples the time it takes to place an ad in each 510 | segment and adjusts the radius such that registration at the chosen distance takes 511 | approximately `target-ad-lifetime / 2` to complete. 512 | 513 | ## Topic Search 514 | 515 | Finding nodes that provide a certain topic is a continuous process which reads the content 516 | of topic queues inside the approximated topic radius. This is a much simpler process than 517 | topic advertisement because collecting tickets and waiting on them is not required. 518 | 519 | To find nodes for a topic, the searcher generates random node IDs inside the estimated 520 | topic radius and performs Kademlia lookups for these IDs. All (intermediate) nodes 521 | encountered during lookup are asked for topic queue entries using the [TOPICQUERY] packet. 522 | 523 | Radius estimation for topic search is similar to the estimation procedure for 524 | advertisement, but samples the average number of results from TOPICQUERY instead of 525 | average time to registration. The radius estimation value can be shared with the 526 | registration algorithm if the same topic is being registered and searched for. 527 | 528 | [EIP-778]: ../enr.md 529 | [identity scheme]: ../enr.md#record-structure 530 | [handshake message packet]: ./discv5-wire.md#handshake-message-packet-flag--2 531 | [WHOAREYOU packet]: ./discv5-wire.md#whoareyou-packet-flag--1 532 | [PING]: ./discv5-wire.md#ping-request-0x01 533 | [PONG]: ./discv5-wire.md#pong-response-0x02 534 | [FINDNODE]: ./discv5-wire.md#findnode-request-0x03 535 | [REGTOPIC]: ./discv5-wire.md#regtopic-request-0x07 536 | [REGCONFIRMATION]: ./discv5-wire.md#regconfirmation-response-0x09 537 | [TOPICQUERY]: ./discv5-wire.md#topicquery-request-0x0a 538 | --------------------------------------------------------------------------------