├── .lint
    ├── .gitignore
    ├── lint.sh
    ├── remark-lint-config.js
    └── package.json
├── etherdog.png
├── discv5
    ├── img
    │   ├── ticket-validity.png
    │   ├── message-packet-layout.png
    │   ├── topic-queue-diagram.png
    │   ├── topic-radius-diagram.png
    │   ├── handshake-packet-layout.png
    │   └── whoareyou-packet-layout.png
    ├── discv5.md
    ├── discv5-wire-test-vectors.md
    ├── discv5-wire.md
    ├── discv5-rationale.md
    └── discv5-theory.md
├── .circleci
    └── config.yml
├── enr-entries
    ├── eth.md
    └── les.md
├── README.md
├── caps
    ├── wit.md
    ├── pip.md
    ├── snap.md
    └── les.md
├── enr.md
├── dnsdisc.md
├── discv4.md
└── rlpx.md


/.lint/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | 


--------------------------------------------------------------------------------
/etherdog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/etherdog.png


--------------------------------------------------------------------------------
/discv5/img/ticket-validity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/ticket-validity.png


--------------------------------------------------------------------------------
/discv5/img/message-packet-layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/message-packet-layout.png


--------------------------------------------------------------------------------
/discv5/img/topic-queue-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/topic-queue-diagram.png


--------------------------------------------------------------------------------
/discv5/img/topic-radius-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/topic-radius-diagram.png


--------------------------------------------------------------------------------
/discv5/img/handshake-packet-layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/handshake-packet-layout.png


--------------------------------------------------------------------------------
/discv5/img/whoareyou-packet-layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethereum/devp2p/HEAD/discv5/img/whoareyou-packet-layout.png


--------------------------------------------------------------------------------
/.lint/lint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | d=$(dirname $0)
4 | $d/node_modules/remark-cli/cli.js --no-stdout --frail --rc-path $d/remark-lint-config.js $*
5 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.0
 2 | jobs:
 3 |   build:
 4 |     docker:
 5 |       - image: cimg/node:lts
 6 |     steps:
 7 |       - checkout
 8 |       - restore_cache:
 9 |           key: lint-node-modules-{{ checksum ".lint/package-lock.json" }}
10 |       - run:
11 |           name: Install the Markdown Linter
12 |           command: "cd .lint && npm install"
13 |       - run:
14 |           name: Run the Markdown Linter
15 |           command: ".lint/lint.sh ."
16 |       - save_cache:
17 |           key: lint-node-modules-{{ checksum ".lint/package-lock.json" }}
18 |           paths:
19 |             - .lint/node_modules
20 | 


--------------------------------------------------------------------------------
/enr-entries/eth.md:
--------------------------------------------------------------------------------
 1 | # The "eth" ENR entry
 2 | 
 3 | This specification defines the "eth" ENR entry, which provides information
 4 | about the [eth capability] on a certain node.
 5 | 
 6 | ## Entry Format
 7 | 
 8 |     entry-key   = "eth"
 9 |     entry-value = [[ forkHash, forkNext ]]
10 | 
11 | At this time, the "eth" entry is a single element list containing an [EIP-2124] fork ID
12 | value. Please see the EIP for definitions of `forkHash` and `forkNext`.
13 | 
14 | In order to be compatible with future versions of this specifications, implementations
15 | should ignore any additional list elements in `entry-value`.
16 | 
17 | ## Change Log
18 | 
19 | ### EIP-2124 (May 2019)
20 | 
21 | The initial version of the "eth" entry was proposed in [EIP-2124].
22 | 
23 | [eth capability]: ../caps/eth.md
24 | [EIP-2124]: https://eips.ethereum.org/EIPS/eip-2124
25 | 


--------------------------------------------------------------------------------
/.lint/remark-lint-config.js:
--------------------------------------------------------------------------------
 1 | // Get list of plugins from package.json.
 2 | var fs = require('fs'), path = require('path');
 3 | var packageFile = path.resolve(__dirname, 'package.json');
 4 | var deps = Object.keys(JSON.parse(fs.readFileSync(packageFile)).dependencies);
 5 | 
 6 | var pluginOptions = {
 7 |     'remark-lint-code-block-style': 'indented',
 8 |     'remark-lint-emphasis-marker': '*',
 9 |     'remark-lint-strong-marker': '*',
10 |     'remark-lint-heading-style': 'atx',
11 |     'remark-lint-list-item-indent': 'space',
12 |     'remark-lint-no-heading-punctuation': '.,;:!',
13 |     'remark-lint-unordered-list-marker-style': '-',
14 |     'remark-lint-no-dead-urls': { skipOffline: true },
15 |     'remark-lint-no-missing-blank-lines': { exceptTightLists: true },
16 | };
17 | 
18 | exports.plugins = [];
19 | deps.forEach(function (d) {
20 |     if (d.match(/^remark-(lint|validate)/)) {
21 |         var option = pluginOptions[d];
22 |         exports.plugins.push(option ? [d, option] : d);
23 |     }
24 | });
25 | 


--------------------------------------------------------------------------------
/enr-entries/les.md:
--------------------------------------------------------------------------------
 1 | # The "les" ENR entry
 2 | 
 3 | This specification defines the "les" ENR entry, which provides information about the [les
 4 | capability] provided by a node. The presence of this entry in a node's ENR indicates that
 5 | the node is acting as a light client server. ENRs containing the "les" entry must also
 6 | contain an [eth entry], which provides information about the specific Ethereum blockchain
 7 | served by LES.
 8 | 
 9 | ## Entry Format
10 | 
11 |     entry-key   = "les"
12 |     entry-value = [ vflux-version ]
13 | 
14 | At this time, the "les" entry is a single element list containing the version number of
15 | the 'vflux' payment protocol.
16 | 
17 | In order to be compatible with future versions of this specifications, implementations
18 | should ignore any additional list elements in `entry-value`.
19 | 
20 | ## Change Log
21 | 
22 | ### vflux-version (March 2021)
23 | 
24 | In March 2021, the les entry was updated to include the vflux version number.
25 | 
26 | ### Initial Version (October 2019)
27 | 
28 | The initial version of the les entry was an empty list with the sole purpose of
29 | signaling LES server support.
30 | 
31 | [les capability]: ../caps/les.md
32 | [eth entry]: ./eth.md
33 | 


--------------------------------------------------------------------------------
/.lint/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "devp2p-specs-lint-setup",
 3 |     "description": "markdown linter setup for devp2p specs",
 4 |     "version": "0.0.0",
 5 |     "keywords": [],
 6 |     "repository": "https://github.com/ethereum/devp2p",
 7 |     "bugs": "https://github.com/ethereum/devp2p/issues",
 8 |     "dependencies": {
 9 |         "remark-cli": "^10.0.0",
10 |         "remark-lint": "^6.0.6",
11 |         "remark-lint-blockquote-indentation": "^1.0.4",
12 |         "remark-lint-code-block-style": "^1.0.4",
13 |         "remark-lint-definition-spacing": "^1.0.5",
14 |         "remark-lint-emphasis-marker": "^1.0.4",
15 |         "remark-lint-fenced-code-flag": "^1.0.4",
16 |         "remark-lint-fenced-code-marker": "^1.0.4",
17 |         "remark-lint-final-definition": "^1.0.4",
18 |         "remark-lint-final-newline": "^1.0.5",
19 |         "remark-lint-hard-break-spaces": "^1.0.5",
20 |         "remark-lint-heading-style": "^1.0.4",
21 |         "remark-lint-linebreak-style": "^1.0.4",
22 |         "remark-lint-link-title-style": "^1.0.5",
23 |         "remark-lint-list-item-bullet-indent": "^1.0.4",
24 |         "remark-lint-list-item-indent": "^1.0.5",
25 |         "remark-lint-no-auto-link-without-protocol": "^1.0.4",
26 |         "remark-lint-no-blockquote-without-marker": "^2.0.4",
27 |         "remark-lint-no-consecutive-blank-lines": "^1.0.4",
28 |         "remark-lint-no-dead-urls": "^0.4.1",
29 |         "remark-lint-no-duplicate-definitions": "^1.0.6",
30 |         "remark-lint-no-duplicate-headings-in-section": "^1.0.5",
31 |         "remark-lint-no-empty-sections": "^3.0.0",
32 |         "remark-lint-no-empty-url": "^1.0.6",
33 |         "remark-lint-no-heading-content-indent": "^1.0.4",
34 |         "remark-lint-no-heading-indent": "^1.0.4",
35 |         "remark-lint-no-heading-like-paragraph": "^1.0.4",
36 |         "remark-lint-no-heading-punctuation": "^1.0.4",
37 |         "remark-lint-no-inline-padding": "^1.0.5",
38 |         "remark-lint-no-literal-urls": "^1.0.4",
39 |         "remark-lint-no-missing-blank-lines": "^1.0.4",
40 |         "remark-lint-no-paragraph-content-indent": "^1.0.7",
41 |         "remark-lint-no-reference-like-url": "^1.0.5",
42 |         "remark-lint-no-shortcut-reference-image": "^1.0.4",
43 |         "remark-lint-no-table-indentation": "^1.0.5",
44 |         "remark-lint-no-tabs": "^1.0.4",
45 |         "remark-lint-no-undefined-references": "^1.1.2",
46 |         "remark-lint-no-unneeded-full-reference-image": "^1.0.1",
47 |         "remark-lint-no-unneeded-full-reference-link": "^1.0.1",
48 |         "remark-lint-no-unused-definitions": "^1.0.6",
49 |         "remark-lint-ordered-list-marker-style": "^1.0.4",
50 |         "remark-lint-ordered-list-marker-value": "^1.0.5",
51 |         "remark-lint-rule-style": "^1.0.4",
52 |         "remark-lint-strong-marker": "^1.0.4",
53 |         "remark-lint-table-cell-padding": "^1.0.5",
54 |         "remark-lint-table-pipe-alignment": "^1.0.4",
55 |         "remark-lint-table-pipes": "^1.0.4",
56 |         "remark-lint-unordered-list-marker-style": "^1.0.4",
57 |         "remark-validate-links": "^8.0.0"
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/discv5/discv5.md:
--------------------------------------------------------------------------------
 1 | # Node Discovery Protocol v5
 2 | 
 3 | **Protocol version v5.1**
 4 | 
 5 | Welcome to the Node Discovery Protocol v5 specification!
 6 | 
 7 | Note that this specification is a work in progress and may change incompatibly without
 8 | prior notice.
 9 | 
10 | Node Discovery is a system for finding other participants in a peer-to-peer network. The
11 | system can be used by any node, for any purpose, at no cost other than running the network
12 | protocol and storing a limited number of other nodes' records. Any node can be used as an
13 | entry point into the network.
14 | 
15 | The system's design is loosely inspired by the Kademlia DHT, but unlike most DHTs no
16 | arbitrary keys and values are stored. Instead, the DHT stores and relays 'node records',
17 | which are signed documents providing information about nodes in the network. Node
18 | Discovery acts as a database of all live nodes in the network and performs three basic
19 | functions:
20 | 
21 | - Sampling the set of all live participants: by walking the DHT, the network can be
22 |   enumerated.
23 | - Searching for participants providing a certain service: Node Discovery v5 includes a
24 |   scalable facility for registering 'topic advertisements'. These advertisements can be
25 |   queried and nodes advertising a topic found.
26 | - Authoritative resolution of node records: if a node's ID is known, the most recent
27 |   version of its record can be retrieved.
28 | 
29 | ## Specification Overview
30 | 
31 | The specification has three parts:
32 | 
33 | - [discv5-wire.md] defines the wire protocol.
34 | - [discv5-theory.md] describes the algorithms and data structures.
35 | - [discv5-rationale.md] contains the design rationale.
36 | 
37 | ## Comparison With Other Discovery Mechanisms
38 | 
39 | Systems such as MDNS/Bonjour allow finding hosts in a local-area network. The Node
40 | Discovery Protocol is designed to work on the Internet and is most useful for applications
41 | with a large number of participants spread across the Internet.
42 | 
43 | Systems using a rendezvous server: these systems are commonly used by desktop applications
44 | or cloud services to connect participants to each other. While undoubtedly efficient, this
45 | requires trust in the operator of the rendezvous server and these systems are prone to
46 | censorship. Compared to a rendezvous server, The Node Discovery Protocol doesn't rely on a
47 | single operator and places a small amount of trust in every participant. It becomes more
48 | resistant to censorship as the size of the network increases and participants of multiple
49 | distinct peer-to-peer networks can share the discovery network to further increase its
50 | resilience.
51 | 
52 | The Achilles heel of the Node Discovery Protocol is the process of joining the network:
53 | while any other node may be used as an entry point, such a node must first be located
54 | through some other mechanism. Several approaches including scalable listing of initial
55 | entry points in DNS or discovery of participants in the local network can be used for
56 | reasonable secure entry into the network.
57 | 
58 | ## Comparison With Node Discovery v4
59 | 
60 | - Topic advertisement was added.
61 | - Arbitrary node metadata can be stored/relayed.
62 | - Node identity crypto is extensible, use of secp256k1 keys isn't strictly required.
63 | - The protocol no longer relies on the system clock.
64 | - Communication is encrypted, protecting topic searches and record lookups against passive
65 |   observers.
66 | 
67 | [discv5-wire.md]: ./discv5-wire.md
68 | [discv5-theory.md]: ./discv5-theory.md
69 | [discv5-rationale.md]: ./discv5-rationale.md
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><img src="etherdog.png"></p>
 2 | 
 3 | This repository contains specifications for the peer-to-peer networking protocols used by
 4 | Ethereum. The issue tracker here is for discussions of protocol changes. It's also OK to
 5 | open an issue if you just have a question.
 6 | 
 7 | Protocol level security issues are valuable! Please report serious issues responsibly
 8 | through the [Ethereum Foundation Bounty Program].
 9 | 
10 | We have several specifications for low-level protocols:
11 | 
12 | - [Ethereum Node Records]
13 | - [DNS Node Lists]
14 | - [Node Discovery Protocol v4]
15 | - [Node Discovery Protocol v5]
16 | - [RLPx protocol]
17 | 
18 | The repository also contains specifications of many RLPx-based application-level protocols:
19 | 
20 | - [Ethereum Wire Protocol] (eth/68)
21 | - [Ethereum Snapshot Protocol] (snap/1)
22 | - [Light Ethereum Subprotocol] (les/4)
23 | - [Parity Light Protocol] (pip/1)
24 | - [Ethereum Witness Protocol] (wit/0)
25 | 
26 | ### The Mission
27 | 
28 | devp2p is a set of network protocols which form the Ethereum peer-to-peer network.
29 | 'Ethereum network' is meant in a broad sense, i.e. devp2p isn't specific to a particular
30 | blockchain, but should serve the needs of any networked application associated with the
31 | Ethereum umbrella.
32 | 
33 | We aim for an integrated system of orthogonal parts, implemented in multiple programming
34 | environments. The system provides discovery of other participants throughout the Internet
35 | as well as secure communication with those participants.
36 | 
37 | The network protocols in devp2p should be easy to implement from scratch given only the
38 | specification, and must work within the limits of a consumer-grade Internet connection. We
39 | usually design protocols in a 'specification first' approach, but any specification
40 | proposed must be accompanied by a working prototype or implementable within reasonable
41 | time.
42 | 
43 | ### Relationship with libp2p
44 | 
45 | The [libp2p] project was started at about the same time as devp2p and seeks to be a
46 | collection of modules for assembling a peer-to-peer network from modular components.
47 | Questions about the relationship between devp2p and libp2p come up rather often.
48 | 
49 | It's hard to compare the two projects because they have different scope and are designed
50 | with different goals in mind. devp2p is an integrated system definition that wants to
51 | serve Ethereum's needs well (although it may be a good fit for other applications, too)
52 | while libp2p is a collection of programming library parts serving no single application in
53 | particular.
54 | 
55 | That said, both projects are very similar in spirit and devp2p is slowly adopting parts of
56 | libp2p as they mature.
57 | 
58 | ### Implementations
59 | 
60 | devp2p is part of most Ethereum clients. Implementations include:
61 | 
62 | - C#: Nethermind <https://github.com/NethermindEth/nethermind>
63 | - C++: Aleth <https://github.com/ethereum/aleth>
64 | - C: Breadwallet <https://github.com/breadwallet/breadwallet-core>
65 | - Elixir: Exthereum <https://github.com/exthereum/ex_wire>
66 | - Go: go-ethereum/geth <https://github.com/ethereum/go-ethereum>
67 | - Java: Tuweni RLPx library <https://github.com/apache/incubator-tuweni/tree/master/rlpx>
68 | - Java: Besu <https://github.com/hyperledger/besu>
69 | - JavaScript: EthereumJS <https://github.com/ethereumjs/ethereumjs-devp2p>
70 | - Kotlin: Tuweni Discovery library <https://github.com/apache/incubator-tuweni/tree/master/devp2p>
71 | - Nim: Nimbus nim-eth <https://github.com/status-im/nim-eth>
72 | - Python: Trinity <https://github.com/ethereum/trinity>
73 | - Ruby: Ciri <https://github.com/ciri-ethereum/ciri>
74 | - Ruby: ruby-devp2p <https://github.com/cryptape/ruby-devp2p>
75 | - Rust: rust-devp2p <https://github.com/rust-ethereum/devp2p>
76 | - Rust: openethereum <https://github.com/openethereum/openethereum>
77 | - Rust: reth <https://github.com/paradigmxyz/reth>
78 | 
79 | WireShark dissectors are available here: <https://github.com/ConsenSys/ethereum-dissectors>
80 | 
81 | [Ethereum Foundation Bounty Program]: https://bounty.ethereum.org
82 | [Ethereum Wire Protocol]: ./caps/eth.md
83 | [Ethereum Snapshot Protocol]: ./caps/snap.md
84 | [Light Ethereum Subprotocol]: ./caps/les.md
85 | [Ethereum Witness Protocol]: ./caps/wit.md
86 | [Ethereum Node Records]: ./enr.md
87 | [DNS Node Lists]: ./dnsdisc.md
88 | [Node Discovery Protocol v4]: ./discv4.md
89 | [Node Discovery Protocol v5]: ./discv5/discv5.md
90 | [Parity Light Protocol]: ./caps/pip.md
91 | [RLPx protocol]: ./rlpx.md
92 | [libp2p]: https://libp2p.io
93 | 


--------------------------------------------------------------------------------
/caps/wit.md:
--------------------------------------------------------------------------------
  1 | # Ethereum Witness Protocol (wit)
  2 | 
  3 | The `wit` protocol runs on top of [RLPx], facilitating the exchange of Ethereum state
  4 | witnesses between peers. The protocol is an optional extension for peers supporting (or
  5 | caring about) the state witnesses for Ethereum blocks.
  6 | 
  7 | The current version is `wit/0`.
  8 | 
  9 | ### Overview
 10 | 
 11 | The `wit` protocol is designed to assist clients in syncing up to the tip of the chain.
 12 | Eventually, it also aspires to assist in stateless client operation. The `wit` protocol
 13 | does not take part in chain maintenance (block and transaction propagation); and it is
 14 | **meant to be run side-by-side with the `eth` protocol**, not standalone (e.g. chain
 15 | progression is announced via `eth`). (like the `snap` protocol)
 16 | 
 17 | Despite the name, version 0 will not provide actual witnesses. It will provide meta-data
 18 | about the witness, which can be used to download the witness over the `eth` protocol.
 19 | 
 20 | For now, the known use case is to assist Beam Syncing peers. By requesting witness
 21 | metadata, these peers will keep up with the tip of the network and become fully-synced
 22 | nodes faster.
 23 | 
 24 | Using the `wit` protocol, peers ask each other for the list of trie node hashes read
 25 | during the execution of a particular block. This includes the following data:
 26 | 
 27 | - Storage nodes
 28 | - Bytecodes
 29 | - Account nodes
 30 |   - Read during EVM execution
 31 |   - Read during transaction validation
 32 |   - Read during block reward calculation
 33 | - Nodes read when generating the final state root (i.e. sometimes deleting data requires a
 34 |   trie refactor that reads nearby trie nodes)
 35 | 
 36 | The trie node hashes which are generated at the end of the block from existing data are
 37 | *not* included. For example, the final state root hash is not included.
 38 | 
 39 | ### Relation to `eth`
 40 | 
 41 | The `wit` protocol follows the same pattern as `snap`. It is a *dependent satellite* of
 42 | `eth` (i.e. to run `wit`, you need to run `eth` too), not a fully standalone protocol.
 43 | This is a deliberate design decision:
 44 | 
 45 | - `wit` is meant to be a bootstrap aid for newly joining full nodes. By enforcing all
 46 |   `wit` peers to also speak `eth`, we can avoid non-full nodes from lingering attached to
 47 |   `wit` indefinitely.
 48 | - `eth` already contains well established chain and fork negotiation mechanisms, as well
 49 |   as remote peer staleness detection during sync. By running both protocols side-by-side,
 50 |   `wit` can benefit of all these mechanisms without having to duplicate them.
 51 | 
 52 | This *satellite* status may be changed later, but it's better to launch with a more
 53 | restricted protocol first and then expand if need be vs. trying to withdraw depended-upon
 54 | features.
 55 | 
 56 | In order to follow the `wit` protocol, clients must generate witness metadata when
 57 | executing blocks. For now, its primary purpose is also one specific sync method that might
 58 | not be suitable for all clients. Keeping `wit` as a separate protocol permits every client
 59 | to decide to pursue it or not, without hindering their capacity to participate in the
 60 | `eth` protocol.
 61 | 
 62 | ### Accelerating Beam Sync
 63 | 
 64 | At its most naive, Beam Sync needs to download any missing state one trie node at a time.
 65 | According to a recent test, after Beam Syncing for 22 hours, the median block still
 66 | required more than 300 new trie nodes. At an optimistic 100ms round-trip time, that means
 67 | 30 seconds per block of data download. This is where witness metadata can help
 68 | tremendously.
 69 | 
 70 | If a client can request the trie node hashes used by a block up front, those 300 trie
 71 | nodes can likely be accessed in a fraction of a second. That's easily enough to keep
 72 | synced with mainnet.
 73 | 
 74 | Unfortunately, the list of trie node hashes cannot be verified before the block is
 75 | imported. This would be a huge problem for a stateless client, which would be permanently
 76 | at risk to a DoS attack where peers feed it a long list of incorrect hashes. But Beam
 77 | Syncing clients are only vulnerable until they've finished downloading the full network
 78 | state, so the payoff for such an attack is smaller.
 79 | 
 80 | ## Protocol Messages
 81 | 
 82 | ### RESERVED (0x00)
 83 | 
 84 | This command is undefined, held in place for a possible future Status message.
 85 | 
 86 | ### GetBlockWitnessHashes (0x01)
 87 | 
 88 | `[reqID: P, blockHash: B_32]`
 89 | 
 90 | Requests a list of trie node hashes used by a given block.
 91 | 
 92 | - `reqID`: Request ID to match up responses with
 93 | - `blockHash`: Hash of the header to request the witness hashes for
 94 | 
 95 | Notes:
 96 | 
 97 | - Nodes **must** always respond to the query.
 98 | - If the node does **not** have the trie hashes requested block, it **must** return an
 99 |   empty reply.
100 | 
101 | ### BlockWitnessHashes (0x02)
102 | 
103 | `[reqID: P, witnessHashes: [trieNodeHash: B_32, ...]]`
104 | 
105 | Returns a list of the trie node hashes that were read during execution and validation of
106 | the given block.
107 | 
108 | - `reqID`: ID of the request this is a response for
109 | - `witnessHashes`: List of trie node hashes
110 | 
111 | ## Change Log
112 | 
113 | ### wit/0 (October 2020)
114 | 
115 | Version 0 was the introduction of the witness protocol.
116 | 
117 | [RLPx]: ../rlpx.md
118 | 


--------------------------------------------------------------------------------
/enr.md:
--------------------------------------------------------------------------------
  1 | # Ethereum Node Records
  2 | 
  3 | This specification defines Ethereum Node Records (ENR), an open format for p2p
  4 | connectivity information. A node record usually contains the network endpoints of a node,
  5 | i.e. the node's IP addresses and ports. It also holds information about the node's purpose
  6 | on the network so others can decide whether to connect to the node.
  7 | 
  8 | Ethereum Node Records were originally proposed in [EIP-778].
  9 | 
 10 | ## Record Structure
 11 | 
 12 | The components of a node record are:
 13 | 
 14 | - `signature`: cryptographic signature of record contents
 15 | - `seq`: The sequence number, a 64-bit unsigned integer. Nodes should increase the number
 16 |   whenever the record changes and republish the record.
 17 | - The remainder of the record consists of arbitrary key/value pairs
 18 | 
 19 | A record's signature is made and validated according to an *identity scheme*. The identity
 20 | scheme is also responsible for deriving a node's address in the DHT.
 21 | 
 22 | The key/value pairs must be sorted by key and must be unique, i.e. any key may be present
 23 | only once. The keys can technically be any byte sequence, but ASCII text is preferred. Key
 24 | names in the table below have pre-defined meaning.
 25 | 
 26 | | Key         | Value                                      |
 27 | |:------------|:-------------------------------------------|
 28 | | `id`        | name of identity scheme, e.g. "v4"         |
 29 | | `secp256k1` | compressed secp256k1 public key, 33 bytes  |
 30 | | `ip`        | IPv4 address, 4 bytes                      |
 31 | | `tcp`       | TCP port, big endian integer               |
 32 | | `udp`       | UDP port, big endian integer               |
 33 | | `ip6`       | IPv6 address, 16 bytes                     |
 34 | | `tcp6`      | IPv6-specific TCP port, big endian integer |
 35 | | `udp6`      | IPv6-specific UDP port, big endian integer |
 36 | 
 37 | All keys except `id` are optional, including IP addresses and ports. A record without
 38 | endpoint information is still valid as long as its signature is valid. If no `tcp6` /
 39 | `udp6` port is provided, the `tcp` / `udp` port applies to both IP addresses. Declaring
 40 | the same port number in both `tcp`, `tcp6` or `udp`, `udp6` should be avoided but doesn't
 41 | render the record invalid.
 42 | 
 43 | ### RLP Encoding
 44 | 
 45 | The canonical encoding of a node record is an RLP list of `[signature, seq, k, v, ...]`.
 46 | The maximum encoded size of a node record is 300 bytes. Implementations should reject
 47 | records larger than this size.
 48 | 
 49 | Records are signed and encoded as follows:
 50 | 
 51 |     content   = [seq, k, v, ...]
 52 |     signature = sign(content)
 53 |     record    = [signature, seq, k, v, ...]
 54 | 
 55 | ### Text Encoding
 56 | 
 57 | The textual form of a node record is the base64 encoding of its RLP representation,
 58 | prefixed by `enr:`. Implementations should use the [URL-safe base64 alphabet]
 59 | and omit padding characters.
 60 | 
 61 | ### "v4" Identity Scheme
 62 | 
 63 | This specification defines a single identity scheme to be used as the default until other
 64 | schemes are defined by further EIPs. The "v4" scheme is backwards-compatible with the
 65 | cryptosystem used by Node Discovery v4.
 66 | 
 67 | - To sign record `content` with this scheme, apply the keccak256 hash function (as used by
 68 |   the EVM) to `content`, then create a signature of the hash. The resulting 64-byte
 69 |   signature is encoded as the concatenation of the `r` and `s` signature values (the
 70 |   recovery ID `v` is omitted).
 71 | 
 72 | - To verify a record, check that the signature was made by the public key in the
 73 |   "secp256k1" key/value pair of the record.
 74 | 
 75 | - To derive a node address, take the keccak256 hash of the uncompressed public key, i.e.
 76 |   `keccak256(x || y)`. Note that `x` and `y` must be zero-padded up to length 32.
 77 | 
 78 | ## Rationale
 79 | 
 80 | The format is meant to suit future needs in two ways:
 81 | 
 82 | - Adding new key/value pairs: This is always possible and doesn't require implementation
 83 |   consensus. Existing clients will accept any key/value pairs regardless of whether they
 84 |   can interpret their content.
 85 | - Adding identity schemes: these need implementation consensus because the network won't
 86 |   accept the signature otherwise. To introduce a new identity scheme, propose an EIP and
 87 |   get it implemented. The scheme can be used as soon as most clients accept it.
 88 | 
 89 | The size of a record is limited because records are relayed frequently and may be included
 90 | in size-constrained protocols such as DNS. A record containing a IPv4 address, when signed
 91 | using the "v4" scheme occupies roughly 120 bytes, leaving plenty of room for additional
 92 | metadata.
 93 | 
 94 | You might wonder about the need for so many pre-defined keys related to IP addresses and
 95 | ports. This need arises because residential and mobile network setups often put IPv4
 96 | behind NAT while IPv6 traffic—if supported—is directly routed to the same host. Declaring
 97 | both address types ensures a node is reachable from IPv4-only locations and those
 98 | supporting both protocols.
 99 | 
100 | ## Test Vectors
101 | 
102 | This is an example record containing the IPv4 address `127.0.0.1` and UDP port `30303`.
103 | The node ID is `a448f24c6d18e575453db13171562b71999873db5b286df957af199ec94617f7`.
104 | 
105 |     enr:-IS4QHCYrYZbAKWCBRlAy5zzaDZXJBGkcnh4MHcBFZntXNFrdvJjX04jRzjzCBOonrkTfj499SZuOh8R33Ls8RRcy5wBgmlkgnY0gmlwhH8AAAGJc2VjcDI1NmsxoQPKY0yuDUmstAHYpMa2_oxVtw0RW_QAdpzBQA8yWM0xOIN1ZHCCdl8
106 | 
107 | The record is signed using the "v4" identity scheme using sequence number `1` and this
108 | private key:
109 | 
110 |     b71c71a67e1177ad4e901695e1b4b9ee17ae16c6668d313eac2f96dbcda3f291
111 | 
112 | The RLP structure of the record is:
113 | 
114 |     [
115 |       7098ad865b00a582051940cb9cf36836572411a47278783077011599ed5cd16b76f2635f4e234738f30813a89eb9137e3e3df5266e3a1f11df72ecf1145ccb9c,
116 |       01,
117 |       "id",
118 |       "v4",
119 |       "ip",
120 |       7f000001,
121 |       "secp256k1",
122 |       03ca634cae0d49acb401d8a4c6b6fe8c55b70d115bf400769cc1400f3258cd3138,
123 |       "udp",
124 |       765f,
125 |     ]
126 | 
127 | [EIP-778]: https://eips.ethereum.org/EIPS/eip-778
128 | [URL-safe base64 alphabet]: https://tools.ietf.org/html/rfc4648#section-5
129 | 


--------------------------------------------------------------------------------
/discv5/discv5-wire-test-vectors.md:
--------------------------------------------------------------------------------
  1 | # Test Vectors
  2 | 
  3 | This document provides a collection of test vectors for the Discovery v5 wire protocol
  4 | aimed to aid new implementations conform to the specification.
  5 | 
  6 | ## Packet Encodings
  7 | 
  8 | This section provides test vectors for the different packet types. Your implementation
  9 | should load the `node-b-key` and then be able to decrypt and authenticate these as-is.
 10 | 
 11 | The secp256k1 private keys used here are:
 12 | 
 13 |     node-a-key = 0xeef77acb6c6a6eebc5b363a475ac583ec7eccdb42b6481424c60f59aa326547f
 14 |     node-b-key = 0x66fb62bfbd66b9177a138c1e5cddbe4f7c30c343e94e68df8769459cb1cde628
 15 | 
 16 | Ping message packet (flag 0):
 17 | 
 18 |     # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb
 19 |     # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
 20 |     # nonce = 0xffffffffffffffffffffffff
 21 |     # read-key = 0x00000000000000000000000000000000
 22 |     # ping.req-id = 0x00000001
 23 |     # ping.enr-seq = 2
 24 | 
 25 |     00000000000000000000000000000000088b3d4342774649325f313964a39e55
 26 |     ea96c005ad52be8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3
 27 |     4c4f53245d08dab84102ed931f66d1492acb308fa1c6715b9d139b81acbdcc
 28 | 
 29 | WHOAREYOU packet (flag 1):
 30 | 
 31 |     # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb
 32 |     # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
 33 |     # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000
 34 |     # whoareyou.request-nonce = 0x0102030405060708090a0b0c
 35 |     # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10
 36 |     # whoareyou.enr-seq = 0
 37 | 
 38 |     00000000000000000000000000000000088b3d434277464933a1ccc59f5967ad
 39 |     1d6035f15e528627dde75cd68292f9e6c27d6b66c8100a873fcbaed4e16b8d
 40 | 
 41 | Ping handshake packet (flag 2):
 42 | 
 43 |     # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb
 44 |     # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
 45 |     # nonce = 0xffffffffffffffffffffffff
 46 |     # read-key = 0x4f9fac6de7567d1e3b1241dffe90f662
 47 |     # ping.req-id = 0x00000001
 48 |     # ping.enr-seq = 1
 49 |     #
 50 |     # handshake inputs:
 51 |     #
 52 |     # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000001
 53 |     # whoareyou.request-nonce = 0x0102030405060708090a0b0c
 54 |     # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10
 55 |     # whoareyou.enr-seq = 1
 56 |     # ephemeral-key = 0x0288ef00023598499cb6c940146d050d2b1fb914198c327f76aad590bead68b6
 57 |     # ephemeral-pubkey = 0x039a003ba6517b473fa0cd74aefe99dadfdb34627f90fec6362df85803908f53a5
 58 | 
 59 |     00000000000000000000000000000000088b3d4342774649305f313964a39e55
 60 |     ea96c005ad521d8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3
 61 |     4c4f53245d08da4bb252012b2cba3f4f374a90a75cff91f142fa9be3e0a5f3ef
 62 |     268ccb9065aeecfd67a999e7fdc137e062b2ec4a0eb92947f0d9a74bfbf44dfb
 63 |     a776b21301f8b65efd5796706adff216ab862a9186875f9494150c4ae06fa4d1
 64 |     f0396c93f215fa4ef524f1eadf5f0f4126b79336671cbcf7a885b1f8bd2a5d83
 65 |     9cf8
 66 | 
 67 | Ping handshake message packet (flag 2, with ENR):
 68 | 
 69 |     # src-node-id = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb
 70 |     # dest-node-id = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
 71 |     # nonce = 0xffffffffffffffffffffffff
 72 |     # read-key = 0x53b1c075f41876423154e157470c2f48
 73 |     # ping.req-id = 0x00000001
 74 |     # ping.enr-seq = 1
 75 |     #
 76 |     # handshake inputs:
 77 |     #
 78 |     # whoareyou.challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000
 79 |     # whoareyou.request-nonce = 0x0102030405060708090a0b0c
 80 |     # whoareyou.id-nonce = 0x0102030405060708090a0b0c0d0e0f10
 81 |     # whoareyou.enr-seq = 0
 82 |     # ephemeral-key = 0x0288ef00023598499cb6c940146d050d2b1fb914198c327f76aad590bead68b6
 83 |     # ephemeral-pubkey = 0x039a003ba6517b473fa0cd74aefe99dadfdb34627f90fec6362df85803908f53a5
 84 | 
 85 |     00000000000000000000000000000000088b3d4342774649305f313964a39e55
 86 |     ea96c005ad539c8c7560413a7008f16c9e6d2f43bbea8814a546b7409ce783d3
 87 |     4c4f53245d08da4bb23698868350aaad22e3ab8dd034f548a1c43cd246be9856
 88 |     2fafa0a1fa86d8e7a3b95ae78cc2b988ded6a5b59eb83ad58097252188b902b2
 89 |     1481e30e5e285f19735796706adff216ab862a9186875f9494150c4ae06fa4d1
 90 |     f0396c93f215fa4ef524e0ed04c3c21e39b1868e1ca8105e585ec17315e755e6
 91 |     cfc4dd6cb7fd8e1a1f55e49b4b5eb024221482105346f3c82b15fdaae36a3bb1
 92 |     2a494683b4a3c7f2ae41306252fed84785e2bbff3b022812d0882f06978df84a
 93 |     80d443972213342d04b9048fc3b1d5fcb1df0f822152eced6da4d3f6df27e70e
 94 |     4539717307a0208cd208d65093ccab5aa596a34d7511401987662d8cf62b1394
 95 |     71
 96 | 
 97 | ## Cryptographic Primitives
 98 | 
 99 | This section provides test vectors for the currently supported "v4" identity scheme.
100 | 
101 | ### ECDH
102 | 
103 | The ECDH function takes the elliptic-curve scalar multiplication of a public key and a
104 | private key. The wire protocol describes this process.
105 | 
106 |     public-key = 0x039961e4c2356d61bedb83052c115d311acb3a96f5777296dcf297351130266231
107 |     secret-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736
108 | 
109 | This output is the result of the ECDH function which will be used by the KDF.
110 | 
111 |     shared-secret = 0x033b11a2a1f214567e1537ce5e509ffd9b21373247f2a3ff6841f4976f53165e7e
112 | 
113 | ### Key Derivation
114 | 
115 | This test vector checks the complete key derivation as used by the handshake.
116 | 
117 |     ephemeral-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736
118 |     dest-pubkey = 0x0317931e6e0840220642f230037d285d122bc59063221ef3226b1f403ddc69ca91
119 |     node-id-a = 0xaaaa8419e9f49d0083561b48287df592939a8d19947d8c0ef88f2a4856a69fbb
120 |     node-id-b = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
121 |     challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000
122 | 
123 | The expected outputs, resulting from the HKDF-EXPAND function.
124 | 
125 |     initiator-key = 0xdccc82d81bd610f4f76d3ebe97a40571
126 |     recipient-key = 0xac74bb8773749920b0d3a8881c173ec5
127 | 
128 | ### ID Nonce Signing
129 | 
130 | This test vector checks the ID signature as used by the handshake.
131 | The `static-key` is the secp256k1 private key used for signing.
132 | 
133 |     static-key = 0xfb757dc581730490a1d7a00deea65e9b1936924caaea8f44d476014856b68736
134 |     challenge-data = 0x000000000000000000000000000000006469736376350001010102030405060708090a0b0c00180102030405060708090a0b0c0d0e0f100000000000000000
135 |     ephemeral-pubkey = 0x039961e4c2356d61bedb83052c115d311acb3a96f5777296dcf297351130266231
136 |     node-id-B = 0xbbbb9d047f0488c0b5a93c1c3f2d8bafc7c8ff337024a55434a0d0555de64db9
137 | 
138 | The expected output is the `id-signature`. You can also apply this test vector in reverse
139 | by verifying the signature against the inputs above.
140 | 
141 |     id-signature = 0x94852a1e2318c4e5e9d422c98eaf19d1d90d876b29cd06ca7cb7546d0fff7b484fe86c09a064fe72bdbef73ba8e9c34df0cd2b53e9d65528c2c7f336d5dfc6e6
142 | 
143 | ### Encryption/Decryption
144 | 
145 | This test vector demonstrates the `AES_GCM` encryption/decryption used in the wire
146 | protocol.
147 | 
148 |     encryption-key: 0x9f2d77db7004bf8a1a85107ac686990b
149 |     nonce: 0x27b5af763c446acd2749fe8e
150 |     pt: 0x01c20101
151 |     ad: 0x93a7400fa0d6a694ebc24d5cf570f65d04215b6ac00757875e3f3a5f42107903
152 | 
153 | Note that the 16 byte MAC is appended to the ciphertext.
154 | 
155 |     message-ciphertext: 0xa5d12a2d94b8ccb3ba55558229867dc13bfa3648
156 | 


--------------------------------------------------------------------------------
/dnsdisc.md:
--------------------------------------------------------------------------------
  1 | # DNS Node Lists
  2 | 
  3 | Peer-to-peer node software often contains hard-coded bootstrap node lists. Updating those
  4 | lists requires a software update and, effort is required from the software maintainers to
  5 | ensure the list is up-to-date. As a result, the provided lists are usually small, giving
  6 | the software little choice of initial entry point into the network.
  7 | 
  8 | This specification describes a scheme for authenticated, updateable node lists retrievable
  9 | via DNS. In order to use such a list, the client only requires information about the DNS
 10 | name and the public key that signs the list.
 11 | 
 12 | DNS-based discovery was initially proposed in [EIP-1459].
 13 | 
 14 | ## Node Lists
 15 | 
 16 | A 'node list' is a list of ['node records' (ENRs)](./enr.md) of arbitrary length. Lists
 17 | may refer to other lists using links. The entire list is signed using a secp256k1 private
 18 | key. The corresponding public key must be known to the client in order to verify the list.
 19 | 
 20 | ## URL Scheme
 21 | 
 22 | To refer to a DNS node list, clients use a URL with 'enrtree' scheme. The URL contains the
 23 | DNS name on which the list can be found as well as the public key that signed the list.
 24 | The public key is contained in the username part of the URL and is the base32 encoding of
 25 | the compressed 32-byte binary public key.
 26 | 
 27 | Example:
 28 | 
 29 |     enrtree://AM5FCQLWIZX2QFPNJAP7VUERCCRNGRHWZG3YYHIUV7BVDQ5FDPRT2@nodes.example.org
 30 | 
 31 | This URL refers to a node list at the DNS name 'nodes.example.org' and is signed by the
 32 | public key
 33 | 
 34 |     0x049f88229042fef9200246f49f94d9b77c4e954721442714e85850cb6d9e5daf2d880ea0e53cb3ac1a75f9923c2726a4f941f7d326781baa6380754a360de5c2b6
 35 | 
 36 | ## DNS Record Structure
 37 | 
 38 | The nodes in a list are encoded as a merkle tree for distribution via the DNS protocol.
 39 | Entries of the merkle tree are contained in DNS TXT records. The root of the tree is a TXT
 40 | record with the following content:
 41 | 
 42 |     enrtree-root:v1 e=<enr-root> l=<link-root> seq=<sequence-number> sig=<signature>
 43 | 
 44 | where
 45 | 
 46 | - `enr-root` and `link-root` refer to the root hashes of subtrees containing nodes and
 47 |   links subtrees.
 48 | - `sequence-number` is the tree's update sequence number, a decimal integer.
 49 | - `signature` is a 65-byte secp256k1 EC signature over the keccak256 hash of the record
 50 |   content, excluding the `sig=` part, encoded as URL-safe base64.
 51 | 
 52 | Further TXT records on subdomains map hashes to one of three entry types. The subdomain
 53 | name of any entry is the base32 encoding of the (abbreviated) keccak256 hash of its text
 54 | content.
 55 | 
 56 | - `enrtree-branch:<h₁>,<h₂>,...,<hₙ>` is an intermediate tree entry containing hashes of
 57 |   subtree entries.
 58 | - `enrtree://<key>@<fqdn>` is a leaf pointing to a different list located at another fully
 59 |   qualified domain name. Note that this format matches the URL encoding. This type of
 60 |   entry may only appear in the subtree pointed to by `link-root`.
 61 | - `enr:<node-record>` is a leaf containing a node record. The node record is encoded as a
 62 |   URL-safe base64 string. Note that this type of entry matches the canonical ENR text
 63 |   encoding. It may only appear in the `enr-root` subtree.
 64 | 
 65 | No particular ordering or structure is defined for the tree. Whenever the tree is updated,
 66 | its sequence number should increase. The content of any TXT record should be small enough
 67 | to fit into the 512 byte limit imposed on UDP DNS packets. This limits the number of
 68 | hashes that can be placed into an `enrtree-branch` entry.
 69 | 
 70 | Example in zone file format:
 71 | 
 72 |     ; name                        ttl     class type  content
 73 |     @                             60      IN    TXT   enrtree-root:v1 e=JWXYDBPXYWG6FX3GMDIBFA6CJ4 l=C7HRFPF3BLGF3YR4DY5KX3SMBE seq=1 sig=o908WmNp7LibOfPsr4btQwatZJ5URBr2ZAuxvK4UWHlsB9sUOTJQaGAlLPVAhM__XJesCHxLISo94z5Z2a463gA
 74 |     C7HRFPF3BLGF3YR4DY5KX3SMBE    86900   IN    TXT   enrtree://AM5FCQLWIZX2QFPNJAP7VUERCCRNGRHWZG3YYHIUV7BVDQ5FDPRT2@morenodes.example.org
 75 |     JWXYDBPXYWG6FX3GMDIBFA6CJ4    86900   IN    TXT   enrtree-branch:2XS2367YHAXJFGLZHVAWLQD4ZY,H4FHT4B454P6UXFD7JCYQ5PWDY,MHTDO6TMUBRIA2XWG5LUDACK24
 76 |     2XS2367YHAXJFGLZHVAWLQD4ZY    86900   IN    TXT   enr:-HW4QOFzoVLaFJnNhbgMoDXPnOvcdVuj7pDpqRvh6BRDO68aVi5ZcjB3vzQRZH2IcLBGHzo8uUN3snqmgTiE56CH3AMBgmlkgnY0iXNlY3AyNTZrMaECC2_24YYkYHEgdzxlSNKQEnHhuNAbNlMlWJxrJxbAFvA
 77 |     H4FHT4B454P6UXFD7JCYQ5PWDY    86900   IN    TXT   enr:-HW4QAggRauloj2SDLtIHN1XBkvhFZ1vtf1raYQp9TBW2RD5EEawDzbtSmlXUfnaHcvwOizhVYLtr7e6vw7NAf6mTuoCgmlkgnY0iXNlY3AyNTZrMaECjrXI8TLNXU0f8cthpAMxEshUyQlK-AM0PW2wfrnacNI
 78 |     MHTDO6TMUBRIA2XWG5LUDACK24    86900   IN    TXT   enr:-HW4QLAYqmrwllBEnzWWs7I5Ev2IAs7x_dZlbYdRdMUx5EyKHDXp7AV5CkuPGUPdvbv1_Ms1CPfhcGCvSElSosZmyoqAgmlkgnY0iXNlY3AyNTZrMaECriawHKWdDRk2xeZkrOXBQ0dfMFLHY4eENZwdufn1S1o
 79 | 
 80 | ## Client Protocol
 81 | 
 82 | To find nodes at a given DNS name, say "mynodes.org":
 83 | 
 84 | 1. Resolve the TXT record of the name and check whether it contains a valid
 85 |    "enrtree-root=v1" entry. Let's say the `enr-root` hash contained in the entry is
 86 |    "CFZUWDU7JNQR4VTCZVOJZ5ROV4".
 87 | 2. Verify the signature on the root against the known public key and check whether the
 88 |    sequence number is larger than or equal to any previous number seen for that name.
 89 | 3. Resolve the TXT record of the hash subdomain, e.g.
 90 |    "CFZUWDU7JNQR4VTCZVOJZ5ROV4.mynodes.org" and verify whether the content matches the
 91 |    hash.
 92 | 4. The next step depends on the entry type found:
 93 |    - for `enrtree-branch`: parse the list of hashes and continue resolving them (step 3).
 94 |    - for `enr`: decode, verify the node record and import it to local node storage.
 95 | 
 96 | During traversal, the client must track hashes and domains which are already resolved to
 97 | avoid going into an infinite loop. It's in the client's best interest to traverse the tree
 98 | in random order.
 99 | 
100 | Client implementations should avoid downloading the entire tree at once during normal
101 | operation. It's much better to request entries via DNS when-needed, i.e. at the time when
102 | the client is looking for peers.
103 | 
104 | ## Rationale
105 | 
106 | DNS is used because it is a low-latency protocol that is pretty much guaranteed to be
107 | available.
108 | 
109 | Being a merkle tree, any node list can be authenticated by a single signature on the root.
110 | Hash subdomains protect the integrity of the list. At worst intermediate resolvers can
111 | block access to the list or disallow updates to it, but cannot corrupt its content. The
112 | sequence number prevents replacing the root with an older version.
113 | 
114 | Synchronizing updates on the client side can be done incrementally, which matters for
115 | large lists. Individual entries of the tree are small enough to fit into a single UDP
116 | packet, ensuring compatibility with environments where only basic UDP DNS can be used. The
117 | tree format also works well with caching resolvers: only the root of the tree needs a
118 | short TTL. Intermediate entries and leaves can be cached for days.
119 | 
120 | ### Why does the link subtree exist?
121 | 
122 | Links between lists enable federation and web-of-trust functionality. The operator of a
123 | large list can delegate maintenance to other list providers. If two node lists link to
124 | each other, users can use either list and get nodes from both.
125 | 
126 | The link subtree is separate from the tree containing ENRs. This is done to enable client
127 | implementations to sync these trees independently. A client wanting to get as many nodes
128 | as possible will sync the link tree first and add all linked names to the sync horizon.
129 | 
130 | ## References
131 | 
132 | 1. The base64 and base32 encodings used to represent binary data are defined in [RFC
133 |    4648]. No padding is used for base64 and base32 data.
134 | 
135 | [EIP-1459]: https://eips.ethereum.org/EIPS/eip-1459
136 | [RFC 4648]: https://tools.ietf.org/html/rfc4648
137 | 


--------------------------------------------------------------------------------
/discv4.md:
--------------------------------------------------------------------------------
  1 | # Node Discovery Protocol
  2 | 
  3 | This specification defines the Node Discovery protocol version 4, a Kademlia-like DHT that
  4 | stores information about Ethereum nodes. The Kademlia structure was chosen because it is
  5 | an efficient way to organize a distributed index of nodes and yields a topology of low
  6 | diameter.
  7 | 
  8 | The current protocol version is **4**. You can find a list of changes in past protocol
  9 | versions at the end of this document.
 10 | 
 11 | ## Node Identities
 12 | 
 13 | Every node has a cryptographic identity, a key on the secp256k1 elliptic curve. The public
 14 | key of the node serves as its identifier or 'node ID'.
 15 | 
 16 | The 'distance' between two node keys is the bitwise exclusive or on the hashes of the
 17 | public keys, taken as the number.
 18 | 
 19 |     distance(n₁, n₂) = keccak256(n₁) XOR keccak256(n₂)
 20 | 
 21 | ## Node Records
 22 | 
 23 | Participants in the Discovery Protocol are expected to maintain a [node record] \(ENR\)
 24 | containing up-to-date information. All records must use the "v4" identity scheme. Other
 25 | nodes may request the local record at any time by sending an [ENRRequest] packet.
 26 | 
 27 | To resolve the current record of any node public key, perform a Kademlia lookup using
 28 | [FindNode] packets. When the node is found, send ENRRequest to it and return the record
 29 | from the response.
 30 | 
 31 | ## Kademlia Table
 32 | 
 33 | Nodes in the Discovery Protocol keep information about other nodes in their neighborhood.
 34 | Neighbor nodes are stored in a routing table consisting of 'k-buckets'. For each `i` in
 35 | `0 ≤ i < 256`, every node keeps a k-bucket of neighbors with distance
 36 | `2^i ≤ distance < 2^(i+1)` from itself.
 37 | 
 38 | The Node Discovery Protocol uses `k = 16`, i.e. every k-bucket contains up to 16 node
 39 | entries. The entries are sorted by time last seen — least-recently seen node at the head,
 40 | most-recently seen at the tail.
 41 | 
 42 | Whenever a new node N₁ is encountered, it can be inserted into the corresponding bucket.
 43 | If the bucket contains less than `k` entries N₁ can simply be added as the last entry. If
 44 | the bucket already contains `k` entries, the least recently seen node in the bucket, N₂,
 45 | needs to be revalidated by sending a [Ping] packet. If no reply is received from N₂ it is
 46 | considered dead, removed and N₁ added to the tail of the bucket.
 47 | 
 48 | ## Endpoint Proof
 49 | 
 50 | To prevent traffic amplification attacks, implementations must verify that the sender of a
 51 | query participates in the discovery protocol. The sender of a packet is considered
 52 | verified if it has sent a valid [Pong] response with matching ping hash within the last 12
 53 | hours.
 54 | 
 55 | ## Recursive Lookup
 56 | 
 57 | A 'lookup' locates the `k` closest nodes to a node ID.
 58 | 
 59 | The lookup initiator starts by picking `α` closest nodes to the target it knows of. The
 60 | initiator then sends concurrent [FindNode] packets to those nodes. `α` is a system-wide
 61 | concurrency parameter, such as 3. In the recursive step, the initiator resends FindNode to
 62 | nodes it has learned about from previous queries. Of the `k` nodes the initiator has heard
 63 | of closest to the target, it picks `α` that it has not yet queried and resends [FindNode]
 64 | to them. Nodes that fail to respond quickly are removed from consideration until and
 65 | unless they do respond.
 66 | 
 67 | If a round of FindNode queries fails to return a node any closer than the closest already
 68 | seen, the initiator resends the find node to all of the `k` closest nodes it has not
 69 | already queried. The lookup terminates when the initiator has queried and gotten responses
 70 | from the `k` closest nodes it has seen.
 71 | 
 72 | ## Wire Protocol
 73 | 
 74 | Node discovery messages are sent as UDP datagrams. The maximum size of any packet is 1280
 75 | bytes.
 76 | 
 77 |     packet = packet-header || packet-data
 78 | 
 79 | Every packet starts with a header:
 80 | 
 81 |     packet-header = hash || signature || packet-type
 82 |     hash = keccak256(signature || packet-type || packet-data)
 83 |     signature = sign(packet-type || packet-data)
 84 | 
 85 | The `hash` exists to make the packet format recognizable when running multiple protocols
 86 | on the same UDP port. It serves no other purpose.
 87 | 
 88 | Every packet is signed by the node's identity key. The `signature` is encoded as a byte
 89 | array of length 65 as the concatenation of the signature values `r`, `s` and the 'recovery
 90 | id' `v`.
 91 | 
 92 | The `packet-type` is a single byte defining the type of message. Valid packet types are
 93 | listed below. Data after the header is specific to the packet type and is encoded as an
 94 | RLP list. Implementations should ignore any additional elements in the `packet-data` list
 95 | as well as any extra data after the list.
 96 | 
 97 | ### Ping Packet (0x01)
 98 | 
 99 |     packet-data = [version, from, to, expiration, enr-seq ...]
100 |     version = 4
101 |     from = [sender-ip, sender-udp-port, sender-tcp-port]
102 |     to = [recipient-ip, recipient-udp-port, 0]
103 | 
104 | The `expiration` field is an absolute UNIX time stamp. Packets containing a time stamp
105 | that lies in the past are expired may not be processed.
106 | 
107 | The `enr-seq` field is the current ENR sequence number of the sender. This field is
108 | optional.
109 | 
110 | When a ping packet is received, the recipient should reply with a [Pong] packet. It may
111 | also consider the sender for addition into the local table. Implementations should ignore
112 | any mismatches in version.
113 | 
114 | If no communication with the sender has occurred within the last 12h, a ping should be
115 | sent in addition to pong in order to receive an endpoint proof.
116 | 
117 | ### Pong Packet (0x02)
118 | 
119 |     packet-data = [to, ping-hash, expiration, enr-seq, ...]
120 | 
121 | Pong is the reply to ping.
122 | 
123 | `ping-hash` should be equal to `hash` of the corresponding ping packet. Implementations
124 | should ignore unsolicited pong packets that do not contain the hash of the most recent
125 | ping packet.
126 | 
127 | The `enr-seq` field is the current ENR sequence number of the sender. This field is
128 | optional.
129 | 
130 | ### FindNode Packet (0x03)
131 | 
132 |     packet-data = [target, expiration, ...]
133 | 
134 | A FindNode packet requests information about nodes close to `target`. The `target` is a
135 | 64-byte secp256k1 public key. When FindNode is received, the recipient should reply with
136 | [Neighbors] packets containing the closest 16 nodes to target found in its local table.
137 | 
138 | To guard against traffic amplification attacks, Neighbors replies should only be sent if
139 | the sender of FindNode has been verified by the endpoint proof procedure.
140 | 
141 | ### Neighbors Packet (0x04)
142 | 
143 |     packet-data = [nodes, expiration, ...]
144 |     nodes = [[ip, udp-port, tcp-port, node-id], ...]
145 | 
146 | Neighbors is the reply to [FindNode].
147 | 
148 | ### ENRRequest Packet (0x05)
149 | 
150 |     packet-data = [expiration]
151 | 
152 | When a packet of this type is received, the node should reply with an ENRResponse packet
153 | containing the current version of its [node record].
154 | 
155 | To guard against amplification attacks, the sender of ENRRequest should have replied to a
156 | ping packet recently (just like for FindNode). The `expiration` field, a UNIX timestamp,
157 | should be handled as for all other existing packets i.e. no reply should be sent if it
158 | refers to a time in the past.
159 | 
160 | ### ENRResponse Packet (0x06)
161 | 
162 |     packet-data = [request-hash, ENR]
163 | 
164 | This packet is the response to ENRRequest.
165 | 
166 | - `request-hash` is the hash of the entire ENRRequest packet being replied to.
167 | - `ENR` is the node record.
168 | 
169 | The recipient of the packet should verify that the node record is signed by the public key
170 | which signed the response packet.
171 | 
172 | # Change Log
173 | 
174 | ## Known Issues in the Current Version
175 | 
176 | The `expiration` field present in all packets is supposed to prevent packet replay. Since
177 | it is an absolute time stamp, the node's clock must be accurate to verify it correctly.
178 | Since the protocol's launch in 2016 we have received countless reports about connectivity
179 | issues related to the user's clock being wrong.
180 | 
181 | The endpoint proof is imprecise because the sender of FindNode can never be sure whether
182 | the recipient has seen a recent enough pong. Geth handles it as follows: If no
183 | communication with the recipient has occurred within the last 12h, initiate the procedure
184 | by sending a ping. Wait for a ping from the other side, reply to it and then send
185 | FindNode.
186 | 
187 | ## EIP-868 (October 2019)
188 | 
189 | [EIP-868] adds the [ENRRequest] and [ENRResponse] packets. It also modifies [Ping] and
190 | [Pong] to include the local ENR sequence number.
191 | 
192 | ## EIP-8 (December 2017)
193 | 
194 | [EIP-8] mandated that implementations ignore mismatches in Ping version and any additional
195 | list elements in `packet-data`.
196 | 
197 | [Ping]: #ping-packet-0x01
198 | [Pong]: #pong-packet-0x02
199 | [FindNode]: #findnode-packet-0x03
200 | [Neighbors]: #neighbors-packet-0x04
201 | [ENRRequest]: #enrrequest-packet-0x05
202 | [ENRResponse]: #enrresponse-packet-0x06
203 | [EIP-8]: https://eips.ethereum.org/EIPS/eip-8
204 | [EIP-868]: https://eips.ethereum.org/EIPS/eip-868
205 | [node record]: ./enr.md
206 | 


--------------------------------------------------------------------------------
/caps/pip.md:
--------------------------------------------------------------------------------
  1 | # Parity Light Protocol (PIP)
  2 | 
  3 | The Parity Light Protocol is a variation of LES designed and implemented by Parity Tech
  4 | for the Parity Ethereum client. Please refer to the [LES specification] for information on
  5 | the purpose of the light client protocol.
  6 | 
  7 | Like LES, PIP adopts a flow-control mechanism closely analogous to a [token-bucket rate
  8 | limiter] where the client is expected to mirror the server token-bucket state (as
  9 | exceeding the 'burstiness' depth is a violation that results in disconnection). PIP
 10 | utilises [Canonical Hash Tries] \(CHTs), which are also described in the LES documentation.
 11 | Unlike LES, a PIP CHT is generated once every 2048 blocks. One 32-byte trie root is stored
 12 | for every range of 2048 blocks.
 13 | 
 14 | The current version is **pip/1**. This specification was derived from the official
 15 | specification at `https://wiki.parity.io`. However, the official specification has since
 16 | been deleted.
 17 | 
 18 | ## Notation
 19 | 
 20 | Throughout this document, and in accordance with other devp2p documents, when referring to
 21 | wire message formats the following symbols apply:
 22 | 
 23 | `[ .. , .. , .. ]` means an RLP list
 24 | 
 25 | `a || b` means concatenation of `a` and `b`
 26 | 
 27 | `...` means additional list elements
 28 | 
 29 | ## Handshake
 30 | 
 31 | After the initial RLPx handshake, the first message that must be communicated is from the
 32 | server to the light peer and is a status message. Updates to information in the status
 33 | message are supplied with announcements.
 34 | 
 35 | ### Status (0x00)
 36 | 
 37 | `[[key0, value0], [key1, value1], ...]`
 38 | 
 39 | Keys are strings. Mandatory keys and values are as follows:
 40 | 
 41 | - `"protocol_version"` 1 for this PIP/1 protocol version.
 42 | - `"network_id"` 0 for testnet, 1 for mainnet
 43 | - `"total_difficulty"` integer total difficulty of the best chain as found in the block header.
 44 | - `"head_blockhash"`  the hash of the best (i.e. highest total difficulty) known block.
 45 | - `"head_blocknum"` the number of the best (i.e. highest total difficulty) known block.
 46 | - `"genesisHash""` the hash of the genesis block.
 47 | 
 48 | Optional keys and values are as follows:
 49 | 
 50 | - `"serve_headers"` any value and key-pair present if the peer can serve header chain
 51 |   downloads.
 52 | - `"serve_chain_since"` present if the peer can serve Body/Receipts ODR requests starting
 53 |   from the given block number.
 54 | - `"serve_state_since"` present if the peer can serve Proof/Code ODR requests starting
 55 |   from the given block number.
 56 | - `"tx_relay"` present if the peer can relay transactions to the network.
 57 | - `"flow_control_bl"` max credits (positive integer describing the burst-depth of the
 58 |   token bucket),
 59 | - `"flow_control_mrc"` the initial cost table (see below)
 60 | - `"flow_control_mrr"` rate of recharge (positive integer of credits recharged per second)
 61 | 
 62 | #### Cost Table
 63 | 
 64 | The cost table includes a mapping of individual [PIP Request/Response Messages] to costs,
 65 | which are applied in the token-bucket rate limiter. The [Headers] and [Execution] request
 66 | messages are special cases where the cost is multiplied by the maximum number of requested
 67 | header or gas requested, respectively. The table also includes a base cost, which is
 68 | applied for every [Request Batch].
 69 | 
 70 |     cost_table = [base_cost, [id,cost],...]
 71 |     base_cost = positive integer cost applied to a request batch.
 72 |     id = identifier of an individual PIP message type
 73 |     cost = positive integer to apply to cost calculations for this message type
 74 | 
 75 | ### Announcement (0x01)
 76 | 
 77 | `[head_blockhash, head_blocknum, total_difficulty, reorg_depth, [key0, value0], [key1, value1], ...]`
 78 | 
 79 | - `reorg_depth` is positive integer containing the reorganization depth to the common
 80 |   ancestor of the new head and the last announced head.
 81 | - Other elements have the same meaning as in the [Status] message with the exception of
 82 |   `reorg_depth`.
 83 | 
 84 | ### Request Batch (0x02)
 85 | 
 86 | `[request-id, [req1, ...]]`
 87 | 
 88 | where
 89 | 
 90 | - `request-id` is a unique scalar request identifier for request-reply correlation.
 91 | - `[req1, ...]` is the list of request messages, as described in the [PIP Request/Response Messages]
 92 |   section.
 93 | 
 94 | This message, sent from client to server, requests that the given request messages should
 95 | be executed. The server responds with a Response Batch.
 96 | 
 97 | ### Response Batch (0x03)
 98 | 
 99 | `[request-id, cr, [resp1, ...]]`
100 | 
101 | where
102 | 
103 | - `request-id1` is the unique scalar correlating with a previously received request message.
104 | - `cr` is an updated amount of request credits prior to recharge events at the time of
105 |   processing on the server (please see throttling below).
106 | - `[resp1, ...]` is the list of response messages.
107 | 
108 | There must be a response message for each request contained in the corresponding request batch.
109 | The individual responses must supply all elements of the response message specifications.
110 | The PIP protocol considers messages missing any of these elements *incomplete*.
111 | 
112 | ### UpdateCreditParameters (0x04)
113 | 
114 | `[max, recharge, cost_table]`
115 | 
116 | where
117 | 
118 | - `max` is a positive integer, the new maximum credit depth for the token bucket.
119 | - `recharge` a positive integer, the new recharge rate in credits per second.
120 | - `cost_table` is the updated [Cost Table].
121 | 
122 | The server may periodically update the token-bucket parameters, such as depth, message
123 | cost and recharge rate, for the particular client. Received updates must be acknowledged
124 | with an AcknowledgeUpdate message.
125 | 
126 | ### AcknowledgeUpdate (0x05)
127 | 
128 | This message acknowledges receipt of updated credit parameters and has no payload.
129 | 
130 | ### RelayTransactions (0x06)
131 | 
132 | `[tx1, tx2, ...]`
133 | 
134 | where
135 | 
136 | `tx1`, `tx2` are RLP encoded transactions as per [ETH] documentation.
137 | 
138 | This message requests that the given transactions should be relayed to the
139 | to the eth network.
140 | 
141 | ## PIP Request/Response Messages
142 | 
143 | PIP request and response messages are batched and cannot be sent individually. Unlike LES,
144 | PIP batches may contain multiple messages of different types. The [Request Batch] is used
145 | to send messages of the types described below to the server.
146 | 
147 | Each message type also specifies its corresponding response message (referred to as
148 | *outputs*). Response messages are sent as a [Response Batch] by the server when requests
149 | have executed.
150 | 
151 | PIP tries to further optimise client-server round trips by allowing the individual
152 | requests in the batch to include references to what their responses would contain if
153 | processed sequentially. For clarification, an example PIP batch request could contain two
154 | request messages in order, where the second message specifies that an input is a specific
155 | 'output' of the first message, where 'output' means the server response to that request.
156 | 
157 | Referencing a field in a response to a batched request is achieved with *loose inputs* and
158 | *reusable outputs*. Response message fields are documented as being **reusable as `n`**
159 | where `n` is an identifier labelling the field in the response message body.
160 | 
161 | *Loose inputs* may be a back-reference to a *reusable output* or may be hard data.
162 | 
163 |     loose_input = [raw_flag, input]
164 |     raw_flag = is 0 or 1 (a.k.a. 'discriminant')
165 |     input = if raw_flag is 0, this is the RLP encoded value
166 |             if raw_flag is 1, this is back_reference
167 |     back_reference = [request_message_index, reusable_output]
168 |     request_message_index = the 0-based position of a prior message in the request batch
169 |     reusable_output = the unsigned integer identifying the corresponding response message field
170 | 
171 | The following are the individual messages, paired as requests and their responses.
172 | 
173 | ### Headers (0x00)
174 | 
175 | Request and retrieve block headers from the server.
176 | 
177 | #### Request
178 | 
179 | `[message-id, [start, skip, max, reverse]]`
180 | 
181 | - `start` Loose, of type either 32byte hash (block hash), or unsigned integer block number
182 | - `skip` unsigned integer N, specifying the server should return every Nth block
183 | - `max` unsinged integer, the maximum number of blocks to return
184 | - `reverse` 0 if the block numbers should be increasing, 1 to return in reverse order
185 | 
186 | #### Response
187 | 
188 | `[message-id, [header1, header2, ...]]`
189 | 
190 | - `header1, header2, ...` the requested block headers
191 | 
192 | ### HeaderProof (0x01)
193 | 
194 | Request for a header proof.
195 | 
196 | #### Request
197 | 
198 | `[message-id, [block]]`
199 | 
200 | - `block` Loose, of type unsigned integer, referring to the block number
201 | 
202 | #### Response
203 | 
204 | `[message-id, [cht_inclusion_proof, block_hash, total_difficulty]]`
205 | 
206 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]`
207 | - `node1` merkle tree node as byte array
208 | - `block_hash` hash of the requested block **reusable as 0**
209 | - `total_difficulty` unsigned integer, the requested block total difficulty
210 | 
211 | ### TransactionIndex (0x02)
212 | 
213 | Request for transaction inclusion information by transaction hash.
214 | 
215 | #### Request
216 | 
217 | `[message-id, [hash]]`
218 | 
219 | - `hash` Loose, of type 32 byte hash, referring to the transaction hash.
220 | 
221 | #### Response
222 | 
223 | `[message-id, [block_number, block_hash, index]]`
224 | 
225 | - `block_number` the block number of the block containing the transaction **reusable as 0**
226 | - `block_hash` hash of the requested block **reusable as 1**
227 | - `index` index in the block
228 | 
229 | ### BlockReceipts (0x03)
230 | 
231 | Request for a block's receipts.
232 | 
233 | #### Request
234 | 
235 | `[message-id, [hash]]`
236 | 
237 | - `hash` Loose, of type 32 byte hash, referring to the block hash.
238 | 
239 | #### Response
240 | 
241 | `[message-id, [receipts]]`
242 | 
243 | - `receipts` is `[receipt1, receipt2, ...]`
244 | - `receipt1` a receipt, as per ETH spec.
245 | 
246 | ### BlockBody (0x04)
247 | 
248 | Request for a block's transactions.
249 | 
250 | #### Request
251 | 
252 | `[message-id, [hash]]`
253 | 
254 | - `hash` Loose, of type 32 byte hash, referring to the transaction hash
255 | 
256 | #### Response
257 | 
258 | `[message-id, [transactions, uncles]]`
259 | 
260 | - `transactions` is `[tx1, tx2, ...]`
261 | - `tx1` a transaction, as per ETH spec
262 | - `uncles` is `[header1, header2,...]`
263 | - `header1` an uncle block header as per ETH spec
264 | 
265 | ### Account (0x05)
266 | 
267 | Request for proof of specific account in the state.
268 | 
269 | #### Request
270 | 
271 | `[message-id , [block_hash, address_hash]]`
272 | 
273 | - `block_hash` Loose, of type 32 byte hash, referring to the block hash
274 | - `address_hash` Loose, of type 32 byte hash, referring to the account address hash
275 | 
276 | #### Response
277 | 
278 | `[message-id, [cht_inclusion_proof, nonce, balance, code_hash, storage_root]]`
279 | 
280 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]`
281 | - `node1` merkle tree node as byte array
282 | - `nonce` the block nonce (unsigned integer)
283 | - `balance` the account balance (unsigned integer)
284 | - `code_hash` 32 byte hash **reusable as 0**
285 | - `storage_root` 32 byte storage root hash **reusable as 1**
286 | 
287 | ### Storage (0x06)
288 | 
289 | Request for a proof of contract storage.
290 | 
291 | #### Request
292 | 
293 | `[message-id, [block_hash, address_hash, storage_key_hash]]`
294 | 
295 | - `block_hash` Loose, of type 32 byte hash, referring to the block hash
296 | - `address_hash` Loose, of type 32 byte hash, referring to the account address hash
297 | - `storage_key_hash` Loose, of type 32 byte hash, referring to the storage key
298 | 
299 | #### Response
300 | 
301 | `[message-id, [cht_inclusion_proof, storage_value]]`
302 | 
303 | - `cht_inclusion_proof` is `[[node1, node2, ...], ...]`
304 | - `node1` merkle tree node as byte array
305 | - `storage_value` 32 byte hash **reusable as 0**
306 | 
307 | ### Code (0x07)
308 | 
309 | Request for contract code.
310 | 
311 | #### Request
312 | 
313 | `[message-id, [block_hash, code_hash]]`
314 | 
315 | - `block_hash` Loose, of type 32 byte hash, identifying the block.
316 | - `code_hash` Loose, of type 32 byte hash, identifying the code.
317 | 
318 | #### Response
319 | 
320 | `[message-id, [bytecode]]`
321 | 
322 | - `bytecode` byte array of the contract code
323 | 
324 | ### Execution (0x08)
325 | 
326 | Request for Merkle proofs of a contract execution.
327 | 
328 | #### Request
329 | 
330 | `[message-id, [block_hash, from_address, call_or_create_address, gas_to_prove, gas_price, value, data]]`
331 | 
332 | - `block_hash` Loose, of type 32 byte hash, identifying the block
333 | - `from_address` Type 32 byte hash, referring to the caller account address hash
334 | - `call_or_create_address` 32 byte hash, call contract if address, otherwise create contract if empty
335 | - `gas_to_prove` 32 byte unsigned integer of gas to prove
336 | - `gas_price` 32 byte unsigned integer of gas price
337 | - `value` 32 byte unsigned integer of value to transfer
338 | - `data` byte array of relevant data
339 | 
340 | #### Response
341 | 
342 | `[message-id, [proof]]`
343 | 
344 | - `proof` is `[[node1, node2, ...], ...]`, the necessary execution proof
345 | - `node1` merkle tree node as byte array
346 | 
347 | [LES specification]: ./les.md
348 | [ETH]: ./eth.md
349 | [Cost Table]: #cost-table
350 | [Canonical Hash Tries]: ./les.md#canonical-hash-trie
351 | [token-bucket rate limiter]: https://en.wikipedia.org/wiki/Token_bucket
352 | [Status]: #status-0x00
353 | [Request Batch]: #request-batch-0x02
354 | [Response Batch]: #response-batch-0x03
355 | [PIP Request/Response Messages]: #pip-requestresponse-messages
356 | [Headers]: #headers-0x00
357 | [Execution]: #execution-0x08
358 | 


--------------------------------------------------------------------------------
/discv5/discv5-wire.md:
--------------------------------------------------------------------------------
  1 | # Node Discovery Protocol v5 - Wire Protocol
  2 | 
  3 | **Protocol version v5.1**
  4 | 
  5 | This document specifies the wire protocol of Node Discovery v5.
  6 | 
  7 | ## Notation
  8 | 
  9 | Here we present the notation that is used throughout this document.
 10 | 
 11 | `[ .. , .. , .. ]`\
 12 |     is recursive encoding as an RLP list\
 13 | `a || b`\
 14 |     means binary concatenation of `a` and `b`\
 15 | `xor(a, b)`\
 16 |     means binary XOR of `a` and `b`\
 17 | `sha256(x)`\
 18 |     is the SHA256 digest of `x`\
 19 | `aesctr_encrypt(key, iv, pt)`\
 20 |     is unauthenticated AES/CTR symmetric encryption with the given `key` and `iv`.\
 21 |     Size of `key` and `iv` is 16 bytes (AES-128).\
 22 | `aesgcm_encrypt(key, nonce, pt, ad)`\
 23 |     is AES-GCM encryption/authentication with the given `key`, `nonce` and additional\
 24 |     authenticated data `ad`. Size of `key` is 16 bytes (AES-128), size of `nonce` 12 bytes.
 25 | 
 26 | ## UDP Communication
 27 | 
 28 | Node discovery messages are sent as UDP datagrams. Since UDP is a lossy transport, packets
 29 | may be received in any order or not at all. Implementations should not re-send packets if
 30 | the recipient doesn't respond.
 31 | 
 32 | The maximum size of any packet is 1280 bytes. Implementations should not generate or
 33 | process packets larger than this size. Most messages are smaller than this limit by
 34 | definition, the exception being the NODES message. FINDNODE returns up to 16 records, plus
 35 | other data, and TOPICQUERY may also distribute a significantly long list of ENRs. As per
 36 | specification the maximum size of an ENR is 300 bytes. A NODES message containing all
 37 | FINDNODE response records would be at least 4800 bytes, not including additional data such
 38 | as the header. To stay below the size limit, NODES responses are sent as multiple messages
 39 | and specify the total number of responses in the message.
 40 | 
 41 | The minimum size of any Discovery v5 packet is 63 bytes. Implementations should reject
 42 | packets smaller than this size.
 43 | 
 44 | Since low-latency communication is expected, implementations should place short timeouts
 45 | on request/response interactions. Good timeout values are 500ms for a single
 46 | request/response and 1s for the handshake.
 47 | 
 48 | When responding to a request, the response should be sent to the UDP envelope address of
 49 | the request.
 50 | 
 51 | ## Packet Encoding
 52 | 
 53 | The protocol deals with three distinct kinds of packets:
 54 | 
 55 | - Ordinary message packets, which carry an encrypted/authenticated message.
 56 | - WHOAREYOU packets, which are sent when the recipient of an ordinary message packet
 57 |   cannot decrypt/authenticate the packet's message.
 58 | - Handshake message packets, which are sent following WHOAREYOU. These packets establish a
 59 |   new session and carry handshake-related data in addition to the encrypted/authenticated
 60 |   message.
 61 | 
 62 | In the following definitions, we assume that the sender of a packet has knowledge of its
 63 | own 256-bit node ID (`src-id`) and the node ID of the packet destination (`dest-id`). When
 64 | sending any packet except WHOAREYOU, the sender also generates a unique 96-bit `nonce`
 65 | value.
 66 | 
 67 | ### Protocol Header
 68 | 
 69 | All discovery packets contain a header followed by an optional encrypted and authenticated
 70 | message.
 71 | 
 72 | Header information is 'masked' using symmetric encryption in order to avoid static
 73 | identification of the protocol by firewalls.
 74 | 
 75 |     packet        = masking-iv || masked-header || message
 76 |     masked-header = aesctr_encrypt(masking-key, masking-iv, header)
 77 |     masking-key   = dest-id[:16]
 78 |     masking-iv    = uint128   -- random data unique to packet
 79 | 
 80 | The `masked-header` contains the actual packet header, which starts with a fixed-size
 81 | `static-header`, followed by a variable-length `authdata` section (of size `authdata-size`).
 82 | 
 83 |     header        = static-header || authdata
 84 |     static-header = protocol-id || version || flag || nonce || authdata-size
 85 |     protocol-id   = "discv5"
 86 |     version       = 0x0001
 87 |     authdata-size = uint16    -- byte length of authdata
 88 |     flag          = uint8     -- packet type identifier
 89 |     nonce         = uint96    -- nonce of message
 90 | 
 91 | Decrypting the masked header data works as follows: The recipient constructs an AES/CTR
 92 | stream cipher using its own node ID (`dest-id`) as the key and taking the IV from the
 93 | packet. It can then decrypt the `static-header` and verify that `protocol-id` matches the
 94 | expected string. If it does, the recipient can read `authdata-size` and unmask the
 95 | remaining `authdata`.
 96 | 
 97 | Implementations should not respond to packets with mismatching `protocol-id`.
 98 | 
 99 | In ordinary message packets and handshake message packets, the packet contains an
100 | authenticated message after the `authdata` section. For WHOAREYOU packets, the `message`
101 | is empty. Implementations must generate a unique `nonce` value for every message packet.
102 | 
103 |     message       = aesgcm_encrypt(initiator-key, nonce, message-pt, message-ad)
104 |     message-pt    = message-type || message-data
105 |     message-ad    = masking-iv || header
106 | 
107 | The `flag` field of the header identifies the kind of packet and determines the encoding
108 | of `authdata`, which differs depending on the packet type.
109 | 
110 | ### Ordinary Message Packet (`flag = 0`)
111 | 
112 | For message packets, the `authdata` section is just the source node ID.
113 | 
114 |     authdata      = src-id
115 |     authdata-size = 32
116 | 
117 | ![message packet layout](./img/message-packet-layout.png)
118 | 
119 | ### WHOAREYOU Packet (`flag = 1`)
120 | 
121 | In WHOAREYOU packets, the `authdata` section contains information for the identity
122 | verification procedure. The `message` part of WHOAREYOU packets is always empty. The
123 | `nonce` part of the packet must be set to the `nonce` of the message packet that caused
124 | the WHOAREYOU response.
125 | 
126 |     authdata      = id-nonce || enr-seq
127 |     authdata-size = 24
128 |     id-nonce      = uint128   -- random bytes
129 |     enr-seq       = uint64    -- ENR sequence number of the requesting node
130 | 
131 | ![whoareyou packet layout](./img/whoareyou-packet-layout.png)
132 | 
133 | ### Handshake Message Packet (`flag = 2`)
134 | 
135 | For handshake message packets, the `authdata` section has variable size since public key
136 | and signature sizes depend on the ENR identity scheme. For the "v4" identity scheme, we
137 | assume 64-byte signature size and 33 bytes of (compressed) public key size.
138 | 
139 | `authdata` starts with a fixed-size `authdata-head` component, followed by the ID
140 | signature, ephemeral public key and optional node record.
141 | 
142 | The `record` field may be omitted if the `enr-seq` of WHOAREYOU is recent enough, i.e.
143 | when it matches the current sequence number of the sending node. If `enr-seq` is zero, the
144 | record must be sent. Node records are encoded and verified as specified in [EIP-778].
145 | 
146 | Please refer to the [handshake section] for more information about the content of the
147 | handshake packet.
148 | 
149 |     authdata      = authdata-head || id-signature || eph-pubkey || record
150 |     authdata-head = src-id || sig-size || eph-key-size
151 |     authdata-size = 34 + sig-size + eph-key-size + len(record)
152 |     sig-size      = uint8     -- value: 64 for ID scheme "v4"
153 |     eph-key-size  = uint8     -- value: 33 for ID scheme "v4"
154 | 
155 | ![handshake packet layout](./img/handshake-packet-layout.png)
156 | 
157 | ## Protocol Messages
158 | 
159 | This section lists all defined messages which can be sent and received. The hexadecimal
160 | value in parentheses is the `message-type`.
161 | 
162 | The first element of every `message-data` list is the request ID. `request-id` is an RLP
163 | byte array of length <= 8 bytes. For requests, this value is assigned by the requester.
164 | The recipient of a message must mirror the value in the `request-id` element of the
165 | response. The selection of appropriate values for request IDs is left to the implementation.
166 | 
167 | ### PING Request (0x01)
168 | 
169 |     message-data = [request-id, enr-seq]
170 |     message-type = 0x01
171 |     enr-seq      = local ENR sequence number of sender
172 | 
173 | PING checks whether the recipient is alive and informs it about the sender's ENR sequence
174 | number.
175 | 
176 | ### PONG Response (0x02)
177 | 
178 |     message-data   = [request-id, enr-seq, recipient-ip, recipient-port]
179 |     message-type   = 0x02
180 |     enr-seq        = ENR sequence number of sender
181 |     recipient-ip   = 16 or 4 byte IP address of the intended recipient
182 |     recipient-port = recipient UDP port, a 16-bit integer
183 | 
184 | PONG is the reply to PING.
185 | 
186 | ### FINDNODE Request (0x03)
187 | 
188 |     message-data = [request-id, [distance₁, distance₂, ..., distanceₙ]]
189 |     message-type = 0x03
190 |     distanceₙ    = requested log2 distance, a positive integer
191 | 
192 | FINDNODE queries for nodes at the given logarithmic distances from the recipient's node
193 | ID. When distance `0` is requested, the result set should contain the recipient's current
194 | record.
195 | 
196 | The recipient should create the result set by collecting nodes from its local node table
197 | according to the requested distances. Implementations should limit the number of nodes in
198 | the result set. The recommended result limit for FINDNODE queries is 16 nodes.
199 | 
200 | ### NODES Response (0x04)
201 | 
202 |     message-data = [request-id, total, [ENR, ...]]
203 |     message-type = 0x04
204 |     total        = total number of responses to the request
205 | 
206 | NODES is the response to a FINDNODE or TOPICQUERY message. Multiple NODES messages may be
207 | sent as responses to a single query. Implementations may place a limit on the allowed
208 | maximum for `total`. If exceeded, additional responses may be ignored.
209 | 
210 | When handling NODES as a response to FINDNODE, the recipient should verify that the
211 | received nodes match the requested distances.
212 | 
213 | ### TALKREQ Request (0x05)
214 | 
215 |     message-data = [request-id, protocol, request]
216 |     message-type = 0x05
217 | 
218 | TALKREQ sends an application-level request. The purpose of this message is pre-negotiating
219 | connections made through another application-specific protocol identified by `protocol`.
220 | `protocol` and `request` are RLP byte arrays.
221 | 
222 | The recipient must respond with a TALKRESP message containing the response to the request.
223 | If the `protocol` is unknown to the recipient, it must respond with a TALKRESP response
224 | containing empty `response` data.
225 | 
226 | ### TALKRESP Response (0x06)
227 | 
228 |     message-data = [request-id, response]
229 |     message-type = 0x06
230 |     request-id   = request-id of TALKREQ
231 | 
232 | TALKRESP is the response to TALKREQ. The `response` is a RLP byte array containing the
233 | response data.
234 | 
235 | ### REGTOPIC Request (0x07)
236 | 
237 | **NOTE: the content and semantics of this message are not final.**
238 | **Implementations should not respond to or send these messages.**
239 | 
240 |     message-data = [request-id, topic, ENR, ticket]
241 |     message-type = 0x07
242 |     node-record  = current node record of sender
243 |     ticket       = byte array containing ticket content
244 | 
245 | REGTOPIC attempts to register the sender for the given topic. If the requesting node has a
246 | ticket from a previous registration attempt, it must present the ticket. Otherwise
247 | `ticket` is the empty byte array (RLP: `0x80`). The ticket must be valid and its waiting
248 | time must have elapsed before using the ticket.
249 | 
250 | REGTOPIC is always answered by a TICKET response. The requesting node may also receive a
251 | REGCONFIRMATION response when registration is successful. It may take up to 10s for the
252 | confirmation to be sent.
253 | 
254 | ### TICKET Response (0x08)
255 | 
256 | **NOTE: the content and semantics of this message are not final.**
257 | **Implementations should not respond to or send these messages.**
258 | 
259 |     message-data = [request-id, ticket, wait-time]
260 |     message-type = 0x08
261 |     ticket       = an opaque byte array representing the ticket
262 |     wait-time    = time to wait before registering, in seconds
263 | 
264 | TICKET is the response to REGTOPIC. It contains a ticket which can be used to register for
265 | the requested topic after `wait-time` has elapsed. See the [theory section on tickets] for
266 | more information.
267 | 
268 | ### REGCONFIRMATION Response (0x09)
269 | 
270 | **NOTE: the content and semantics of this message are not final.**
271 | **Implementations should not respond to or send these messages.**
272 | 
273 |     message-data = [request-id, topic]
274 |     message-type = 0x09
275 |     request-id   = request-id of REGTOPIC
276 | 
277 | REGCONFIRMATION notifies the recipient about a successful registration for the given
278 | topic. This call is sent by the advertisement medium after the time window for
279 | registration has elapsed on a topic queue.
280 | 
281 | ### TOPICQUERY Request (0x0A)
282 | 
283 | **NOTE: the content and semantics of this message are not final.**
284 | **Implementations should not respond to or send these messages.**
285 | 
286 |     message-data = [request-id, topic]
287 |     message-type = 0x0a
288 |     topic        = 32-byte topic hash
289 | 
290 | TOPICQUERY requests nodes in the [topic queue] of the given topic. The recipient of this
291 | request must send one or more NODES messages containing node records registered for the
292 | topic.
293 | 
294 | ## Test Vectors
295 | 
296 | A collection of test vectors for this specification can be found at
297 | [discv5 wire test vectors].
298 | 
299 | [handshake section]: ./discv5-theory.md#handshake-steps
300 | [topic queue]: ./discv5-theory.md#topic-table
301 | [theory section on tickets]: ./discv5-theory.md#tickets
302 | [EIP-778]: ../enr.md
303 | [discv5 wire test vectors]: ./discv5-wire-test-vectors.md
304 | 


--------------------------------------------------------------------------------
/rlpx.md:
--------------------------------------------------------------------------------
  1 | # The RLPx Transport Protocol
  2 | 
  3 | This specification defines the RLPx transport protocol, a TCP-based transport protocol
  4 | used for communication among Ethereum nodes. The protocol carries encrypted messages
  5 | belonging to one or more 'capabilities' which are negotiated during connection
  6 | establishment. RLPx is named after the [RLP] serialization format. The name is not an
  7 | acronym and has no particular meaning.
  8 | 
  9 | The current protocol version is **5**. You can find a list of changes in past versions at
 10 | the end of this document.
 11 | 
 12 | ## Notation
 13 | 
 14 | `X || Y`\
 15 |     denotes concatenation of X and Y.\
 16 | `X ^ Y`\
 17 |     is byte-wise XOR of X and Y.\
 18 | `X[:N]`\
 19 |     denotes an N-byte prefix of X.\
 20 | `[X, Y, Z, ...]`\
 21 |     denotes recursive encoding as an RLP list.\
 22 | `keccak256(MESSAGE)`\
 23 |     is the Keccak256 hash function as used by Ethereum.\
 24 | `ecies.encrypt(PUBKEY, MESSAGE, AUTHDATA)`\
 25 |     is the asymmetric authenticated encryption function as used by RLPx.\
 26 |     AUTHDATA is authenticated data which is not part of the resulting ciphertext,\
 27 |     but written to HMAC-256 before generating the message tag.\
 28 | `ecdh.agree(PRIVKEY, PUBKEY)`\
 29 |     is elliptic curve Diffie-Hellman key agreement between PRIVKEY and PUBKEY.
 30 | 
 31 | ## ECIES Encryption
 32 | 
 33 | ECIES (Elliptic Curve Integrated Encryption Scheme) is an asymmetric encryption method
 34 | used in the RLPx handshake. The cryptosystem used by RLPx is
 35 | 
 36 | - The elliptic curve secp256k1 with generator `G`.
 37 | - `KDF(k, len)`: the NIST SP 800-56 Concatenation Key Derivation Function
 38 | - `MAC(k, m)`: HMAC using the SHA-256 hash function.
 39 | - `AES(k, iv, m)`: the AES-128 encryption function in CTR mode.
 40 | 
 41 | Alice wants to send an encrypted message that can be decrypted by Bobs static private key
 42 | <code>k<sub>B</sub></code>. Alice knows about Bobs static public key
 43 | <code>K<sub>B</sub></code>.
 44 | 
 45 | To encrypt the message `m`, Alice generates a random number `r` and corresponding elliptic
 46 | curve public key `R = r * G` and computes the shared secret <code>S = P<sub>x</sub></code>
 47 | where <code>(P<sub>x</sub>, P<sub>y</sub>) = r * K<sub>B</sub></code>. She derives key
 48 | material for encryption and authentication as
 49 | <code>k<sub>E</sub> || k<sub>M</sub> = KDF(S, 32)</code> as well as a random
 50 | initialization vector `iv`. Alice sends the encrypted message `R || iv || c || d` where
 51 | <code>c = AES(k<sub>E</sub>, iv , m)</code> and
 52 | <code>d = MAC(sha256(k<sub>M</sub>), iv || c)</code> to Bob.
 53 | 
 54 | For Bob to decrypt the message `R || iv || c || d`, he derives the shared secret
 55 | <code>S = P<sub>x</sub></code> where
 56 | <code>(P<sub>x</sub>, P<sub>y</sub>) = k<sub>B</sub> * R</code> as well as the encryption and
 57 | authentication keys <code>k<sub>E</sub> || k<sub>M</sub> = KDF(S, 32)</code>. Bob verifies
 58 | the authenticity of the message by checking whether
 59 | <code>d == MAC(sha256(k<sub>M</sub>), iv || c)</code> then obtains the plaintext as
 60 | <code>m = AES(k<sub>E</sub>, iv || c)</code>.
 61 | 
 62 | ## Node Identity
 63 | 
 64 | All cryptographic operations are based on the secp256k1 elliptic curve. Each node is
 65 | expected to maintain a static secp256k1 private key which is saved and restored between
 66 | sessions. It is recommended that the private key can only be reset manually, for example,
 67 | by deleting a file or database entry.
 68 | 
 69 | ## Initial Handshake
 70 | 
 71 | An RLPx connection is established by creating a TCP connection and agreeing on ephemeral
 72 | key material for further encrypted and authenticated communication. The process of
 73 | creating those session keys is the 'handshake' and is carried out between the 'initiator'
 74 | (the node which opened the TCP connection) and the 'recipient' (the node which accepted it).
 75 | 
 76 | 1. initiator connects to recipient and sends its `auth` message
 77 | 2. recipient accepts, decrypts and verifies `auth` (checks that recovery of signature ==
 78 |    `keccak256(ephemeral-pubk)`)
 79 | 3. recipient generates `auth-ack` message from `remote-ephemeral-pubk` and `nonce`
 80 | 4. recipient derives secrets and sends the first encrypted frame containing the [Hello] message
 81 | 5. initiator receives `auth-ack` and derives secrets
 82 | 6. initiator sends its first encrypted frame containing initiator [Hello] message
 83 | 7. recipient receives and authenticates first encrypted frame
 84 | 8. initiator receives and authenticates first encrypted frame
 85 | 9. cryptographic handshake is complete if MAC of first encrypted frame is valid on both sides
 86 | 
 87 | Either side may disconnect if authentication of the first framed packet fails.
 88 | 
 89 | Handshake messages:
 90 | 
 91 |     auth = auth-size || enc-auth-body
 92 |     auth-size = size of enc-auth-body, encoded as a big-endian 16-bit integer
 93 |     auth-vsn = 4
 94 |     auth-body = [sig, initiator-pubk, initiator-nonce, auth-vsn, ...]
 95 |     enc-auth-body = ecies.encrypt(recipient-pubk, auth-body || auth-padding, auth-size)
 96 |     auth-padding = arbitrary data
 97 | 
 98 |     ack = ack-size || enc-ack-body
 99 |     ack-size = size of enc-ack-body, encoded as a big-endian 16-bit integer
100 |     ack-vsn = 4
101 |     ack-body = [recipient-ephemeral-pubk, recipient-nonce, ack-vsn, ...]
102 |     enc-ack-body = ecies.encrypt(initiator-pubk, ack-body || ack-padding, ack-size)
103 |     ack-padding = arbitrary data
104 | 
105 | Implementations must ignore any mismatches in `auth-vsn` and `ack-vsn`. Implementations
106 | must also ignore any additional list elements in `auth-body` and `ack-body`.
107 | 
108 | Secrets generated following the exchange of handshake messages:
109 | 
110 |     static-shared-secret = ecdh.agree(privkey, remote-pubk)
111 |     ephemeral-key = ecdh.agree(ephemeral-privkey, remote-ephemeral-pubk)
112 |     shared-secret = keccak256(ephemeral-key || keccak256(nonce || initiator-nonce))
113 |     aes-secret = keccak256(ephemeral-key || shared-secret)
114 |     mac-secret = keccak256(ephemeral-key || aes-secret)
115 | 
116 | ## Framing
117 | 
118 | All messages following the initial handshake are framed. A frame carries a single
119 | encrypted message belonging to a capability.
120 | 
121 | The purpose of framing is multiplexing multiple capabilities over a single connection.
122 | Secondarily, as framed messages yield reasonable demarcation points for message
123 | authentication codes, supporting an encrypted and authenticated stream becomes
124 | straight-forward. Frames are encrypted and authenticated via key material generated during
125 | the handshake.
126 | 
127 | The frame header provides information about the size of the message and the message's
128 | source capability. Padding is used to prevent buffer starvation, such that frame
129 | components are byte-aligned to block size of cipher.
130 | 
131 |     frame = header-ciphertext || header-mac || frame-ciphertext || frame-mac
132 |     header-ciphertext = aes(aes-secret, header)
133 |     header = frame-size || header-data || header-padding
134 |     header-data = [capability-id, context-id]
135 |     capability-id = integer, always zero
136 |     context-id = integer, always zero
137 |     header-padding = zero-fill header to 16-byte boundary
138 |     frame-ciphertext = aes(aes-secret, frame-data || frame-padding)
139 |     frame-padding = zero-fill frame-data to 16-byte boundary
140 | 
141 | See the [Capability Messaging] section for definitions of `frame-data` and `frame-size.`
142 | 
143 | ### MAC
144 | 
145 | Message authentication in RLPx uses two keccak256 states, one for each direction of
146 | communication. The `egress-mac` and `ingress-mac` keccak states are continuously updated
147 | with the ciphertext of bytes sent (egress) or received (ingress). Following the initial
148 | handshake, the MAC states are initialized as follows:
149 | 
150 | Initiator:
151 | 
152 |     egress-mac = keccak256.init((mac-secret ^ recipient-nonce) || auth)
153 |     ingress-mac = keccak256.init((mac-secret ^ initiator-nonce) || ack)
154 | 
155 | Recipient:
156 | 
157 |     egress-mac = keccak256.init((mac-secret ^ initiator-nonce) || ack)
158 |     ingress-mac = keccak256.init((mac-secret ^ recipient-nonce) || auth)
159 | 
160 | When a frame is sent, the corresponding MAC values are computed by updating the
161 | `egress-mac` state with the data to be sent. The update is performed by XORing the header
162 | with the encrypted output of its corresponding MAC. This is done to ensure uniform
163 | operations are performed for both plaintext MAC and ciphertext. All MACs are sent
164 | cleartext.
165 | 
166 |     header-mac-seed = aes(mac-secret, keccak256.digest(egress-mac)[:16]) ^ header-ciphertext
167 |     egress-mac = keccak256.update(egress-mac, header-mac-seed)
168 |     header-mac = keccak256.digest(egress-mac)[:16]
169 | 
170 | Computing `frame-mac`:
171 | 
172 |     egress-mac = keccak256.update(egress-mac, frame-ciphertext)
173 |     frame-mac-seed = aes(mac-secret, keccak256.digest(egress-mac)[:16]) ^ keccak256.digest(egress-mac)[:16]
174 |     egress-mac = keccak256.update(egress-mac, frame-mac-seed)
175 |     frame-mac = keccak256.digest(egress-mac)[:16]
176 | 
177 | Verifying the MAC on ingress frames is done by updating the `ingress-mac` state in the
178 | same way as `egress-mac` and comparing to the values of `header-mac` and `frame-mac` in
179 | the ingress frame. This should be done before decrypting `header-ciphertext` and
180 | `frame-ciphertext`.
181 | 
182 | # Capability Messaging
183 | 
184 | All messages following the initial handshake are associated with a 'capability'. Any
185 | number of capabilities can be used concurrently on a single RLPx connection.
186 | 
187 | A capability is identified by a short ASCII name (max eight characters) and version number. The capabilities
188 | supported on either side of the connection are exchanged in the [Hello] message belonging
189 | to the 'p2p' capability which is required to be available on all connections.
190 | 
191 | ## Message Encoding
192 | 
193 | The initial [Hello] message is encoded as follows:
194 | 
195 |     frame-data = msg-id || msg-data
196 |     frame-size = length of frame-data, encoded as a 24bit big-endian integer
197 | 
198 | where `msg-id` is an RLP-encoded integer identifying the message and `msg-data` is an RLP
199 | list containing the message data.
200 | 
201 | All messages following Hello are compressed using the Snappy algorithm.
202 | 
203 |     frame-data = msg-id || snappyCompress(msg-data)
204 |     frame-size = length of frame-data encoded as a 24bit big-endian integer
205 | 
206 | Note that the `frame-size` of compressed messages refers to the compressed size of
207 | `msg-data`. Since compressed messages may inflate to a very large size after
208 | decompression, implementations should check for the uncompressed size of the data before
209 | decoding the message. This is possible because the [snappy format] contains a length
210 | header. Messages carrying uncompressed data larger than 16 MiB should be rejected by
211 | closing the connection.
212 | 
213 | ## Message ID-based Multiplexing
214 | 
215 | While the framing layer supports a `capability-id`, the current version of RLPx doesn't
216 | use that field for multiplexing between different capabilities. Instead, multiplexing
217 | relies purely on the message ID.
218 | 
219 | Each capability is given as much of the message-ID space as it needs. All such
220 | capabilities must statically specify how many message IDs they require. On connection and
221 | reception of the [Hello] message, both peers have equivalent information about what
222 | capabilities they share (including versions) and are able to form consensus over the
223 | composition of message ID space.
224 | 
225 | Message IDs are assumed to be compact from ID 0x10 onwards (0x00-0x0f is reserved for the
226 | "p2p" capability) and given to each shared (equal-version, equal-name) capability in
227 | alphabetic order. Capability names are case-sensitive. Capabilities which are not shared
228 | are ignored. If multiple versions are shared of the same (equal name) capability, the
229 | numerically highest wins, others are ignored.
230 | 
231 | ## "p2p" Capability
232 | 
233 | The "p2p" capability is present on all connections. After the initial handshake, both
234 | sides of the connection must send either [Hello] or a [Disconnect] message. Upon receiving
235 | the [Hello] message a session is active and any other message may be sent. Implementations
236 | must ignore any difference in protocol version for forward-compatibility reasons. When
237 | communicating with a peer of lower version, implementations should try to mimic that
238 | version.
239 | 
240 | At any time after protocol negotiation, a [Disconnect] message may be sent.
241 | 
242 | ### Hello (0x00)
243 | 
244 | `[protocolVersion: P, clientId: B, capabilities, listenPort: P, nodeKey: B_64, ...]`
245 | 
246 | First packet sent over the connection, and sent once by both sides. No other messages may
247 | be sent until a Hello is received. Implementations must ignore any additional list elements
248 | in Hello because they may be used by a future version.
249 | 
250 | - `protocolVersion` the version of the "p2p" capability, **5**.
251 | - `clientId` Specifies the client software identity, as a human-readable string (e.g.
252 |   "Ethereum(++)/1.0.0").
253 | - `capabilities` is the list of supported capabilities and their versions:
254 |   `[[cap1, capVersion1], [cap2, capVersion2], ...]`.
255 | - `listenPort` (legacy) specifies the port that the client is listening on (on the
256 |   interface that the present connection traverses). If 0 it indicates the client is
257 |   not listening. This field should be ignored.
258 | - `nodeId` is the secp256k1 public key corresponding to the node's private key.
259 | 
260 | ### Disconnect (0x01)
261 | 
262 | `[reason: P]`
263 | 
264 | Inform the peer that a disconnection is imminent; if received, a peer should disconnect
265 | immediately. When sending, well-behaved hosts give their peers a fighting chance (read:
266 | wait 2 seconds) to disconnect to before disconnecting themselves.
267 | 
268 | `reason` is an optional integer specifying one of a number of reasons for disconnect:
269 | 
270 | | Reason | Meaning                                                      |
271 | |--------|:-------------------------------------------------------------|
272 | | `0x00` | Disconnect requested                                         |
273 | | `0x01` | TCP sub-system error                                         |
274 | | `0x02` | Breach of protocol, e.g. a malformed message, bad RLP, ...   |
275 | | `0x03` | Useless peer                                                 |
276 | | `0x04` | Too many peers                                               |
277 | | `0x05` | Already connected                                            |
278 | | `0x06` | Incompatible P2P protocol version                            |
279 | | `0x07` | Null node identity received - this is automatically invalid  |
280 | | `0x08` | Client quitting                                              |
281 | | `0x09` | Unexpected identity in handshake                             |
282 | | `0x0a` | Identity is the same as this node (i.e. connected to itself) |
283 | | `0x0b` | Ping timeout                                                 |
284 | | `0x10` | Some other reason specific to a subprotocol                  |
285 | 
286 | ### Ping (0x02)
287 | 
288 | `[]`
289 | 
290 | Requests an immediate reply of [Pong] from the peer.
291 | 
292 | ### Pong (0x03)
293 | 
294 | `[]`
295 | 
296 | Reply to the peer's [Ping] packet.
297 | 
298 | # Change Log
299 | 
300 | ### Known Issues in the current version
301 | 
302 | - The frame encryption/MAC scheme is considered 'broken' because `aes-secret` and
303 |   `mac-secret` are reused for both reading and writing. The two sides of a RLPx connection
304 |   generate two CTR streams from the same key, nonce and IV. If an attacker knows one
305 |   plaintext, they can decrypt unknown plaintexts of the reused keystream.
306 | - General feedback from reviewers has been that the use of a keccak256 state as a MAC
307 |   accumulator and the use of AES in the MAC algorithm is an uncommon and overly complex
308 |   way to perform message authentication but can be considered safe.
309 | - The frame encoding provides `capability-id` and `context-id` fields for multiplexing
310 |   purposes, but these fields are unused.
311 | 
312 | ### Version 5 (EIP-706, September 2017)
313 | 
314 | [EIP-706] added Snappy message compression.
315 | 
316 | ### Version 4 (EIP-8, December 2015)
317 | 
318 | [EIP-8] changed the encoding of `auth-body` and `ack-body` in the initial handshake to
319 | RLP, added a version number to the handshake and mandated that implementations should
320 | ignore additional list elements in handshake messages and [Hello].
321 | 
322 | # References
323 | 
324 | - Elaine Barker, Don Johnson, and Miles Smid. NIST Special Publication 800-56A Section 5.8.1,
325 |   Concatenation Key Derivation Function. 2017.\
326 |   URL <https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-56ar.pdf>
327 | 
328 | - Victor Shoup. A proposal for an ISO standard for public key encryption, Version 2.1. 2001.\
329 |   URL <http://www.shoup.net/papers/iso-2_1.pdf>
330 | 
331 | - Mike Belshe and Roberto Peon. SPDY Protocol - Draft 3. 2014.\
332 |   URL <http://www.chromium.org/spdy/spdy-protocol/spdy-protocol-draft3>
333 | 
334 | - Snappy compressed format description. 2011.\
335 |   URL <https://github.com/google/snappy/blob/master/format_description.txt>
336 | 
337 | Copyright &copy; 2014 Alex Leverington.
338 | <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">
339 | This work is licensed under a
340 | Creative Commons Attribution-NonCommercial-ShareAlike
341 | 4.0 International License</a>.
342 | 
343 | [Hello]: #hello-0x00
344 | [Disconnect]: #disconnect-0x01
345 | [Ping]: #ping-0x02
346 | [Pong]: #pong-0x03
347 | [Capability Messaging]: #capability-messaging
348 | [EIP-8]: https://eips.ethereum.org/EIPS/eip-8
349 | [EIP-706]: https://eips.ethereum.org/EIPS/eip-706
350 | [RLP]: https://ethereum.org/en/developers/docs/data-structures-and-encoding/rlp
351 | [snappy format]: https://github.com/google/snappy/blob/master/format_description.txt
352 | 


--------------------------------------------------------------------------------
/caps/snap.md:
--------------------------------------------------------------------------------
  1 | # Ethereum Snapshot Protocol (SNAP)
  2 | 
  3 | The `snap` protocol runs on top of [RLPx], facilitating the exchange of Ethereum state
  4 | snapshots between peers. The protocol is an optional extension for peers supporting (or
  5 | caring about) the dynamic snapshot format.
  6 | 
  7 | The current version is `snap/1`.
  8 | 
  9 | ## Overview
 10 | 
 11 | The `snap` protocol is designed for semi real-time data retrieval. It's goal is to make
 12 | dynamic snapshots of recent states available for peers. The `snap` protocol does not take
 13 | part in chain maintenance (block and transaction propagation); and it is **meant to be run
 14 | side-by-side with the `eth` protocol**, not standalone (e.g. chain progression is
 15 | announced via `eth`).
 16 | 
 17 | The protocol itself is simplistic by design (take note, the supporting implementation is
 18 | everything but simple). In its crux, `snap` supports retrieving a contiguous segment of
 19 | accounts from the Ethereum state trie, or a contiguous segment of storage slots from one
 20 | particular storage trie. Both replies are Merkle proven for immediate verification. In
 21 | addition batches of bytecodes can also be retrieved similarly to the `eth` protocol.
 22 | 
 23 | The synchronization mechanism the protocol enables is for peers to retrieve and verify all
 24 | the account and storage data without downloading intermediate Merkle trie nodes. The final
 25 | state trie is reassembled locally. An additional complexity nodes must be aware of, is
 26 | that state is ephemeral and moves with the chain, so syncers need to support reassembling
 27 | partially consistent state segments. This is supported by trie node retrieval similar to
 28 | `eth`, which can be used to heal trie inconsistencies (more on this later).
 29 | 
 30 | The `snap` protocol permits downloading the entire Ethereum state without having to
 31 | download all the intermediate Merkle proofs, which can be regenerated locally. This
 32 | reduces the networking load enormously:
 33 | 
 34 | - Ingress bandwidth is reduced from `O(accounts * log account + SUM(states * log states))`
 35 |   (Merkle trie nodes) to `O(accounts + SUM(states))` (actual state data).
 36 | - Egress bandwidth is reduced from `O(accounts * log account + SUM(states * log states)) *
 37 |   32 bytes` (Merkle trie node hashes) to `O(accounts + SUM(states)) / 100000 bytes`
 38 |   (number of 100KB chucks to cover the state).
 39 | - Round trip time is reduced from `O(accounts * log account + SUM(states * log states)) /
 40 |   384` (states retrieval packets) to `O(accounts + SUM(states)) / 100000 bytes` (number of
 41 |   100KB chucks to cover the state).
 42 | 
 43 | ### Expected results
 44 | 
 45 | To put some numbers on the above abstract orders of magnitudes, synchronizing Ethereum
 46 | mainnet state (i.e. ignoring blocks and receipts, as those are the same) with `eth` vs.
 47 | the `snap` protocol:
 48 | 
 49 | Block ~#11,177,000:
 50 | 
 51 | - Accounts: 107,598,788 @ 19.70GiB
 52 | - Byte codes: 319,654 @ 1.48GiB
 53 | - Storage slots: 365,787,020 @ 49.88GiB
 54 | - Trie nodes: 617,045,138
 55 | 
 56 | |        | Time   | Upload  | Download | Packets  | Serving disk reads* |
 57 | |:------:|:------:|:-------:|:--------:|:--------:|:-------------------:|
 58 | | `eth`  | 10h50m | 20.38GB | 43.8GB   | 1607M    | 15.68TB             |
 59 | | `snap` | 2h6m   | 0.15GB  | 20.44GB  | 0.099M   | 0.096TB             |
 60 | |        | -80.6% | -99.26% | -53.33%  | -99.993% | -99.39%             |
 61 | 
 62 | *\*Also accounts for other peer requests during the time span.*
 63 | 
 64 | Post snap state heal:
 65 | 
 66 | - Additional trie nodes: 541,260 @ 160.44MiB
 67 | - Additional byte codes: 34 @ 234.98KiB
 68 | 
 69 | ## Relation to `eth`
 70 | 
 71 | The `snap` protocol is a *dependent satellite* of `eth` (i.e. to run `snap`, you need to
 72 | run `eth` too), not a fully standalone protocol. This is a deliberate design decision:
 73 | 
 74 | - `snap` is meant to be a bootstrap aid for newly joining full nodes. By enforcing all
 75 |   `snap` peers to also speak `eth`, we can avoid non-full nodes from lingering attached to
 76 |   `snap` indefinitely.
 77 | - `eth` already contains well established chain and fork negotiation mechanisms, as well
 78 |   as remote peer staleness detection during sync. By running both protocols side-by-side,
 79 |   `snap` can benefit of all these mechanisms without having to duplicate them.
 80 | 
 81 | This *satellite* status may be changed later, but it's better to launch with a more
 82 | restricted protocol first and then expand if need be vs. trying to withdraw depended-upon
 83 | features.
 84 | 
 85 | The `snap` protocol is not an extension / next version of `eth` as it relies on the
 86 | availability of a *snapshot* acceleration structure that can iterate accounts and storage
 87 | slots linearly. Its purpose is also one specific sync method that might not be suitable
 88 | for all clients. Keeping `snap` as a separate protocol permits every client to decide to
 89 | pursue it or not, without hindering their capacity to participate in the `eth` protocol.
 90 | 
 91 | ## Synchronization algorithm
 92 | 
 93 | The crux of the snapshot synchronization is making contiguous ranges of accounts and
 94 | storage slots available for remote retrieval. The sort order is the same as the state trie
 95 | iteration order, which makes it possible to not only request N subsequent accounts, but
 96 | also to Merkle prove them. Some important properties of this simple algorithm:
 97 | 
 98 | - Opposed to *fast sync*, we only need to transfer the useful leaf data from the state
 99 |   trie and can reconstruct internal nodes locally.
100 | - Opposed to *warp sync*, we can download small chunks of accounts and storage slots and
101 |   immediately verify their Merkle proofs, making junk attacks impossible.
102 | - Opposed to *warp sync*, random account ranges can be retrieved, thus synchronization
103 |   concurrency is totally dependent on client implementation and is not forced by the
104 |   protocol.
105 | 
106 | The gotcha of the snapshot synchronization is that serving nodes need to be able to
107 | provide **fast** iterable access to the state of the most recent `N` (128) blocks.
108 | Iterating the Merkle trie itself might be functional, but it's not viable (iterating the
109 | state trie at the time of writing takes 9h 30m on an idle machine). Geth introduced
110 | support for [dynamic snapshots], which allows iterating all the accounts in 7m
111 | (see [blog for more]). Some important properties of the dynamic snapshots:
112 | 
113 | - Serving a contiguous range of accounts or storage slots take `O(n)` operations, and more
114 |   importantly, it's the same for disk access too, being stored contiguously on disk (not
115 |   counting the database read amplification).
116 | - Maintaining a live dynamic snapshot means:
117 |   - Opposed to *warp sync*, syncing nodes can always get the latest data, thus they don't
118 |     need to process days' worth of blocks afterwards.
119 |   - Opposed to *warp sync*, there is no pre-computation to generate a snapshot (it's
120 |     updated live), so there's no periodic burden on the nodes to iterate the tries (there
121 |     it an initial burden to create the first snapshot after sync though).
122 |   - Providing access to 128 recent snapshots permits `O(1)` direct access to any account
123 |     and state, which can be used during EVM execution for `SLOAD`.
124 | 
125 | The caveat of the snapshot synchronization is that as with *fast sync* (and opposed to
126 | *warp sync*), the available data constantly moves (as new blocks arrive). The probability
127 | of finishing sync before the 128 block window (15m) moves out is asymptotically zero. This
128 | is not a problem, because we can self-heal. It is fine to import state snapshot chunks
129 | from different tries, because the inconsistencies can be fixed by running a
130 | *fast-sync-style-state-sync* on top of the assembled semi-correct state afterwards. Some
131 | important properties of the self-healing:
132 | 
133 | - Synchronization can be aborted at any time and resumed later. It might cause
134 |   self-healing to run longer, but it will fix the data either way.
135 | - Synchronization on slow connections is guaranteed to finish too (as long as the node can
136 |   download data faster than it's being produced by the network), the data cannot disappear
137 |   from the network (opposed to warp sync).
138 | 
139 | ## Data format
140 | 
141 | The accounts in the `snap` protocol are analogous to the Ethereum RLP consensus encoding
142 | (same fields, same order), but in a **slim** format:
143 | 
144 | - The code hash is `empty list` instead of `Keccak256("")`
145 | - The root hash is `empty list` instead of `Hash(<empty trie>)`
146 | 
147 | This is done to avoid having to transfer the same 32+32 bytes for all plain accounts over
148 | the network.
149 | 
150 | ## Protocol Messages
151 | 
152 | ### GetAccountRange (0x00)
153 | 
154 | `[reqID: P, rootHash: B_32, startingHash: B_32, limitHash: B_32, responseBytes: P]`
155 | 
156 | Requests an unknown number of accounts from a given account trie, starting at the
157 | specified account hash and capped by the maximum allowed response size in bytes. The
158 | intended purpose of this message is to fetch a large number of subsequent accounts from a
159 | remote node and reconstruct a state subtrie locally.
160 | 
161 | - `reqID`: Request ID to match up responses with
162 | - `rootHash`: Root hash of the account trie to serve
163 | - `startingHash`: Account hash of the first to retrieve
164 | - `limitHash`: Account hash after which to stop serving data
165 | - `responseBytes`: Soft limit at which to stop returning data
166 | 
167 | Notes:
168 | 
169 | - Nodes **must** always respond to the query.
170 | - If the node does **not** have the state for the requested state root, it **must** return
171 |   an empty reply. It is the responsibility of the caller to query an state not older than
172 |   128 blocks.
173 | - The responding node is allowed to return **less** data than requested (own QoS limits),
174 |   but the node **must** return at least one account. If no accounts exist between `startingHash` and `limitHash`, then
175 |   the first (if any) account **after** `limitHash` must be provided. 
176 | - The responding node **must** Merkle prove the starting hash (even if it does not exist)
177 |   and the last returned account (if any exists after the starting hash).
178 | 
179 | Rationale:
180 | 
181 | - The starting account is identified deliberately by hash and not by address. As the
182 |   accounts in the Ethereum Merkle trie are sorted by hash, the address is irrelevant. In
183 |   addition, there is no consensus requirement for full nodes to be aware of the address
184 |   pre-images.
185 | - The response is capped by byte size and not by number of accounts, because it makes the
186 |   network traffic more deterministic. As the state density is unknowable, it's also
187 |   impossible to delimit the query with an ending hash.
188 | 
189 | Caveats:
190 | 
191 | - When requesting accounts from a starting hash, malicious nodes may skip ahead and return
192 |   a gapped reply. Such a reply would cause sync to finish early with a lot of missing data.
193 |   Proof of non-existence for the starting hash prevents this attack, completely covering
194 |   the range from start to end.
195 | - No special signaling is needed if there are no more accounts after the last returned
196 |   one, as the attached Merkle proof for the last account will have all trie nodes right of
197 |   the proven path zero.
198 | 
199 | ### AccountRange (0x01)
200 | 
201 | `[reqID: P, accounts: [[accHash: B_32, accBody: B], ...], proof: [node_1: B, node_2, ...]]`
202 | 
203 | Returns a number of consecutive accounts and the Merkle proofs for the entire range
204 | (boundary proofs). The left-side proof must be for the requested origin hash (even if an
205 | associated account does not exist) and the right-side proof must be for the last returned
206 | account.
207 | 
208 | - `reqID`: ID of the request this is a response for
209 | - `accounts`: List of consecutive accounts from the trie
210 |   - `accHash`: Hash of the account address (trie path)
211 |   - `accBody`: Account body in slim format
212 | - `proof`: List of trie nodes proving the account range
213 | 
214 | Notes:
215 | 
216 | - If the account range is the entire state (requested origin was `0x00..0` and all
217 |   accounts fit into the response), no proofs should be sent along the response. This is
218 |   unlikely for accounts, but since it's a common situation for storage slots, this clause
219 |   keeps the behavior the same across both.
220 | 
221 | ### GetStorageRanges (0x02)
222 | 
223 | `[reqID: P, rootHash: B_32, accountHashes: [B_32], startingHash: B, limitHash: B, responseBytes: P]`
224 | 
225 | Requests the storage slots of multiple accounts' storage tries. Since certain contracts
226 | have huge state, the method can also request storage slots from a single account, starting
227 | at a specific storage key hash. The intended purpose of this message is to fetch a large
228 | number of subsequent storage slots from a remote node and reconstruct a state subtrie
229 | locally.
230 | 
231 | - `reqID`: Request ID to match up responses with
232 | - `rootHash`: Root hash of the account trie to serve
233 | - `accountHashes`: Account hashes of the storage tries to serve
234 | - `startingHash`: Storage slot hash of the first to retrieve
235 | - `limitHash`: Storage slot hash after which to stop serving
236 | - `responseBytes`: Soft limit at which to stop returning data
237 | 
238 | Notes:
239 | 
240 | - Nodes **must** always respond to the query.
241 | - If the node does **not** have the state for the requested state root or for **any**
242 |   requested account hash, it **must** return an empty reply. It is the responsibility of
243 |   the caller to query an state not older than 128 blocks; and the caller is expected to
244 |   only ever query existing accounts.
245 | - The responding node is allowed to return **less** data than requested (serving QoS
246 |   limits), but the node **must** return at least one slot, unless none exists.
247 | - If multiple accounts' storage is requested, serving nodes should reply with the entire
248 |   storage ranges (thus no Merkle proofs needed), up to the first contract which exceeds
249 |   the packet limit. If the last included storage range does not fit entirely, a Merkle
250 |   proof **must** be attached to that and **only** that.
251 | - If a single account's storage is requested, serving nodes should only return slots
252 |   starting with the requested starting hash, up to the last one or until the packet fills
253 |   up. It the entire storage range is not being returned, a Merkle proof **must** be
254 |   attached.
255 | - If a proof is attached, the responding node **must** Merkle prove the starting hash
256 |   (even if it does not exist) and the last returned slot (if any exists after the starting
257 |   hash).
258 | 
259 | Rationale:
260 | 
261 | - The response is capped by byte size and not by number of slots, because it makes the
262 |   network traffic more deterministic.
263 | - The request supports querying multiple contracts at the same time as most storage tries
264 |   are in the order of 100s of bytes. Querying these individually would produce a lot of
265 |   network round trips.
266 | 
267 | Caveats:
268 | 
269 | - When requesting storage slots from a starting hash, malicious nodes may skip ahead and
270 |   return a prefix-gapped reply. Such a reply would cause sync to finish early with a lot
271 |   of missing data. Proof of non-existence for the starting hash prevents this attack,
272 |   completely covering the range from start to end.
273 | - Although serving nodes should respect the response limit requested by the caller, it is
274 |   valuable to slightly force the limit (consider it soft only) when adding the last
275 |   contract to avoid having to split it and prove it.
276 | - No special signaling is needed if there are no more slots after the last returned one,
277 |   as the attached Merkle proof for the last account will have all trie nodes right of the
278 |   proven path zero.
279 | 
280 | ### StorageRanges (0x03)
281 | 
282 | `[reqID: P, slots: [[[slotHash: B_32, slotData: B], ...], ...], proof: [node_1: B, node_2, ...]]`
283 | 
284 | Returns a number of consecutive storage slots for the requested account (i.e. list of list
285 | of slots) and optionally the Merkle proofs for the last range (boundary proofs) if it only
286 | partially covers the storage trie. The left-side proof must be for the requested origin
287 | slots (even if it does not exist) and the right-side proof must be for the last returned
288 | slots.
289 | 
290 | - `reqID`: ID of the request this is a response for
291 | - `slots`: List of list of consecutive slots from the trie (one list per account)
292 |   - `slotHash`: Hash of the storage slot key (trie path)
293 |   - `slotData`: Data content of the slot
294 | - `proof`: List of trie nodes proving the slot range
295 | 
296 | Notes:
297 | 
298 | - If the slot range is the entire storage state, no proofs will be sent along the response.
299 | 
300 | ### GetByteCodes (0x04)
301 | 
302 | `[reqID: P, hashes: [hash1: B_32, hash2: B_32, ...], bytes: P]`
303 | 
304 | Requests a number of contract byte-codes by hash. This is analogous to the `eth/63`
305 | `GetNodeData`, but restricted to only bytecode to break the generality that causes issues
306 | with database optimizations. The intended purpose of this request is to allow retrieving
307 | the code associated with accounts retrieved via GetAccountRange, but it's needed during
308 | healing too.
309 | 
310 | - `reqID`: Request ID to match up responses with
311 | - `hashes`: Code hashes to retrieve the code for
312 | - `bytes`: Soft limit at which to stop returning data
313 | 
314 | *This functionality was duplicated into `snap` from `eth/65` to permit `eth` long term to
315 | become a chain maintenance protocol only and move synchronization primitives out into
316 | satellite protocols only.*
317 | 
318 | Notes:
319 | 
320 | - Nodes **must** always respond to the query.
321 | - The returned codes **must** be in the request order.
322 | - The responding node is allowed to return **less** data than requested (serving QoS
323 |   limits), but the node **must** return at least one bytecode, unless none requested are
324 |   available, in which case it **must** answer with an empty response.
325 | - If a bytecode is unavailable, the node **must** skip that slot and proceed to the next
326 |   one. The node **must not** return `nil` or other placeholders.
327 | 
328 | Rationale:
329 | 
330 | - The response is capped by byte size and not by number of slots, because it makes the
331 |   network traffic more deterministic, as contract sizes can vary randomly up to 24KB with
332 |   current consensus rules.
333 | - By retaining the original request order and skipping unavailable bytecodes, the
334 |   requesting node can differentiate between unavailable data (gaps in the hashes) and QoS
335 |   limitations (missing suffix).
336 | 
337 | Caveats:
338 | 
339 | - Implementations are free to request as many or as few bytecodes in a single request, but
340 |   they should keep in mind that requesting too few results in wasted time due to network
341 |   latency; but requesting too many results in wasted bandwidth if the response doesn't
342 |   fit. Average (unique) contract size on mainnet is about 5-6KB, so `bytes / 6KB` is a
343 |   good heuristic for the number of codes to request in a single packet (e.g. for 512KB
344 |   desired response size, 80-100 bytecodes per request is a good choice).
345 | 
346 | ### ByteCodes (0x05)
347 | 
348 | `[reqID: P, codes: [code1: B, code2: B, ...]]`
349 | 
350 | Returns a number of requested contract codes. The order is the same as in the request, but
351 | there might be gaps if not all codes are available or there might be fewer is QoS limits
352 | are reached.
353 | 
354 | ### GetTrieNodes (0x06)
355 | 
356 | `[reqID: P, rootHash: B_32, paths: [[accPath: B, slotPath1: B, slotPath2: B, ...]...], bytes: P]`
357 | 
358 | Requests a number of state (either account or storage) Merkle trie nodes **by path**. This
359 | is analogous in functionality to the `eth/63` `GetNodeData`, but restricted to only tries
360 | and queried by path, to break the generality that causes issues with database
361 | optimizations.
362 | 
363 | - `reqID`: Request ID to match up responses with
364 | - `rootHash`: Root hash of the account trie to serve
365 | - `paths`: Trie paths to retrieve the nodes for, grouped by account
366 | - `bytes`: Soft limit at which to stop returning data
367 | 
368 | The `paths` is one array of trie node paths to retrieve per account (i.e. list of list of
369 | paths). Each list in the array special cases the first element as the path in the account
370 | trie and the remaining elements as paths in the storage trie. To address an account node,
371 | the inner list should have a length of 1 consisting of only the account path. Partial
372 | paths (<32 bytes) should be compact encoded per the Ethereum wire protocol, full paths
373 | should be plain binary encoded.
374 | 
375 | *This functionality was mutated into `snap` from `eth/65` to permit `eth` long term to
376 | become a chain maintenance protocol only and move synchronization primitives out into
377 | satellite protocols only.*
378 | 
379 | Notes:
380 | 
381 | - Nodes **must** always respond to the query.
382 | - The returned nodes **must** be in the request order.
383 | - If the node does **not** have the state for the requested state root or for **any**
384 |   requested account paths, it **must** return an empty reply. It is the responsibility of
385 |   the caller to query an state not older than 128 blocks; and the caller is expected to
386 |   only ever query existing trie nodes.
387 | - The responding node is allowed to return **less** data than requested (serving QoS
388 |   limits), but the node **must** return at least one trie node.
389 | 
390 | Rationale:
391 | 
392 | - The response is capped by byte size and not by number of slots, because it makes the
393 |   network traffic more deterministic. Although opposed to the previous request types
394 |   (accounts, slots, codes), trie nodes are relatively deterministic (100-500B), the
395 |   protocol remains cleaner if all packets follow the same traffic shaping rules.
396 | - A naive way to represent trie nodes would be a simple list of `account || storage` path
397 |   segments concatenated, but that would be very wasteful on the network as it would
398 |   duplicate the account hash for every storage trie node.
399 | 
400 | ### TrieNodes (0x07)
401 | 
402 | `[reqID: P, nodes: [node1: B, node2: B, ...]]`
403 | 
404 | Returns a number of requested state trie nodes. The order is the same as in the request,
405 | but there might be fewer is QoS limits are reached.
406 | 
407 | ## Change Log
408 | 
409 | ### snap/1 (November 2020)
410 | 
411 | Version 1 was the introduction of the snapshot protocol.
412 | 
413 | [RLPx]: ../rlpx.md
414 | [dynamic snapshots]: https://github.com/ethereum/go-ethereum/pull/20152
415 | [blog for more]: https://blog.ethereum.org/2020/07/17/ask-about-geth-snapshot-acceleration/
416 | 


--------------------------------------------------------------------------------
/caps/les.md:
--------------------------------------------------------------------------------
  1 | # Light Ethereum Subprotocol (LES)
  2 | 
  3 | The Light Ethereum Subprotocol (LES) is the protocol used by "light" clients, which only
  4 | download block headers as they appear and fetch other parts of the blockchain on-demand.
  5 | They provide full functionality in terms of safely accessing the blockchain, but do not
  6 | mine and therefore do not take part in the consensus process. Full and archive nodes can
  7 | also support the 'les' protocol besides 'eth' in order to be able to serve light nodes.
  8 | 
  9 | The current protocol version is **les/4**. See end of document for a list of changes in
 10 | past protocol versions. Some of the les protocol messages are similar to of the [Ethereum
 11 | Wire Protocol], with the addition of a few new fields.
 12 | 
 13 | ## Canonical Hash Trie
 14 | 
 15 | Canonical Hash Trie (CHT) structures are used by LES for quick initial syncing and secure
 16 | on-demand retrieval of canonical hash mappings, block headers and total difficulty (TD)
 17 | values.
 18 | 
 19 | A CHT is a Merkle trie (specifically '[Merkle Patricia Trie]' as used for Ethereum state)
 20 | that contains `blockNumber -> [blockHash, TD]` mappings where keys are binary big endian
 21 | encoded 64 bit integers and values are RLP-encoded `[hash, number]` pairs.
 22 | 
 23 | CHTs are generated by LES servers for every 32768 blocks, `CHT[i]` containing data for
 24 | blocks `0..(i+1) * 32768 - 1`. If a client knows the root hash of `CHT[i]` and wants to fetch
 25 | header number `N` (where `N < (i+1) * 32768`), it can obtain the header and the corresponding
 26 | Merkle proof of the CHT with a [GetHelperTrieProofs] request.
 27 | 
 28 | CHTs are only generated after 2048 confirmations, making it sure they will not be changed
 29 | by a chain reorg. In the current version of the light client there is a hard-coded
 30 | `[chtNumber, chtRoot]` pair associated with the genesis block hash of both the mainnet and
 31 | the testnet. A trustless validation algorithm is planned for later protocol versions.
 32 | 
 33 | ## BloomBits
 34 | 
 35 | The BloomBits data structure optimizes log searching by doing a bitwise transformation
 36 | that makes it cheaper to retrieve bloom filter data relevant to a specific filter.
 37 | 
 38 | When searching in a long section of the block history, we are checking three specific bits
 39 | of each bloom filter per queried address/topic. In order to do that, LES must retrieve a
 40 | ~550 byte block header per filtered block.
 41 | 
 42 | The BloomBits structure optimizes bloom filter lookups through a "bitwise 90 degree
 43 | rotation" of the bloom filters. Blocks are grouped into fixed length sections (section
 44 | size for the LES BloomBits Trie is 32768 blocks), `BloomBits[bitIdx][sectionIdx]` is a
 45 | 32768 bit (4096 byte) long bit vector that contains a single bit of each bloom filter from
 46 | the block range `sectionIdx*SectionSize ... (sectionIdx+1)*SectionSize-1`. Since bloom
 47 | filters are usually sparse, a simple data compression makes this structure even more
 48 | efficient, especially for on-demand retrieval. By reading and binary AND-ing three
 49 | BloomBits sections, we can filter for an address/topic in 32768 blocks at once ("1" bits
 50 | in the binary AND result mean bloom matches).
 51 | 
 52 | ### Compression Algorithm
 53 | 
 54 | BloomBits data is stored in compressed form. The compression algorithm is optimized for
 55 | sparse input data which contains a lot of zero bytes. Decompression requires knowledge of
 56 | the decompressed data length.
 57 | 
 58 | The algorithm can be described with this pseudo-code:
 59 | 
 60 |     if data only contains zeroes,
 61 |         CompressBytes(data) == nil
 62 |     otherwise if len(data) <= 1,
 63 |         CompressBytes(data) == data
 64 |     otherwise:
 65 |         CompressBytes(data) == append(CompressBytes(nonZeroBitset(data)), nonZeroBytes(data)...)
 66 |         where
 67 |           nonZeroBitset(data) is a bit vector with len(data) bits (MSB first):
 68 |               nonZeroBitset(data)[i/8] && (1 << (7-i%8)) != 0  if data[i] != 0
 69 |               len(nonZeroBitset(data)) == (len(data)+7)/8
 70 |           nonZeroBytes(data) contains the non-zero bytes of data in the same order
 71 | 
 72 | ### BloomBits Trie
 73 | 
 74 | In order to make this data structure retrievable on-demand for the light client, we put
 75 | the generated vectors in a trie. Parts of this trie can be retrieved with the
 76 | [GetHelperTrieProofs] message. Currently the trie root is part of the trusted syncing
 77 | checkpoint but trustless validation of the BloomBits trie is part of the development
 78 | plans. The trie consists of the compressed bit vectors as values stored at keys
 79 | constructed from the bloom bit index encoded as a 2-byte big endian, followed by the
 80 | section index encoded as an 8-byte big endian. Since all-zero bit vectors have a zero
 81 | length when compressed, these vectors are not added to the trie at all.
 82 | 
 83 | BloomBits tries are generated for each new section of transformed bloom filter data by
 84 | adding the vectors belonging to the latest section index to the previous trie.
 85 | 
 86 | ## Client Side Flow Control
 87 | 
 88 | Any node which takes on a server role in the LES protocol needs to be able to somehow
 89 | limit the amount of work it does for each client peer during a given time period. They can
 90 | always just serve requests slowly if they are overloaded, but it is beneficial to give
 91 | some sort of flow control feedback to the clients. This way, clients could (and would have
 92 | incentive to) behave nicely and not send requests too quickly in the first place (and then
 93 | possibly timeout and resend while the server is still working on them). They could also
 94 | distribute requests better between multiple servers they are connected to. And if clients
 95 | can do this, servers can expect them to do this and throttle or drop them if they break
 96 | the flow control rules.
 97 | 
 98 | ### The Model
 99 | 
100 | Let us assume that serving each request has a cost (depending on type and parameters) for
101 | the server. This cost is determined by the server, but it has an upper limit for any valid
102 | request. The server assigns a "buffer" for each client from which the cost of each request
103 | is deduced. The buffer has an upper limit (the "buffer limit") and a recharge rate (cost
104 | per second). The server can decide to recharge it more quickly at any time if it has more
105 | free resources, but there is a guaranteed minimum recharge rate. If a request is received
106 | that would drain the client's buffer below zero, the client has broken the flow control
107 | rules and is throttled or disconnected.
108 | 
109 | ### The Protocol
110 | 
111 | The server announces three parameters in the [Status] message:
112 | 
113 | - `"flowControl/BL"`: Buffer Limit, an integer value
114 | - `"flowControl/MRR"`: Minimum Rate of Recharge, an integer value
115 | - `"flowControl/MRC"`: Maximum Request Cost table. The value of this parameter is a
116 |   table assigning cost values to every on-demand retrieval message in the LES protocol.
117 |   The table is encoded as a list of integer triples: `[[MsgCode, BaseCost, ReqCost], ...]`
118 | 
119 | On the server side:
120 | 
121 | When a client connects, the server sets the initial Buffer Value (`BV`) of the client to
122 | `BL` and announces `BL` in [Status]. When a request is received from the client, it
123 | calculates the cost according to its own estimates (but not higher than `MaxCost`, which
124 | equals `BaseCost + ReqCost * N`, where `N` is the number of individual elements asked in
125 | the request), then deducts it from `BV`. If `BV` goes negative, drops the peer, otherwise
126 | starts serving the request. The reply message contains a `BV` value that is the previously
127 | calculated `BV` plus the amount recharged during the time spent serving. Note that since
128 | the server can always determine any cost up to `MaxCost` for a request (and a client
129 | should not assume otherwise), it can reject a message without processing it if received
130 | while `BV < MaxCost` because that's already a flow control breach.
131 | 
132 | On the client side:
133 | 
134 | The client always has a lowest estimate for its current `BV`, called `BLE`. It
135 | 
136 | - sets `BLE` to `BL` received in [Status]
137 | - doesn't send any request to the server when `BLE < MaxCost`
138 | - deduces `MaxCost` when sending a request
139 | - recharges `BLE` at the rate of `MRR` when less than `BL`
140 | 
141 | When a reply message with a new `BV` value is received, it sets `BLE` to `BV -
142 | Sum(MaxCost)`, summing the `MaxCost` values of requests sent after the one belonging to
143 | this reply.
144 | 
145 | #### Buffer underrun
146 | 
147 | Before **les/3** buffer underruns always resulted in immediate disconnection. Now it is
148 | possible and recommended to send a [StopMsg] instead and then a [ResumeMsg] when the
149 | buffer has been at least partially recharged. This allows clients to treat the buffer
150 | feedback as an optional performance optimization hint instead of a mandatory mechanism
151 | and allows simple implementations that do not care about the buffer at all.
152 | 
153 | ## Request ID
154 | 
155 | Every on-demand request message contains a `reqID` field, which is simply returned by the
156 | server in the corresponding reply message. This helps matching replies for requests on the
157 | client side so that each reply doesn't need to be matched against each pending request.
158 | 
159 | ## Protocol Messages
160 | 
161 | ### Status (0x00)
162 | 
163 | `[[key_0, value_0], [key_1, value_1], ...]`
164 | 
165 | Inform a peer of the sender's current LES state. This message should be sent just after
166 | the connection is established and prior to any other LES messages. The following keys
167 | are required (value types are noted after the key string):
168 | 
169 | - `"protocolVersion"` `P`: is 1 for protocol version one.
170 | - `"networkId"` `P`: specifies the network ID of the chain, as in the [Ethereum Wire Protocol].
171 | - `"headTd"` `P`: Total Difficulty of the best chain. Integer, as found in block header.
172 | - `"headHash"` `B_32`: the hash of the best (i.e. highest TD) known block.
173 | - `"headNum"` `P`: the number of the best (i.e. highest TD) known block.
174 | - `"genesisHash"` `B_32`: the hash of the Genesis block.
175 | - `"forkID"` `[crc32, nextFork: P]`: mandatory since **les/4**.
176 |   The value identifies the chain/fork the node is operating on.
177 | - `"recentTxLookup"` `P`: announced by servers since **les/4**. Transaction status
178 |   is served for transactions included in the N-1 most recent blocks (N=1 means that
179 |   mined transactions are not served at all). N=0 means all transactions are available.
180 | 
181 | There are several optional key/value pairs which can be set:
182 | 
183 | - `"announceType"` `P`: set by clients, this field affects the [Announce] messages of the
184 |   server. Allowed integer values are:
185 | 
186 |   - none (`0`): no [Announce] messages are sent, i.e. the client is not interested in announcements.
187 |   - simple (`1`): Default. [Announce] messages use the **les/1** format.
188 |   - signed (`2`): there is a `"sign"` key in the key/value list of [Announce] messages. The
189 |     associated value is a signature of an RLP encoded `[headHash: B_32, headNumber: P, headTd: P]`
190 |     structure by the server's node key.
191 | 
192 | - `"serveHeaders"` (empty value): present if the peer can serve header chain downloads.
193 | 
194 | - `"serveChainSince"` `P`: present if the peer can serve Body/Receipts ODR requests
195 |   starting from the given block number.
196 | 
197 | - `"serveRecentChain"` `P`: if present then the availability of chain data is only guaranteed
198 |   for the given number of recent blocks. If the node serves chain data then `"serveChainSince"`
199 |   should always be present while `"serveRecentChain"` is optional. Chain availability can
200 |   be assumed for blocks with `blockNumber >= MAX(serveChainSince, headNumber-serveRecentChain+1)`.
201 | 
202 | - `"serveStateSince"` `P`: present if the peer can serve Proof/Code ODR requests starting
203 |   from the given block number.
204 | 
205 | - `"serveRecentState"` `P`: if present then the availability of state data is only guaranteed
206 |   for the given number of recent blocks. If the node serves state data then `"serveStateSince"`
207 |   should always be present while `"serveRecentState"` is optional. State availability can
208 |   be assumed for blocks with `blockNumber >= MAX(serveStateSince, headNumber-serveRecentState+1)`.
209 | 
210 | - `"txRelay"` (no value): present if the peer can relay transactions to the ETH network.
211 | 
212 | - `"flowControl/BL"`, `"flowControl/MRC"`, `"flowControl/MRR"`: see [Client Side Flow Control]
213 | 
214 | Unknown keys should be ignored by both sides. This allows announcing additional
215 | capabilities while staying compatible with past protocol versions.
216 | 
217 | ### Announce (0x01)
218 | 
219 | `[headHash: B_32, headNumber: P, headTd: P, reorgDepth: P, [[key_0, value_0], [key_1, value_1], ...]]`
220 | 
221 | Announce a new chain head and optionally also a change to some of the values announced at
222 | handshake. A restrictive change of server capabilities (for example, an increase of
223 | `"serveStateSince"` due to state pruning) should be announced at least 10 seconds prior to
224 | actually restricting those capabilities in order to avoid asynchronous problems. Changes
225 | to unknown keys should be ignored. Changes to known keys that make no sense lead to
226 | disconnection.
227 | 
228 | Announcing a head with a lower or equal TD than previously announced or a head that the
229 | sending node later refuses to honor with a proceeding [GetBlockHeaders] message (with
230 | number and TD also matching) is considered bad form, and may lead to disconnection or
231 | reduce the reputation of the sending node.
232 | 
233 | The field `reorgDepth` contains the number of blocks to be rolled back from the last head
234 | announced by the same node in order to find the last common ancestor of the last and
235 | current heaviest chain. Adding this field helps the client to minimize the number of
236 | requests and the amount of bandwidth required to fetch new headers.
237 | 
238 | ### GetBlockHeaders (0x02)
239 | 
240 | `[reqID: P, [block: {P, B_32}, maxHeaders: P, skip: P, reverse: P in {0, 1}]]`
241 | 
242 | Require peer to return a [BlockHeaders] message. Reply must contain a number of block
243 | headers, of rising number when `reverse` is `0`, falling when `1`, `skip` blocks apart,
244 | beginning at block `block` (denoted by either number or hash) in the canonical chain, and
245 | with at most `maxHeaders` items.
246 | 
247 | ### BlockHeaders (0x03)
248 | 
249 | `[reqID: P, BV: P, [blockHeader_0, blockHeader_1, ...]]`
250 | 
251 | Reply to [GetBlockHeaders]. The items in the list (following the message ID) are block
252 | headers in the format described in the main Ethereum specification, previously asked for
253 | in a [GetBlockHeaders] message. The list may be empty if none of the requested block
254 | headers were available on the server side.
255 | 
256 | ### GetBlockBodies (0x04)
257 | 
258 | `[reqID: P, [hash_0: B_32, hash_1: B_32, ...]]`
259 | 
260 | Require peer to return a [BlockBodies] message. Specify the set of blocks that we're
261 | interested in with the hashes.
262 | 
263 | ### BlockBodies (0x05)
264 | 
265 | `[reqID: P, BV: P, [[transactions_0, uncles_0] , ...]]`
266 | 
267 | Reply to [GetBlockBodies]. The items in the list (following the message ID) are some of
268 | the blocks, minus the header, in the format described in the main Ethereum specification,
269 | previously asked for in a [GetBlockBodies] message.
270 | 
271 | ### GetReceipts (0x06)
272 | 
273 | `[reqID: P, [hash_0: B_32, hash_1: B_32, ...]]`
274 | 
275 | Require peer to return a [Receipts] message.
276 | 
277 | ### Receipts (0x07)
278 | 
279 | `[reqID: P, BV: P, [[receipt_0, receipt_1, ...], ...]]`
280 | 
281 | Provide a set of receipts which correspond to the block hashes previously asked for in
282 | [GetReceipts].
283 | 
284 | ### GetProofs (0x08)
285 | 
286 | `[reqID: P, [[blockhash: B_32, key: B_32, key2: B_32, fromLevel: P], ...]]`
287 | 
288 | Require peer to return a [Proofs] message, containing one or more Merkle proofs, each
289 | proving the value of index `key` from the state trie of the given block (if `key2` is
290 | empty), or the storage value of index `key2` from the storage trie referenced in the
291 | account at `key`. If `fromLevel` is greater than zero, the given number of trie nodes
292 | closest to the root can be omitted from the proof.
293 | 
294 | This message was deprecated in **les/2**, use [GetProofsV2] instead.
295 | 
296 | ### Proofs (0x09)
297 | 
298 | `[reqID: P, BV: P, [[node_1, node_2, ...], ...]]`
299 | 
300 | Return a set of Merkle proofs, each consisting of a set of nodes that must be processed in
301 | order to access the trie entry value (or prove the absence of an entry) requested in
302 | [GetProofs].
303 | 
304 | ### GetContractCodes (0x0a)
305 | 
306 | `[reqID: P, [[blockhash: B_32, key: B_32], ...]]`
307 | 
308 | Require peer to return a [ContractCodes] message.
309 | 
310 | ### ContractCodes (0x0b)
311 | 
312 | `[reqID: P, BV: P, [value_0: B, value_1: B, ...]]`
313 | 
314 | Provide a set of contract codes which correspond to the block hashes and account keys
315 | previously asked in [GetContractCodes].
316 | 
317 | ### GetHeaderProofs (0x0d)
318 | 
319 | `[reqID: P, [[chtNumber: P, blockNumber: P, fromLevel: P], ...]]`
320 | 
321 | Require peer to return a [HeaderProofs] message, containing one or more canonical block
322 | headers (of block number `blockNumber`) and corresponding Merkle proofs of the [CHT]
323 | (Canonical Hash Trie) identified by `chtNumber`. If `fromLevel` is greater than zero, the
324 | given number of trie nodes closest to the root can be omitted from the proof.
325 | 
326 | This message was deprecated in **les/2**, use [GetHelperTrieProofs] instead.
327 | 
328 | ### HeaderProofs (0x0e)
329 | 
330 | `[reqID: P, BV: P, [[blockHeader, [node_1, node_2...]], ...]]`
331 | 
332 | Return a set of structures, each containing a block header and a Merkle proof proving the
333 | header hash and belonging TD against a given CHT requested in [GetHeaderProofs].
334 | 
335 | ### SendTx (0x0c)
336 | 
337 | `[txdata_1, txdata_2, ...]`
338 | 
339 | Require peer to add a set of transactions into its transaction pool and relay them to the
340 | ETH network.
341 | 
342 | This message was deprecated in **les/2**, use [SendTxV2] instead.
343 | 
344 | ### GetProofsV2 (0x0f)
345 | 
346 | `[reqID: P, [[blockhash: B_32, key: B_32, key2: B_32, fromLevel: P], ...]]`
347 | 
348 | Require peer to return a [ProofsV2] message, containing a single (and smallest possible)
349 | set of trie nodes that proves for each request the value of index `key` from the state
350 | trie of the given block (if `key2` is empty), or the storage value of index `key2` from
351 | the storage trie referenced in the account at `key`. If `fromLevel` is greater than zero,
352 | the given number of trie nodes closest to the root can be omitted from the proof.
353 | 
354 | ### ProofsV2 (0x10)
355 | 
356 | `[reqID: P, BV: P, [node_1, node_2, ...]]`
357 | 
358 | Return the smallest set of trie nodes required to access the trie entry value (or prove
359 | the absence of an entry) requested in [GetProofsV2]. This set will be called a *proof
360 | set*. Compared to [Proofs], this message contains a single list of nodes satisfying all
361 | requested proofs. The list shouldn't contain duplicate nodes.
362 | 
363 | ### GetHelperTrieProofs (0x11)
364 | 
365 | `[reqID: P, [[subType: P, sectionIdx: P, key: B, fromLevel: P, auxReq: P], ...]]`
366 | 
367 | Require peer to return a [HelperTrieProofs] message, containing a *proof set* and optional
368 | auxiliary data for each request.
369 | 
370 | Note: this request is a generalization of the **les/1** [GetHeaderProofs] message. It
371 | retrieves Merkle proofs from different types of "helper tries" which are generated for
372 | every fixed-length section of the canonical chain. `subType` identifies the helper trie
373 | that is being requested for the section marked by `sectionIdx`. `key` and `fromLevel` are
374 | interpreted like in case of proof requests.
375 | 
376 | If `auxReq` is greater than zero then auxiliary data is requested too. If `auxReq` is 1
377 | then the root hash of the specified trie (according to the server) is returned and no trie
378 | nodes are added to the proof set. This special request will be required for trustless
379 | validation of helper tries. The interpretation of `auxReq` values greater than 1 is
380 | subject to `subType`.
381 | 
382 | The following `subType` integer values are allowed in **les/2**:
383 | 
384 | - CHT (`0`): request a key from the [Canonical Hash Trie]. If `auxReq` is 2 then the
385 |   belonging header is returned as `auxData`. `key` is the block number encoded as an
386 |   8-byte big endian. Note that the section size for CHTs has been raised to 32k instead of
387 |   4k blocks so for example a `sectionIdx` of 100 equals a `chtNumber` of 807 in case of
388 |   the **les/1** [GetHeaderProofs] message.
389 | - BloomBits (`1`): request a key from the [BloomBits Trie]. In this trie `key` is 10 bytes
390 |   long, it consists of the bloom bit index encoded as a 2-byte big endian, followed by the
391 |   section index encoded as an 8-byte big endian. The returned value is the corresponding
392 |   compressed bloom bit vector.
393 | 
394 | ### HelperTrieProofs (0x12)
395 | 
396 | `[reqID: P, BV: P, [[node_1, node_2...], [auxData_0, auxData_1, ...]]]`
397 | 
398 | Return a proof set and a set of `auxData` requested in [GetHelperTrieProofs]. The length
399 | of the `auxData` list equals the number of requests with a non-zero `auxReq`.
400 | 
401 | ### SendTxV2 (0x13)
402 | 
403 | `[reqID: P, [txdata_1, txdata_2, ...]]`
404 | 
405 | Require peer to add a set of transactions into its transaction pool and relay them to the
406 | ETH network, then return a [TxStatus] message containing the status of the sent
407 | transactions.
408 | 
409 | ### GetTxStatus (0x14)
410 | 
411 | `[reqID: P, [txHash_1, txHash_2, ...]]`
412 | 
413 | Require peer to return a [TxStatus] message containing the status of the referenced
414 | transactions. This message is intended for inquiry about past transactions sent by the
415 | client. Note that the server is not required to make every transaction available
416 | indefinitely.
417 | 
418 | ### TxStatus (0x15)
419 | 
420 | `[reqID: P, BV: P, [[status: P, data: B], ...]]`
421 | 
422 | Return the current status of the sent/queried transactions. Possible `status` values are:
423 | 
424 | - Unknown (`0`): transaction is unknown
425 | - Queued (`1`): transaction is queued (not processable yet)
426 | - Pending (`2`): transaction is pending (processable)
427 | - Included (`3`): transaction is already included in the canonical chain. `data` contains
428 |   an RLP-encoded `[blockHash: B_32, blockNumber: P, txIndex: P]` structure.
429 | - Error (`4`): transaction sending failed. `data` contains a text error message.
430 | 
431 | ### StopMsg (0x16)
432 | 
433 | Instruct the client to temporarily stop sending requests and to not expect responses to those requests it did not already receive a reply for.
434 | 
435 | Implementer's note: this message can be used to handle transient server overloads or individual client flow control buffer underruns. The server should avoid sending [StopMsg] too often though if the client also avoids buffer underruns. It should try to regulate its own utilization (and thereby also the frequency of transient overload occurences) with the flow control feedback. Receiving [StopMsg] more than once every few minutes in long term average or not receiving [ResumeMsg] in a few seconds can be considered bad service quality by the clients.
436 | 
437 | ### ResumeMsg (0x17)
438 | 
439 | `[BV: P]`
440 | 
441 | Update flow control buffer and allow sending requests again. Note that the requests not answered before [StopMsg] were permanently canceled and will not be answered after [ResumeMsg]. If a [ResumeMsg] is received without a preceding [StopMsg] then it should be treated as a simple flow control buffer update (assuming that the server has already deducted the cost of the previously answered messages).
442 | 
443 | ## Change Log
444 | 
445 | ### les/4 (March 2021)
446 | 
447 | - Keys `"forkID"` and `"recentTxLookup"` were added to the [Status] message.
448 | 
449 | ### les/3 (May 2019)
450 | 
451 | - Keys `"serveRecentChain"` and `"serveRecentState"` were added to the [Status] message.
452 | - Messages [StopMsg] and [ResumeMsg] were added to improve handling transient overloads
453 |   and flow control buffer underruns.
454 | 
455 | ### les/2 (November 2017)
456 | 
457 | - The `"announceType"` key was added to the [Status] message.
458 | - The BloomBits Trie and associated messages [GetHelperTrieProofs], [HelperTrieProofs]
459 |   were added to facilitate server-assisted log search. **les/1** clients would frequently
460 |   download large ranges of receipts to search for specific logs.
461 | - Messages [GetProofsV2], [ProofsV2] were added to de-duplicate result nodes when
462 |   requesting multiple proofs at the same time.
463 | - Messages [SendTxV2], [GetTxStatus] and [TxStatus] were added to allow querying for past
464 |   transactions and to enable user-lever error reporting for non-includable transactions at
465 |   the time of submission.
466 | - The [GetHeaderProofs], [HeaderProofs], [GetProofs], [Proofs] and [SendTx] messages from
467 |   **les/1** are no longer supported in **les/2**.
468 | 
469 | [Client Side Flow Control]: #client-side-flow-control
470 | [Canonical Hash Trie]: #canonical-hash-trie
471 | [CHT]: #canonical-hash-trie
472 | [BloomBits Trie]: #bloombits-trie
473 | [Status]: #status-0x00
474 | [Announce]: #announce-0x01
475 | [GetBlockHeaders]: #getblockheaders-0x02
476 | [BlockHeaders]: #blockheaders-0x03
477 | [GetBlockBodies]: #getblockbodies-0x04
478 | [BlockBodies]: #blockbodies-0x05
479 | [GetReceipts]: #getreceipts-0x06
480 | [Receipts]: #receipts-0x07
481 | [GetProofs]: #getproofs-0x08
482 | [Proofs]: #proofs-0x09
483 | [GetContractCodes]: #getcontractcodes-0x0a
484 | [ContractCodes]: #contractcodes-0x0b
485 | [GetHeaderProofs]: #getheaderproofs-0x0d
486 | [HeaderProofs]: #headerproofs-0x0e
487 | [SendTx]: #sendtx-0x0c
488 | [GetProofsV2]: #getproofsv2-0x0f
489 | [ProofsV2]: #proofsv2-0x10
490 | [GetHelperTrieProofs]: #gethelpertrieproofs-0x11
491 | [HelperTrieProofs]: #helpertrieproofs-0x12
492 | [SendTxV2]: #sendtxv2-0x13
493 | [GetTxStatus]: #gettxstatus-0x14
494 | [TxStatus]: #txstatus-0x15
495 | [StopMsg]: #stopmsg-0x16
496 | [ResumeMsg]: #resumemsg-0x17
497 | [Ethereum Wire Protocol]: ./eth.md
498 | [Merkle Patricia Trie]: https://github.com/ethereum/wiki/wiki/Patricia-Tree
499 | 


--------------------------------------------------------------------------------
/discv5/discv5-rationale.md:
--------------------------------------------------------------------------------
  1 | # Node Discovery Protocol v5 - Rationale
  2 | 
  3 | **Protocol version v5.1**
  4 | 
  5 | Note that this specification is a work in progress and may change incompatibly without
  6 | prior notice.
  7 | 
  8 | This document explains the design requirements and security needs of Discovery v5. In
  9 | addition, the document tries to gather the various vulnerabilities and threats that
 10 | pertain to Kademlia-like p2p networks. Our aim is to make it plain which issues are
 11 | addressed and how they are mitigated, so that the design of the [wire protocol] may be
 12 | verified.
 13 | 
 14 | # Design Requirements
 15 | 
 16 | ## Basic Goals
 17 | 
 18 | #### 1.1.1 Replace the Discovery v4 Endpoint Proof
 19 | 
 20 | The existing mutual endpoint verification process is unreliable because either side may
 21 | forget about a previously performed endpoint proof. If node A assumes that node B already
 22 | knows about a recent PING/PONG interaction and sends FINDNODE, the request may fail.
 23 | Implementations of Discovery v4 may guard against this flaw using retries, but retrying is
 24 | really slow and usually not done.
 25 | 
 26 | #### 1.1.2 Require knowledge of destination node ID for communication
 27 | 
 28 | Make it expensive to obtain the logical node ID from discovery communications. In
 29 | Discovery v4, any node can provoke responses knowing IP alone, and obtain information
 30 | about a node without knowing its ID. This encourages sloppy implementations to not perform
 31 | proper validation of FINDNODE results and increases the risk of DHT misuse for DDoS
 32 | purposes.
 33 | 
 34 | #### 1.1.3 Support more than one node ID cryptosystem
 35 | 
 36 | Ensure the DHT can accommodate ENR's with multiple identity systems. This will allow
 37 | identity cryptosystems other than *secp256k1/keccak256*.
 38 | 
 39 | #### 1.1.4 Replace node information tuples with ENRs
 40 | 
 41 | ENRs include discovery information and more. These signed, versioned records fulfill
 42 | multiple requirements, such as permitting capability advertisement and transport
 43 | negotiation.
 44 | 
 45 | #### 1.1.5 Guard against Kademlia implementation flaws
 46 | 
 47 | Discovery v4 trusts other nodes to return neighbors according to an agreed distance
 48 | metric. Mismatches in implementation can make it hard for nodes to join the network, or
 49 | lead to network fragmentation.
 50 | 
 51 | #### 1.1.6 Secondary topic-based node index
 52 | 
 53 | The protocol must support discovery of nodes via an arbitrary topic identifier. Finding
 54 | nodes belonging to a topic should be as fast or faster than finding a node with a certain
 55 | ID.
 56 | 
 57 | #### 1.1.7 Change replay prevention
 58 | 
 59 | The use of timestamps as a replay prevention mechanism in Discovery v4 has led to many
 60 | complaints about connectivity when the host's clock was wrong. The protocol should be
 61 | independent of the clock.
 62 | 
 63 | #### 1.1.8 Message obfuscation
 64 | 
 65 | The protocol should obfuscate traffic to prevent accidental packet mangling or trivial
 66 | sniffing. It must also avoid inclusion of obvious markers to prevent naive blocking of
 67 | discovery traffic using hard-coded packet signatures. Defense against advanced traffic
 68 | analysis systems, e.g. using inter-packet timing is a secondary concern.
 69 | 
 70 | ## Security Goals
 71 | 
 72 | Individual potential vulnerabilities are identified below. These each represent their own
 73 | risk mitigation goal.
 74 | 
 75 | #### 1.2.1 Replay of the handshake
 76 | 
 77 | The handshake, if successfully replayed from an older session, would allow a malicious
 78 | node to occupy a former IP location, or pollute the routing table with old information.
 79 | 
 80 | #### 1.2.2 Replay NODES
 81 | 
 82 | A NODES response, if successfully replayed, would pollute the routing table with stale
 83 | information.
 84 | 
 85 | #### 1.2.3 Replay PONG
 86 | 
 87 | A PONG, if successfully replayed, could convince a node that a node is live and
 88 | participating when it isn't.
 89 | 
 90 | #### 1.2.4 Kademlia redirection
 91 | 
 92 | A FindNode response contains false endpoint information intended at directing traffic at a
 93 | victim / polluting the routing table. A topic query results in fake endpoint information,
 94 | directing traffic at a victim.
 95 | 
 96 | #### 1.2.5 Kademlia redirection + self-propagation
 97 | 
 98 | As 1.2.3 but the responses attempt to replicate the malicious node throughout the routing
 99 | table, to amplify the source of pollution and traffic.
100 | 
101 | #### 1.2.6 Unsolicited replies
102 | 
103 | A malicious node is attempting to spam a node with fake responses to typical requests.
104 | These messages may be replayed from previous communications, or may be new messages with
105 | spoofed source endpoints. The aim is to disrupt weak implementations or have their
106 | information be received as authentic, to pollute the recipient's routing table.
107 | 
108 | #### 1.2.7 Amplification
109 | 
110 | Malicious requests of small message size are sent from spoofed source IPs to direct larger
111 | response messages at the victim.
112 | 
113 | #### 1.2.8 Kademlia direct validation
114 | 
115 | Direct validation of a newly discovered node can be an attack vector. A malicious node may
116 | supply false node information with the IP of a victim. Validation traffic is then directed
117 | at the victim.
118 | 
119 | #### 1.2.9 Kademlia ID count per address validations
120 | 
121 | There are various attacks facilitated by being able to associate multiple fake (or even
122 | real) malicious node ids with a single IP endpoint. One mitigation method that is
123 | sometimes considered is to globally limit the number of logical node IDs that can be
124 | associated with an IP address. However, this is an attack vector. A malicious actor can
125 | supply many logical node ids for a single IP address and thus prevent the correct node
126 | from being able to join the network.
127 | 
128 | #### 1.2.10 Sybil/Eclipse attacks
129 | 
130 | These attacks rely on being able to create many real nodes, or spoof many logical node IDs
131 | for a small number of physical endpoints, to form a large, isolated area of the network
132 | under the control of the malicious actor. The victim's discovery findings are directed
133 | into that part of the network, either to manipulate their traffic or to fully isolate them
134 | from the network.
135 | 
136 | ## Version Interoperability / Upgrade Paths
137 | 
138 | There are several considerations regarding the coexistence of v4 and v5 network members.
139 | 
140 | #### 1.3.1 Transition period during network formation
141 | 
142 | Discovery v4 clients should be able to serve as discovery v5 bootstrap nodes while the
143 | number of new discovery v5 clients is still low.
144 | 
145 | #### 1.3.2 Circumvention of 1.1.2 with v4 PING
146 | 
147 | While a client supports both the old v4 and newer versions, it is possible for malicious
148 | actors to pose as a v4 node and recover node IDs from arbitrary IP addresses. This should
149 | somehow be avoided.
150 | 
151 | # Rationale
152 | 
153 | ## Why UDP?
154 | 
155 | The wire protocol specification mandates the use of UDP. This may seem restrictive, but
156 | use of UDP communication is an important part of the design. While there is no single
157 | reason which ultimately dictates this choice, there are many reasons why the system as a
158 | whole will function a lot better in the context of UDP.
159 | 
160 | For discovery to work, all nodes must be able to communicate with each other on equal
161 | footing. The network won't form properly if some nodes can only communicate with certain
162 | other nodes. Incooperative NAT in between the node and the Internet can cause
163 | communication failure. UDP is fundamentally easier to work with when it comes to NAT
164 | traversal. No explicit hole-punching is required if the NAT setup is capable of full-cone
165 | translation, i.e. a single packet sent to any other node establishes a port mapping which
166 | allows packets from others to reach the node behind NAT.
167 | 
168 | Unlike other DHT systems such as IPFS, the node discovery protocol mandates a single wire
169 | protocol to be implemented by everyone. This avoids communication failures due to
170 | incompatible transports and strengthens the DHT because all participants are guaranteed to
171 | be reachable on the declared endpoint. It is also fundamentally simpler to reason about
172 | and implement: the protocol either works in a certain context or it doesn't. If the
173 | protocol cannot be used because the networking environment doesn't support UDP, another
174 | discovery mechanism must be chosen.
175 | 
176 | Another reason for UDP is communication latency: participants in the discovery protocol
177 | must be able to communicate with a large number of other nodes within a short time frame
178 | to establish and maintain the neighbor set and must perform regular liveness checks on
179 | their neighbors. For the topic advertisement system, registrants collect tickets and must
180 | use them as soon as the ticket expires to place an ad in a topic queue.
181 | 
182 | These protocol interactions are difficult to implement in a TCP setting where connections
183 | require multiple round-trips before application data can be sent and the connection
184 | lifecycle needs to be maintained. An implementation of the wire protocol on a TCP-based
185 | transport would either need permanent connection to hundreds of nodes, in which case the
186 | application would be short on file descriptors, or establish many short-lived TCP
187 | connections per second to communicate with specific nodes.
188 | 
189 | Yet another useful property of UDP is that packets aren't required to reach their
190 | destination --- intermediaries may drop arbitrary packets. This strengthens the protocol
191 | because it must be designed to function even under bad connectivity. Implementations may
192 | exploit the possibility of packet loss to their advantage. A participant can never tell
193 | whether a certain request wasn't answered in time because the recipient chose to ignore it
194 | or because their own connection isn't working. An implementation that tries to minimize
195 | traffic or CPU overhead could simply drop a certain amount of packets at application level
196 | to stay within self-imposed limits.
197 | 
198 | ## Why Kademlia?
199 | 
200 | Kademlia is a simple distributed hash table design proposed in 2002. It is commonly used
201 | for file-sharing systems where content is stored by hash and distributed among
202 | participants based on their 'proximity' according to the XOR distance metric.
203 | 
204 | Node discovery is a Kademlia-inspired system but doesn't store any files, only node
205 | information is relayed. We chose Kademlia primarily because the algorithm is simple and
206 | understandable while providing a distributed database that scales with the number of
207 | participants. Our system also relies on the routing table to allow enumeration and random
208 | traversal of the whole network, i.e. all participants can be found. Most importantly,
209 | having a structured network with routing enables thinking about DHT 'address space' and
210 | 'regions of address space'. These concepts are used to build the [topic-based node index].
211 | 
212 | Kademlia is often criticized as a naive design with obvious weaknesses. We believe that
213 | most issues with simple Kademlia can be overcome by careful programming and the benefits
214 | of a simple design outweigh the cost and risks of maintaining a more complex system.
215 | 
216 | ## Sybil and Eclipse Attacks
217 | 
218 | The well-known 'sybil attack' is based on the observation that creating node identities is
219 | essentially free. In any system using a measure of proximity among node identities, an
220 | adversary may place nodes close to a chosen node by generating suitable identities. For
221 | basic node discovery through network enumeration, the 'sybil attack' poses no significant
222 | challenge. Sybils are a serious issue for the topic-based node index, especially for
223 | topics provided by few participants, because the index relies on node distance.
224 | 
225 | An 'eclipse attack' is usually based on generating sybil nodes with the goal of polluting
226 | the victim node's routing table. Once the table is overtaken, the victim has no way to
227 | find any other nodes but those controlled by the adversary. Even if creating sybil nodes
228 | were somehow impossible, 'eclipsing' a node might still be achieved through other means
229 | such as directing large amounts of traffic to the node. When the victim node is unable to
230 | keep up regular communication with the rest of the network it may lose connection and be
231 | forced into re-bootstrapping its routing table --- a situation in which it is most
232 | vulnerable.
233 | 
234 | Both the 'sybil attack' and the 'eclipse attack' must be considered for any structured
235 | overlay network, and there is no single optimal solution to fully protect against these
236 | attacks. However, certain implementation decisions can make them more expensive or render
237 | them ineffective.
238 | 
239 | As a general measure, implementations can place IP-based limits on the content of their
240 | routing table. For example, limiting Kademlia table buckets to two nodes from every /24 IP
241 | subnetwork and the whole table to 10 nodes per /24 IP subnetwork significantly increases
242 | the number of hosts an attacker must control to overtake the routing table. Such limits
243 | are effective because IPv4 addresses are a scarce resource. Subnetwork-based limits remain
244 | effective even as IPv6 adoption progresses.
245 | 
246 | To counter being eclipsed via repeated contact by an adversary, implementations of the
247 | Kademlia table should avoid taking on new members on incoming contact unless the table is
248 | well-stocked from outbound queries. Readers of the original Kademlia paper may easily
249 | assume that liveness checks on bucket members should be performed just when a new node
250 | tries to enter the bucket, but doing so increases the risk of emptying the table through
251 | DoS. We therefore recommend to perform liveness checks on a separate schedule which is
252 | independent of incoming requests. Checks may also be paused or delayed when the node is
253 | under high load. The number of past liveness checks performed on a bucket member is an
254 | important indicator of its age: Implementations should favor long-lived nodes and may
255 | relax liveness checks according to node age.
256 | 
257 | A well-researched countermeasure to sybil attacks is to make creation of identities
258 | computationally expensive. While effective in theory, there are significant downsides to
259 | this approach. Nodes on resource-constrained devices such as mobile phones may not be able
260 | to solve the computational puzzle in time to join the network. Continuous advances in
261 | hashing technology which speed up cryptocurrency proof-of-work algorithms show that this
262 | way of securing the network requires constant adjustments to thresholds and can never beat
263 | determined attackers.
264 | 
265 | Support for mixed ENR identity schemes, described later in this document, allows for an
266 | escape hatch to introduce arbitrary optional constraints (including proof-of-work) on node
267 | identities. Thus, while the issue is not directly addressed at wire protocol level, there
268 | is no inherent blocker for solving it as the need arises.
269 | 
270 | ## Node Records and Their Properties
271 | 
272 | In Discovery v5, all node information is exchanged using [node records]. Records are
273 | self-signed by the node they describe and contain arbitrary key-value pairs. They also
274 | contain a sequence number to determine which copy of the record is newer when multiple
275 | copies are available. When a node record is changed by its owner, the sequence number
276 | increases. The new record 'syncs' to neighboring nodes because they will request it during
277 | liveness revalidation. The record is also 'pushed' on to newly seen nodes as part of the
278 | handshake.
279 | 
280 | Signing records prevents any intermediary node from changing the content of a record. Any
281 | node's information is either available in the exact form it was published or not at all.
282 | To make the system secure, proper validation of records is important. Implementations must
283 | verify the signature of all received records. Implementations should also avoid sharing
284 | records containing no usable IP addresses or ports and check that Internet hosts do not
285 | attempt to share records containing LAN IP addresses.
286 | 
287 | ## On Encryption
288 | 
289 | An early draft of Discovery v5 integrated weak obfuscation based on XORing packet content
290 | as an optional facility. As development of the protocol progressed, we understood that
291 | traffic amplification, replay and packet authentication could all be solved by introducing
292 | a real encryption scheme. The way the handshake and encryption works is primarily aimed at
293 | these issues and is not supposed to ensure complete anonymity of DHT users. While it does
294 | protect against passive observers, the handshake is not forward-secure and active protocol
295 | participants can access node information by simply asking for it.
296 | 
297 | Node identities can use different kinds of keys depending on the identity scheme used in
298 | the node record. This has implications on the handshake because it deals with the public
299 | key used to derive the identity. Implementations of Discovery v5 must agree on the set of
300 | supported identity schemes to keep the network interoperable and custom code to verify the
301 | handshake is required for every new scheme. We believe this is an acceptable tradeoff
302 | because introducing a new kind of node identity is a rare event.
303 | 
304 | Since the handshake performs complex cryptographic operations (ECDH, signature
305 | verification) performance of the handshake is a big concern. Benchmarking the experimental
306 | Go implementation shows that the handshake computation takes 500µs on a 2014-era laptop
307 | using the default secp256k1/keccak256 identity scheme. That's a lot, but note the cost
308 | amortizes because nodes commonly exchange multiple packets. Subsequent packets in the same
309 | conversation can be decrypted and authenticated in just 2µs. The most common protocol
310 | interaction is a FINDNODE or TOPICQUERY request on an unknown node with 4 NODES responses.
311 | 
312 | To put things into perspective: encryption and authentication in Discovery v5 is still a
313 | significant improvement over the authentication scheme used in Discovery v4, which
314 | performs secp256k1 signature 'recovery' (benchmark: ~170µs) on every packet. A FINDNODE
315 | interaction with an unknown v4 node takes 7 packets (2x PING/PONG, FINDNODE, 2x NEIGHBORS)
316 | and costs 1.2ms on each side for the crypto alone. In addition, the v5 handshake reduces
317 | the risk of computational DoS because it costs as much to create as it costs to verify and
318 | cannot be replayed.
319 | 
320 | ## On Amplification and Replay
321 | 
322 | Any openly accessible packet-based system must consider misuse of the protocol for traffic
323 | amplification purposes. There are two possible avenues of attack: In the first, an
324 | adversary who wishes to attack a third-party host may send packets with 'spoofed' source
325 | IP address to a node, attempting to make the node send a larger response to the victim
326 | endpoint. In the second, the adversary attempts to install a node record containing the
327 | victim's endpoint in the DHT, causing other nodes to direct packets to the victim.
328 | 
329 | The handshake handles the first kind of attack by responding with a small WHOAREYOU packet
330 | whenever any request is received from an unknown endpoint. This is safe because the
331 | adversary's packet is always larger than the WHOAREYOU response, removing the incentive
332 | for the attack. To make the countermeasure work, implementations must keep session secrets
333 | not just per node ID, but also per node IP.
334 | 
335 | The second kind of attack--- installing the victim as a node ---is handled by requiring
336 | that implementations mustn't answer queries with nodes whose liveness hasn't been
337 | verified. When a node is added to the Kademlia table, it must pass at least one check on
338 | the IP declared in the node record before it can be returned in a NODES response.
339 | 
340 | An adversary may also try to replay previously sent/seen packets to impersonate a node or
341 | disturb the operation of the protocol. Session keys per node-ID/IP generally prevent
342 | replay across sessions. The `request-id`, mirrored in response packets, prevents replay of
343 | responses within a session.
344 | 
345 | ## The Topic Index
346 | 
347 | Using FINDNODE queries with appropriately chosen targets, the entire DHT can be sampled by
348 | a random walk to find all other participants. When building a distributed application, it
349 | is often desirable to restrict the search to participants which provide a certain service.
350 | A simple solution to this problem would be to simply split up the network and require
351 | participation in many smaller application-specific networks. However, such networks are
352 | hard to bootstrap and also more vulnerable to attacks which could isolate nodes.
353 | 
354 | The topic index provides discovery by provided service in a different way. Nodes maintain
355 | a single node table tracking their neighbors and advertise 'topics' on nodes found by
356 | randomly walking the DHT. While the 'global' topic index can be also spammed, it makes
357 | complete isolation a lot harder. To prevent nodes interested in a certain topic from
358 | finding each other, the entire discovery network would have to be overpowered.
359 | 
360 | To make the index useful, searching for nodes by topic must be efficient regardless of the
361 | number of advertisers. This is achieved by estimating the topic 'radius', i.e. the
362 | percentage of all live nodes which are advertising the topic. Advertisement and search
363 | activities are restricted to a region of DHT address space around the topic's 'center'.
364 | 
365 | We also want the index to satisfy another property: When a topic advertisement is placed,
366 | it should last for a well-defined amount of time. This ensures nodes may rely on their
367 | advertisements staying placed rather than worrying about keeping them alive.
368 | 
369 | Finally, the index should consume limited resources. Just as the node table is limited in
370 | number and size of buckets, the size of the index data structure on each node is limited.
371 | 
372 | ### Why should advertisers wait?
373 | 
374 | Advertisers must wait a certain amount of time before they can be registered. Enforcing
375 | this time limit prevents misuse of the topic index because any topic must be important
376 | enough to outweigh the cost of waiting. Imagine a group phone call: announcing the
377 | participants of the call using topic advertisement isn't a good use of the system because
378 | the topic exists only for a short time and will have very few participants. The waiting
379 | time prevents using the index for this purpose because the call might already be over
380 | before everyone could get registered.
381 | 
382 | ### Dealing with Topic Spam
383 | 
384 | Our model is based on the following assumptions:
385 | 
386 | - Anyone can place their own advertisements under any topics and the rate of placing ads
387 |   is not limited globally. The number of active ads for any node is roughly proportional
388 |   to the resources (network bandwidth, mostly) spent on advertising.
389 | - Honest actors whose purpose is to connect to other honest actors will spend an adequate
390 |   amount of efforts on registering and searching for ads, depending on the rate of newly
391 |   established connections they are targeting. If the given topic is used only by honest
392 |   actors, a few registrations per minute will be satisfactory, regardless of the size of
393 |   the subnetwork.
394 | - Dishonest actors may want to place an excessive amount of ads just to disrupt the
395 |   discovery service. This will reduce the effectiveness of honest registration efforts by
396 |   increasing the topic radius and/or topic queue waiting times. If the attacker(s) can
397 |   place a comparable amount or more ads than all honest actors combined then the rate of
398 |   new (useful) connections established throughout the network will reduce proportionally
399 |   to the `honest / (dishonest + honest)` registration rates.
400 | 
401 | This adverse effect can be countered by honest actors increasing their registration and
402 | search efforts. Fortunately, the rate of established connections between them will
403 | increase proportionally both with increased honest registration and search efforts. If
404 | both are increased in response to an attack, the required factor of increased efforts from
405 | honest actors is proportional to the square root of the attacker's efforts.
406 | 
407 | ### Detecting a useless registration attack
408 | 
409 | In the case of a symmetrical protocol, where nodes are both searching and advertising
410 | under the same topic, it is easy to detect when most of the found ads turn out to be
411 | useless and increase both registration and query frequency. It is a bit harder but still
412 | possible with asymmetrical (client-server) protocols, where only clients can easily detect
413 | useless registrations, while advertisers (servers) do not have a direct way of detecting
414 | when they should increase their advertising efforts. One possible solution is for servers
415 | to also act as clients just to test the server capabilities of other advertisers. It is
416 | also possible to implement a feedback system between trusted clients and servers.
417 | 
418 | # References
419 | 
420 | - Petar Maymounkov and David Mazières.
421 |   *Kademlia: A Peer-to-peer Information System Based on the XOR Metric.* 2002.\
422 |   <https://www.scs.stanford.edu/~dm/home/papers/kpos.pdf>
423 | 
424 | - Atul Singh, Tsuen-Wan “Johnny” Ngan, Peter Druschel, Dan S. Wallach.
425 |   *Eclipse Attacks on Overlay Networks: Threats and Defenses*. 2006.\
426 |   <https://www.cs.rice.edu/~dwallach/pub/eclipse-infocom06.pdf>
427 | 
428 | - Ingmar Baumgart and Sebastian Mies.
429 |   *S/Kademlia: A Practicable Approach Towards Secure Key-Based Routing.* 2007.\
430 |   <https://telematics.tm.kit.edu/publications/Files/267/SKademlia_2007.pdf>
431 | 
432 | - Xin Sun, Ruben Torres and Sanjay Rao. *Feasiblity of DDoS Attacks with P2P Systems and
433 |   Prevention through Robust Membership Management.* 2007.\
434 |   <https://docs.lib.purdue.edu/cgi/viewcontent.cgi?article=1357&context=ecetr>
435 | 
436 | - Erik Hjelmvik, Wolfgang John. *Breaking and Improving Protocol Obfuscation.* 2010.\
437 |   <https://internetstiftelsen.se/docs/hjelmvik_breaking.pdf>
438 | 
439 | - Adam Langley, Wan-Teh Chang. *QUIC Crypto*. 2016.\
440 |   <https://docs.google.com/document/d/1g5nIXAIkN_Y-7XJW5K45IblHd_L2f5LTaDUDwvZ5L6g>
441 | 
442 | - W3C Credentials Community Group. *Decentralized Identifiers (DIDs) Spec.* 2017.\
443 |   <https://w3c-ccg.github.io/did-spec>
444 | 
445 | - Seoung Kyun Kim, Zane Ma, Siddharth Murali, Joshua Mason, Andrew Miller, Michael Bailey.
446 |   *Measuring Ethereum Network Peers*. 2018.\
447 |   <http://mdbailey.ece.illinois.edu/publications/imc18_ethereum.pdf>
448 | 
449 | - Yuval Marcus, Ethan Heilman, Sharon Goldberg.
450 |   *Low-Resource Eclipse Attacks on Ethereum’s Peer-to-Peer Network.* 2018.\
451 |   <https://eprint.iacr.org/2018/236.pdf>
452 | 
453 | [wire protocol]: ./discv5-wire.md
454 | [topic-based node index]: ./discv5-theory.md#topic-advertisement
455 | [node records]: ../enr.md
456 | 


--------------------------------------------------------------------------------
/discv5/discv5-theory.md:
--------------------------------------------------------------------------------
  1 | # Node Discovery Protocol v5 - Theory
  2 | 
  3 | **Protocol version v5.1**
  4 | 
  5 | This document explains the algorithms and data structures used by the protocol.
  6 | 
  7 | ## Nodes, Records and Distances
  8 | 
  9 | A participant in the Node Discovery Protocol is represented by a 'node record' as defined
 10 | in [EIP-778]. The node record keeps arbitrary information about the node. For the purposes
 11 | of this protocol, the node must at least provide an IP address (`"ip"` or `"ip6"` key) and
 12 | UDP port (`"udp"` key) in order to have it's record relayed in the DHT.
 13 | 
 14 | Node records are signed according to an 'identity scheme'. Any scheme can be used with
 15 | Node Discovery Protocol, and nodes using different schemes can communicate.
 16 | 
 17 | The identity scheme of a node record defines how a 32-byte 'node ID' is derived from the
 18 | information contained in the record. The 'distance' between two node IDs is the bitwise
 19 | XOR of the IDs, taken as the big-endian number.
 20 | 
 21 |     distance(n₁, n₂) = n₁ XOR n₂
 22 | 
 23 | In many situations, the logarithmic distance (i.e. length of differing suffix in bits) is
 24 | used in place of the actual distance.
 25 | 
 26 |     logdistance(n₁, n₂) = log2(distance(n₁, n₂))
 27 | 
 28 | ### Maintaining The Local Node Record
 29 | 
 30 | Participants should update their record, increase the sequence number and sign a new
 31 | version of the record whenever their information changes. This is especially important for
 32 | changes to the node's IP address and port. Implementations should determine the external
 33 | endpoint (the Internet-facing IP address and port on which the node can be reached) and
 34 | include it in their record.
 35 | 
 36 | If communication flows through a NAT device, the UPnP/NAT-PMP protocols or the mirrored
 37 | UDP envelope IP and port found in the [PONG] message can be used to determine the external
 38 | IP address and port.
 39 | 
 40 | If the endpoint cannot be determined (e.g. when the NAT doesn't support 'full-cone'
 41 | translation), implementations should omit IP address and UDP port from the record.
 42 | 
 43 | ## Sessions
 44 | 
 45 | Discovery communication is encrypted and authenticated using session keys, established in
 46 | the handshake. Since every node participating in the network acts as both client and
 47 | server, a handshake can be initiated by either side of communication at any time.
 48 | 
 49 | ### Handshake Steps
 50 | 
 51 | #### Step 1: Node A sends message packet
 52 | 
 53 | In the following definitions, we assume that node A wishes to communicate with node B,
 54 | e.g. to send a FINDNODE message. Node A must have a copy of node B's record in order to
 55 | communicate with it.
 56 | 
 57 | If node A has session keys from prior communication with B, it encrypts its request with
 58 | those keys. If no keys are known, it initiates the handshake by sending an ordinary
 59 | message packet with random message content.
 60 | 
 61 |     A -> B   FINDNODE message packet encrypted with unknown key
 62 | 
 63 | #### Step 2: Node B responds with challenge
 64 | 
 65 | Node B receives the message packet and extracts the source node ID from the packet header.
 66 | If node B has session keys from prior communication with A, it attempts to decrypt the
 67 | message data. If decryption and authentication of the message succeeds, there is no need
 68 | for a handshake and node B can simply respond to the request.
 69 | 
 70 | If node B does not have session keys or decryption is not successful, it must initiate a
 71 | handshake by responding with a [WHOAREYOU packet].
 72 | 
 73 | It first generates a unique `id-nonce` value and includes it in the packet. Node B also
 74 | checks if it has a copy of node A's record. If it does, it also includes the sequence
 75 | number of this record in the challenge packet, otherwise it sets the `enr-seq` field to
 76 | zero.
 77 | 
 78 | Node B must also store the A's record and the WHOAREYOU challenge for a short duration
 79 | after sending it to node A because they will be needed again in step 4.
 80 | 
 81 |     A <- B   WHOAREYOU packet including id-nonce, enr-seq
 82 | 
 83 | #### Step 3: Node A processes the challenge
 84 | 
 85 | Node A receives the challenge sent by node B, which confirms that node B is alive and is
 86 | ready to perform the handshake. The challenge can be traced back to the request packet
 87 | which solicited it by checking the `nonce`, which mirrors the request packet's `nonce`.
 88 | 
 89 | Node A proceeds with the handshake by re-sending the FINDNODE request as a [handshake
 90 | message packet]. This packet contains three parts in addition to the message:
 91 | `id-signature`, `ephemeral-pubkey` and `record`.
 92 | 
 93 | The handshake uses the unmasked WHOAREYOU challenge as an input:
 94 | 
 95 |     challenge-data     = masking-iv || static-header || authdata
 96 | 
 97 | Node A can now derive the new session keys. To do so, it first generates an ephemeral key
 98 | pair on the elliptic curve used by node B's identity scheme. As an example, let's assume
 99 | the node record of B uses the "v4" scheme. In this case the `ephemeral-pubkey` will be a
100 | public key on the secp256k1 curve.
101 | 
102 |     ephemeral-key      = random private key generated by node A
103 |     ephemeral-pubkey   = public key corresponding to ephemeral-key
104 | 
105 | The ephemeral key is used to perform Diffie-Hellman key agreement with node B's static
106 | public key and the session keys are derived from it using the HKDF key derivation
107 | function.
108 | 
109 |     dest-pubkey        = public key corresponding to node B's static private key
110 |     secret             = ecdh(dest-pubkey, ephemeral-key)
111 |     kdf-info           = "discovery v5 key agreement" || node-id-A || node-id-B
112 |     prk                = HKDF-Extract(secret, challenge-data)
113 |     key-data           = HKDF-Expand(prk, kdf-info)
114 |     initiator-key      = key-data[:16]
115 |     recipient-key      = key-data[16:]
116 | 
117 | Node A creates the `id-signature`, which proves that it controls the private key which
118 | signed its node record. The signature also prevents replay of the handshake.
119 | 
120 |     id-signature-text  = "discovery v5 identity proof"
121 |     id-signature-input = id-signature-text || challenge-data || ephemeral-pubkey || node-id-B
122 |     id-signature       = id_sign(sha256(id-signature-input))
123 | 
124 | Finally, node A compares the `enr-seq` element of the WHOAREYOU challenge against its own
125 | node record sequence number. If the sequence number in the challenge is lower, it includes
126 | its record into the handshake message packet.
127 | 
128 | The request is now re-sent, with the message encrypted using the new session keys.
129 | 
130 |     A -> B   FINDNODE handshake message packet, encrypted with new initiator-key
131 | 
132 | #### Step 4: Node B receives handshake message
133 | 
134 | When node B receives the handshake message packet, it first loads the node record and
135 | WHOAREYOU challenge which it sent and stored earlier.
136 | 
137 | If node B did not have the node record of node A, the handshake message packet must
138 | contain a node record. A record may also be present if node A determined that its record
139 | is newer than B's current copy. If the packet contains a node record, B must first
140 | validate it by checking the record's signature.
141 | 
142 | Node B then verifies the `id-signature` against the identity public key of A's record.
143 | 
144 | After that, B can perform the key derivation using its own static private key and the
145 | `ephemeral-pubkey` from the handshake packet. Using the resulting session keys, it
146 | attempts to decrypt the message contained in the packet.
147 | 
148 | If the message can be decrypted and authenticated, Node B considers the new session keys
149 | valid and responds to the message. In our example case, the response is a `NODES` message:
150 | 
151 |     A <- B   NODES encrypted with new recipient-key
152 | 
153 | #### Step 5: Node A receives response message
154 | 
155 | Node A receives the message packet response and authenticates/decrypts it with the new
156 | session keys. If decryption/authentication succeeds, node B's identity is verified and
157 | node A also considers the new session keys valid.
158 | 
159 | ### Identity-Specific Cryptography in the Handshake
160 | 
161 | Establishment of session keys is dependent on the [identity scheme] used by the recipient
162 | (i.e. the node which sends WHOAREYOU). Likewise, the signature over `id-sig-input` is made
163 | by the identity key of the initiator. It is not required that initiator and recipient use
164 | the same identity scheme in their respective node records. Implementations must be able to
165 | perform the handshake for all supported identity schemes.
166 | 
167 | At this time, the only supported identity scheme is "v4".
168 | 
169 | `id_sign(hash)` creates a signature over `hash` using the node's static private key. The
170 | signature is encoded as the 64-byte array `r || s`, i.e. as the concatenation of the
171 | signature values.
172 | 
173 | `ecdh(pubkey, privkey)` creates a secret through elliptic-curve Diffie-Hellman key
174 | agreement. The public key is multiplied by the private key to create a secret ephemeral
175 | key `eph = pubkey * privkey`. The 33-byte secret output is `y || eph.x` where `y` is
176 | `0x02` when `eph.y` is even or `0x03` when `eph.y` is odd.
177 | 
178 | ### Handshake Implementation Considerations
179 | 
180 | Since a handshake may happen at any time, UDP packets may be reordered by transmitting
181 | networking equipment, implementations must deal with certain subtleties regarding the
182 | handshake.
183 | 
184 | In general, implementations should keep a reference to all sent request packets until the
185 | request either times out, is answered by the corresponding response packet or answered by
186 | WHOAREYOU. If WHOAREYOU is received as the answer to a request, the request must be
187 | re-sent as a handshake packet.
188 | 
189 | If an implementation supports sending concurrent requests, multiple responses may be
190 | pending when WHOAREYOU is received, as in the following example:
191 | 
192 |     A -> B   FINDNODE
193 |     A -> B   PING
194 |     A -> B   TOPICQUERY
195 |     A <- B   WHOAREYOU (nonce references PING)
196 | 
197 | When this happens, all buffered requests can be considered invalid (the remote end cannot
198 | decrypt them) and the packet referenced by the WHOAREYOU `nonce` (in this example: PING)
199 | must be re-sent as a handshake. When the response to the re-sent is received, the new
200 | session is established and other pending requests (example: FINDNODE, TOPICQUERY) may be
201 | re-sent.
202 | 
203 | Note that WHOAREYOU is only ever valid as a response to a previously sent request. If
204 | WHOAREYOU is received but no requests are pending, the handshake attempt can be ignored.
205 | 
206 | Another important issue is the processing of message packets while a challenge is
207 | received: consider the case where node A has sent a packet that B cannot decrypt, and B
208 | has responded with WHOAREYOU.
209 | 
210 |     A -> B   FINDNODE
211 |     A <- B   WHOAREYOU
212 | 
213 | Node B is now waiting for a handshake message packet to complete the new session, but
214 | instead receives another ordinary message packet.
215 | 
216 |     A -> B   ORDINARY MESSAGE PACKET
217 | 
218 | In this case, implementations should respond with a new WHOAREYOU challenge referencing
219 | the message packet.
220 | 
221 | ### Session Cache
222 | 
223 | Nodes should store session keys for communication with other recently-seen nodes. Since
224 | sessions are ephemeral and can be re-established whenever necessary, it is sufficient to
225 | store a limited number of sessions in an in-memory LRU cache.
226 | 
227 | To prevent IP spoofing attacks, implementations must ensure that session secrets and the
228 | handshake are tied to a specific UDP endpoint. This is simple to implement by using the
229 | node ID and IP/port as the 'key' into the in-memory session cache. When a node switches
230 | endpoints, e.g. when roaming between different wireless networks, sessions will have to be
231 | re-established by handshaking again. This requires no effort on behalf of the roaming node
232 | because the recipients of protocol messages will simply refuse to decrypt messages from
233 | the new endpoint and reply with WHOAREYOU.
234 | 
235 | The number of messages which can be encrypted with a certain session key is limited
236 | because encryption of each message requires a unique nonce for AES-GCM. In addition to the
237 | keys, the session cache must also keep track of the count of outgoing messages to ensure
238 | the uniqueness of nonce values. Since the wire protocol uses 96 bit AES-GCM nonces, it is
239 | strongly recommended to generate them by encoding the current outgoing message count into
240 | the first 32 bits of the nonce and filling the remaining 64 bits with random data
241 | generated by a cryptographically secure random number generator.
242 | 
243 | ## Node Table
244 | 
245 | Nodes keep information about other nodes in their neighborhood. Neighbor nodes are stored
246 | in a routing table consisting of 'k-buckets'. For each `0 ≤ i < 256`, every node keeps a
247 | k-bucket for nodes of `logdistance(self, n) == i`. The Node Discovery Protocol uses `k =
248 | 16`, i.e. every k-bucket contains up to 16 node entries. The entries are sorted by time
249 | last seen — least-recently seen node at the head, most-recently seen at the tail.
250 | 
251 | Whenever a new node N₁ is encountered, it can be inserted into the corresponding bucket.
252 | If the bucket contains less than `k` entries N₁ can simply be added as the first entry. If
253 | the bucket already contains `k` entries, the liveness of the least recently seen node in
254 | the bucket, N₂, needs to be revalidated. If no reply is received from N₂ it is considered
255 | dead, removed and N₁ added to the front of the bucket.
256 | 
257 | Neighbors of very low distance are unlikely to occur in practice. Implementations may omit
258 | k-buckets for low distances.
259 | 
260 | ### Table Maintenance In Practice
261 | 
262 | Nodes are expected to keep track of their close neighbors and regularly refresh their
263 | information. To do so, a lookup targeting the least recently refreshed bucket should be
264 | performed at regular intervals.
265 | 
266 | Checking node liveness whenever a node is to be added to a bucket is impractical and
267 | creates a DoS vector. Implementations should perform liveness checks asynchronously with
268 | bucket addition and occasionally verify that a random node in a random bucket is live by
269 | sending [PING]. When the PONG response indicates that a new version of the node record is
270 | available, the liveness check should pull the new record and update it in the local table.
271 | 
272 | If a node's liveness has been verified many times, implementations may consider occasional
273 | non-responsiveness permissible and assume the node is live.
274 | 
275 | When responding to FINDNODE, implementations must avoid relaying any nodes whose liveness
276 | has not been verified. This is easy to achieve by storing an additional flag per node in
277 | the table, tracking whether the node has ever successfully responded to a PING request.
278 | 
279 | In order to keep all k-bucket positions occupied even when bucket members fail liveness
280 | checks, it is strongly recommended to maintain a 'replacement cache' alongside each
281 | bucket. This cache holds recently-seen nodes which would fall into the corresponding bucket
282 | but cannot become a member of the bucket because it is already at capacity. Once a bucket
283 | member becomes unresponsive, a replacement can be chosen from the cache.
284 | 
285 | ### Lookup
286 | 
287 | A 'lookup' locates the `k` closest nodes to a node ID.
288 | 
289 | The lookup initiator starts by picking `α` closest nodes to the target it knows of from
290 | the local table. The initiator then sends [FINDNODE] requests to those nodes. `α` is an
291 | implementation-defined concurrency parameter, typically `3`. As NODES responses are
292 | received, the initiator resends FINDNODE to nodes it has learned about from previous
293 | queries. Of the `k` nodes the initiator has heard of closest to the target, it picks `α`
294 | that it has not yet queried and sends FINDNODE to them. The lookup terminates when the
295 | initiator has queried and gotten responses from the `k` closest nodes it has seen.
296 | 
297 | To improve the resilience of lookups against adversarial nodes, the algorithm may be
298 | adapted to perform network traversal on multiple disjoint paths. Not only does this
299 | approach benefit security, it also improves effectiveness because more nodes are visited
300 | during a single lookup. The initial `k` closest nodes are partitioned into multiple
301 | independent 'path' buckets, and ​concurrent FINDNODE​ requests executed as described above,
302 | with one difference: results discovered on one path are not reused on another, i.e. each
303 | path attempts to reach the closest nodes to the lookup target independently without
304 | reusing intermediate results found on another path. Note that it is still necessary to
305 | track previously asked nodes across all paths to keep the paths disjoint.
306 | 
307 | ### Lookup Protocol
308 | 
309 | This section shows how the wire protocol messages can be used to perform a lookup
310 | interaction against a single node.
311 | 
312 | Node `A` is looking for target `x`. It selects node `B` from the local table or
313 | intermediate lookup results. To query for nodes close to `x` on `B`, node `A` computes the
314 | query distance `d = logdistance(B, x)` and sends its request.
315 | 
316 |     A -> B  FINDNODE [d]
317 | 
318 | Node `B` responds with multiple nodes messages containing the nodes at the queried
319 | distance.
320 | 
321 |     A <- B  NODES [N₁, N₂, N₃]
322 |     A <- B  NODES [N₄, N₅]
323 | 
324 | Depending on the value of `d` and the content of `B`s table, the response to the initial
325 | query might contain very few nodes or no nodes at all. Should this be the case, `A` varies
326 | the distance to retrieve more nodes from adjacent k-buckets on `B`:
327 | 
328 |     A -> B  FINDNODE [d+1]
329 | 
330 | `B` responds with more nodes:
331 | 
332 |     A <- B  NODES [N₆, N₇]
333 | 
334 | Node `A` now sorts all received nodes by distance to the lookup target and proceeds by
335 | repeating the lookup procedure on another, closer node.
336 | 
337 | ## Topic Advertisement
338 | 
339 | The topic advertisement subsystem indexes participants by their provided services. A
340 | node's provided services are identified by arbitrary strings called 'topics'. A node
341 | providing a certain service is said to 'place an ad' for itself when it makes itself
342 | discoverable under that topic. Depending on the needs of the application, a node can
343 | advertise multiple topics or no topics at all. Every node participating in the discovery
344 | protocol acts as an advertisement medium, meaning that it accepts topic ads from other
345 | nodes and later returns them to nodes searching for the same topic.
346 | 
347 | ### Topic Table
348 | 
349 | Nodes store ads for any number of topics and a limited number of ads for each topic. The
350 | data structure holding advertisements is called the 'topic table'. The list of ads for a
351 | particular topic is called the 'topic queue' because it functions like a FIFO queue of
352 | limited length. The image below depicts a topic table containing three queues. The queue
353 | for topic `T₁` is at capacity.
354 | 
355 | ![topic table](./img/topic-queue-diagram.png)
356 | 
357 | The queue size limit is implementation-defined. Implementations should place a global
358 | limit on the number of ads in the topic table regardless of the topic queue which contains
359 | them. Reasonable limits are 100 ads per queue and 50000 ads across all queues. Since ENRs
360 | are at most 300 bytes in size, these limits ensure that a full topic table consumes
361 | approximately 15MB of memory.
362 | 
363 | Any node may appear at most once in any topic queue, that is, registration of a node which
364 | is already registered for a given topic fails. Implementations may impose other
365 | restrictions on the table, such as restrictions on the number of IP-addresses in a certain
366 | range or number of occurrences of the same node across queues.
367 | 
368 | ### Tickets
369 | 
370 | Ads should remain in the queue for a constant amount of time, the `target-ad-lifetime`. To
371 | maintain this guarantee, new registrations are throttled and registrants must wait for a
372 | certain amount of time before they are admitted. When a node attempts to place an ad, it
373 | receives a 'ticket' which tells them how long they must wait before they will be accepted.
374 | It is up to the registrant node to keep the ticket and present it to the advertisement
375 | medium when the waiting time has elapsed.
376 | 
377 | The waiting time constant is:
378 | 
379 |     target-ad-lifetime = 15min
380 | 
381 | The assigned waiting time for any registration attempt is determined according to the
382 | following rules:
383 | 
384 | - When the table is full, the waiting time is assigned based on the lifetime of the oldest
385 |   ad across the whole table, i.e. the registrant must wait for a table slot to become
386 |   available.
387 | - When the topic queue is full, the waiting time depends on the lifetime of the oldest ad
388 |   in the queue. The assigned time is `target-ad-lifetime - oldest-ad-lifetime` in this
389 |   case.
390 | - Otherwise the ad may be placed immediately.
391 | 
392 | Tickets are opaque objects storing arbitrary information determined by the issuing node.
393 | While details of encoding and ticket validation are up to the implementation, tickets must
394 | contain enough information to verify that:
395 | 
396 | - The node attempting to use the ticket is the node which requested it.
397 | - The ticket is valid for a single topic only.
398 | - The ticket can only be used within the registration window.
399 | - The ticket can't be used more than once.
400 | 
401 | Implementations may choose to include arbitrary other information in the ticket, such as
402 | the cumulative wait time spent by the advertiser. A practical way to handle tickets is to
403 | encrypt and authenticate them with a dedicated secret key:
404 | 
405 |     ticket       = aesgcm_encrypt(ticket-key, ticket-nonce, ticket-pt, '')
406 |     ticket-pt    = [src-node-id, src-ip, topic, req-time, wait-time, cum-wait-time]
407 |     src-node-id  = node ID that requested the ticket
408 |     src-ip       = IP address that requested the ticket
409 |     topic        = the topic that ticket is valid for
410 |     req-time     = absolute time of REGTOPIC request
411 |     wait-time    = waiting time assigned when ticket was created
412 |     cum-wait     = cumulative waiting time of this node
413 | 
414 | ### Registration Window
415 | 
416 | The image below depicts a single ticket's validity over time. When the ticket is issued,
417 | the node keeping it must wait until the registration window opens. The length of the
418 | registration window is 10 seconds. The ticket becomes invalid after the registration
419 | window has passed.
420 | 
421 | ![ticket validity over time](./img/ticket-validity.png)
422 | 
423 | Since all ticket waiting times are assigned to expire when a slot in the queue opens, the
424 | advertisement medium may receive multiple valid tickets during the registration window and
425 | must choose one of them to be admitted in the topic queue. The winning node is notified
426 | using a [REGCONFIRMATION] response.
427 | 
428 | Picking the winner can be achieved by keeping track of a single 'next ticket' per queue
429 | during the registration window. Whenever a new ticket is submitted, first determine its
430 | validity and compare it against the current 'next ticket' to determine which of the two is
431 | better according to an implementation-defined metric such as the cumulative wait time
432 | stored in the ticket.
433 | 
434 | ### Advertisement Protocol
435 | 
436 | This section explains how the topic-related protocol messages are used to place an ad.
437 | 
438 | Let us assume that node `A` provides topic `T`. It selects node `C` as advertisement
439 | medium and wants to register an ad, so that when node `B` (who is searching for topic `T`)
440 | asks `C`, `C` can return the registration entry of `A` to `B`.
441 | 
442 | Node `A` first attempts to register without a ticket by sending [REGTOPIC] to `C`.
443 | 
444 |     A -> C  REGTOPIC [T, ""]
445 | 
446 | `C` replies with a ticket and waiting time.
447 | 
448 |     A <- C  TICKET [ticket, wait-time]
449 | 
450 | Node `A` now waits for the duration of the waiting time. When the wait is over, `A` sends
451 | another registration request including the ticket. `C` does not need to remember its
452 | issued tickets since the ticket is authenticated and contains enough information for `C`
453 | to determine its validity.
454 | 
455 |     A -> C  REGTOPIC [T, ticket]
456 | 
457 | Node `C` replies with another ticket. Node `A` must keep this ticket in place of the
458 | earlier one, and must also be prepared to handle a confirmation call in case registration
459 | was successful.
460 | 
461 |     A <- C  TICKET [ticket, wait-time]
462 | 
463 | Node `C` waits for the registration window to end on the queue and selects `A` as the node
464 | which is registered. Node `C` places `A` into the topic queue for `T` and sends a
465 | [REGCONFIRMATION] response.
466 | 
467 |     A <- C  REGCONFIRMATION [T]
468 | 
469 | ### Ad Placement And Topic Radius
470 | 
471 | Since every node may act as an advertisement medium for any topic, advertisers and nodes
472 | looking for ads must agree on a scheme by which ads for a topic are distributed. When the
473 | number of nodes advertising a topic is at least a certain percentage of the whole
474 | discovery network (rough estimate: at least 1%), ads may simply be placed on random nodes
475 | because searching for the topic on randomly selected nodes will locate the ads quickly enough.
476 | 
477 | However, topic search should be fast even when the number of advertisers for a topic is
478 | much smaller than the number of all live nodes. Advertisers and searchers must agree on a
479 | subset of nodes to serve as advertisement media for the topic. This subset is simply a
480 | region of the node ID address space, consisting of nodes whose Kademlia address is within a
481 | certain distance to the topic hash `sha256(T)`. This distance is called the 'topic
482 | radius'.
483 | 
484 | Example: for a topic `f3b2529e...` with a radius of 2^240, the subset covers all nodes
485 | whose IDs have prefix `f3b2...`. A radius of 2^256 means the entire network, in which case
486 | advertisements are distributed uniformly among all nodes. The diagram below depicts a
487 | region of the address space with topic hash `t` in the middle and several nodes close to
488 | `t` surrounding it. Dots above the nodes represent entries in the node's queue for the
489 | topic.
490 | 
491 | ![diagram explaining the topic radius concept](./img/topic-radius-diagram.png)
492 | 
493 | To place their ads, participants simply perform a random walk within the currently
494 | estimated radius and run the advertisement protocol by collecting tickets from all nodes
495 | encountered during the walk and using them when their waiting time is over.
496 | 
497 | ### Topic Radius Estimation
498 | 
499 | Advertisers must estimate the topic radius continuously in order to place their ads on
500 | nodes where they will be found. The radius mustn't fall below a certain size because
501 | restricting registration to too few nodes leaves the topic vulnerable to censorship and
502 | leads to long waiting times. If the radius were too large, searching nodes would take too
503 | long to find the ads.
504 | 
505 | Estimating the radius uses the waiting time as an indicator of how many other nodes are
506 | attempting to place ads in a certain region. This is achieved by keeping track of the
507 | average time to successful registration within segments of the address space surrounding
508 | the topic hash. Advertisers initially assume the radius is 2^256, i.e. the entire network.
509 | As tickets are collected, the advertiser samples the time it takes to place an ad in each
510 | segment and adjusts the radius such that registration at the chosen distance takes
511 | approximately `target-ad-lifetime / 2` to complete.
512 | 
513 | ## Topic Search
514 | 
515 | Finding nodes that provide a certain topic is a continuous process which reads the content
516 | of topic queues inside the approximated topic radius. This is a much simpler process than
517 | topic advertisement because collecting tickets and waiting on them is not required.
518 | 
519 | To find nodes for a topic, the searcher generates random node IDs inside the estimated
520 | topic radius and performs Kademlia lookups for these IDs. All (intermediate) nodes
521 | encountered during lookup are asked for topic queue entries using the [TOPICQUERY] packet.
522 | 
523 | Radius estimation for topic search is similar to the estimation procedure for
524 | advertisement, but samples the average number of results from TOPICQUERY instead of
525 | average time to registration. The radius estimation value can be shared with the
526 | registration algorithm if the same topic is being registered and searched for.
527 | 
528 | [EIP-778]: ../enr.md
529 | [identity scheme]: ../enr.md#record-structure
530 | [handshake message packet]: ./discv5-wire.md#handshake-message-packet-flag--2
531 | [WHOAREYOU packet]: ./discv5-wire.md#whoareyou-packet-flag--1
532 | [PING]: ./discv5-wire.md#ping-request-0x01
533 | [PONG]: ./discv5-wire.md#pong-response-0x02
534 | [FINDNODE]: ./discv5-wire.md#findnode-request-0x03
535 | [REGTOPIC]: ./discv5-wire.md#regtopic-request-0x07
536 | [REGCONFIRMATION]: ./discv5-wire.md#regconfirmation-response-0x09
537 | [TOPICQUERY]: ./discv5-wire.md#topicquery-request-0x0a
538 | 


--------------------------------------------------------------------------------