├── sleep.pdf ├── dat-paper.pdf ├── source ├── readme.md ├── buildpapers.sh ├── dat-paper.bib ├── sleep.md ├── sleep.latex ├── dat-paper.md └── dat-paper.latex └── readme.md /sleep.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dat-ecosystem-archive/whitepaper/HEAD/sleep.pdf -------------------------------------------------------------------------------- /dat-paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dat-ecosystem-archive/whitepaper/HEAD/dat-paper.pdf -------------------------------------------------------------------------------- /source/readme.md: -------------------------------------------------------------------------------- 1 | # Creating + Generating Paper from Markdown 2 | 3 | [See this gist](https://gist.github.com/maxogden/97190db73ac19fc6c1d9beee1a6e4fc8) for more information on how the paper is created with a basic example. 4 | 5 | To generate the paper again, make sure you have `pandoc` and `pandoc-citeproc`: 6 | 7 | ``` 8 | brew install pandoc pandoc-citeproc 9 | ``` 10 | 11 | Then run the build script in `source`. 12 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | [![deprecated](http://badges.github.io/stability-badges/dist/deprecated.svg)](https://github.com/dat-ecosystem-archive/DEPs) See [DEPs](https://github.com/dat-ecosystem-archive/DEPs) for similar functionality. 2 | 3 | More info on active projects and modules at [dat-ecosystem.org](https://dat-ecosystem.org/) 4 | 5 | # Dat Whitepaper 6 | 7 | Dat whitepaper originally published *April 2017*. 8 | 9 | *These papers are archived versions and may not reflect the latest Dat specification.* 10 | -------------------------------------------------------------------------------- /source/buildpapers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | pandoc --filter pandoc-citeproc --bibliography=dat-paper.bib --variable classoption=twocolumn --variable papersize=a4paper -s dat-paper.md -t latex -o dat-paper.latex 4 | 5 | pandoc --filter pandoc-citeproc --bibliography=dat-paper.bib --variable classoption=twocolumn --variable papersize=a4paper -s dat-paper.md -o dat-paper.pdf 6 | 7 | pandoc --filter pandoc-citeproc --bibliography=dat-paper.bib --variable classoption=twocolumn --variable papersize=a4paper -s sleep.md -t latex -o sleep.latex 8 | 9 | pandoc --filter pandoc-citeproc --bibliography=dat-paper.bib --variable classoption=twocolumn --variable papersize=a4paper -s sleep.md -o sleep.pdf 10 | -------------------------------------------------------------------------------- /source/dat-paper.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{sleep, 2 | title={SLEEP - The Dat Protocol On Disk Format}, 3 | author={Ogden, Maxwell and Buus, Mathias}, 4 | year={2017} 5 | } 6 | 7 | @inproceedings{aumasson2013blake2, 8 | title={BLAKE2: simpler, smaller, fast as MD5}, 9 | author={Aumasson, Jean-Philippe and Neves, Samuel and Wilcox-O’Hearn, Zooko and Winnerlein, Christian}, 10 | booktitle={International Conference on Applied Cryptography and Network Security}, 11 | pages={119--135}, 12 | year={2013}, 13 | organization={Springer} 14 | } 15 | 16 | @article{bernstein2012high, 17 | title={High-speed high-security signatures}, 18 | author={Bernstein, Daniel J and Duif, Niels and Lange, Tanja and Schwabe, Peter and Yang, Bo-Yin}, 19 | journal={Journal of Cryptographic Engineering}, 20 | pages={1--13}, 21 | year={2012}, 22 | publisher={Springer} 23 | } 24 | 25 | @article{mykletun2003providing, 26 | title={Providing authentication and integrity in outsourced databases using Merkle hash trees}, 27 | author={Mykletun, Einar and Narasimha, Maithili and Tsudik, Gene}, 28 | journal={UCI-SCONCE Technical Report}, 29 | year={2003} 30 | } 31 | 32 | @inproceedings{rossi2010ledbat, 33 | title={LEDBAT: The New BitTorrent Congestion Control Protocol.}, 34 | author={Rossi, Dario and Testa, Claudio and Valenti, Silvio and Muscariello, Luca}, 35 | booktitle={ICCCN}, 36 | pages={1--6}, 37 | year={2010} 38 | } 39 | 40 | @article{varda2008protocol, 41 | title={Protocol buffers: Google’s data interchange format}, 42 | author={Varda, Kenton}, 43 | journal={Google Open Source Blog, Available at least as early as Jul}, 44 | year={2008} 45 | } 46 | 47 | @inproceedings{maymounkov2002kademlia, 48 | title={Kademlia: A peer-to-peer information system based on the xor metric}, 49 | author={Maymounkov, Petar and Mazieres, David}, 50 | booktitle={International Workshop on Peer-to-Peer Systems}, 51 | pages={53--65}, 52 | year={2002}, 53 | organization={Springer} 54 | } 55 | 56 | @techreport{bakker2015peer, 57 | title={Peer-to-peer streaming peer protocol (ppspp)}, 58 | author={Bakker, A and Petrocco, R and Grishchenko, V}, 59 | year={2015} 60 | } 61 | 62 | @techreport{laurie2013certificate, 63 | title={Certificate transparency}, 64 | author={Laurie, Ben and Langley, Adam and Kasper, Emilia}, 65 | year={2013} 66 | } -------------------------------------------------------------------------------- /source/sleep.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SLEEP - Syncable Ledger of Exact Events Protocol" 3 | date: "August 2017" 4 | author: "Mathias Buus Madsen, Maxwell Ogden, Code for Science" 5 | --- 6 | 7 | ## SLEEP 8 | 9 | This document is a technical description of the SLEEP format intended for implementers. SLEEP is the the on-disk format that Dat produces and uses. It is a set of 9 files that hold all of the metadata needed to list the contents of a Dat repository and verify the integrity of the data you receive. SLEEP is designed to work with REST, allowing servers to be plain HTTP file servers serving the static SLEEP files, meaning you can implement a Dat protocol client using HTTP with a static HTTP file server as the backend. 10 | 11 | SLEEP files contain metadata about the data inside a Dat repository, including cryptographic hashes, cryptographic signatures, filenames and file permissions. The SLEEP format is specifically designed to allow efficient access to subsets of the metadata and/or data in the repository, even on very large repositories, which enables Dat's peer to peer networking to be fast. 12 | 13 | The acronym SLEEP is a slumber related pun on REST and stands for Syncable Ledger of Exact Events Protocol. The Syncable part refers to how SLEEP files are append-only in nature, meaning they grow over time and new updates can be subscribed to as a realtime feed of events through the Dat protocol. 14 | 15 | The SLEEP version described here, used in Dat as of 2017 is SLEEP V2. SLEEP V1 is documented at http://specs.okfnlabs.org/sleep. 16 | 17 | ### SLEEP Files 18 | 19 | SLEEP is a set of 9 files that should be stored with the following names. In Dat, the files are stored in a folder called `.dat` in the top level of the repository. 20 | 21 | ``` 22 | metadata.key 23 | metadata.signatures 24 | metadata.bitfield 25 | metadata.tree 26 | metadata.data 27 | content.key 28 | content.signatures 29 | content.bitfield 30 | content.tree 31 | ``` 32 | 33 | The files prefixed with `content` store metadata about the primary data in a Dat repository, for example the raw binary contents of the files. The files prefixed with `metadata` store metadata about the files in the repository, for example the filenames, file sizes, and file permissions. The `content` and `metadata` files are both Hypercore registers, making SLEEP a set of two Hypercore registers. 34 | 35 | ### SLEEP File Headers 36 | 37 | The following structured binary format is used for `signatures`, `bitfield`, and `tree` files. The header contains metadata as well as information needed to decode the rest of the files after the header. SLEEP files are designed to be easy to append new data, easy to read arbitrary byte offsets in the middle, and are relatively flat, simple files that rely on the filesystem for the heavy lifting. 38 | 39 | SLEEP files are laid out like this: 40 | 41 | ``` 42 | <32 byte header> 43 | 44 | 45 | 46 | 47 | ```` 48 | 49 | - 32 byte header 50 | - 4 bytes Uint32BE ("Big-Endian") - magic byte (value varies depending on which file, used to quickly identify which file type it is) 51 | - 1 byte - version number of the file header protocol, current version is 0 52 | - 2 byte Uint16BE - entry size, describes how long each entry in the file is 53 | - 1 byte - length prefix for body 54 | - rest of 32 byte header - string describing key or hash algorithm. length of this string matches the length in the previous length prefix field. This string must fit within the 32 byte header limitation (24 bytes reserved for string). Unused bytes should be filled with zeroes. 55 | 56 | Possible values in the Dat implementation for the body field are: 57 | 58 | ``` 59 | Ed25519 60 | BLAKE2b 61 | ``` 62 | 63 | To calculate the offset of some entry position, first read the header and get the entry size, then do `32 + entrySize * entryIndex`. To calculate how many entries are in a file, you can use the entry size and the filesize on disk and do `(fileSize - 32) / entrySize`. 64 | 65 | As mentioned above, `signatures`, `bitfield` and `tree` are the three SLEEP files. There are two additional files, `key`, and `data`, which do not contain SLEEP file headers and store plain serialized data for easy access. `key` stores the public key that is described by the `signatures` file, and `data` stores the raw chunk data that the `tree` file contains the hashes and metadata for. 66 | 67 | ### File Descriptions 68 | 69 | #### key 70 | 71 | The public key used to verify the signatures in the `signatures` file, stored in binary as a single buffer written to disk. To find out what format of key is stored in this file, read the header of `signatures`. In Dat, it's always a ed25519 public key, but other implementations can specify other key types using a string value in that header. 72 | 73 | #### tree 74 | 75 | A SLEEP formatted 32 byte header with data entries representing a serialized Merkle tree based on the data in the data storage layer. All the fixed size nodes written in in-order tree notation. The header algorithm string for `tree` files is `BLAKE2b`. The entry size is 40 bytes. Entries are formatted like this: 76 | 77 | ``` 78 | <32 byte header> 79 | <4 byte magic string: 0x05025702> 80 | <1 byte version number: 0> 81 | <2 byte entry size: 40> 82 | <1 byte algorithm name length prefix: 7> 83 | <7 byte algorithm name: BLAKE2b> 84 | <17 zeroes> 85 | <40 byte entries> 86 | <32 byte BLAKE2b hash> 87 | <8 byte Uint64BE children leaf byte length> 88 | ``` 89 | 90 | The children leaf byte length is the byte size containing the sum byte length of all leaf nodes in the tree below this node. 91 | 92 | This file uses the in-order notation, meaning even entries are leaf nodes and odd entries are parent nodes (non-leaf). 93 | 94 | To prevent pre-image attacks, all hashes start with a one byte type descriptor: 95 | 96 | ``` 97 | 0 - LEAF 98 | 1 - PARENT 99 | 2 - ROOT 100 | ``` 101 | 102 | To calculate leaf node entries (the hashes of the data entries) we hash this data: 103 | 104 | ``` 105 | BLAKE2b( 106 | <1 byte type> 107 | 0 108 | <8 bytes Uint64BE> 109 | length of entry data 110 | 111 | ) 112 | ``` 113 | 114 | Then we take this 32 byte hash and write it to the tree as 40 bytes like this: 115 | 116 | ``` 117 | <32 bytes> 118 | BLAKE2b hash 119 | <8 bytes Uint64BE> 120 | length of data 121 | ``` 122 | 123 | Note that the Uint64 of length of data is included both in the hashed data and written at the end of the entry. This is to expose more metadata to Dat for advanced use cases such as verifying data length in sparse replication scenarios. 124 | 125 | To calculate parent node entries (the hashes of the leaf nodes) we hash this data: 126 | 127 | ``` 128 | BLAKE2b( 129 | <1 byte> 130 | 1 131 | <8 bytes Uint64BE> 132 | left child length + right child length 133 | <32 bytes> 134 | left child hash 135 | <32 bytes> 136 | right child hash 137 | ) 138 | ``` 139 | 140 | Then we take this 32 byte hash and write it to the tree as 40 bytes like this: 141 | 142 | ``` 143 | <32 bytes> 144 | BLAKE2b hash 145 | <8 bytes Uint64BE> 146 | left child length + right child length 147 | ``` 148 | 149 | The reason the tree entries contain data lengths is to allow for sparse mode replication. Encoding lengths (and including lengths in all hashes) means you can verify the Merkle subtrees independent of the rest of the tree, which happens during sparse replication scenarios. 150 | 151 | The tree file corresponds directly to the `data` file. 152 | 153 | #### data 154 | 155 | The `data` file is only included in the SLEEP format for the `metadata.*` prefixed files which contains filesystem metadata and not actual file data. For the `content.*` files, the data is stored externally (in Dat it is stored as normal files on the filesystem and not in a SLEEP file). However you can configure Dat to use a `content.data` file if you want and it will still work. If you want to store the full history of all versions of all files, using the `content.data` file would provide that guarantee, but would have the disadvantage of storing files as chunks merged into one huge file (not as user friendly). 156 | 157 | The `data` file does not contain a SLEEP file header. It just contains a bunch of concatenated data entries. Entries are written in the same order as they appear in the `tree` file. To read a `data` file, first decode the `tree` file and for every leaf in the `tree` file you can calculate a data offset for the data described by that leaf node in the `data` file. 158 | 159 | ##### Index Lookup 160 | 161 | For example, if we wanted to seek to a specific entry offset (say entry 42): 162 | 163 | - First, read the header of the `tree` file and get the entry size, then do `32 + entrySize * 42` to get the raw tree index: `32 + (40 * 42)` 164 | - Since we want the leaf entry (even node in the in-order layout), we multiply the entry index by 2: 165 | `32 + (40 * (42 * 2))` 166 | - Read the 40 bytes at that offset in the `tree` file to get the leaf node entry. 167 | - Read the last 8 bytes of the entry to get the length of the data entry 168 | - To calculate the offset of where in the `data` file your entry begins, you need to sum all the lengths of all the earlier entries in the tree. The most efficient way to do this is to sum all the previous parent node (non-leaf) entry lengths. You can also sum all leaf node lengths, but parent nodes contain the sum of their children's lengths so it's more efficient to use parents. During Dat replication, these nodes are fetched as part of the Merkle tree verification so you will already have them locally. This is a log(N) operation where N is the entry index. Entries are also small and therefore easily cacheable. 169 | - Once you get the offset, you use the length you decoded above and read N bytes (where N is the decoded length) at the offset in the `data` file. You can verify the data integrity using the 32 byte hash from the `tree` entry. 170 | 171 | ##### Byte Lookup 172 | 173 | The above method illustrates how to resolve a chunk position index to a byte offset. You can also do the reverse operation, resolving a byte offset to a chunk position index. This is used to stream arbitrary random access regions of files in sparse replication scenarios. 174 | 175 | - First, you start by calculating the current Merkle roots 176 | - Each node in the tree (including these root nodes) stores the aggregate file size of all byte sizes of the nodes below it. So the roots cumulatively will describe all possible byte ranges for this repository. 177 | - Find the root that contains the byte range of the offset you are looking for and get the node information for all of that nodes children using the Index Lookup method, and recursively repeat this step until you find the lowest down child node that describes this byte range. 178 | - The chunk described by this child node will contain the byte range you are looking for. You can use the `byteOffset` field in the `Stat` metadata object to seek to the correct position in the content file for the start of this chunk. 179 | 180 | ##### Metadata Overhead 181 | 182 | Using this scheme, if you write 4GB of data using on average 64KB data chunks (note: chunks can be variable length and do not need to be the same size), your tree file will be around 5MB (0.0125% overhead). 183 | 184 | #### signatures 185 | 186 | A SLEEP formatted 32 byte header with data entries being 64 byte signatures. 187 | 188 | ``` 189 | <32 byte header> 190 | <4 byte magic string: 0x05025701> 191 | <1 byte version number: 0> 192 | <2 byte entry size: 64> 193 | <1 byte algorithm name length prefix: 7> 194 | <7 byte algorithm name: Ed25519> 195 | <17 zeroes> 196 | <64 byte entries> 197 | <64 byte Ed25519 signature> 198 | ``` 199 | 200 | Every time the tree is updated we sign the current roots of the Merkle tree, and append them to the signatures file. The signatures file starts with no entries. Each time a new leaf is appended to the `tree` file (aka whenever data is added to a Dat), we take all root hashes at the current state of the Merkle tree and hash and sign them, then append them as a new entry to the signatures file. 201 | 202 | ``` 203 | Ed25519 sign( 204 | BLAKE2b( 205 | <1 byte> 206 | 2 // root type 207 | for (every root node left-to-right) { 208 | <32 byte root hash> 209 | <8 byte Uint64BE root tree index> 210 | <8 byte Uint64BE child byte lengths> 211 | } 212 | ) 213 | ) 214 | ``` 215 | 216 | The reason we hash all the root nodes is that the BLAKE2b hash above is only calculable if you have all of the pieces of data required to generate all the intermediate hashes. This is the crux of Dat's data integrity guarantees. 217 | 218 | #### bitfield 219 | 220 | A SLEEP formatted 32 byte header followed by a series of 3328 byte long entries. 221 | 222 | ``` 223 | <32 byte header> 224 | <4 byte magic string: 0x05025700> 225 | <1 byte version number: 0> 226 | <2 byte entry size: 3328> 227 | <1 byte algorithm name length: 0> 228 | <1 byte algorithm name: 0> 229 | <24 zeroes> 230 | <3328 byte entries> // (2048 + 1024 + 256) 231 | ``` 232 | 233 | The bitfield describes which pieces of data you have, and which nodes in the `tree` file have been written. This file exists as an index of the `tree` and `data` to quickly figure out which pieces of data you have or are missing. This file can be regenerated if you delete it, so it is considered a materialized index. 234 | 235 | The `bitfield` file actually contains three bitfields of different sizes. A bitfield (AKA bitmap) is defined as a set of bits where each bit (0 or 1) represents if you have or do not have a piece of data at that bit index. So if there is a dataset of 10 cat pictures, and you have pictures 1, 3, and 5 but are missing the rest, your bitfield would look like `1010100000`. 236 | 237 | Each entry contains three objects: 238 | 239 | - Data Bitfield (1024 bytes) - 1 bit for for each data entry that you have synced (1 for every entry in `data`). 240 | - Tree Bitfield (2048 bytes) - 1 bit for every tree entry (all nodes in `tree`) 241 | - Bitfield Index (256 bytes) - This is an index of the Data Bitfield that makes it efficient to figure out which pieces of data are missing from the Data Bitfield without having to do a linear scan. 242 | 243 | The Data Bitfield is 1Kb somewhat arbitrarily, but the idea is that because most filesystems work in 4Kb chunk sizes, we can fit the Data, Tree and Index in less then 4Kb of data for efficient writes to the filesystem. The Tree and Index sizes are based on the Data size (the Tree has twice the entries as the Data, odd and even nodes vs just even nodes in `tree`, and Index is always 1/4th the size). 244 | 245 | To generate the Index, you take pairs of 2 bytes at a time from the Data Bitfield, check if all bits in the 2 bytes are the same, and generate 4 bits of Index metadata for every 2 bytes of Data (hence how 1024 bytes of Data ends up as 256 bytes of Index). 246 | 247 | First you generate a 2 bit tuple for the 2 bytes of Data: 248 | 249 | ``` 250 | if (data is all 1's) then [1,1] 251 | if (data is all 0's) then [0,0] 252 | if (data is not all the same) then [1, 0] 253 | ``` 254 | 255 | The Index itself is an in-order binary tree, not a traditional bitfield. To generate the tree, you take the tuples you generate above and then write them into a tree like the following example, where non-leaf nodes are generated using the above scheme by looking at the results of the relative even child tuples for each odd parent tuple: 256 | 257 | ``` 258 | // for e.g. 16 bytes (8 tuples) of 259 | // sparsely replicated data 260 | 0 - [00 00 00 00] 261 | 1 - [10 10 10 10] 262 | 2 - [11 11 11 11] 263 | ``` 264 | 265 | The tuples at entry `1` above are `[1,0]` because the relative child tuples are not uniform. In the following example, all non-leaf nodes are `[1,1]` because their relative children are all uniform (`[1,1]`) 266 | 267 | ``` 268 | // for e.g. 32 bytes (16 tuples) of 269 | // fully replicated data (all 1's) 270 | 0 - [11 11 11 11] 271 | 1 - [11 11 11 11] 272 | 2 - [11 11 11 11] 273 | 3 - [11 11 11 11] 274 | 4 - [11 11 11 11] 275 | 5 - [11 11 11 11] 276 | 6 - [11 11 11 11] 277 | ``` 278 | 279 | Using this scheme, it takes at most 8 bytes of Index to represent 32 bytes of data. In this example the Index can compresses well because it consists of all one bits. Similarly, an empty bitfield is all zero bits. 280 | 281 | If you write 4GB of data using on average 64KB data chunk size, your bitfield will be at most 32KB. 282 | 283 | #### metadata.data 284 | 285 | This file is used to store content described by the rest of the `metadata.*` hypercore SLEEP files. Whereas the `content.*` SLEEP files describe the data stored in the actual data cloned in the Dat repository filesystem, the `metadata` data feed is stored inside the `.dat` folder along with the rest of the SLEEP files. 286 | 287 | The contents of this file is a series of versions of the Dat filesystem tree. As this is a hypercore data feed, it's just an append only log of binary data entries. The challenge is representing a tree in a one-dimensional way to make it representable as a Hypercore register. For example, imagine three files: 288 | 289 | ``` 290 | ~/dataset $ ls 291 | figures 292 | graph1.png 293 | graph2.png 294 | results.csv 295 | 296 | 1 directory, 3 files 297 | ``` 298 | 299 | We want to take this structure and map it to a serialized representation that gets written into an append only log in a way that still allows for efficient random access by file path. 300 | 301 | To do this, we convert the filesystem metadata into entries in a feed like this: 302 | 303 | ``` 304 | { 305 | "path": "/results.csv", 306 | trie: [[]], 307 | sequence: 0 308 | } 309 | { 310 | "path": "/figures/graph1.png", 311 | trie: [[0], []], 312 | sequence: 1 313 | } 314 | { 315 | "path": "/figures/graph2.png", 316 | trie: [[0], [1]], 317 | sequence: 2 318 | } 319 | ``` 320 | 321 | ##### Filename Resolution 322 | 323 | Each sequence represents adding one of the files to the register, so at sequence 0 the filesystem state only has a single file, `results.csv` in it. At sequence 1, there are only 2 files added to the register, and at sequence 3 all files are finally added. The `children` field represents a shorthand way of declaring which other files at every level of the directory hierarchy exist alongside the file being added at that revision. For example at the time of sequence 1, children is `[[0], []]`. The first sub-array, `[0]`, represents the first folder in the `path`, which is the root folder `/`. In this case `[0]` means the root folder at this point in time only has a single file, the file that is the subject of sequence `0`. The second subarray is empty `[]` because there are no other existing files in the second folder in the `path`, `figures`. 324 | 325 | To look up a file by filename, you fetch the latest entry in the log, then use the `children` metadata in that entry to look up the longest common ancestor based on the parent folders of the filename you are querying. You can then recursively repeat this operation until you find the `path` entry you are looking for (or you exhaust all options which means the file does not exist). This is a `O(number of slashes in your path)` operation. 326 | 327 | For example, if you wanted to look up `/results.csv` given the above register, you would start by grabbing the metadata at sequence 2. The longest common ancestor between `/results.csv` and `/figures/graph2` is `/`. You then grab the corresponding entry in the children array for `/`, which in this case is the first entry, `[0]`. You then repeat this with all of the children entries until you find a child that is closer to the entry you are looking for. In this example, the first entry happens to be the match we are looking for. 328 | 329 | You can also perform lookups relative to a point in time by starting from a specific sequence number in the register. For example to get the state of some file relative to an old sequence number, similar to checking out an old version of a repository in Git. 330 | 331 | ##### Data Serialization 332 | 333 | The format of the `metadata.data` file is as follows: 334 | 335 | ``` 336 |
337 | 338 | 339 | 340 | 341 | ``` 342 | 343 | Each entry in the file is encoded using Protocol Buffers [@varda2008protocol]. 344 | 345 | The first message we write to the file is of a type called Header which uses this schema: 346 | 347 | ``` 348 | message Header { 349 | required string type = 1; 350 | optional bytes content = 2; 351 | } 352 | ``` 353 | 354 | This is used to declare two pieces of metadata used by Dat. It includes a `type` string with the value `hyperdrive` and `content` binary value that holds the public key of the content register that this metadata register represents. When you share a Dat, the metadata key is the main key that gets used, and the content register key is linked from here in the metadata. 355 | 356 | After the header the file will contain many filesystem `Node` entries: 357 | 358 | ``` 359 | message Node { 360 | required string path = 1; 361 | optional Stat value = 2; 362 | optional bytes trie = 3; 363 | repeated Writer writers = 4; 364 | optional uint64 writersSequence = 5; 365 | } 366 | 367 | message Writer { 368 | required bytes publicKey = 1; 369 | optional string permission = 2; 370 | } 371 | ``` 372 | 373 | The `Node` object has five fields 374 | 375 | - `path` - the string of the absolute file path of this file. 376 | - `Stat` - a Stat encoded object representing the file metadata 377 | - `trie` - a compressed list of the sequence numbers as described earlier 378 | - `writers` - a list of the writers who are allowed to write to this dat 379 | - `writersSequence` - a reference to the last sequence where the writers array was modified. you can use this to quickly find the value of the writers keys. 380 | 381 | The `trie` value is encoded by starting with the nested array of sequence numbers, e.g. `[[[0, 3]], [[0, 2], [0, 1]]]`. Each entry is a tuple where the first item is the index of the feed in the `writers` array and the second value is the sequence number. Finally you prepend the trie value with a version number varint. 382 | 383 | To write these subarrays we use variable width integers (varints), using a repeating pattern like this, one for each array: 384 | 385 | ``` 386 | 387 | 388 | 389 | 390 | 391 | 392 | ``` 393 | 394 | This encoding is designed for efficiency as it reduces the filesystem path + feed index metadata down to a series of small integers. 395 | 396 | The `Stat` objects use this encoding: 397 | 398 | ``` 399 | message Stat { 400 | required uint32 mode = 1; 401 | optional uint32 uid = 2; 402 | optional uint32 gid = 3; 403 | optional uint64 size = 4; 404 | optional uint64 blocks = 5; 405 | optional uint64 offset = 6; 406 | optional uint64 byteOffset = 7; 407 | optional uint64 mtime = 8; 408 | optional uint64 ctime = 9; 409 | } 410 | ``` 411 | 412 | These are the field definitions: 413 | 414 | - `mode` - POSIX file mode bitmask 415 | - `uid` - POSIX user id 416 | - `gid` - POSIX group id 417 | - `size` - file size in bytes 418 | - `blocks` - number of data chunks that make up this file 419 | - `offset` - the data feed entry index for the first chunk in this file 420 | - `byteOffset` - the data feed file byte offset for the first chunk in this file 421 | - `mtime` - POSIX modified_at time 422 | - `mtime` - POSIX created_at time 423 | 424 | ## References 425 | -------------------------------------------------------------------------------- /source/sleep.latex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paperpaper,twocolumn]{article} 2 | \usepackage{lmodern} 3 | \usepackage{amssymb,amsmath} 4 | \usepackage{ifxetex,ifluatex} 5 | \usepackage{fixltx2e} % provides \textsubscript 6 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex 7 | \usepackage[T1]{fontenc} 8 | \usepackage[utf8]{inputenc} 9 | \else % if luatex or xelatex 10 | \ifxetex 11 | \usepackage{mathspec} 12 | \else 13 | \usepackage{fontspec} 14 | \fi 15 | \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase} 16 | \fi 17 | % use upquote if available, for straight quotes in verbatim environments 18 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{} 19 | % use microtype if available 20 | \IfFileExists{microtype.sty}{% 21 | \usepackage{microtype} 22 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts 23 | }{} 24 | \usepackage[unicode=true]{hyperref} 25 | \hypersetup{ 26 | pdftitle={SLEEP - Syncable Ledger of Exact Events Protocol}, 27 | pdfauthor={Mathias Buus Madsen, Maxwell Ogden, Code for Science}, 28 | pdfborder={0 0 0}, 29 | breaklinks=true} 30 | \urlstyle{same} % don't use monospace font for urls 31 | \IfFileExists{parskip.sty}{% 32 | \usepackage{parskip} 33 | }{% else 34 | \setlength{\parindent}{0pt} 35 | \setlength{\parskip}{6pt plus 2pt minus 1pt} 36 | } 37 | \setlength{\emergencystretch}{3em} % prevent overfull lines 38 | \providecommand{\tightlist}{% 39 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 40 | \setcounter{secnumdepth}{0} 41 | % Redefines (sub)paragraphs to behave more like sections 42 | \ifx\paragraph\undefined\else 43 | \let\oldparagraph\paragraph 44 | \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} 45 | \fi 46 | \ifx\subparagraph\undefined\else 47 | \let\oldsubparagraph\subparagraph 48 | \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} 49 | \fi 50 | 51 | \title{SLEEP - Syncable Ledger of Exact Events Protocol} 52 | \author{Mathias Buus Madsen, Maxwell Ogden, Code for Science} 53 | \date{August 2017} 54 | 55 | \begin{document} 56 | \maketitle 57 | 58 | \subsection{SLEEP}\label{sleep} 59 | 60 | This document is a technical description of the SLEEP format intended 61 | for implementers. SLEEP is the the on-disk format that Dat produces and 62 | uses. It is a set of 9 files that hold all of the metadata needed to 63 | list the contents of a Dat repository and verify the integrity of the 64 | data you receive. SLEEP is designed to work with REST, allowing servers 65 | to be plain HTTP file servers serving the static SLEEP files, meaning 66 | you can implement a Dat protocol client using HTTP with a static HTTP 67 | file server as the backend. 68 | 69 | SLEEP files contain metadata about the data inside a Dat repository, 70 | including cryptographic hashes, cryptographic signatures, filenames and 71 | file permissions. The SLEEP format is specifically designed to allow 72 | efficient access to subsets of the metadata and/or data in the 73 | repository, even on very large repositories, which enables Dat's peer to 74 | peer networking to be fast. 75 | 76 | The acronym SLEEP is a slumber related pun on REST and stands for 77 | Syncable Ledger of Exact Events Protocol. The Syncable part refers to 78 | how SLEEP files are append-only in nature, meaning they grow over time 79 | and new updates can be subscribed to as a realtime feed of events 80 | through the Dat protocol. 81 | 82 | The SLEEP version described here, used in Dat as of 2017 is SLEEP V2. 83 | SLEEP V1 is documented at http://specs.okfnlabs.org/sleep. 84 | 85 | \subsubsection{SLEEP Files}\label{sleep-files} 86 | 87 | SLEEP is a set of 9 files that should be stored with the following 88 | names. In Dat, the files are stored in a folder called \texttt{.dat} in 89 | the top level of the repository. 90 | 91 | \begin{verbatim} 92 | metadata.key 93 | metadata.signatures 94 | metadata.bitfield 95 | metadata.tree 96 | metadata.data 97 | content.key 98 | content.signatures 99 | content.bitfield 100 | content.tree 101 | \end{verbatim} 102 | 103 | The files prefixed with \texttt{content} store metadata about the 104 | primary data in a Dat repository, for example the raw binary contents of 105 | the files. The files prefixed with \texttt{metadata} store metadata 106 | about the files in the repository, for example the filenames, file 107 | sizes, and file permissions. The \texttt{content} and \texttt{metadata} 108 | files are both Hypercore registers, making SLEEP a set of two Hypercore 109 | registers. 110 | 111 | \subsubsection{SLEEP File Headers}\label{sleep-file-headers} 112 | 113 | The following structured binary format is used for \texttt{signatures}, 114 | \texttt{bitfield}, and \texttt{tree} files. The header contains metadata 115 | as well as information needed to decode the rest of the files after the 116 | header. SLEEP files are designed to be easy to append new data, easy to 117 | read arbitrary byte offsets in the middle, and are relatively flat, 118 | simple files that rely on the filesystem for the heavy lifting. 119 | 120 | SLEEP files are laid out like this: 121 | 122 | \begin{verbatim} 123 | <32 byte header> 124 | 125 | 126 | 127 | 128 | \end{verbatim} 129 | 130 | \begin{itemize} 131 | \tightlist 132 | \item 133 | 32 byte header 134 | \item 135 | 4 bytes Uint32BE (``Big-Endian'') - magic byte (value varies depending 136 | on which file, used to quickly identify which file type it is) 137 | \item 138 | 1 byte - version number of the file header protocol, current version 139 | is 0 140 | \item 141 | 2 byte Uint16BE - entry size, describes how long each entry in the 142 | file is 143 | \item 144 | 1 byte - length prefix for body 145 | \item 146 | rest of 32 byte header - string describing key or hash algorithm. 147 | length of this string matches the length in the previous length prefix 148 | field. This string must fit within the 32 byte header limitation (24 149 | bytes reserved for string). Unused bytes should be filled with zeroes. 150 | \end{itemize} 151 | 152 | Possible values in the Dat implementation for the body field are: 153 | 154 | \begin{verbatim} 155 | Ed25519 156 | BLAKE2b 157 | \end{verbatim} 158 | 159 | To calculate the offset of some entry position, first read the header 160 | and get the entry size, then do 161 | \texttt{32\ +\ entrySize\ *\ entryIndex}. To calculate how many entries 162 | are in a file, you can use the entry size and the filesize on disk and 163 | do \texttt{(fileSize\ -\ 32)\ /\ entrySize}. 164 | 165 | As mentioned above, \texttt{signatures}, \texttt{bitfield} and 166 | \texttt{tree} are the three SLEEP files. There are two additional files, 167 | \texttt{key}, and \texttt{data}, which do not contain SLEEP file headers 168 | and store plain serialized data for easy access. \texttt{key} stores the 169 | public key that is described by the \texttt{signatures} file, and 170 | \texttt{data} stores the raw chunk data that the \texttt{tree} file 171 | contains the hashes and metadata for. 172 | 173 | \subsubsection{File Descriptions}\label{file-descriptions} 174 | 175 | \paragraph{key}\label{key} 176 | 177 | The public key used to verify the signatures in the \texttt{signatures} 178 | file, stored in binary as a single buffer written to disk. To find out 179 | what format of key is stored in this file, read the header of 180 | \texttt{signatures}. In Dat, it's always a ed25519 public key, but other 181 | implementations can specify other key types using a string value in that 182 | header. 183 | 184 | \paragraph{tree}\label{tree} 185 | 186 | A SLEEP formatted 32 byte header with data entries representing a 187 | serialized Merkle tree based on the data in the data storage layer. All 188 | the fixed size nodes written in in-order tree notation. The header 189 | algorithm string for \texttt{tree} files is \texttt{BLAKE2b}. The entry 190 | size is 40 bytes. Entries are formatted like this: 191 | 192 | \begin{verbatim} 193 | <32 byte header> 194 | <4 byte magic string: 0x05025702> 195 | <1 byte version number: 0> 196 | <2 byte entry size: 40> 197 | <1 byte algorithm name length prefix: 7> 198 | <7 byte algorithm name: BLAKE2b> 199 | <17 zeroes> 200 | <40 byte entries> 201 | <32 byte BLAKE2b hash> 202 | <8 byte Uint64BE children leaf byte length> 203 | \end{verbatim} 204 | 205 | The children leaf byte length is the byte size containing the sum byte 206 | length of all leaf nodes in the tree below this node. 207 | 208 | This file uses the in-order notation, meaning even entries are leaf 209 | nodes and odd entries are parent nodes (non-leaf). 210 | 211 | To prevent pre-image attacks, all hashes start with a one byte type 212 | descriptor: 213 | 214 | \begin{verbatim} 215 | 0 - LEAF 216 | 1 - PARENT 217 | 2 - ROOT 218 | \end{verbatim} 219 | 220 | To calculate leaf node entries (the hashes of the data entries) we hash 221 | this data: 222 | 223 | \begin{verbatim} 224 | BLAKE2b( 225 | <1 byte type> 226 | 0 227 | <8 bytes Uint64BE> 228 | length of entry data 229 | 230 | ) 231 | \end{verbatim} 232 | 233 | Then we take this 32 byte hash and write it to the tree as 40 bytes like 234 | this: 235 | 236 | \begin{verbatim} 237 | <32 bytes> 238 | BLAKE2b hash 239 | <8 bytes Uint64BE> 240 | length of data 241 | \end{verbatim} 242 | 243 | Note that the Uint64 of length of data is included both in the hashed 244 | data and written at the end of the entry. This is to expose more 245 | metadata to Dat for advanced use cases such as verifying data length in 246 | sparse replication scenarios. 247 | 248 | To calculate parent node entries (the hashes of the leaf nodes) we hash 249 | this data: 250 | 251 | \begin{verbatim} 252 | BLAKE2b( 253 | <1 byte> 254 | 1 255 | <8 bytes Uint64BE> 256 | left child length + right child length 257 | <32 bytes> 258 | left child hash 259 | <32 bytes> 260 | right child hash 261 | ) 262 | \end{verbatim} 263 | 264 | Then we take this 32 byte hash and write it to the tree as 40 bytes like 265 | this: 266 | 267 | \begin{verbatim} 268 | <32 bytes> 269 | BLAKE2b hash 270 | <8 bytes Uint64BE> 271 | left child length + right child length 272 | \end{verbatim} 273 | 274 | The reason the tree entries contain data lengths is to allow for sparse 275 | mode replication. Encoding lengths (and including lengths in all hashes) 276 | means you can verify the Merkle subtrees independent of the rest of the 277 | tree, which happens during sparse replication scenarios. 278 | 279 | The tree file corresponds directly to the \texttt{data} file. 280 | 281 | \paragraph{data}\label{data} 282 | 283 | The \texttt{data} file is only included in the SLEEP format for the 284 | \texttt{metadata.*} prefixed files which contains filesystem metadata 285 | and not actual file data. For the \texttt{content.*} files, the data is 286 | stored externally (in Dat it is stored as normal files on the filesystem 287 | and not in a SLEEP file). However you can configure Dat to use a 288 | \texttt{content.data} file if you want and it will still work. If you 289 | want to store the full history of all versions of all files, using the 290 | \texttt{content.data} file would provide that guarantee, but would have 291 | the disadvantage of storing files as chunks merged into one huge file 292 | (not as user friendly). 293 | 294 | The \texttt{data} file does not contain a SLEEP file header. It just 295 | contains a bunch of concatenated data entries. Entries are written in 296 | the same order as they appear in the \texttt{tree} file. To read a 297 | \texttt{data} file, first decode the \texttt{tree} file and for every 298 | leaf in the \texttt{tree} file you can calculate a data offset for the 299 | data described by that leaf node in the \texttt{data} file. 300 | 301 | \subparagraph{Index Lookup}\label{index-lookup} 302 | 303 | For example, if we wanted to seek to a specific entry offset (say entry 304 | 42): 305 | 306 | \begin{itemize} 307 | \tightlist 308 | \item 309 | First, read the header of the \texttt{tree} file and get the entry 310 | size, then do \texttt{32\ +\ entrySize\ *\ 42} to get the raw tree 311 | index: \texttt{32\ +\ (40\ *\ 42)} 312 | \item 313 | Since we want the leaf entry (even node in the in-order layout), we 314 | multiply the entry index by 2: \texttt{32\ +\ (40\ *\ (42\ *\ 2))} 315 | \item 316 | Read the 40 bytes at that offset in the \texttt{tree} file to get the 317 | leaf node entry. 318 | \item 319 | Read the last 8 bytes of the entry to get the length of the data entry 320 | \item 321 | To calculate the offset of where in the \texttt{data} file your entry 322 | begins, you need to sum all the lengths of all the earlier entries in 323 | the tree. The most efficient way to do this is to sum all the previous 324 | parent node (non-leaf) entry lengths. You can also sum all leaf node 325 | lengths, but parent nodes contain the sum of their children's lengths 326 | so it's more efficient to use parents. During Dat replication, these 327 | nodes are fetched as part of the Merkle tree verification so you will 328 | already have them locally. This is a log(N) operation where N is the 329 | entry index. Entries are also small and therefore easily cacheable. 330 | \item 331 | Once you get the offset, you use the length you decoded above and read 332 | N bytes (where N is the decoded length) at the offset in the 333 | \texttt{data} file. You can verify the data integrity using the 32 334 | byte hash from the \texttt{tree} entry. 335 | \end{itemize} 336 | 337 | \subparagraph{Byte Lookup}\label{byte-lookup} 338 | 339 | The above method illustrates how to resolve a chunk position index to a 340 | byte offset. You can also do the reverse operation, resolving a byte 341 | offset to a chunk position index. This is used to stream arbitrary 342 | random access regions of files in sparse replication scenarios. 343 | 344 | \begin{itemize} 345 | \tightlist 346 | \item 347 | First, you start by calculating the current Merkle roots 348 | \item 349 | Each node in the tree (including these root nodes) stores the 350 | aggregate file size of all byte sizes of the nodes below it. So the 351 | roots cumulatively will describe all possible byte ranges for this 352 | repository. 353 | \item 354 | Find the root that contains the byte range of the offset you are 355 | looking for and get the node information for all of that nodes 356 | children using the Index Lookup method, and recursively repeat this 357 | step until you find the lowest down child node that describes this 358 | byte range. 359 | \item 360 | The chunk described by this child node will contain the byte range you 361 | are looking for. You can use the \texttt{byteOffset} field in the 362 | \texttt{Stat} metadata object to seek to the correct position in the 363 | content file for the start of this chunk. 364 | \end{itemize} 365 | 366 | \subparagraph{Metadata Overhead}\label{metadata-overhead} 367 | 368 | Using this scheme, if you write 4GB of data using on average 64KB data 369 | chunks (note: chunks can be variable length and do not need to be the 370 | same size), your tree file will be around 5MB (0.0125\% overhead). 371 | 372 | \paragraph{signatures}\label{signatures} 373 | 374 | A SLEEP formatted 32 byte header with data entries being 64 byte 375 | signatures. 376 | 377 | \begin{verbatim} 378 | <32 byte header> 379 | <4 byte magic string: 0x05025701> 380 | <1 byte version number: 0> 381 | <2 byte entry size: 64> 382 | <1 byte algorithm name length prefix: 7> 383 | <7 byte algorithm name: Ed25519> 384 | <17 zeroes> 385 | <64 byte entries> 386 | <64 byte Ed25519 signature> 387 | \end{verbatim} 388 | 389 | Every time the tree is updated we sign the current roots of the Merkle 390 | tree, and append them to the signatures file. The signatures file starts 391 | with no entries. Each time a new leaf is appended to the \texttt{tree} 392 | file (aka whenever data is added to a Dat), we take all root hashes at 393 | the current state of the Merkle tree and hash and sign them, then append 394 | them as a new entry to the signatures file. 395 | 396 | \begin{verbatim} 397 | Ed25519 sign( 398 | BLAKE2b( 399 | <1 byte> 400 | 2 // root type 401 | for (every root node left-to-right) { 402 | <32 byte root hash> 403 | <8 byte Uint64BE root tree index> 404 | <8 byte Uint64BE child byte lengths> 405 | } 406 | ) 407 | ) 408 | \end{verbatim} 409 | 410 | The reason we hash all the root nodes is that the BLAKE2b hash above is 411 | only calculable if you have all of the pieces of data required to 412 | generate all the intermediate hashes. This is the crux of Dat's data 413 | integrity guarantees. 414 | 415 | \paragraph{bitfield}\label{bitfield} 416 | 417 | A SLEEP formatted 32 byte header followed by a series of 3328 byte long 418 | entries. 419 | 420 | \begin{verbatim} 421 | <32 byte header> 422 | <4 byte magic string: 0x05025700> 423 | <1 byte version number: 0> 424 | <2 byte entry size: 3328> 425 | <1 byte algorithm name length: 0> 426 | <1 byte algorithm name: 0> 427 | <24 zeroes> 428 | <3328 byte entries> // (2048 + 1024 + 256) 429 | \end{verbatim} 430 | 431 | The bitfield describes which pieces of data you have, and which nodes in 432 | the \texttt{tree} file have been written. This file exists as an index 433 | of the \texttt{tree} and \texttt{data} to quickly figure out which 434 | pieces of data you have or are missing. This file can be regenerated if 435 | you delete it, so it is considered a materialized index. 436 | 437 | The \texttt{bitfield} file actually contains three bitfields of 438 | different sizes. A bitfield (AKA bitmap) is defined as a set of bits 439 | where each bit (0 or 1) represents if you have or do not have a piece of 440 | data at that bit index. So if there is a dataset of 10 cat pictures, and 441 | you have pictures 1, 3, and 5 but are missing the rest, your bitfield 442 | would look like \texttt{1010100000}. 443 | 444 | Each entry contains three objects: 445 | 446 | \begin{itemize} 447 | \tightlist 448 | \item 449 | Data Bitfield (1024 bytes) - 1 bit for for each data entry that you 450 | have synced (1 for every entry in \texttt{data}). 451 | \item 452 | Tree Bitfield (2048 bytes) - 1 bit for every tree entry (all nodes in 453 | \texttt{tree}) 454 | \item 455 | Bitfield Index (256 bytes) - This is an index of the Data Bitfield 456 | that makes it efficient to figure out which pieces of data are missing 457 | from the Data Bitfield without having to do a linear scan. 458 | \end{itemize} 459 | 460 | The Data Bitfield is 1Kb somewhat arbitrarily, but the idea is that 461 | because most filesystems work in 4Kb chunk sizes, we can fit the Data, 462 | Tree and Index in less then 4Kb of data for efficient writes to the 463 | filesystem. The Tree and Index sizes are based on the Data size (the 464 | Tree has twice the entries as the Data, odd and even nodes vs just even 465 | nodes in \texttt{tree}, and Index is always 1/4th the size). 466 | 467 | To generate the Index, you take pairs of 2 bytes at a time from the Data 468 | Bitfield, check if all bits in the 2 bytes are the same, and generate 4 469 | bits of Index metadata~for every 2 bytes of Data (hence how 1024 bytes 470 | of Data ends up as 256 bytes of Index). 471 | 472 | First you generate a 2 bit tuple for the 2 bytes of Data: 473 | 474 | \begin{verbatim} 475 | if (data is all 1's) then [1,1] 476 | if (data is all 0's) then [0,0] 477 | if (data is not all the same) then [1, 0] 478 | \end{verbatim} 479 | 480 | The Index itself is an in-order binary tree, not a traditional bitfield. 481 | To generate the tree, you take the tuples you generate above and then 482 | write them into a tree like the following example, where non-leaf nodes 483 | are generated using the above scheme by looking at the results of the 484 | relative even child tuples for each odd parent tuple: 485 | 486 | \begin{verbatim} 487 | // for e.g. 16 bytes (8 tuples) of 488 | // sparsely replicated data 489 | 0 - [00 00 00 00] 490 | 1 - [10 10 10 10] 491 | 2 - [11 11 11 11] 492 | \end{verbatim} 493 | 494 | The tuples at entry \texttt{1} above are \texttt{{[}1,0{]}} because the 495 | relative child tuples are not uniform. In the following example, all 496 | non-leaf nodes are \texttt{{[}1,1{]}} because their relative children 497 | are all uniform (\texttt{{[}1,1{]}}) 498 | 499 | \begin{verbatim} 500 | // for e.g. 32 bytes (16 tuples) of 501 | // fully replicated data (all 1's) 502 | 0 - [11 11 11 11] 503 | 1 - [11 11 11 11] 504 | 2 - [11 11 11 11] 505 | 3 - [11 11 11 11] 506 | 4 - [11 11 11 11] 507 | 5 - [11 11 11 11] 508 | 6 - [11 11 11 11] 509 | \end{verbatim} 510 | 511 | Using this scheme, it takes at most 8 bytes of Index to represent 32 512 | bytes of data. In this example the Index can compresses well because it 513 | consists of all one bits. Similarly, an empty bitfield is all zero bits. 514 | 515 | If you write 4GB of data using on average 64KB data chunk size, your 516 | bitfield will be at most 32KB. 517 | 518 | \paragraph{metadata.data}\label{metadata.data} 519 | 520 | This file is used to store content described by the rest of the 521 | \texttt{metadata.*} hypercore SLEEP files. Whereas the 522 | \texttt{content.*} SLEEP files describe the data stored in the actual 523 | data cloned in the Dat repository filesystem, the \texttt{metadata} data 524 | feed is stored inside the \texttt{.dat} folder along with the rest of 525 | the SLEEP files. 526 | 527 | The contents of this file is a series of versions of the Dat filesystem 528 | tree. As this is a hypercore data feed, it's just an append only log of 529 | binary data entries. The challenge is representing a tree in a 530 | one-dimensional way to make it representable as a Hypercore register. 531 | For example, imagine three files: 532 | 533 | \begin{verbatim} 534 | ~/dataset $ ls 535 | figures 536 | graph1.png 537 | graph2.png 538 | results.csv 539 | 540 | 1 directory, 3 files 541 | \end{verbatim} 542 | 543 | We want to take this structure and map it to a serialized representation 544 | that gets written into an append only log in a way that still allows for 545 | efficient random access by file path. 546 | 547 | To do this, we convert the filesystem metadata into entries in a feed 548 | like this: 549 | 550 | \begin{verbatim} 551 | { 552 | "path": "/results.csv", 553 | trie: [[]], 554 | sequence: 0 555 | } 556 | { 557 | "path": "/figures/graph1.png", 558 | trie: [[0], []], 559 | sequence: 1 560 | } 561 | { 562 | "path": "/figures/graph2.png", 563 | trie: [[0], [1]], 564 | sequence: 2 565 | } 566 | \end{verbatim} 567 | 568 | \subparagraph{Filename Resolution}\label{filename-resolution} 569 | 570 | Each sequence represents adding one of the files to the register, so at 571 | sequence 0 the filesystem state only has a single file, 572 | \texttt{results.csv} in it. At sequence 1, there are only 2 files added 573 | to the register, and at sequence 3 all files are finally added. The 574 | \texttt{children} field represents a shorthand way of declaring which 575 | other files at every level of the directory hierarchy exist alongside 576 | the file being added at that revision. For example at the time of 577 | sequence 1, children is \texttt{{[}{[}0{]},\ {[}{]}{]}}. The first 578 | sub-array, \texttt{{[}0{]}}, represents the first folder in the 579 | \texttt{path}, which is the root folder \texttt{/}. In this case 580 | \texttt{{[}0{]}} means the root folder at this point in time only has a 581 | single file, the file that is the subject of sequence \texttt{0}. The 582 | second subarray is empty \texttt{{[}{]}} because there are no other 583 | existing files in the second folder in the \texttt{path}, 584 | \texttt{figures}. 585 | 586 | To look up a file by filename, you fetch the latest entry in the log, 587 | then use the \texttt{children} metadata in that entry to look up the 588 | longest common ancestor based on the parent folders of the filename you 589 | are querying. You can then recursively repeat this operation until you 590 | find the \texttt{path} entry you are looking for (or you exhaust all 591 | options which means the file does not exist). This is a 592 | \texttt{O(number\ of\ slashes\ in\ your\ path)} operation. 593 | 594 | For example, if you wanted to look up \texttt{/results.csv} given the 595 | above register, you would start by grabbing the metadata at sequence 2. 596 | The longest common ancestor between \texttt{/results.csv} and 597 | \texttt{/figures/graph2} is \texttt{/}. You then grab the corresponding 598 | entry in the children array for \texttt{/}, which in this case is the 599 | first entry, \texttt{{[}0{]}}. You then repeat this with all of the 600 | children entries until you find a child that is closer to the entry you 601 | are looking for. In this example, the first entry happens to be the 602 | match we are looking for. 603 | 604 | You can also perform lookups relative to a point in time by starting 605 | from a specific sequence number in the register. For example to get the 606 | state of some file relative to an old sequence number, similar to 607 | checking out an old version of a repository in Git. 608 | 609 | \subparagraph{Data Serialization}\label{data-serialization} 610 | 611 | The format of the \texttt{metadata.data} file is as follows: 612 | 613 | \begin{verbatim} 614 |
615 | 616 | 617 | 618 | 619 | \end{verbatim} 620 | 621 | Each entry in the file is encoded using Protocol Buffers (Varda 2008). 622 | 623 | The first message we write to the file is of a type called Header which 624 | uses this schema: 625 | 626 | \begin{verbatim} 627 | message Header { 628 | required string type = 1; 629 | optional bytes content = 2; 630 | } 631 | \end{verbatim} 632 | 633 | This is used to declare two pieces of metadata used by Dat. It includes 634 | a \texttt{type} string with the value \texttt{hyperdrive} and 635 | \texttt{content} binary value that holds the public key of the content 636 | register that this metadata register represents. When you share a Dat, 637 | the metadata key is the main key that gets used, and the content 638 | register key is linked from here in the metadata. 639 | 640 | After the header the file will contain many filesystem \texttt{Node} 641 | entries: 642 | 643 | \begin{verbatim} 644 | message Node { 645 | required string path = 1; 646 | optional Stat value = 2; 647 | optional bytes trie = 3; 648 | repeated Writer writers = 4; 649 | optional uint64 writersSequence = 5; 650 | } 651 | 652 | message Writer { 653 | required bytes publicKey = 1; 654 | optional string permission = 2; 655 | } 656 | \end{verbatim} 657 | 658 | The \texttt{Node} object has five fields 659 | 660 | \begin{itemize} 661 | \tightlist 662 | \item 663 | \texttt{path} - the string of the absolute file path of this file. 664 | \item 665 | \texttt{Stat} - a Stat encoded object representing the file metadata 666 | \item 667 | \texttt{trie} - a compressed list of the sequence numbers as described 668 | earlier 669 | \item 670 | \texttt{writers} - a list of the writers who are allowed to write to 671 | this dat 672 | \item 673 | \texttt{writersSequence} - a reference to the last sequence where the 674 | writers array was modified. you can use this to quickly find the value 675 | of the writers keys. 676 | \end{itemize} 677 | 678 | The \texttt{trie} value is encoded by starting with the nested array of 679 | sequence numbers, e.g. 680 | \texttt{{[}{[}{[}0,\ 3{]}{]},\ {[}{[}0,\ 2{]},\ {[}0,\ 1{]}{]}{]}}. Each 681 | entry is a tuple where the first item is the index of the feed in the 682 | \texttt{writers} array and the second value is the sequence number. 683 | Finally you prepend the trie value with a version number varint. 684 | 685 | To write these subarrays we use variable width integers (varints), using 686 | a repeating pattern like this, one for each array: 687 | 688 | \begin{verbatim} 689 | 690 | 691 | 692 | 693 | 694 | 695 | \end{verbatim} 696 | 697 | This encoding is designed for efficiency as it reduces the filesystem 698 | path + feed index metadata down to a series of small integers. 699 | 700 | The \texttt{Stat} objects use this encoding: 701 | 702 | \begin{verbatim} 703 | message Stat { 704 | required uint32 mode = 1; 705 | optional uint32 uid = 2; 706 | optional uint32 gid = 3; 707 | optional uint64 size = 4; 708 | optional uint64 blocks = 5; 709 | optional uint64 offset = 6; 710 | optional uint64 byteOffset = 7; 711 | optional uint64 mtime = 8; 712 | optional uint64 ctime = 9; 713 | } 714 | \end{verbatim} 715 | 716 | These are the field definitions: 717 | 718 | \begin{itemize} 719 | \tightlist 720 | \item 721 | \texttt{mode} - POSIX file mode bitmask 722 | \item 723 | \texttt{uid} - POSIX user id 724 | \item 725 | \texttt{gid} - POSIX group id 726 | \item 727 | \texttt{size} - file size in bytes 728 | \item 729 | \texttt{blocks} - number of data chunks that make up this file 730 | \item 731 | \texttt{offset} - the data feed entry index for the first chunk in 732 | this file 733 | \item 734 | \texttt{byteOffset} - the data feed file byte offset for the first 735 | chunk in this file 736 | \item 737 | \texttt{mtime} - POSIX modified\_at time 738 | \item 739 | \texttt{mtime} - POSIX created\_at time 740 | \end{itemize} 741 | 742 | \subsection*{References}\label{references} 743 | \addcontentsline{toc}{subsection}{References} 744 | 745 | \hypertarget{refs}{} 746 | \hypertarget{ref-varda2008protocol}{} 747 | Varda, Kenton. 2008. ``Protocol Buffers: Google's Data Interchange 748 | Format.'' \emph{Google Open Source Blog, Available at Least as Early as 749 | Jul}. 750 | 751 | \end{document} 752 | -------------------------------------------------------------------------------- /source/dat-paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Dat - Distributed Dataset Synchronization And Versioning" 3 | date: "May 2017 (last updated: Jan 2018)" 4 | author: "Maxwell Ogden, Karissa McKelvey, Mathias Buus Madsen, Code for Science" 5 | --- 6 | 7 | # Abstract 8 | 9 | Dat is a protocol designed for syncing folders of data, even if they are large or changing constantly. Dat uses a cryptographically secure register of changes to prove that the requested data version is distributed. A byte range of any file's version can be efficiently streamed from a Dat repository over a network connection. Consumers can choose to fully or partially replicate the contents of a remote Dat repository, and can also subscribe to live changes. To ensure writer and reader privacy, Dat uses public key cryptography to encrypt network traffic. A group of Dat clients can connect to each other to form a public or private decentralized network to exchange data between each other. A reference implementation is provided in JavaScript. 10 | 11 | # 1. Background 12 | 13 | Many datasets are shared online today using HTTP and FTP, which lack built in support for version control or content addressing of data. This results in link rot and content drift as files are moved, updated or deleted, leading to an alarming rate of disappearing data references in areas such as [published scientific literature](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253). 14 | 15 | Cloud storage services like S3 ensure availability of data, but they have a centralized hub-and-spoke networking model and are therefore limited by their bandwidth, meaning popular files can become very expensive to share. Services like Dropbox and Google Drive provide version control and synchronization on top of cloud storage services which fixes many issues with broken links but rely on proprietary code and services requiring users to store their data on centralized cloud infrastructure which has implications on cost, transfer speeds, vendor lock-in and user privacy. 16 | 17 | Distributed file sharing tools can become faster as files become more popular, removing the bandwidth bottleneck and making file distribution cheaper. They also use link resolution and discovery systems which can prevent broken links meaning if the original source goes offline other backup sources can be automatically discovered. However these file sharing tools today are not supported by Web browsers, do not have good privacy guarantees, and do not provide a mechanism for updating files without redistributing a new dataset which could mean entirely re-downloading data you already have. 18 | 19 | # 2. Dat 20 | 21 | Dat is a dataset synchronization protocol that does not assume a dataset is static or that the entire dataset will be downloaded. The main reference implementation is available from npm as `npm install dat -g`. 22 | 23 | The protocol is agnostic to the underlying transport e.g. you could implement Dat over carrier pigeon. Data is stored in a format called SLEEP [@sleep], described in its own paper. The key properties of the Dat design are explained in this section. 24 | 25 | - 2.1 **Content Integrity** - Data and publisher integrity is verified through use of signed hashes of the content. 26 | - 2.2 **Decentralized Mirroring** - Users sharing the same Dat automatically discover each other and exchange data in a swarm. 27 | - 2.3 **Network Privacy** - Dat provides certain privacy guarantees including end-to-end encryption. 28 | - 2.4 **Incremental Versioning** - Datasets can be efficiently synced, even in real time, to other peers. 29 | - 2.5 **Random Access** - Huge file hierarchies can be efficiently traversed remotely. 30 | 31 | ## 2.1 Content Integrity 32 | 33 | Content integrity means being able to verify the data you received is the exact same version of the data that you expected. This is important in a distributed system as this mechanism will catch incorrect data sent by bad peers. It also has implications for reproducibility as it lets you refer to a specific version of a dataset. 34 | 35 | Link rot, when links online stop resolving, and content drift, when data changes but the link to the data remains the same, are two common issues in data analysis. For example, one day a file called data.zip might change, but a typical HTTP link to the file does not include a hash of the content, or provide a way to get updated metadata, so clients that only have the HTTP link have no way to check if the file changed without downloading the entire file again. Referring to a file by the hash of its content is called content addressability, and lets users not only verify that the data they receive is the version of the data they want, but also lets people cite specific versions of the data by referring to a specific hash. 36 | 37 | Dat uses BLAKE2b [@aumasson2013blake2] cryptographically secure hashes to address content. Hashes are arranged in a Merkle tree [@mykletun2003providing], a tree where each non-leaf node is the hash of all child nodes. Leaf nodes contain pieces of the dataset. Due to the properties of secure cryptographic hashes the top hash can only be produced if all data below it matches exactly. If two trees have matching top hashes then you know that all other nodes in the tree must match as well, and you can conclude that your dataset is synchronized. Trees are chosen as the primary data structure in Dat as they have a number of properties that allow for efficient access to subsets of the metadata, which allows Dat to work efficiently over a network connection. 38 | 39 | ### Dat Links 40 | 41 | Dat links are Ed25519 [@bernstein2012high] public keys which have a length of 32 bytes (64 characters when Hex encoded). You can represent your Dat link in the following ways and Dat clients will be able to understand them: 42 | 43 | - The standalone public key: 44 | 45 | `8e1c7189b1b2dbb5c4ec2693787884771201da9...` 46 | 47 | - Using the dat:// protocol: 48 | 49 | `dat://8e1c7189b1b2dbb5c4ec2693787884771...` 50 | 51 | - As part of an HTTP URL: 52 | 53 | `https://datproject.org/8e1c7189b1b2dbb5...` 54 | 55 | All messages in the Dat protocol are encrypted and signed using the public key during transport. This means that unless you know the public key (e.g. unless the Dat link was shared with you) then you will not be able to discover or communicate with any member of the swarm for that Dat. Anyone with the public key can verify that messages (such as entries in a Dat Stream) were created by a holder of the private key. 56 | 57 | Every Dat repository has a corresponding private key which is kept in your home folder and never shared. Dat never exposes either the public or private key over the network. During the discovery phase the BLAKE2b hash of the public key is used as the discovery key. This means that the original key is impossible to discover (unless it was shared publicly through a separate channel) since only the hash of the key is exposed publicly. 58 | 59 | Dat does not provide an authentication mechanism at this time. Instead it provides a capability system. Anyone with the Dat link is currently considered able to discover and access data. Do not share your Dat links publicly if you do not want them to be accessed. 60 | 61 | ### Hypercore and Hyperdrive 62 | 63 | The Dat storage, content integrity, and networking protocols are implemented in a module called [Hypercore](https://npmjs.org/hypercore). Hypercore is agnostic to the format of the input data, it operates on any stream of binary data. For the Dat use case of synchronizing datasets we use a file system module on top of Hypercore called [Hyperdrive](https://npmjs.org/hyperdrive). 64 | 65 | Dat has a layered abstraction so that users can use Hypercore directly to have full control over how they model their data. Hyperdrive works well when your data can be represented as files on a filesystem, which is the main use case with Dat. 66 | 67 | ### Hypercore Registers 68 | 69 | Hypercore Registers are the core mechanism used in Dat. They are binary append-only streams whose contents are cryptographically hashed and signed and therefore can be verified by anyone with access to the public key of the writer. They are an implementation of the concept known as a register, a digital ledger you can trust. 70 | 71 | Dat uses two registers, `content` and `metadata`. The `content` register contains the files in your repository and `metadata` contains the metadata about the files including name, size, last modified time, etc. Dat replicates them both when synchronizing with another peer. 72 | 73 | When files are added to Dat, each file gets split up into some number of chunks, and the chunks are then arranged into a Merkle tree, which is used later for version control and replication processes. 74 | 75 | ## 2.2 Decentralized Mirroring 76 | 77 | Dat is a peer to peer protocol designed to exchange pieces of a dataset amongst a swarm of peers. As soon as a peer acquires their first piece of data in the dataset they can choose to become a partial mirror for the dataset. If someone else contacts them and needs the piece they have, they can choose to share it. This can happen simultaneously while the peer is still downloading the pieces they want from others. 78 | 79 | ### Source Discovery 80 | 81 | An important aspect of mirroring is source discovery, the techniques that peers use to find each other. Source discovery means finding the IP and port of data sources online that have a copy of that data you are looking for. You can then connect to them and begin exchanging data. By using source discovery techniques Dat is able to create a network where data can be discovered even if the original data source disappears. 82 | 83 | Source discovery can happen over many kinds of networks, as long as you can model the following actions: 84 | 85 | - `join(key, [port])` - Begin performing regular lookups on an interval for `key`. Specify `port` if you want to announce that you share `key` as well. 86 | - `leave(key, [port])` - Stop looking for `key`. Specify `port` to stop announcing that you share `key` as well. 87 | - `foundpeer(key, ip, port)` - Called when a peer is found by a lookup. 88 | 89 | In the Dat implementation we implement the above actions on top of three types of discovery networks: 90 | 91 | - DNS name servers - An Internet standard mechanism for resolving keys to addresses 92 | - Multicast DNS - Useful for discovering peers on local networks 93 | - Kademlia Mainline Distributed Hash Table - Less central points of failure, increases probability of Dat working even if DNS servers are unreachable 94 | 95 | Additional discovery networks can be implemented as needed. We chose the above three as a starting point to have a complementary mix of strategies to increase the probability of source discovery. Additionally you can specify a Dat via HTTPS link, which runs the Dat protocol in "single-source" mode, meaning the above discovery networks are not used, and instead only that one HTTPS server is used as the only peer. 96 | 97 | ### Peer Connections 98 | 99 | After the discovery phase, Dat should have a list of potential data sources to try and contact. Dat uses either TCP, HTTP or [UTP](https://en.wikipedia.org/wiki/Micro_Transport_Protocol) [@rossi2010ledbat]. UTP uses LEDBAT which is designed to not take up all available bandwidth on a network (e.g. so that other people sharing WiFi can still use the Internet), and is still based on UDP so works with NAT traversal techniques like UDP hole punching. HTTP is supported for compatibility with static file servers and web browser clients. Note that these are the protocols we support in the reference Dat implementation, but the Dat protocol itself is transport agnostic. 100 | 101 | If an HTTP source is specified Dat will prefer that one over other sources. Otherwise when Dat gets the IP and port for a potential TCP or UTP source it tries to connect using both protocols. If one connects first, Dat aborts the other one. If none connect, Dat will try again until it decides that source is offline or unavailable and then stops trying to connect to them. Sources Dat is able to connect to go into a list of known good sources, so that if/when the Internet connection goes down Dat can use that list to reconnect to known good sources again quickly. 102 | 103 | If Dat gets a lot of potential sources it picks a handful at random to try and connect to and keeps the rest around as additional sources to use later in case it decides it needs more sources. 104 | 105 | Once a duplex binary connection to a remote source is open Dat then layers on the Hypercore protocol, a message-based replication protocol that allows two peers to communicate over a stateless channel to request and exchange data. You open separate replication channels with many peers at once which allows clients to parallelize data requests across the entire pool of peers they have established connections with. 106 | 107 | ## 2.3 Network Privacy 108 | 109 | On the Web today, with SSL, there is a guarantee that the traffic between your computer and the server is private. As long as you trust the server to not leak your logs, attackers who intercept your network traffic will not be able to read the HTTP traffic exchanged between you and the server. This is a fairly straightforward model as clients only have to trust a single server for some domain. 110 | 111 | There is an inherent tradeoff in peer to peer systems of source discovery vs. user privacy. The more sources you contact and ask for some data, the more sources you trust to keep what you asked for private. Our goal is to have Dat be configurable in respect to this tradeoff to allow application developers to meet their own privacy guidelines. 112 | 113 | It is up to client programs to make design decisions around which discovery networks they trust. For example if a Dat client decides to use the BitTorrent DHT to discover peers, and they are searching for a publicly shared Dat key (e.g. a key cited publicly in a published scientific paper) with known contents, then because of the privacy design of the BitTorrent DHT it becomes public knowledge what key that client is searching for. 114 | 115 | A client could choose to only use discovery networks with certain privacy guarantees. For example a client could only connect to an approved list of sources that they trust, similar to SSL. As long as they trust each source, the encryption built into the Dat network protocol will prevent the Dat key they are looking for from being leaked. 116 | 117 | ## 2.4 Incremental Versioning 118 | 119 | Given a stream of binary data, Dat splits the stream into chunks, hashes each chunk, and arranges the hashes in a specific type of Merkle tree that allows for certain replication properties. 120 | 121 | Dat is also able to fully or partially synchronize streams in a distributed setting even if the stream is being appended to. This is accomplished by using the messaging protocol to traverse the Merkle tree of remote sources and fetch a strategic set of nodes. Due to the low-level, message-oriented design of the replication protocol, different node traversal strategies can be implemented. 122 | 123 | There are two types of versioning performed automatically by Dat. Metadata is stored in a folder called `.dat` in the root folder of a repository, and data is stored as normal files in the root folder. 124 | 125 | ### Metadata Versioning 126 | 127 | Dat tries as much as possible to act as a one-to-one mirror of the state of a folder and all its contents. When importing files, Dat uses a sorted, depth-first recursion to list all the files in the tree. For each file it finds, it grabs the filesystem metadata (filename, Stat object, etc) and checks if there is already an entry for this filename with this exact metadata already represented in the Dat repository metadata. If the file with this metadata matches exactly the newest version of the file metadata stored in Dat, then this file will be skipped (no change). 128 | 129 | If the metadata differs from the current existing one (or there are no entries for this filename at all in the history), then this new metadata entry will be appended as the new 'latest' version for this file in the append-only SLEEP metadata content register (described below). 130 | 131 | ### Content Versioning 132 | 133 | In addition to storing a historical record of filesystem metadata, the content of the files themselves are also capable of being stored in a version controlled manner. The default storage system used in Dat stores the files as files. This has the advantage of being very straightforward for users to understand, but the downside of not storing old versions of content by default. 134 | 135 | In contrast to other version control systems like Git, Dat by default only stores the current set of checked out files on disk in the repository folder, not old versions. It does store all previous metadata for old versions in `.dat`. Git for example stores all previous content versions and all previous metadata versions in the `.git` folder. Because Dat is designed for larger datasets, if it stored all previous file versions in `.dat`, then the `.dat` folder could easily fill up the users hard drive inadvertently. Therefore Dat has multiple storage modes based on usage. 136 | 137 | Hypercore registers include an optional `data` file that stores all chunks of data. In Dat, only the `metadata.data` file is used, but the `content.data` file is not used. The default behavior is to store the current files only as normal files. If you want to run an 'archival' node that keeps all previous versions, you can configure Dat to use the `content.data` file instead. For example, on a shared server with lots of storage you probably want to store all versions. However on a workstation machine that is only accessing a subset of one version, the default mode of storing all metadata plus the current set of downloaded files is acceptable, because you know the server has the full history. 138 | 139 | ### Merkle Trees 140 | 141 | Registers in Dat use a specific method of encoding a Merkle tree where hashes are positioned by a scheme called binary in-order interval numbering or just "bin" numbering. This is just a specific, deterministic way of laying out the nodes in a tree. For example a tree with 7 nodes will always be arranged like this: 142 | 143 | ``` 144 | 0 145 | 1 146 | 2 147 | 3 148 | 4 149 | 5 150 | 6 151 | ``` 152 | 153 | In Dat, the hashes of the chunks of files are always even numbers, at the wide end of the tree. So the above tree had four original values that become the even numbers: 154 | 155 | ``` 156 | chunk0 -> 0 157 | chunk1 -> 2 158 | chunk2 -> 4 159 | chunk3 -> 6 160 | ``` 161 | 162 | In the resulting Merkle tree, the even and odd nodes store different information: 163 | 164 | - Evens - List of data hashes [chunk0, chunk1, chunk2, ...] 165 | - Odds - List of Merkle hashes (hashes of child even nodes) [hash0, hash1, hash2, ...] 166 | 167 | These two lists get interleaved into a single register such that the indexes (position) in the register are the same as the bin numbers from the Merkle tree. 168 | 169 | All odd hashes are derived by hashing the two child nodes, e.g. given hash0 is `hash(chunk0)` and hash2 is `hash(chunk1)`, hash1 is `hash(hash0 + hash2)`. 170 | 171 | For example a register with two data entries would look something like this (pseudocode): 172 | 173 | ``` 174 | 0. hash(chunk0) 175 | 1. hash(hash(chunk0) + hash(chunk1)) 176 | 2. hash(chunk1) 177 | ``` 178 | 179 | It is possible for the in-order Merkle tree to have multiple roots at once. A root is defined as a parent node with a full set of child node slots filled below it. 180 | 181 | For example, this tree has 2 roots (1 and 4) 182 | 183 | ``` 184 | 0 185 | 1 186 | 2 187 | 188 | 4 189 | ``` 190 | 191 | This tree has one root (3): 192 | 193 | ``` 194 | 0 195 | 1 196 | 2 197 | 3 198 | 4 199 | 5 200 | 6 201 | ``` 202 | 203 | This one has one root (1): 204 | 205 | ``` 206 | 0 207 | 1 208 | 2 209 | ``` 210 | 211 | ### Replication Example 212 | 213 | This section describes in high level the replication flow of a Dat. Note that the low level details are available by reading the SLEEP section below. For the sake of illustrating how this works in practice in a networked replication scenario, consider a folder with two files: 214 | 215 | ``` 216 | bat.jpg 217 | cat.jpg 218 | ``` 219 | 220 | To send these files to another machine using Dat, you would first add them to a Dat repository by splitting them into chunks and constructing SLEEP files representing the chunks and filesystem metadata. 221 | 222 | Let's assume `bat.jpg` and `cat.jpg` both produce three chunks, each around 64KB. Dat stores in a representation called SLEEP, but here we will show a pseudo-representation for the purposes of illustrating the replication process. The six chunks get sorted into a list like this: 223 | 224 | ``` 225 | bat-1 226 | bat-2 227 | bat-3 228 | cat-1 229 | cat-2 230 | cat-3 231 | ``` 232 | 233 | These chunks then each get hashed, and the hashes get arranged into a Merkle tree (the content register): 234 | 235 | ``` 236 | 0 - hash(bat-1) 237 | 1 - hash(0 + 2) 238 | 2 - hash(bat-2) 239 | 3 - hash(1 + 5) 240 | 4 - hash(bat-3) 241 | 5 - hash(4 + 6) 242 | 6 - hash(cat-1) 243 | 8 - hash(cat-2) 244 | 9 - hash(8 + 10) 245 | 10 - hash(cat-3) 246 | ``` 247 | 248 | Next we calculate the root hashes of our tree, in this case 3 and 9. We then hash them together, and cryptographically sign the hash. This signed hash now can be used to verify all nodes in the tree, and the signature proves it was produced by us, the holder of the private key for this Dat. 249 | 250 | This tree is for the hashes of the contents of the photos. There is also a second Merkle tree that Dat generates that represents the list of files and their metadata and looks something like this (the metadata register): 251 | 252 | ``` 253 | 0 - hash({contentRegister: '9e29d624...'}) 254 | 1 - hash(0 + 2) 255 | 2 - hash({"bat.jpg", first: 0, length: 3}) 256 | 4 - hash({"cat.jpg", first: 3, length: 3}) 257 | ``` 258 | 259 | The first entry in this feed is a special metadata entry that tells Dat the address of the second feed (the content register). Note that node 3 is not included yet, because 3 is the hash of `1 + 5`, but 5 does not exist yet, so will be written at a later update. 260 | 261 | Now we're ready to send our metadata to the other peer. The first message is a `Register` message with the key that was shared for this Dat. Let's call ourselves Alice and the other peer Bob. Alice sends Bob a `Want` message that declares they want all nodes in the file list (the metadata register). Bob replies with a single `Have` message that indicates he has 2 nodes of data. Alice sends three `Request` messages, one for each leaf node (`0, 2, 4`). Bob sends back three `Data` messages. The first `Data` message contains the content register key, the hash of the sibling, in this case node `2`, the hash of the uncle root `4`, as well as a signature for the root hashes (in this case `1, 4`). Alice verifies the integrity of this first `Data` message by hashing the metadata received for the content register metadata to produce the hash for node `0`. They then hash the hash `0` with the hash `2` that was included to reproduce hash `1`, and hashes their `1` with the value for `4` they received, which they can use the received signature to verify it was the same data. When the next `Data` message is received, a similar process is performed to verify the content. 262 | 263 | Now Alice has the full list of files in the Dat, but decides they only want to download `cat.png`. Alice knows they want blocks 3 through 6 from the content register. First Alice sends another `Register` message with the content key to open a new replication channel over the connection. Then Alice sends three `Request` messages, one for each of blocks `4, 5, 6`. Bob sends back three `Data` messages with the data for each block, as well as the hashes needed to verify the content in a way similar to the process described above for the metadata feed. 264 | 265 | ## 2.5 Random Access 266 | 267 | Dat pursues the following access capabilities: 268 | 269 | - Support large file hierachies (millions of files in a single repository). 270 | - Support efficient traversal of the hierarchy (listing files in arbitrary folders efficiently). 271 | - Store all changes to all files (metadata and/or content). 272 | - List all changes made to any single file. 273 | - View the state of all files relative to any point in time. 274 | - Subscribe live to all changes (any file). 275 | - Subscribe live to changes to files under a specific path. 276 | - Efficiently access any byte range of any version of any file. 277 | - Allow all of the above to happen remotely, only syncing the minimum metadata necessary to perform any action. 278 | - Allow efficient comparison of remote and local repository state to request missing pieces during synchronization. 279 | - Allow entire remote archive to be synchronized, or just some subset of files and/or versions. 280 | 281 | The way Dat accomplishes these is through a combination of storing all changes in Hypercore feeds, but also using strategic metadata indexing strategies that support certain queries efficiently to be performed by traversing the Hypercore feeds. The protocol itself is specified in Section 3 (SLEEP), but a scenario based summary follows here. 282 | 283 | ### Scenario: Reading a file from a specific byte offset 284 | 285 | Alice has a dataset in Dat, Bob wants to access a 100MB CSV called `cat_dna.csv` stored in the remote repository, but only wants to access the 10MB range of the CSV spanning from 30MB - 40MB. 286 | 287 | Bob has never communicated with Alice before, and is starting fresh with no knowledge of this Dat repository other than that he knows he wants `cat_dna.csv` at a specific offset. 288 | 289 | First, Bob asks Alice through the Dat protocol for the metadata he needs to resolve `cat_dna.csv` to the correct metadata feed entry that represents the file he wants. Note: In this scenario we assume Bob wants the latest version of `cat_dna.csv`. It is also possible to do this for a specific older version. 290 | 291 | Bob first sends a `Request` message for the latest entry in the metadata feed. Alice responds. Bob looks at the `trie` value, and using the lookup algorithm described below sends another `Request` message for the metadata node that is closer to the filename he is looking for. This repeats until Alice sends Bob the matching metadata entry. This is the un-optimized resolution that uses `log(n)` round trips, though there are ways to optimize this by having Alice send additional sequence numbers to Bob that help him traverse in less round trips. 292 | 293 | In the metadata record Bob received for `cat_dna.csv` there is the byte offset to the beginning of the file in the data feed. Bob adds his +30MB offset to this value and starts requesting pieces of data starting at that byte offset using the SLEEP protocol as described below. 294 | 295 | This method tries to allow any byte range of any file to be accessed without the need to synchronize the full metadata for all files up front. 296 | 297 | ## 3. Dat Network Protocol 298 | 299 | The SLEEP format is designed to allow for sparse replication, meaning you can efficiently download only the metadata and data required to resolve a single byte region of a single file, which makes Dat suitable for a wide variety of streaming, real time and large dataset use cases. 300 | 301 | To take advantage of this, Dat includes a network protocol. It is message-based and stateless, making it possible to implement on a variety of network transport protocols including UDP and TCP. Both metadata and content registers in SLEEP share the exact same replication protocol. 302 | 303 | Individual messages are encoded using Protocol Buffers and there are ten message types using the following schema: 304 | 305 | ### Wire Protocol 306 | 307 | Over the wire messages are packed in the following lightweight container format 308 | 309 | ``` 310 | 311 | 312 | 313 | ``` 314 | 315 | The `header` value is a single varint that has two pieces of information: the integer `type` that declares a 4-bit message type (used below), and a channel identifier, `0` for metadata and `1` for content. 316 | 317 | To generate this varint, you bitshift the 4-bit type integer onto the end of the channel identifier, e.g. `channel << 4 | <4-bit-type>`. 318 | 319 | ### Feed 320 | 321 | Type 0. Should be the first message sent on a channel. 322 | 323 | - `discoveryKey` - A BLAKE2b keyed hash of the string 'hypercore' using the public key of the metadata register as the key. 324 | - `nonce` - 24 bytes (192 bits) of random binary data, used in our encryption scheme 325 | 326 | ``` 327 | message Feed { 328 | required bytes discoveryKey = 1; 329 | optional bytes nonce = 2; 330 | } 331 | ``` 332 | 333 | ### Handshake 334 | 335 | Type 1. Overall connection handshake. Should be sent just after the feed message on the first channel only (metadata). 336 | 337 | - `id` - 32 byte random data used as a identifier for this peer on the network, useful for checking if you are connected to yourself or another peer more than once 338 | - `live` - Whether or not you want to operate in live (continuous) replication mode or end after the initial sync 339 | - `userData` - User-specific metadata encoded as a byte sequence 340 | - `extensions` - List of extensions that are supported on this Feed 341 | 342 | ``` 343 | message Handshake { 344 | optional bytes id = 1; 345 | optional bool live = 2; 346 | optional bytes userData = 3; 347 | repeated string extensions = 4; 348 | } 349 | ``` 350 | 351 | ### Info 352 | 353 | Type 2. Message indicating state changes. Used to indicate whether you are uploading and/or downloading. 354 | 355 | Initial state for uploading/downloading is true. If both ends are not downloading and not live it is safe to consider the stream ended. 356 | 357 | ``` 358 | message Info { 359 | optional bool uploading = 1; 360 | optional bool downloading = 2; 361 | } 362 | ``` 363 | 364 | ### Have 365 | 366 | Type 3. How you tell the other peer what chunks of data you have or don't have. You should only send Have messages to peers who have expressed interest in this region with Want messages. 367 | 368 | - `start` - If you only specify `start`, it means you are telling the other side you only have 1 chunk at the position at the value in `start`. 369 | - `length` - If you specify length, you can describe a range of values that you have all of, starting from `start`. 370 | - `bitfield` - If you would like to send a range of sparse data about haves/don't haves via bitfield, relative to `start`. 371 | 372 | ``` 373 | message Have { 374 | required uint64 start = 1; 375 | optional uint64 length = 2 [default = 1]; 376 | optional bytes bitfield = 3; 377 | } 378 | ``` 379 | 380 | When sending bitfields you must run length encode them. The encoded bitfield is a series of compressed and uncompressed bit sequences. All sequences start with a header that is a varint. 381 | 382 | If the last bit is set in the varint (it is an odd number) then a header represents a compressed bit sequence. 383 | 384 | ``` 385 | compressed-sequence = varint( 386 | byte-length-of-sequence 387 | << 2 | bit << 1 | 1 388 | ) 389 | ``` 390 | 391 | If the last bit is *not* set then a header represents a non-compressed sequence. 392 | 393 | ``` 394 | uncompressed-sequence = varint( 395 | byte-length-of-bitfield << 1 | 0 396 | ) + (bitfield) 397 | ``` 398 | 399 | ### Unhave 400 | 401 | Type 4. How you communicate that you deleted or removed a chunk you used to have. 402 | 403 | 404 | ``` 405 | message Unhave { 406 | required uint64 start = 1; 407 | optional uint64 length = 2 [default = 1]; 408 | } 409 | ``` 410 | 411 | ### Want 412 | 413 | Type 5. How you ask the other peer to subscribe you to Have messages for a region of chunks. The `length` value defaults to Infinity or feed.length (if not live). 414 | 415 | ``` 416 | message Want { 417 | required uint64 start = 1; 418 | optional uint64 length = 2; 419 | } 420 | ``` 421 | 422 | ### Unwant 423 | 424 | Type 6. How you ask to unsubscribe from Have messages for a region of chunks from the other peer. You should only Unwant previously Wanted regions, but if you do Unwant something that hasn't been Wanted it won't have any effect. The `length` value defaults to Infinity or feed.length (if not live). 425 | 426 | ``` 427 | message Unwant { 428 | required uint64 start = 1; 429 | optional uint64 length = 2; 430 | } 431 | ``` 432 | 433 | ### Request 434 | 435 | Type 7. Request a single chunk of data. 436 | 437 | - `index` - The chunk index for the chunk you want. You should only ask for indexes that you have received the Have messages for. 438 | - `bytes` - You can also optimistically specify a byte offset, and in the case the remote is able to resolve the chunk for this byte offset depending on their Merkle tree state, they will ignore the `index` and send the chunk that resolves for this byte offset instead. But if they cannot resolve the byte request, `index` will be used. 439 | - `hash` - If you only want the hash of the chunk and not the chunk data itself. 440 | - `nodes` - A 64 bit long bitfield representing which parent nodes you have. 441 | 442 | The `nodes` bitfield is an optional optimization to reduce the amount of duplicate nodes exchanged during the replication lifecycle. It indicates which parents you have or don't have. You have a maximum of 64 parents you can specify. Because `uint64` in Protocol Buffers is implemented as a varint, over the wire this does not take up 64 bits in most cases. The first bit is reserved to signify whether or not you need a signature in response. The rest of the bits represent whether or not you have (`1`) or don't have (`0`) the information at this node already. The ordering is determined by walking parent, sibling up the tree all the way to the root. 443 | 444 | ``` 445 | message Request { 446 | required uint64 index = 1; 447 | optional uint64 bytes = 2; 448 | optional bool hash = 3; 449 | optional uint64 nodes = 4; 450 | } 451 | ``` 452 | 453 | ### Cancel 454 | 455 | Type 8. Cancel a previous Request message that you haven't received yet. 456 | 457 | ``` 458 | message Cancel { 459 | required uint64 index = 1; 460 | optional uint64 bytes = 2; 461 | optional bool hash = 3; 462 | } 463 | ``` 464 | 465 | ### Data 466 | 467 | Type 9. Sends a single chunk of data to the other peer. You can send it in response to a Request or unsolicited on its own as a friendly gift. The data includes all of the Merkle tree parent nodes needed to verify the hash chain all the way up to the Merkle roots for this chunk. Because you can produce the direct parents by hashing the chunk, only the roots and 'uncle' hashes are included (the siblings to all of the parent nodes). 468 | 469 | - `index` - The chunk position for this chunk. 470 | - `value` - The chunk binary data. Empty if you are sending only the hash. 471 | - `Node.index` - The index for this chunk in in-order notation 472 | - `Node.hash` - The hash of this chunk 473 | - `Node.size`- The aggregate chunk size for all children below this node (The sum of all chunk sizes of all children) 474 | - `signature` - If you are sending a root node, all root nodes must have the signature included. 475 | 476 | 477 | ``` 478 | message Data { 479 | required uint64 index = 1; 480 | optional bytes value = 2; 481 | repeated Node nodes = 3; 482 | optional bytes signature = 4; 483 | 484 | message Node { 485 | required uint64 index = 1; 486 | required bytes hash = 2; 487 | required uint64 size = 3; 488 | } 489 | } 490 | ``` 491 | 492 | # 4. Existing Work 493 | 494 | Dat is inspired by a number of features from existing systems. 495 | 496 | ## Git 497 | 498 | Git popularized the idea of a directed acyclic graph (DAG) combined with a Merkle tree, a way to represent changes to data where each change is addressed by the secure hash of the change plus all ancestor hashes in a graph. This provides a way to trust data integrity, as the only way a specific hash could be derived by another peer is if they have the same data and change history required to reproduce that hash. This is important for reproducibility as it lets you trust that a specific git commit hash refers to a specific source code state. 499 | 500 | Decentralized version control tools for source code like Git provide a protocol for efficiently downloading changes to a set of files, but are optimized for text files and have issues with large files. Solutions like Git-LFS solve this by using HTTP to download large files, rather than the Git protocol. GitHub offers Git-LFS hosting but charges repository owners for bandwidth on popular files. Building a distributed distribution layer for files in a Git repository is difficult due to design of Git Packfiles which are delta compressed repository states that do not easily support random access to byte ranges in previous file versions. 501 | 502 | ## BitTorrent 503 | 504 | BitTorrent implements a swarm based file sharing protocol for static datasets. Data is split into fixed sized chunks, hashed, and then that hash is used to discover peers that have the same data. An advantage of using BitTorrent for dataset transfers is that download bandwidth can be fully saturated. Since the file is split into pieces, and peers can efficiently discover which pieces each of the peers they are connected to have, it means one peer can download non-overlapping regions of the dataset from many peers at the same time in parallel, maximizing network throughput. 505 | 506 | Fixed sized chunking has drawbacks for data that changes. BitTorrent assumes all metadata will be transferred up front which makes it impractical for streaming or updating content. Most BitTorrent clients divide data into 1024 pieces meaning large datasets could have a very large chunk size which impacts random access performance (e.g. for streaming video). 507 | 508 | Another drawback of BitTorrent is due to the way clients advertise and discover other peers in absence of any protocol level privacy or trust. From a user privacy standpoint, BitTorrent leaks what users are accessing or attempting to access, and does not provide the same browsing privacy functions as systems like SSL. 509 | 510 | ## Kademlia Distributed Hash Table 511 | 512 | Kademlia [@maymounkov2002kademlia] is a distributed hash table, a distributed key/value store that can serve a similar purpose to DNS servers but has no hard coded server addresses. All clients in Kademlia are also servers. As long as you know at least one address of another peer in the network, you can ask them for the key you are trying to find and they will either have it or give you some other people to talk to that are more likely to have it. 513 | 514 | If you don't have an initial peer to talk to you, most clients use a bootstrap server that randomly gives you a peer in the network to start with. If the bootstrap server goes down, the network still functions as long as other methods can be used to bootstrap new peers (such as sending them peer addresses through side channels like how .torrent files include tracker addresses to try in case Kademlia finds no peers). 515 | 516 | Kademlia is distinct from previous DHT designs due to its simplicity. It uses a very simple XOR operation between two keys as its "distance" metric to decide which peers are closer to the data being searched for. On paper it seems like it wouldn't work as it doesn't take into account things like ping speed or bandwidth. Instead its design is very simple on purpose to minimize the amount of control/gossip messages and to minimize the amount of complexity required to implement it. In practice Kademlia has been extremely successful and is widely deployed as the "Mainline DHT" for BitTorrent, with support in all popular BitTorrent clients today. 517 | 518 | Due to the simplicity in the original Kademlia design a number of attacks such as DDOS and/or sybil have been demonstrated. There are protocol extensions (BEPs) which in certain cases mitigate the effects of these attacks, such as BEP 44 which includes a DDOS mitigation technique. Nonetheless anyone using Kademlia should be aware of the limitations. 519 | 520 | ## Peer to Peer Streaming Peer Protocol (PPSPP) 521 | 522 | PPSPP ([IETF RFC 7574](https://datatracker.ietf.org/doc/rfc7574/?include_text=1), [@bakker2015peer]) is a protocol for live streaming content over a peer to peer network. In it they define a specific type of Merkle Tree that allows for subsets of the hashes to be requested by a peer in order to reduce the time-till-playback for end users. BitTorrent for example transfers all hashes up front, which is not suitable for live streaming. 523 | 524 | Their Merkle trees are ordered using a scheme they call "bin numbering", which is a method for deterministically arranging an append-only log of leaf nodes into an in-order layout tree where non-leaf nodes are derived hashes. If you want to verify a specific node, you only need to request its sibling's hash and all its uncle hashes. PPSPP is very concerned with reducing round trip time and time-till-playback by allowing for many kinds of optimizations, such as to pack as many hashes into datagrams as possible when exchanging tree information with peers. 525 | 526 | Although PPSPP was designed with streaming video in mind, the ability to request a subset of metadata from a large and/or streaming dataset is very desirable for many other types of datasets. 527 | 528 | ## WebTorrent 529 | 530 | With WebRTC, browsers can now make peer to peer connections directly to other browsers. BitTorrent uses UDP sockets which aren't available to browser JavaScript, so can't be used as-is on the Web. 531 | 532 | WebTorrent implements the BitTorrent protocol in JavaScript using WebRTC as the transport. This includes the BitTorrent block exchange protocol as well as the tracker protocol implemented in a way that can enable hybrid nodes, talking simultaneously to both BitTorrent and WebTorrent swarms (if a client is capable of making both UDP sockets as well as WebRTC sockets, such as Node.js). Trackers are exposed to web clients over HTTP or WebSockets. 533 | 534 | ## InterPlanetary File System 535 | 536 | IPFS is a family of application and network protocols that have peer to peer file sharing and data permanence baked in. IPFS abstracts network protocols and naming systems to provide an alternative application delivery platform to today's Web. For example, instead of using HTTP and DNS directly, in IPFS you would use LibP2P streams and IPNS in order to gain access to the features of the IPFS platform. 537 | 538 | ## Certificate Transparency/Secure Registers 539 | 540 | The UK Government Digital Service have developed the concept of a register which they define as a digital public ledger you can trust. In the UK government registers are beginning to be piloted as a way to expose essential open data sets in a way where consumers can verify the data has not been tampered with, and allows the data publishers to update their data sets over time. 541 | 542 | The design of registers was inspired by the infrastructure backing the Certificate Transparency [@laurie2013certificate] project, initiated at Google, which provides a service on top of SSL certificates that enables service providers to write certificates to a distributed public ledger. Any client or service provider can verify if a certificate they received is in the ledger, which protects against so called "rogue certificates". 543 | 544 | # 5. Reference Implementation 545 | 546 | The connection logic is implemented in a module called [discovery-swarm](https://www.npmjs.com/package/discovery-swarm). This builds on discovery-channel and adds connection establishment, management and statistics. It provides statistics such as how many sources are currently connected, how many good and bad behaving sources have been talked to, and it automatically handles connecting and reconnecting to sources. UTP support is implemented in the module [utp-native](https://www.npmjs.com/package/utp-native). 547 | 548 | Our implementation of source discovery is called [discovery-channel](https://npmjs.org/discovery-channel). We also run a [custom DNS server](https://www.npmjs.com/package/dns-discovery) that Dat clients use (in addition to specifying their own if they need to), as well as a [DHT bootstrap](https://github.com/bittorrent/bootstrap-dht) server. These discovery servers are the only centralized infrastructure we need for Dat to work over the Internet, but they are redundant, interchangeable, never see the actual data being shared, anyone can run their own and Dat will still work even if they all are unavailable. If this happens discovery will just be manual (e.g. manually sharing IP/ports). 549 | 550 | # Acknowledgements 551 | 552 | This work was made possible through grants from the John S. and James L. Knight and Alfred P. Sloan Foundations. 553 | 554 | # References 555 | -------------------------------------------------------------------------------- /source/dat-paper.latex: -------------------------------------------------------------------------------- 1 | \documentclass[a4paperpaper,twocolumn]{article} 2 | \usepackage{lmodern} 3 | \usepackage{amssymb,amsmath} 4 | \usepackage{ifxetex,ifluatex} 5 | \usepackage{fixltx2e} % provides \textsubscript 6 | \ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex 7 | \usepackage[T1]{fontenc} 8 | \usepackage[utf8]{inputenc} 9 | \else % if luatex or xelatex 10 | \ifxetex 11 | \usepackage{mathspec} 12 | \else 13 | \usepackage{fontspec} 14 | \fi 15 | \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase} 16 | \fi 17 | % use upquote if available, for straight quotes in verbatim environments 18 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{} 19 | % use microtype if available 20 | \IfFileExists{microtype.sty}{% 21 | \usepackage{microtype} 22 | \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts 23 | }{} 24 | \usepackage[unicode=true]{hyperref} 25 | \hypersetup{ 26 | pdftitle={Dat - Distributed Dataset Synchronization And Versioning}, 27 | pdfauthor={Maxwell Ogden, Karissa McKelvey, Mathias Buus Madsen, Code for Science}, 28 | pdfborder={0 0 0}, 29 | breaklinks=true} 30 | \urlstyle{same} % don't use monospace font for urls 31 | \IfFileExists{parskip.sty}{% 32 | \usepackage{parskip} 33 | }{% else 34 | \setlength{\parindent}{0pt} 35 | \setlength{\parskip}{6pt plus 2pt minus 1pt} 36 | } 37 | \setlength{\emergencystretch}{3em} % prevent overfull lines 38 | \providecommand{\tightlist}{% 39 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} 40 | \setcounter{secnumdepth}{0} 41 | % Redefines (sub)paragraphs to behave more like sections 42 | \ifx\paragraph\undefined\else 43 | \let\oldparagraph\paragraph 44 | \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} 45 | \fi 46 | \ifx\subparagraph\undefined\else 47 | \let\oldsubparagraph\subparagraph 48 | \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} 49 | \fi 50 | 51 | \title{Dat - Distributed Dataset Synchronization And Versioning} 52 | \author{Maxwell Ogden, Karissa McKelvey, Mathias Buus Madsen, Code for Science} 53 | \date{May 2017 (last updated: Jan 2018)} 54 | 55 | \begin{document} 56 | \maketitle 57 | 58 | \section{Abstract}\label{abstract} 59 | 60 | Dat is a protocol designed for syncing folders of data, even if they are 61 | large or changing constantly. Dat uses a cryptographically secure 62 | register of changes to prove that the requested data version is 63 | distributed. A byte range of any file's version can be efficiently 64 | streamed from a Dat repository over a network connection. Consumers can 65 | choose to fully or partially replicate the contents of a remote Dat 66 | repository, and can also subscribe to live changes. To ensure writer and 67 | reader privacy, Dat uses public key cryptography to encrypt network 68 | traffic. A group of Dat clients can connect to each other to form a 69 | public or private decentralized network to exchange data between each 70 | other. A reference implementation is provided in JavaScript. 71 | 72 | \section{1. Background}\label{background} 73 | 74 | Many datasets are shared online today using HTTP and FTP, which lack 75 | built in support for version control or content addressing of data. This 76 | results in link rot and content drift as files are moved, updated or 77 | deleted, leading to an alarming rate of disappearing data references in 78 | areas such as 79 | \href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253}{published 80 | scientific literature}. 81 | 82 | Cloud storage services like S3 ensure availability of data, but they 83 | have a centralized hub-and-spoke networking model and are therefore 84 | limited by their bandwidth, meaning popular files can become very 85 | expensive to share. Services like Dropbox and Google Drive provide 86 | version control and synchronization on top of cloud storage services 87 | which fixes many issues with broken links but rely on proprietary code 88 | and services requiring users to store their data on centralized cloud 89 | infrastructure which has implications on cost, transfer speeds, vendor 90 | lock-in and user privacy. 91 | 92 | Distributed file sharing tools can become faster as files become more 93 | popular, removing the bandwidth bottleneck and making file distribution 94 | cheaper. They also use link resolution and discovery systems which can 95 | prevent broken links meaning if the original source goes offline other 96 | backup sources can be automatically discovered. However these file 97 | sharing tools today are not supported by Web browsers, do not have good 98 | privacy guarantees, and do not provide a mechanism for updating files 99 | without redistributing a new dataset which could mean entirely 100 | re-downloading data you already have. 101 | 102 | \section{2. Dat}\label{dat} 103 | 104 | Dat is a dataset synchronization protocol that does not assume a dataset 105 | is static or that the entire dataset will be downloaded. The main 106 | reference implementation is available from npm as 107 | \texttt{npm\ install\ dat\ -g}. 108 | 109 | The protocol is agnostic to the underlying transport e.g.~you could 110 | implement Dat over carrier pigeon. Data is stored in a format called 111 | SLEEP (Ogden and Buus 2017), described in its own paper. The key 112 | properties of the Dat design are explained in this section. 113 | 114 | \begin{itemize} 115 | \tightlist 116 | \item 117 | 2.1 \textbf{Content Integrity} - Data and publisher integrity is 118 | verified through use of signed hashes of the content. 119 | \item 120 | 2.2 \textbf{Decentralized Mirroring} - Users sharing the same Dat 121 | automatically discover each other and exchange data in a swarm. 122 | \item 123 | 2.3 \textbf{Network Privacy} - Dat provides certain privacy guarantees 124 | including end-to-end encryption. 125 | \item 126 | 2.4 \textbf{Incremental Versioning} - Datasets can be efficiently 127 | synced, even in real time, to other peers. 128 | \item 129 | 2.5 \textbf{Random Access} - Huge file hierarchies can be efficiently 130 | traversed remotely. 131 | \end{itemize} 132 | 133 | \subsection{2.1 Content Integrity}\label{content-integrity} 134 | 135 | Content integrity means being able to verify the data you received is 136 | the exact same version of the data that you expected. This is important 137 | in a distributed system as this mechanism will catch incorrect data sent 138 | by bad peers. It also has implications for reproducibility as it lets 139 | you refer to a specific version of a dataset. 140 | 141 | Link rot, when links online stop resolving, and content drift, when data 142 | changes but the link to the data remains the same, are two common issues 143 | in data analysis. For example, one day a file called data.zip might 144 | change, but a typical HTTP link to the file does not include a hash of 145 | the content, or provide a way to get updated metadata, so clients that 146 | only have the HTTP link have no way to check if the file changed without 147 | downloading the entire file again. Referring to a file by the hash of 148 | its content is called content addressability, and lets users not only 149 | verify that the data they receive is the version of the data they want, 150 | but also lets people cite specific versions of the data by referring to 151 | a specific hash. 152 | 153 | Dat uses BLAKE2b (Aumasson et al. 2013) cryptographically secure hashes 154 | to address content. Hashes are arranged in a Merkle tree (Mykletun, 155 | Narasimha, and Tsudik 2003), a tree where each non-leaf node is the hash 156 | of all child nodes. Leaf nodes contain pieces of the dataset. Due to the 157 | properties of secure cryptographic hashes the top hash can only be 158 | produced if all data below it matches exactly. If two trees have 159 | matching top hashes then you know that all other nodes in the tree must 160 | match as well, and you can conclude that your dataset is synchronized. 161 | Trees are chosen as the primary data structure in Dat as they have a 162 | number of properties that allow for efficient access to subsets of the 163 | metadata, which allows Dat to work efficiently over a network 164 | connection. 165 | 166 | \subsubsection{Dat Links}\label{dat-links} 167 | 168 | Dat links are Ed25519 (Bernstein et al. 2012) public keys which have a 169 | length of 32 bytes (64 characters when Hex encoded). You can represent 170 | your Dat link in the following ways and Dat clients will be able to 171 | understand them: 172 | 173 | \begin{itemize} 174 | \tightlist 175 | \item 176 | The standalone public key: 177 | \end{itemize} 178 | 179 | \texttt{8e1c7189b1b2dbb5c4ec2693787884771201da9...} 180 | 181 | \begin{itemize} 182 | \tightlist 183 | \item 184 | Using the dat:// protocol: 185 | \end{itemize} 186 | 187 | \texttt{dat://8e1c7189b1b2dbb5c4ec2693787884771...} 188 | 189 | \begin{itemize} 190 | \tightlist 191 | \item 192 | As part of an HTTP URL: 193 | \end{itemize} 194 | 195 | \texttt{https://datproject.org/8e1c7189b1b2dbb5...} 196 | 197 | All messages in the Dat protocol are encrypted and signed using the 198 | public key during transport. This means that unless you know the public 199 | key (e.g.~unless the Dat link was shared with you) then you will not be 200 | able to discover or communicate with any member of the swarm for that 201 | Dat. Anyone with the public key can verify that messages (such as 202 | entries in a Dat Stream) were created by a holder of the private key. 203 | 204 | Every Dat repository has a corresponding private key which is kept in 205 | your home folder and never shared. Dat never exposes either the public 206 | or private key over the network. During the discovery phase the BLAKE2b 207 | hash of the public key is used as the discovery key. This means that the 208 | original key is impossible to discover (unless it was shared publicly 209 | through a separate channel) since only the hash of the key is exposed 210 | publicly. 211 | 212 | Dat does not provide an authentication mechanism at this time. Instead 213 | it provides a capability system. Anyone with the Dat link is currently 214 | considered able to discover and access data. Do not share your Dat links 215 | publicly if you do not want them to be accessed. 216 | 217 | \subsubsection{Hypercore and Hyperdrive}\label{hypercore-and-hyperdrive} 218 | 219 | The Dat storage, content integrity, and networking protocols are 220 | implemented in a module called 221 | \href{https://npmjs.org/hypercore}{Hypercore}. Hypercore is agnostic to 222 | the format of the input data, it operates on any stream of binary data. 223 | For the Dat use case of synchronizing datasets we use a file system 224 | module on top of Hypercore called 225 | \href{https://npmjs.org/hyperdrive}{Hyperdrive}. 226 | 227 | Dat has a layered abstraction so that users can use Hypercore directly 228 | to have full control over how they model their data. Hyperdrive works 229 | well when your data can be represented as files on a filesystem, which 230 | is the main use case with Dat. 231 | 232 | \subsubsection{Hypercore Registers}\label{hypercore-registers} 233 | 234 | Hypercore Registers are the core mechanism used in Dat. They are binary 235 | append-only streams whose contents are cryptographically hashed and 236 | signed and therefore can be verified by anyone with access to the public 237 | key of the writer. They are an implementation of the concept known as a 238 | register, a digital ledger you can trust. 239 | 240 | Dat uses two registers, \texttt{content} and \texttt{metadata}. The 241 | \texttt{content} register contains the files in your repository and 242 | \texttt{metadata} contains the metadata about the files including name, 243 | size, last modified time, etc. Dat replicates them both when 244 | synchronizing with another peer. 245 | 246 | When files are added to Dat, each file gets split up into some number of 247 | chunks, and the chunks are then arranged into a Merkle tree, which is 248 | used later for version control and replication processes. 249 | 250 | \subsection{2.2 Decentralized Mirroring}\label{decentralized-mirroring} 251 | 252 | Dat is a peer to peer protocol designed to exchange pieces of a dataset 253 | amongst a swarm of peers. As soon as a peer acquires their first piece 254 | of data in the dataset they can choose to become a partial mirror for 255 | the dataset. If someone else contacts them and needs the piece they 256 | have, they can choose to share it. This can happen simultaneously while 257 | the peer is still downloading the pieces they want from others. 258 | 259 | \subsubsection{Source Discovery}\label{source-discovery} 260 | 261 | An important aspect of mirroring is source discovery, the techniques 262 | that peers use to find each other. Source discovery means finding the IP 263 | and port of data sources online that have a copy of that data you are 264 | looking for. You can then connect to them and begin exchanging data. By 265 | using source discovery techniques Dat is able to create a network where 266 | data can be discovered even if the original data source disappears. 267 | 268 | Source discovery can happen over many kinds of networks, as long as you 269 | can model the following actions: 270 | 271 | \begin{itemize} 272 | \tightlist 273 | \item 274 | \texttt{join(key,\ {[}port{]})} - Begin performing regular lookups on 275 | an interval for \texttt{key}. Specify \texttt{port} if you want to 276 | announce that you share \texttt{key} as well. 277 | \item 278 | \texttt{leave(key,\ {[}port{]})} - Stop looking for \texttt{key}. 279 | Specify \texttt{port} to stop announcing that you share \texttt{key} 280 | as well. 281 | \item 282 | \texttt{foundpeer(key,\ ip,\ port)} - Called when a peer is found by a 283 | lookup. 284 | \end{itemize} 285 | 286 | In the Dat implementation we implement the above actions on top of three 287 | types of discovery networks: 288 | 289 | \begin{itemize} 290 | \tightlist 291 | \item 292 | DNS name servers - An Internet standard mechanism for resolving keys 293 | to addresses 294 | \item 295 | Multicast DNS - Useful for discovering peers on local networks 296 | \item 297 | Kademlia Mainline Distributed Hash Table - Less central points of 298 | failure, increases probability of Dat working even if DNS servers are 299 | unreachable 300 | \end{itemize} 301 | 302 | Additional discovery networks can be implemented as needed. We chose the 303 | above three as a starting point to have a complementary mix of 304 | strategies to increase the probability of source discovery. Additionally 305 | you can specify a Dat via HTTPS link, which runs the Dat protocol in 306 | ``single-source'' mode, meaning the above discovery networks are not 307 | used, and instead only that one HTTPS server is used as the only peer. 308 | 309 | \subsubsection{Peer Connections}\label{peer-connections} 310 | 311 | After the discovery phase, Dat should have a list of potential data 312 | sources to try and contact. Dat uses either TCP, HTTP or 313 | \href{https://en.wikipedia.org/wiki/Micro_Transport_Protocol}{UTP} 314 | (Rossi et al. 2010). UTP uses LEDBAT which is designed to not take up 315 | all available bandwidth on a network (e.g.~so that other people sharing 316 | WiFi can still use the Internet), and is still based on UDP so works 317 | with NAT traversal techniques like UDP hole punching. HTTP is supported 318 | for compatibility with static file servers and web browser clients. Note 319 | that these are the protocols we support in the reference Dat 320 | implementation, but the Dat protocol itself is transport agnostic. 321 | 322 | If an HTTP source is specified Dat will prefer that one over other 323 | sources. Otherwise when Dat gets the IP and port for a potential TCP or 324 | UTP source it tries to connect using both protocols. If one connects 325 | first, Dat aborts the other one. If none connect, Dat will try again 326 | until it decides that source is offline or unavailable and then stops 327 | trying to connect to them. Sources Dat is able to connect to go into a 328 | list of known good sources, so that if/when the Internet connection goes 329 | down Dat can use that list to reconnect to known good sources again 330 | quickly. 331 | 332 | If Dat gets a lot of potential sources it picks a handful at random to 333 | try and connect to and keeps the rest around as additional sources to 334 | use later in case it decides it needs more sources. 335 | 336 | Once a duplex binary connection to a remote source is open Dat then 337 | layers on the Hypercore protocol, a message-based replication protocol 338 | that allows two peers to communicate over a stateless channel to request 339 | and exchange data. You open separate replication channels with many 340 | peers at once which allows clients to parallelize data requests across 341 | the entire pool of peers they have established connections with. 342 | 343 | \subsection{2.3 Network Privacy}\label{network-privacy} 344 | 345 | On the Web today, with SSL, there is a guarantee that the traffic 346 | between your computer and the server is private. As long as you trust 347 | the server to not leak your logs, attackers who intercept your network 348 | traffic will not be able to read the HTTP traffic exchanged between you 349 | and the server. This is a fairly straightforward model as clients only 350 | have to trust a single server for some domain. 351 | 352 | There is an inherent tradeoff in peer to peer systems of source 353 | discovery vs.~user privacy. The more sources you contact and ask for 354 | some data, the more sources you trust to keep what you asked for 355 | private. Our goal is to have Dat be configurable in respect to this 356 | tradeoff to allow application developers to meet their own privacy 357 | guidelines. 358 | 359 | It is up to client programs to make design decisions around which 360 | discovery networks they trust. For example if a Dat client decides to 361 | use the BitTorrent DHT to discover peers, and they are searching for a 362 | publicly shared Dat key (e.g.~a key cited publicly in a published 363 | scientific paper) with known contents, then because of the privacy 364 | design of the BitTorrent DHT it becomes public knowledge what key that 365 | client is searching for. 366 | 367 | A client could choose to only use discovery networks with certain 368 | privacy guarantees. For example a client could only connect to an 369 | approved list of sources that they trust, similar to SSL. As long as 370 | they trust each source, the encryption built into the Dat network 371 | protocol will prevent the Dat key they are looking for from being 372 | leaked. 373 | 374 | \subsection{2.4 Incremental Versioning}\label{incremental-versioning} 375 | 376 | Given a stream of binary data, Dat splits the stream into chunks, hashes 377 | each chunk, and arranges the hashes in a specific type of Merkle tree 378 | that allows for certain replication properties. 379 | 380 | Dat is also able to fully or partially synchronize streams in a 381 | distributed setting even if the stream is being appended to. This is 382 | accomplished by using the messaging protocol to traverse the Merkle tree 383 | of remote sources and fetch a strategic set of nodes. Due to the 384 | low-level, message-oriented design of the replication protocol, 385 | different node traversal strategies can be implemented. 386 | 387 | There are two types of versioning performed automatically by Dat. 388 | Metadata is stored in a folder called \texttt{.dat} in the root folder 389 | of a repository, and data is stored as normal files in the root folder. 390 | 391 | \subsubsection{Metadata Versioning}\label{metadata-versioning} 392 | 393 | Dat tries as much as possible to act as a one-to-one mirror of the state 394 | of a folder and all its contents. When importing files, Dat uses a 395 | sorted, depth-first recursion to list all the files in the tree. For 396 | each file it finds, it grabs the filesystem metadata (filename, Stat 397 | object, etc) and checks if there is already an entry for this filename 398 | with this exact metadata already represented in the Dat repository 399 | metadata. If the file with this metadata matches exactly the newest 400 | version of the file metadata stored in Dat, then this file will be 401 | skipped (no change). 402 | 403 | If the metadata differs from the current existing one (or there are no 404 | entries for this filename at all in the history), then this new metadata 405 | entry will be appended as the new `latest' version for this file in the 406 | append-only SLEEP metadata content register (described below). 407 | 408 | \subsubsection{Content Versioning}\label{content-versioning} 409 | 410 | In addition to storing a historical record of filesystem metadata, the 411 | content of the files themselves are also capable of being stored in a 412 | version controlled manner. The default storage system used in Dat stores 413 | the files as files. This has the advantage of being very straightforward 414 | for users to understand, but the downside of not storing old versions of 415 | content by default. 416 | 417 | In contrast to other version control systems like Git, Dat by default 418 | only stores the current set of checked out files on disk in the 419 | repository folder, not old versions. It does store all previous metadata 420 | for old versions in \texttt{.dat}. Git for example stores all previous 421 | content versions and all previous metadata versions in the \texttt{.git} 422 | folder. Because Dat is designed for larger datasets, if it stored all 423 | previous file versions in \texttt{.dat}, then the \texttt{.dat} folder 424 | could easily fill up the users hard drive inadvertently. Therefore Dat 425 | has multiple storage modes based on usage. 426 | 427 | Hypercore registers include an optional \texttt{data} file that stores 428 | all chunks of data. In Dat, only the \texttt{metadata.data} file is 429 | used, but the \texttt{content.data} file is not used. The default 430 | behavior is to store the current files only as normal files. If you want 431 | to run an `archival' node that keeps all previous versions, you can 432 | configure Dat to use the \texttt{content.data} file instead. For 433 | example, on a shared server with lots of storage you probably want to 434 | store all versions. However on a workstation machine that is only 435 | accessing a subset of one version, the default mode of storing all 436 | metadata plus the current set of downloaded files is acceptable, because 437 | you know the server has the full history. 438 | 439 | \subsubsection{Merkle Trees}\label{merkle-trees} 440 | 441 | Registers in Dat use a specific method of encoding a Merkle tree where 442 | hashes are positioned by a scheme called binary in-order interval 443 | numbering or just ``bin'' numbering. This is just a specific, 444 | deterministic way of laying out the nodes in a tree. For example a tree 445 | with 7 nodes will always be arranged like this: 446 | 447 | \begin{verbatim} 448 | 0 449 | 1 450 | 2 451 | 3 452 | 4 453 | 5 454 | 6 455 | \end{verbatim} 456 | 457 | In Dat, the hashes of the chunks of files are always even numbers, at 458 | the wide end of the tree. So the above tree had four original values 459 | that become the even numbers: 460 | 461 | \begin{verbatim} 462 | chunk0 -> 0 463 | chunk1 -> 2 464 | chunk2 -> 4 465 | chunk3 -> 6 466 | \end{verbatim} 467 | 468 | In the resulting Merkle tree, the even and odd nodes store different 469 | information: 470 | 471 | \begin{itemize} 472 | \tightlist 473 | \item 474 | Evens - List of data hashes {[}chunk0, chunk1, chunk2, \ldots{}{]} 475 | \item 476 | Odds - List of Merkle hashes (hashes of child even nodes) {[}hash0, 477 | hash1, hash2, \ldots{}{]} 478 | \end{itemize} 479 | 480 | These two lists get interleaved into a single register such that the 481 | indexes (position) in the register are the same as the bin numbers from 482 | the Merkle tree. 483 | 484 | All odd hashes are derived by hashing the two child nodes, e.g.~given 485 | hash0 is \texttt{hash(chunk0)} and hash2 is \texttt{hash(chunk1)}, hash1 486 | is \texttt{hash(hash0\ +\ hash2)}. 487 | 488 | For example a register with two data entries would look something like 489 | this (pseudocode): 490 | 491 | \begin{verbatim} 492 | 0. hash(chunk0) 493 | 1. hash(hash(chunk0) + hash(chunk1)) 494 | 2. hash(chunk1) 495 | \end{verbatim} 496 | 497 | It is possible for the in-order Merkle tree to have multiple roots at 498 | once. A root is defined as a parent node with a full set of child node 499 | slots filled below it. 500 | 501 | For example, this tree has 2 roots (1 and 4) 502 | 503 | \begin{verbatim} 504 | 0 505 | 1 506 | 2 507 | 508 | 4 509 | \end{verbatim} 510 | 511 | This tree has one root (3): 512 | 513 | \begin{verbatim} 514 | 0 515 | 1 516 | 2 517 | 3 518 | 4 519 | 5 520 | 6 521 | \end{verbatim} 522 | 523 | This one has one root (1): 524 | 525 | \begin{verbatim} 526 | 0 527 | 1 528 | 2 529 | \end{verbatim} 530 | 531 | \subsubsection{Replication Example}\label{replication-example} 532 | 533 | This section describes in high level the replication flow of a Dat. Note 534 | that the low level details are available by reading the SLEEP section 535 | below. For the sake of illustrating how this works in practice in a 536 | networked replication scenario, consider a folder with two files: 537 | 538 | \begin{verbatim} 539 | bat.jpg 540 | cat.jpg 541 | \end{verbatim} 542 | 543 | To send these files to another machine using Dat, you would first add 544 | them to a Dat repository by splitting them into chunks and constructing 545 | SLEEP files representing the chunks and filesystem metadata. 546 | 547 | Let's assume \texttt{bat.jpg} and \texttt{cat.jpg} both produce three 548 | chunks, each around 64KB. Dat stores in a representation called SLEEP, 549 | but here we will show a pseudo-representation for the purposes of 550 | illustrating the replication process. The six chunks get sorted into a 551 | list like this: 552 | 553 | \begin{verbatim} 554 | bat-1 555 | bat-2 556 | bat-3 557 | cat-1 558 | cat-2 559 | cat-3 560 | \end{verbatim} 561 | 562 | These chunks then each get hashed, and the hashes get arranged into a 563 | Merkle tree (the content register): 564 | 565 | \begin{verbatim} 566 | 0 - hash(bat-1) 567 | 1 - hash(0 + 2) 568 | 2 - hash(bat-2) 569 | 3 - hash(1 + 5) 570 | 4 - hash(bat-3) 571 | 5 - hash(4 + 6) 572 | 6 - hash(cat-1) 573 | 8 - hash(cat-2) 574 | 9 - hash(8 + 10) 575 | 10 - hash(cat-3) 576 | \end{verbatim} 577 | 578 | Next we calculate the root hashes of our tree, in this case 3 and 9. We 579 | then hash them together, and cryptographically sign the hash. This 580 | signed hash now can be used to verify all nodes in the tree, and the 581 | signature proves it was produced by us, the holder of the private key 582 | for this Dat. 583 | 584 | This tree is for the hashes of the contents of the photos. There is also 585 | a second Merkle tree that Dat generates that represents the list of 586 | files and their metadata and looks something like this (the metadata 587 | register): 588 | 589 | \begin{verbatim} 590 | 0 - hash({contentRegister: '9e29d624...'}) 591 | 1 - hash(0 + 2) 592 | 2 - hash({"bat.jpg", first: 0, length: 3}) 593 | 4 - hash({"cat.jpg", first: 3, length: 3}) 594 | \end{verbatim} 595 | 596 | The first entry in this feed is a special metadata entry that tells Dat 597 | the address of the second feed (the content register). Note that node 3 598 | is not included yet, because 3 is the hash of \texttt{1\ +\ 5}, but 5 599 | does not exist yet, so will be written at a later update. 600 | 601 | Now we're ready to send our metadata to the other peer. The first 602 | message is a \texttt{Register} message with the key that was shared for 603 | this Dat. Let's call ourselves Alice and the other peer Bob. Alice sends 604 | Bob a \texttt{Want} message that declares they want all nodes in the 605 | file list (the metadata register). Bob replies with a single 606 | \texttt{Have} message that indicates he has 2 nodes of data. Alice sends 607 | three \texttt{Request} messages, one for each leaf node 608 | (\texttt{0,\ 2,\ 4}). Bob sends back three \texttt{Data} messages. The 609 | first \texttt{Data} message contains the content register key, the hash 610 | of the sibling, in this case node \texttt{2}, the hash of the uncle root 611 | \texttt{4}, as well as a signature for the root hashes (in this case 612 | \texttt{1,\ 4}). Alice verifies the integrity of this first 613 | \texttt{Data} message by hashing the metadata received for the content 614 | register metadata to produce the hash for node \texttt{0}. They then 615 | hash the hash \texttt{0} with the hash \texttt{2} that was included to 616 | reproduce hash \texttt{1}, and hashes their \texttt{1} with the value 617 | for \texttt{4} they received, which they can use the received signature 618 | to verify it was the same data. When the next \texttt{Data} message is 619 | received, a similar process is performed to verify the content. 620 | 621 | Now Alice has the full list of files in the Dat, but decides they only 622 | want to download \texttt{cat.png}. Alice knows they want blocks 3 623 | through 6 from the content register. First Alice sends another 624 | \texttt{Register} message with the content key to open a new replication 625 | channel over the connection. Then Alice sends three \texttt{Request} 626 | messages, one for each of blocks \texttt{4,\ 5,\ 6}. Bob sends back 627 | three \texttt{Data} messages with the data for each block, as well as 628 | the hashes needed to verify the content in a way similar to the process 629 | described above for the metadata feed. 630 | 631 | \subsection{2.5 Random Access}\label{random-access} 632 | 633 | Dat pursues the following access capabilities: 634 | 635 | \begin{itemize} 636 | \tightlist 637 | \item 638 | Support large file hierachies (millions of files in a single 639 | repository). 640 | \item 641 | Support efficient traversal of the hierarchy (listing files in 642 | arbitrary folders efficiently). 643 | \item 644 | Store all changes to all files (metadata and/or content). 645 | \item 646 | List all changes made to any single file. 647 | \item 648 | View the state of all files relative to any point in time. 649 | \item 650 | Subscribe live to all changes (any file). 651 | \item 652 | Subscribe live to changes to files under a specific path. 653 | \item 654 | Efficiently access any byte range of any version of any file. 655 | \item 656 | Allow all of the above to happen remotely, only syncing the minimum 657 | metadata necessary to perform any action. 658 | \item 659 | Allow efficient comparison of remote and local repository state to 660 | request missing pieces during synchronization. 661 | \item 662 | Allow entire remote archive to be synchronized, or just some subset of 663 | files and/or versions. 664 | \end{itemize} 665 | 666 | The way Dat accomplishes these is through a combination of storing all 667 | changes in Hypercore feeds, but also using strategic metadata indexing 668 | strategies that support certain queries efficiently to be performed by 669 | traversing the Hypercore feeds. The protocol itself is specified in 670 | Section 3 (SLEEP), but a scenario based summary follows here. 671 | 672 | \subsubsection{Scenario: Reading a file from a specific byte 673 | offset}\label{scenario-reading-a-file-from-a-specific-byte-offset} 674 | 675 | Alice has a dataset in Dat, Bob wants to access a 100MB CSV called 676 | \texttt{cat\_dna.csv} stored in the remote repository, but only wants to 677 | access the 10MB range of the CSV spanning from 30MB - 40MB. 678 | 679 | Bob has never communicated with Alice before, and is starting fresh with 680 | no knowledge of this Dat repository other than that he knows he wants 681 | \texttt{cat\_dna.csv} at a specific offset. 682 | 683 | First, Bob asks Alice through the Dat protocol for the metadata he needs 684 | to resolve \texttt{cat\_dna.csv} to the correct metadata feed entry that 685 | represents the file he wants. Note: In this scenario we assume Bob wants 686 | the latest version of \texttt{cat\_dna.csv}. It is also possible to do 687 | this for a specific older version. 688 | 689 | Bob first sends a \texttt{Request} message for the latest entry in the 690 | metadata feed. Alice responds. Bob looks at the \texttt{trie} value, and 691 | using the lookup algorithm described below sends another 692 | \texttt{Request} message for the metadata node that is closer to the 693 | filename he is looking for. This repeats until Alice sends Bob the 694 | matching metadata entry. This is the un-optimized resolution that uses 695 | \texttt{log(n)} round trips, though there are ways to optimize this by 696 | having Alice send additional sequence numbers to Bob that help him 697 | traverse in less round trips. 698 | 699 | In the metadata record Bob received for \texttt{cat\_dna.csv} there is 700 | the byte offset to the beginning of the file in the data feed. Bob adds 701 | his +30MB offset to this value and starts requesting pieces of data 702 | starting at that byte offset using the SLEEP protocol as described 703 | below. 704 | 705 | This method tries to allow any byte range of any file to be accessed 706 | without the need to synchronize the full metadata for all files up 707 | front. 708 | 709 | \subsection{3. Dat Network Protocol}\label{dat-network-protocol} 710 | 711 | The SLEEP format is designed to allow for sparse replication, meaning 712 | you can efficiently download only the metadata and data required to 713 | resolve a single byte region of a single file, which makes Dat suitable 714 | for a wide variety of streaming, real time and large dataset use cases. 715 | 716 | To take advantage of this, Dat includes a network protocol. It is 717 | message-based and stateless, making it possible to implement on a 718 | variety of network transport protocols including UDP and TCP. Both 719 | metadata and content registers in SLEEP share the exact same replication 720 | protocol. 721 | 722 | Individual messages are encoded using Protocol Buffers and there are ten 723 | message types using the following schema: 724 | 725 | \subsubsection{Wire Protocol}\label{wire-protocol} 726 | 727 | Over the wire messages are packed in the following lightweight container 728 | format 729 | 730 | \begin{verbatim} 731 | 732 | 733 | 734 | \end{verbatim} 735 | 736 | The \texttt{header} value is a single varint that has two pieces of 737 | information: the integer \texttt{type} that declares a 4-bit message 738 | type (used below), and a channel identifier, \texttt{0} for metadata and 739 | \texttt{1} for content. 740 | 741 | To generate this varint, you bitshift the 4-bit type integer onto the 742 | end of the channel identifier, e.g. 743 | \texttt{channel\ \textless{}\textless{}\ 4\ \textbar{}\ \textless{}4-bit-type\textgreater{}}. 744 | 745 | \subsubsection{Feed}\label{feed} 746 | 747 | Type 0. Should be the first message sent on a channel. 748 | 749 | \begin{itemize} 750 | \tightlist 751 | \item 752 | \texttt{discoveryKey} - A BLAKE2b keyed hash of the string `hypercore' 753 | using the public key of the metadata register as the key. 754 | \item 755 | \texttt{nonce} - 24 bytes (192 bits) of random binary data, used in 756 | our encryption scheme 757 | \end{itemize} 758 | 759 | \begin{verbatim} 760 | message Feed { 761 | required bytes discoveryKey = 1; 762 | optional bytes nonce = 2; 763 | } 764 | \end{verbatim} 765 | 766 | \subsubsection{Handshake}\label{handshake} 767 | 768 | Type 1. Overall connection handshake. Should be sent just after the feed 769 | message on the first channel only (metadata). 770 | 771 | \begin{itemize} 772 | \tightlist 773 | \item 774 | \texttt{id} - 32 byte random data used as a identifier for this peer 775 | on the network, useful for checking if you are connected to yourself 776 | or another peer more than once 777 | \item 778 | \texttt{live} - Whether or not you want to operate in live 779 | (continuous) replication mode or end after the initial sync 780 | \item 781 | \texttt{userData} - User-specific metadata encoded as a byte sequence 782 | \item 783 | \texttt{extensions} - List of extensions that are supported on this 784 | Feed 785 | \end{itemize} 786 | 787 | \begin{verbatim} 788 | message Handshake { 789 | optional bytes id = 1; 790 | optional bool live = 2; 791 | optional bytes userData = 3; 792 | repeated string extensions = 4; 793 | } 794 | \end{verbatim} 795 | 796 | \subsubsection{Info}\label{info} 797 | 798 | Type 2. Message indicating state changes. Used to indicate whether you 799 | are uploading and/or downloading. 800 | 801 | Initial state for uploading/downloading is true. If both ends are not 802 | downloading and not live it is safe to consider the stream ended. 803 | 804 | \begin{verbatim} 805 | message Info { 806 | optional bool uploading = 1; 807 | optional bool downloading = 2; 808 | } 809 | \end{verbatim} 810 | 811 | \subsubsection{Have}\label{have} 812 | 813 | Type 3. How you tell the other peer what chunks of data you have or 814 | don't have. You should only send Have messages to peers who have 815 | expressed interest in this region with Want messages. 816 | 817 | \begin{itemize} 818 | \tightlist 819 | \item 820 | \texttt{start} - If you only specify \texttt{start}, it means you are 821 | telling the other side you only have 1 chunk at the position at the 822 | value in \texttt{start}. 823 | \item 824 | \texttt{length} - If you specify length, you can describe a range of 825 | values that you have all of, starting from \texttt{start}. 826 | \item 827 | \texttt{bitfield} - If you would like to send a range of sparse data 828 | about haves/don't haves via bitfield, relative to \texttt{start}. 829 | \end{itemize} 830 | 831 | \begin{verbatim} 832 | message Have { 833 | required uint64 start = 1; 834 | optional uint64 length = 2 [default = 1]; 835 | optional bytes bitfield = 3; 836 | } 837 | \end{verbatim} 838 | 839 | When sending bitfields you must run length encode them. The encoded 840 | bitfield is a series of compressed and uncompressed bit sequences. All 841 | sequences start with a header that is a varint. 842 | 843 | If the last bit is set in the varint (it is an odd number) then a header 844 | represents a compressed bit sequence. 845 | 846 | \begin{verbatim} 847 | compressed-sequence = varint( 848 | byte-length-of-sequence 849 | << 2 | bit << 1 | 1 850 | ) 851 | \end{verbatim} 852 | 853 | If the last bit is \emph{not} set then a header represents a 854 | non-compressed sequence. 855 | 856 | \begin{verbatim} 857 | uncompressed-sequence = varint( 858 | byte-length-of-bitfield << 1 | 0 859 | ) + (bitfield) 860 | \end{verbatim} 861 | 862 | \subsubsection{Unhave}\label{unhave} 863 | 864 | Type 4. How you communicate that you deleted or removed a chunk you used 865 | to have. 866 | 867 | \begin{verbatim} 868 | message Unhave { 869 | required uint64 start = 1; 870 | optional uint64 length = 2 [default = 1]; 871 | } 872 | \end{verbatim} 873 | 874 | \subsubsection{Want}\label{want} 875 | 876 | Type 5. How you ask the other peer to subscribe you to Have messages for 877 | a region of chunks. The \texttt{length} value defaults to Infinity or 878 | feed.length (if not live). 879 | 880 | \begin{verbatim} 881 | message Want { 882 | required uint64 start = 1; 883 | optional uint64 length = 2; 884 | } 885 | \end{verbatim} 886 | 887 | \subsubsection{Unwant}\label{unwant} 888 | 889 | Type 6. How you ask to unsubscribe from Have messages for a region of 890 | chunks from the other peer. You should only Unwant previously Wanted 891 | regions, but if you do Unwant something that hasn't been Wanted it won't 892 | have any effect. The \texttt{length} value defaults to Infinity or 893 | feed.length (if not live). 894 | 895 | \begin{verbatim} 896 | message Unwant { 897 | required uint64 start = 1; 898 | optional uint64 length = 2; 899 | } 900 | \end{verbatim} 901 | 902 | \subsubsection{Request}\label{request} 903 | 904 | Type 7. Request a single chunk of data. 905 | 906 | \begin{itemize} 907 | \tightlist 908 | \item 909 | \texttt{index} - The chunk index for the chunk you want. You should 910 | only ask for indexes that you have received the Have messages for. 911 | \item 912 | \texttt{bytes} - You can also optimistically specify a byte offset, 913 | and in the case the remote is able to resolve the chunk for this byte 914 | offset depending on their Merkle tree state, they will ignore the 915 | \texttt{index} and send the chunk that resolves for this byte offset 916 | instead. But if they cannot resolve the byte request, \texttt{index} 917 | will be used. 918 | \item 919 | \texttt{hash} - If you only want the hash of the chunk and not the 920 | chunk data itself. 921 | \item 922 | \texttt{nodes} - A 64 bit long bitfield representing which parent 923 | nodes you have. 924 | \end{itemize} 925 | 926 | The \texttt{nodes} bitfield is an optional optimization to reduce the 927 | amount of duplicate nodes exchanged during the replication lifecycle. It 928 | indicates which parents you have or don't have. You have a maximum of 64 929 | parents you can specify. Because \texttt{uint64} in Protocol Buffers is 930 | implemented as a varint, over the wire this does not take up 64 bits in 931 | most cases. The first bit is reserved to signify whether or not you need 932 | a signature in response. The rest of the bits represent whether or not 933 | you have (\texttt{1}) or don't have (\texttt{0}) the information at this 934 | node already. The ordering is determined by walking parent, sibling up 935 | the tree all the way to the root. 936 | 937 | \begin{verbatim} 938 | message Request { 939 | required uint64 index = 1; 940 | optional uint64 bytes = 2; 941 | optional bool hash = 3; 942 | optional uint64 nodes = 4; 943 | } 944 | \end{verbatim} 945 | 946 | \subsubsection{Cancel}\label{cancel} 947 | 948 | Type 8. Cancel a previous Request message that you haven't received yet. 949 | 950 | \begin{verbatim} 951 | message Cancel { 952 | required uint64 index = 1; 953 | optional uint64 bytes = 2; 954 | optional bool hash = 3; 955 | } 956 | \end{verbatim} 957 | 958 | \subsubsection{Data}\label{data} 959 | 960 | Type 9. Sends a single chunk of data to the other peer. You can send it 961 | in response to a Request or unsolicited on its own as a friendly gift. 962 | The data includes all of the Merkle tree parent nodes needed to verify 963 | the hash chain all the way up to the Merkle roots for this chunk. 964 | Because you can produce the direct parents by hashing the chunk, only 965 | the roots and `uncle' hashes are included (the siblings to all of the 966 | parent nodes). 967 | 968 | \begin{itemize} 969 | \tightlist 970 | \item 971 | \texttt{index} - The chunk position for this chunk. 972 | \item 973 | \texttt{value} - The chunk binary data. Empty if you are sending only 974 | the hash. 975 | \item 976 | \texttt{Node.index} - The index for this chunk in in-order notation 977 | \item 978 | \texttt{Node.hash} - The hash of this chunk 979 | \item 980 | \texttt{Node.size}- The aggregate chunk size for all children below 981 | this node (The sum of all chunk sizes of all children) 982 | \item 983 | \texttt{signature} - If you are sending a root node, all root nodes 984 | must have the signature included. 985 | \end{itemize} 986 | 987 | \begin{verbatim} 988 | message Data { 989 | required uint64 index = 1; 990 | optional bytes value = 2; 991 | repeated Node nodes = 3; 992 | optional bytes signature = 4; 993 | 994 | message Node { 995 | required uint64 index = 1; 996 | required bytes hash = 2; 997 | required uint64 size = 3; 998 | } 999 | } 1000 | \end{verbatim} 1001 | 1002 | \section{4. Existing Work}\label{existing-work} 1003 | 1004 | Dat is inspired by a number of features from existing systems. 1005 | 1006 | \subsection{Git}\label{git} 1007 | 1008 | Git popularized the idea of a directed acyclic graph (DAG) combined with 1009 | a Merkle tree, a way to represent changes to data where each change is 1010 | addressed by the secure hash of the change plus all ancestor hashes in a 1011 | graph. This provides a way to trust data integrity, as the only way a 1012 | specific hash could be derived by another peer is if they have the same 1013 | data and change history required to reproduce that hash. This is 1014 | important for reproducibility as it lets you trust that a specific git 1015 | commit hash refers to a specific source code state. 1016 | 1017 | Decentralized version control tools for source code like Git provide a 1018 | protocol for efficiently downloading changes to a set of files, but are 1019 | optimized for text files and have issues with large files. Solutions 1020 | like Git-LFS solve this by using HTTP to download large files, rather 1021 | than the Git protocol. GitHub offers Git-LFS hosting but charges 1022 | repository owners for bandwidth on popular files. Building a distributed 1023 | distribution layer for files in a Git repository is difficult due to 1024 | design of Git Packfiles which are delta compressed repository states 1025 | that do not easily support random access to byte ranges in previous file 1026 | versions. 1027 | 1028 | \subsection{BitTorrent}\label{bittorrent} 1029 | 1030 | BitTorrent implements a swarm based file sharing protocol for static 1031 | datasets. Data is split into fixed sized chunks, hashed, and then that 1032 | hash is used to discover peers that have the same data. An advantage of 1033 | using BitTorrent for dataset transfers is that download bandwidth can be 1034 | fully saturated. Since the file is split into pieces, and peers can 1035 | efficiently discover which pieces each of the peers they are connected 1036 | to have, it means one peer can download non-overlapping regions of the 1037 | dataset from many peers at the same time in parallel, maximizing network 1038 | throughput. 1039 | 1040 | Fixed sized chunking has drawbacks for data that changes. BitTorrent 1041 | assumes all metadata will be transferred up front which makes it 1042 | impractical for streaming or updating content. Most BitTorrent clients 1043 | divide data into 1024 pieces meaning large datasets could have a very 1044 | large chunk size which impacts random access performance (e.g.~for 1045 | streaming video). 1046 | 1047 | Another drawback of BitTorrent is due to the way clients advertise and 1048 | discover other peers in absence of any protocol level privacy or trust. 1049 | From a user privacy standpoint, BitTorrent leaks what users are 1050 | accessing or attempting to access, and does not provide the same 1051 | browsing privacy functions as systems like SSL. 1052 | 1053 | \subsection{Kademlia Distributed Hash 1054 | Table}\label{kademlia-distributed-hash-table} 1055 | 1056 | Kademlia (Maymounkov and Mazieres 2002) is a distributed hash table, a 1057 | distributed key/value store that can serve a similar purpose to DNS 1058 | servers but has no hard coded server addresses. All clients in Kademlia 1059 | are also servers. As long as you know at least one address of another 1060 | peer in the network, you can ask them for the key you are trying to find 1061 | and they will either have it or give you some other people to talk to 1062 | that are more likely to have it. 1063 | 1064 | If you don't have an initial peer to talk to you, most clients use a 1065 | bootstrap server that randomly gives you a peer in the network to start 1066 | with. If the bootstrap server goes down, the network still functions as 1067 | long as other methods can be used to bootstrap new peers (such as 1068 | sending them peer addresses through side channels like how .torrent 1069 | files include tracker addresses to try in case Kademlia finds no peers). 1070 | 1071 | Kademlia is distinct from previous DHT designs due to its simplicity. It 1072 | uses a very simple XOR operation between two keys as its ``distance'' 1073 | metric to decide which peers are closer to the data being searched for. 1074 | On paper it seems like it wouldn't work as it doesn't take into account 1075 | things like ping speed or bandwidth. Instead its design is very simple 1076 | on purpose to minimize the amount of control/gossip messages and to 1077 | minimize the amount of complexity required to implement it. In practice 1078 | Kademlia has been extremely successful and is widely deployed as the 1079 | ``Mainline DHT'' for BitTorrent, with support in all popular BitTorrent 1080 | clients today. 1081 | 1082 | Due to the simplicity in the original Kademlia design a number of 1083 | attacks such as DDOS and/or sybil have been demonstrated. There are 1084 | protocol extensions (BEPs) which in certain cases mitigate the effects 1085 | of these attacks, such as BEP 44 which includes a DDOS mitigation 1086 | technique. Nonetheless anyone using Kademlia should be aware of the 1087 | limitations. 1088 | 1089 | \subsection{Peer to Peer Streaming Peer Protocol 1090 | (PPSPP)}\label{peer-to-peer-streaming-peer-protocol-ppspp} 1091 | 1092 | PPSPP 1093 | (\href{https://datatracker.ietf.org/doc/rfc7574/?include_text=1}{IETF 1094 | RFC 7574}, (Bakker, Petrocco, and Grishchenko 2015)) is a protocol for 1095 | live streaming content over a peer to peer network. In it they define a 1096 | specific type of Merkle Tree that allows for subsets of the hashes to be 1097 | requested by a peer in order to reduce the time-till-playback for end 1098 | users. BitTorrent for example transfers all hashes up front, which is 1099 | not suitable for live streaming. 1100 | 1101 | Their Merkle trees are ordered using a scheme they call ``bin 1102 | numbering'', which is a method for deterministically arranging an 1103 | append-only log of leaf nodes into an in-order layout tree where 1104 | non-leaf nodes are derived hashes. If you want to verify a specific 1105 | node, you only need to request its sibling's hash and all its uncle 1106 | hashes. PPSPP is very concerned with reducing round trip time and 1107 | time-till-playback by allowing for many kinds of optimizations, such as 1108 | to pack as many hashes into datagrams as possible when exchanging tree 1109 | information with peers. 1110 | 1111 | Although PPSPP was designed with streaming video in mind, the ability to 1112 | request a subset of metadata from a large and/or streaming dataset is 1113 | very desirable for many other types of datasets. 1114 | 1115 | \subsection{WebTorrent}\label{webtorrent} 1116 | 1117 | With WebRTC, browsers can now make peer to peer connections directly to 1118 | other browsers. BitTorrent uses UDP sockets which aren't available to 1119 | browser JavaScript, so can't be used as-is on the Web. 1120 | 1121 | WebTorrent implements the BitTorrent protocol in JavaScript using WebRTC 1122 | as the transport. This includes the BitTorrent block exchange protocol 1123 | as well as the tracker protocol implemented in a way that can enable 1124 | hybrid nodes, talking simultaneously to both BitTorrent and WebTorrent 1125 | swarms (if a client is capable of making both UDP sockets as well as 1126 | WebRTC sockets, such as Node.js). Trackers are exposed to web clients 1127 | over HTTP or WebSockets. 1128 | 1129 | \subsection{InterPlanetary File 1130 | System}\label{interplanetary-file-system} 1131 | 1132 | IPFS is a family of application and network protocols that have peer to 1133 | peer file sharing and data permanence baked in. IPFS abstracts network 1134 | protocols and naming systems to provide an alternative application 1135 | delivery platform to today's Web. For example, instead of using HTTP and 1136 | DNS directly, in IPFS you would use LibP2P streams and IPNS in order to 1137 | gain access to the features of the IPFS platform. 1138 | 1139 | \subsection{Certificate Transparency/Secure 1140 | Registers}\label{certificate-transparencysecure-registers} 1141 | 1142 | The UK Government Digital Service have developed the concept of a 1143 | register which they define as a digital public ledger you can trust. In 1144 | the UK government registers are beginning to be piloted as a way to 1145 | expose essential open data sets in a way where consumers can verify the 1146 | data has not been tampered with, and allows the data publishers to 1147 | update their data sets over time. 1148 | 1149 | The design of registers was inspired by the infrastructure backing the 1150 | Certificate Transparency (Laurie, Langley, and Kasper 2013) project, 1151 | initiated at Google, which provides a service on top of SSL certificates 1152 | that enables service providers to write certificates to a distributed 1153 | public ledger. Any client or service provider can verify if a 1154 | certificate they received is in the ledger, which protects against so 1155 | called ``rogue certificates''. 1156 | 1157 | \section{5. Reference Implementation}\label{reference-implementation} 1158 | 1159 | The connection logic is implemented in a module called 1160 | \href{https://www.npmjs.com/package/discovery-swarm}{discovery-swarm}. 1161 | This builds on discovery-channel and adds connection establishment, 1162 | management and statistics. It provides statistics such as how many 1163 | sources are currently connected, how many good and bad behaving sources 1164 | have been talked to, and it automatically handles connecting and 1165 | reconnecting to sources. UTP support is implemented in the module 1166 | \href{https://www.npmjs.com/package/utp-native}{utp-native}. 1167 | 1168 | Our implementation of source discovery is called 1169 | \href{https://npmjs.org/discovery-channel}{discovery-channel}. We also 1170 | run a \href{https://www.npmjs.com/package/dns-discovery}{custom DNS 1171 | server} that Dat clients use (in addition to specifying their own if 1172 | they need to), as well as a 1173 | \href{https://github.com/bittorrent/bootstrap-dht}{DHT bootstrap} 1174 | server. These discovery servers are the only centralized infrastructure 1175 | we need for Dat to work over the Internet, but they are redundant, 1176 | interchangeable, never see the actual data being shared, anyone can run 1177 | their own and Dat will still work even if they all are unavailable. If 1178 | this happens discovery will just be manual (e.g.~manually sharing 1179 | IP/ports). 1180 | 1181 | \section{Acknowledgements}\label{acknowledgements} 1182 | 1183 | This work was made possible through grants from the John S. and James L. 1184 | Knight and Alfred P. Sloan Foundations. 1185 | 1186 | \section*{References}\label{references} 1187 | \addcontentsline{toc}{section}{References} 1188 | 1189 | \hypertarget{refs}{} 1190 | \hypertarget{ref-aumasson2013blake2}{} 1191 | Aumasson, Jean-Philippe, Samuel Neves, Zooko Wilcox-O'Hearn, and 1192 | Christian Winnerlein. 2013. ``BLAKE2: Simpler, Smaller, Fast as Md5.'' 1193 | In \emph{International Conference on Applied Cryptography and Network 1194 | Security}, 119--35. Springer. 1195 | 1196 | \hypertarget{ref-bakker2015peer}{} 1197 | Bakker, A, R Petrocco, and V Grishchenko. 2015. ``Peer-to-Peer Streaming 1198 | Peer Protocol (Ppspp).'' 1199 | 1200 | \hypertarget{ref-bernstein2012high}{} 1201 | Bernstein, Daniel J, Niels Duif, Tanja Lange, Peter Schwabe, and Bo-Yin 1202 | Yang. 2012. ``High-Speed High-Security Signatures.'' \emph{Journal of 1203 | Cryptographic Engineering}. Springer, 1--13. 1204 | 1205 | \hypertarget{ref-laurie2013certificate}{} 1206 | Laurie, Ben, Adam Langley, and Emilia Kasper. 2013. ``Certificate 1207 | Transparency.'' 1208 | 1209 | \hypertarget{ref-maymounkov2002kademlia}{} 1210 | Maymounkov, Petar, and David Mazieres. 2002. ``Kademlia: A Peer-to-Peer 1211 | Information System Based on the Xor Metric.'' In \emph{International 1212 | Workshop on Peer-to-Peer Systems}, 53--65. Springer. 1213 | 1214 | \hypertarget{ref-mykletun2003providing}{} 1215 | Mykletun, Einar, Maithili Narasimha, and Gene Tsudik. 2003. ``Providing 1216 | Authentication and Integrity in Outsourced Databases Using Merkle Hash 1217 | Trees.'' \emph{UCI-SCONCE Technical Report}. 1218 | 1219 | \hypertarget{ref-sleep}{} 1220 | Ogden, Maxwell, and Mathias Buus. 2017. ``SLEEP - the Dat Protocol on 1221 | Disk Format.'' In. 1222 | 1223 | \hypertarget{ref-rossi2010ledbat}{} 1224 | Rossi, Dario, Claudio Testa, Silvio Valenti, and Luca Muscariello. 2010. 1225 | ``LEDBAT: The New Bittorrent Congestion Control Protocol.'' In 1226 | \emph{ICCCN}, 1--6. 1227 | 1228 | \end{document} 1229 | --------------------------------------------------------------------------------