├── .gitignore ├── README.md ├── doc └── intro.md ├── project.clj ├── src └── merkle │ ├── kv │ ├── fixed.clj │ └── linear.clj │ └── range.clj └── test └── merkle ├── kv ├── fixed_test.clj └── linear_test.clj └── range_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | ~* 2 | *.swp 3 | /target 4 | /lib 5 | /classes 6 | /checkouts 7 | pom.xml 8 | pom.xml.asc 9 | *.jar 10 | *.class 11 | .lein-deps-sum 12 | .lein-failures 13 | .lein-plugins 14 | .lein-repl-history 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Merkle 2 | 3 | A Clojure library for computing and comparing hash trees over sorted kv 4 | collections. Allows you to efficiently find the differing pairs between two 5 | such collections without exchanging the collections themselves. Useful in the 6 | synchronization of distributed systems. 7 | 8 | ## Installation 9 | 10 | Via clojars: https://clojars.org/merkle 11 | 12 | # kv.linear 13 | 14 | `merkle.kv.linear` provides merkle trees over sorted collections of key-value 15 | pairs, which could be sorted maps, lists, results from a database; anything 16 | which exposes a sorted sequence of objects. You can pass arbitrary key and 17 | value extractor fns to `(tree)`; the defaults are the clojure builtin `key` and 18 | `val`. `kv.linear` is oriented towards a particular case of synchronization for 19 | k-ordered keys in distributed databases; see the namespace comments for gory 20 | details. 21 | 22 | Note: kv.linear is not as efficient as it could be at identifying identical 23 | regions. 24 | 25 | Note: kv.linear has no way to limit the depth of the trees it produces right 26 | now. 27 | 28 | Note: kv.linear rarely identifies regions as identical which are not actually 29 | so; might be an issue with hash collisions over small-cardinality values like 30 | bytes. If values are bytes, deltas between hashes on the order of 10-1000 31 | entries may miss differences around 5% of the time. If values are ~10-20 32 | character strings, diffs are incomplete less than 1 in 10000 tries. 33 | 34 | ## Usage 35 | 36 | ```clj 37 | (use 'merkle.kv.linear) 38 | 39 | ; Set up two maps with some differences 40 | (def map1 (sorted-map :a 1 :b 2 :c 3 :d 4)) 41 | (def map2 (sorted-map :a 1 :b 2 :c 0)) 42 | 43 | ; Compute a merkle tree of each 44 | (def t1 (tree map1)) 45 | (def t2 (tree map2)) 46 | 47 | ; Find pairs of map1 which, if applied to map2, would make it a superset of 48 | ; map1: 49 | (def d1 (diff map1 t1 t2)) 50 | ; => ([:c 3] [:d 4]) 51 | 52 | ; And the inverse: 53 | user=> (def d2 (diff map2 t2 t1)) 54 | ; => ([:c 0]) 55 | 56 | ; We can merge map2's differences back into map1: 57 | (into map1 d2) 58 | ; => {:a 1, :b 2, :c 0, :d 4} 59 | 60 | ; And merge map1's differences into map2: 61 | (into map2 d1) 62 | ; => {:a 1, :b 2, :c 3, :d 4} 63 | 64 | ; Provided a commutative merge function, exchanging diffs is monotonically 65 | ; convergent: 66 | (merge-with max map1 d2) 67 | ; => {:a 1, :b 2, :c 3, :d 4} 68 | (merge-with max map2 d1) 69 | ; => {:a 1, :b 2, :c 3, :d 4} 70 | ``` 71 | 72 | ## License 73 | 74 | Copyright © 2013 Kyle Kingsbury 75 | 76 | Distributed under the Eclipse Public License, the same as Clojure. 77 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to merkle 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/great-documentation/what-to-write/) 4 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject merkle "0.1.1-SNAPSHOT" 2 | :description "Merkle trees for constructing efficient diffs over collections." 3 | :url "http://github.com/aphyr/merkle" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.5.1"] 7 | [primitive-math "0.1.3"]] 8 | :global-vars {*warn-on-reflection* true} 9 | :profiles {:dev {:dependencies [[org.clojure/data.generators "0.1.0"] 10 | [reiddraper/simple-check "0.4.1"]]}} 11 | :jvm-opts ^:replace ["-server"]) 12 | -------------------------------------------------------------------------------- /src/merkle/kv/fixed.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.kv.fixed 2 | "This merkle tree implementation operates over a fixed-size sequence of 3 | 32-bit integers, presumably the hashes of some other data structure. All 4 | participants must agree on the sequence size in advance. Trees are of fixed 5 | size, and identify the indices (starting with 0) of differing or identical 6 | segments." 7 | (:require 8 | [merkle.range :as range] 9 | [primitive-math :as p]) 10 | (:import 11 | [java.util.zip 12 | CRC32] 13 | [java.util 14 | ArrayList])) 15 | 16 | ;;; 17 | 18 | (defrecord Node 19 | [^long hash 20 | ^long min-segment 21 | ^long max-segment 22 | left 23 | right]) 24 | 25 | (defn node->map 26 | "Converts a Node to a standard Clojure map; useful for serialization." 27 | [^Node node] 28 | (when node 29 | {:hash (.hash node) 30 | :min-segment (.min-segment node) 31 | :max-segment (.max-segment node) 32 | :left (node->map (.left node)) 33 | :right (node->map (.right node))})) 34 | 35 | (defn map->node 36 | "Converts a map to a Node; useful for serialization." 37 | [m] 38 | (when m 39 | (Node. 40 | (:hash m) 41 | (:min-segment m) 42 | (:max-segment m) 43 | (map->node (:left m)) 44 | (map->node (:right m))))) 45 | 46 | ;;; 47 | 48 | (defn log2 ^long [^long n] 49 | (long (/ (Math/log n) (Math/log 2)))) 50 | 51 | (defn crc-update-int! 52 | "Updates a CRC with an integer. Returns the (mutated) CRC." 53 | [^CRC32 crc ^int i] 54 | (let [ 55 | (.update crc (unchecked-int (p/>>> l 24))) 56 | (.update crc (unchecked-int (p/>>> l 16))) 57 | (.update crc (unchecked-int (p/>>> l 8))) 58 | (.update crc (unchecked-int l)) 59 | crc) 60 | 61 | (defn crc-update-crc! 62 | "Updates a CRC with the value of a second CRC. Returns the (mutated) first 63 | CRC." 64 | [^CRC32 crc ^CRC32 crc2] 65 | ; maybe avoid an object allocation??? vooooodooooo.... 66 | (let [l (unchecked-long (.getValue crc2))] 67 | ; CRCs contain a 32-bit unsigned integer packed into a long, so we want to 68 | ; extract bits 31-24, 23-16, 15-8, and 7-0. Since CRC update(int) *only* 69 | ; uses bits 0-7 of that int, we'll repeatedly right-shift to create ints 70 | ; containing values from 0-255. 71 | ; 72 | ; Fuck Java. 73 | (.update crc (unchecked-int (p/>>> l 24))) 74 | (.update crc (unchecked-int (p/>>> l 16))) 75 | (.update crc (unchecked-int (p/>>> l 8))) 76 | (.update crc (unchecked-int l)) 77 | crc)) 78 | 79 | (defn hash-levels 80 | "Returns a seq of seqs, with `output-level` elements. Each represents a 81 | level of the hash-tree, from bottom to top. 82 | 83 | The `segment-seq` represents the input hashes of the underlying segments. 84 | Elements in the hash-seq may be `nil`, denoting no elements within that 85 | segment." 86 | [^long output-levels ^long num-segments segment-seq] 87 | (assert (pos? output-levels)) 88 | (let [levels (if (zero? num-segments) 89 | 0 90 | (long (p/inc (log2 num-segments)))) 91 | emit-all? (= output-levels levels) 92 | ^objects crcs (object-array levels) 93 | get-crc (fn [^long idx] 94 | (if-let [crc (aget crcs idx)] 95 | crc 96 | (let [crc (CRC32.)] 97 | (aset crcs idx crc) 98 | crc))) 99 | lists (object-array 100 | (repeatedly output-levels 101 | #(ArrayList.)))] 102 | (loop [idx 0, s segment-seq] 103 | (when-not (empty? s) 104 | 105 | (let [x (first s)] 106 | 107 | ;; update the level-0 hash 108 | (when x 109 | (let [^CRC32 c (get-crc 0)] 110 | (.update c (unchecked-int x)))) 111 | 112 | (when emit-all? 113 | (.add ^ArrayList (aget lists 0) x)) 114 | 115 | ;; ascend the levels as appropriate 116 | (loop [idx idx, level 0] 117 | (when (== 1 (p/bit-and 1 idx)) 118 | (let [^CRC32 crc (aget crcs level)] 119 | 120 | ;; if there's a crc, propagate it upwards 121 | (when crc 122 | (.update ^CRC32 (get-crc (p/inc level)) (.getValue crc)) 123 | (aset crcs level nil)) 124 | 125 | ;; if we're above the threshold for the output tree, write to it 126 | (let [output-level (p/+ output-levels 1 (p/- level levels))] 127 | (when (<= 0 output-level) 128 | (.add ^ArrayList (aget lists output-level) (when crc (.getValue crc)))))) 129 | (recur (p/>> idx 1) (p/inc level))))) 130 | 131 | (recur (p/inc idx) (rest s)))) 132 | 133 | (map seq lists))) 134 | 135 | (defn hash-levels->tree 136 | "Takes tiered sequences from `hash-levels`, and returns the root `Node` of a 137 | tree." 138 | [hash-levels num-segments] 139 | (let [k (long (/ num-segments (Math/pow 2 (dec (count hash-levels)))))] 140 | (if (zero? num-segments) 141 | (Node. 0 0 0 nil nil) 142 | (first 143 | (reduce 144 | (fn [nodes hashes] 145 | (map 146 | (fn [hash [^Node l ^Node r]] 147 | (Node. hash (.min-segment l) (.max-segment r) l r)) 148 | hashes 149 | (partition 2 nodes))) 150 | (map-indexed 151 | (fn [idx hash] 152 | (Node. hash (* idx k) (* (inc idx) k) nil nil)) 153 | (first hash-levels)) 154 | (rest hash-levels)))))) 155 | 156 | (defn tree 157 | "Returns the root `Node` of a hash-tree with `depth` levels. The input 158 | `segment-seq` is a list of hashes for a discrete number of segments, which 159 | must have a cardinality which is a power of two." 160 | ([depth segment-seq] 161 | (tree depth (count segment-seq) segment-seq)) 162 | ([depth num-segments segment-seq] 163 | (hash-levels->tree 164 | (hash-levels depth num-segments segment-seq) 165 | num-segments))) 166 | 167 | ;;; 168 | 169 | (defn- merge-ranges [a b] 170 | "Merges contiguous ranges. Returns a list of one or two ranges, depending on 171 | whether `a` and `b` are contiguous." 172 | (if (and a b) 173 | (let [prefix (butlast a) 174 | suffix (rest b) 175 | a' (last a) 176 | b' (first b)] 177 | (concat 178 | prefix 179 | (if (= (last a') (first b')) 180 | [[(first a') (last b')]] 181 | [a' b']) 182 | suffix)) 183 | (or a b))) 184 | 185 | (defn identical-ranges 186 | "Returns a list of [min,max] tuples describing the segment ranges for which 187 | the two nodes are identical." 188 | [^Node n1 ^Node n2] 189 | (when (and n1 n2) 190 | (if (== (.hash n1) (.hash n2)) 191 | [[(.min-segment n1) (.max-segment n1)]] 192 | (merge-ranges 193 | (identical-ranges (.left n1) (.left n2)) 194 | (identical-ranges (.right n1) (.right n2)))))) 195 | 196 | (defn diff-ranges 197 | "Returns a list of [min,max] tuples describing the segment ranges for which 198 | the two nodes are different." 199 | [^Node n1 ^Node n2] 200 | (assert (instance? Node n1)) 201 | (assert (instance? Node n2)) 202 | (let [min (.min-segment n1) 203 | max (.max-segment n1) 204 | ranges (identical-ranges n1 n2)] 205 | (if (empty? ranges) 206 | ; No identical ranges. 207 | [[min max]] 208 | 209 | ; *some* region is identical. 210 | (concat 211 | (when-not (= min (ffirst ranges)) 212 | [[0 (ffirst ranges)]]) 213 | (->> ranges 214 | (partition 2 1) 215 | (map 216 | (fn [[[_ l] [u _]]] 217 | [l u]))) 218 | (when-not (= max (last (last ranges))) 219 | [[(last (last ranges)) max]]))))) 220 | 221 | (defn diffs 222 | "Returns a sequence of segment indices for which two nodes are different." 223 | [n1 n2] 224 | (mapcat (partial apply range) 225 | (diff-ranges n1 n2))) 226 | -------------------------------------------------------------------------------- /src/merkle/kv/linear.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.kv.linear 2 | "Merkle trees are a data structure used to efficiently localize differences 3 | between two copies of an ordered collection. Each element in the collection 4 | is hashed into a leaf node. Every n leaf nodes have their hashes concatenated 5 | and hashed to produce a supervening node. Those nodes are then combined and 6 | hashed again, until a single root node has hashed the entire collection. 7 | Given two of these trees, one can find differences between two copies of the 8 | collection by finding nodes with differing hashes. 9 | 10 | This particular implementation is aimed at a specific sub-problem: comparing 11 | two key-value collections where: 12 | 13 | 1. The key set could vary between copies. 14 | 2. The collections could be much larger than RAM. 15 | 3. Keys are in sorted order. 16 | 4. The keyspace and probability distribution of keys is unknown. 17 | 5. New keys are usually added at the end. 18 | 19 | Because the key set varies, we need to encode some information about the keys 20 | in the tree nodes. Each of our nodes keeps the minimum and maximum key 21 | covered by its hash. We can then consider nodes equal when they have the 22 | same hash and the same start and end key, which makes it possible to compare 23 | collections with different keys. 24 | 25 | Since the collection is much larger than RAM (for instance, a table on disk), 26 | but the distribution of the keyspace is unknown, our algorithm forgoes having 27 | a well-balanced tree in favor of a single linear scan over the collection, 28 | accruing at most log(n) nodes in RAM. 29 | 30 | Because the key are in sorted order, we can efficiently construct this tree 31 | in a single linear pass, at the cost of having a much smaller right-hand 32 | subtree. The best-case scenario is a fully balanced tree, and the worst-case 33 | scenario is a root node with a fully balanced subtree on the left, and a 34 | single leaf node on the right. The worst-case cost for this structure, as 35 | compared to a balanced btree, is one additional layer of depth. 36 | 37 | Since new keys are added mostly to the end, two collections which are 38 | periodically synchronized will tend to agree on their 39 | first/smallest/left-hand keys and nodes. If two collections agree on the 40 | first N keys, their hash trees will have an identical node->key distribution 41 | over those N keys. In a periodically synchronized collection, this means 42 | we'll only differ on a few keys towards the end, and can efficiently ignore 43 | the bulk of the dataset. In the worst-case scenario, a single key is added to 44 | the beginning of the collection, and none of the hashes are aligned, forcing 45 | a full resync." 46 | (:require [merkle.range :as range])) 47 | 48 | (defrecord Node [^long hash min-key max-key left right]) 49 | 50 | (defn node->map 51 | "Converts a Node to a standard Clojure map; useful for serialization." 52 | [^Node node] 53 | (when node 54 | {:hash (.hash node) 55 | :min-key (.min-key node) 56 | :max-key (.max-key node) 57 | :left (node->map (.left node)) 58 | :right (node->map (.right node))})) 59 | 60 | (defn map->node 61 | "Converts a map to a Node; useful for serialization." 62 | [m] 63 | (when m 64 | (Node. (:hash m) 65 | (:min-key m) 66 | (:max-key m) 67 | (map->node (:left m)) 68 | (map->node (:right m))))) 69 | (defn key-range 70 | "The inclusive range of keys a Node covers." 71 | [^Node node] 72 | (when node 73 | (list (.min-key node) 74 | (.max-key node)))) 75 | 76 | (defn aligned? 77 | "Are two nodes comparable--i.e. do they have the same min-key and max-key?" 78 | [^Node n1 ^Node n2] 79 | (if (nil? n1) 80 | (nil? n2) 81 | (and n2 82 | (= (.min-key n1) (.min-key n2)) 83 | (= (.max-key n1) (.max-key n2))))) 84 | 85 | (defn same? 86 | "Are two nodes equivalent; i.e. do they have the same hash, min-key, and 87 | max-key?" 88 | [^Node n1 ^Node n2] 89 | (if (nil? n1) 90 | (nil? n2) 91 | (and n2 92 | (= (.hash n1) (.hash n2)) 93 | (= (.min-key n1) (.min-key n2)) 94 | (= (.max-key n1) (.max-key n2))))) 95 | 96 | (defn hash-nodes 97 | "Combines two nodes together." 98 | [^Node left ^Node right] 99 | (cond 100 | (nil? left) right 101 | (nil? right) left 102 | :else (Node. (hash (list (.hash left) 103 | (.hash right))) 104 | (.min-key left) 105 | (.max-key right) 106 | left 107 | right))) 108 | 109 | (defn assoc-or-conj! 110 | "Associates, and conj's on OutOfBoundsException." 111 | [v i value] 112 | (try 113 | (assoc! v i value) 114 | (catch IndexOutOfBoundsException e 115 | (conj! v value)))) 116 | 117 | (defn percolate 118 | "Takes a vector of nodes awaiting merging with their neighbors, and a new 119 | node to merge into position i. Returns a new vector of nodes awaiting merging 120 | with their neighbors." 121 | [levels i right] 122 | (let [left (try (nth levels i) 123 | (catch IndexOutOfBoundsException e 124 | :out-of-bounds))] 125 | (let [x (persistent! levels) 126 | levels (transient x)] 127 | (condp = left 128 | ; Fill in an empty space above 129 | nil (assoc! levels i right) 130 | 131 | ; Add a new slot to the top 132 | :out-of-bounds (conj! levels right) 133 | 134 | ; Merge with the left node and recur upwards. 135 | (recur (assoc! levels i nil) 136 | (inc i) 137 | (hash-nodes left right)))))) 138 | 139 | (defn transient-seq 140 | "A sequence over a transient collection. Definitely not safe unless you 141 | realize it immediately." 142 | [levels] 143 | (->> levels 144 | count 145 | range 146 | (map (partial nth levels)))) 147 | 148 | (defn compact? 149 | "Is the given level merged into a single node--i.e. does it look like [nil 150 | nil nil nil node]. Levels are transient so we have to use nth." 151 | [levels] 152 | (->> levels 153 | transient-seq 154 | (remove nil?) 155 | (count) 156 | (<= 1))) 157 | 158 | (defn compact 159 | "Takes a vector of nodes awaiting merging with their neighbors, and merges 160 | them all into a single node. This handles the (usual) case where the right 161 | side of the tree has fewer entries than the right, leaving us with dangling 162 | levels." 163 | [levels] 164 | (let [levels (persistent! levels)] 165 | (if (empty? levels) 166 | nil 167 | (->> levels reverse (remove nil?) (reduce hash-nodes))))) 168 | 169 | (defn tree 170 | "Computes a merkle tree for a sorted kv collection. Does not retain the head. 171 | 172 | Can take two optional functions: keyfn and valfn, which extract the key and 173 | the value from a given element of the collection; if not given, these default 174 | to key and val, respectively." 175 | ([coll] 176 | (tree coll key val)) 177 | ([coll keyfn valfn] 178 | ; We keep a temporary vector containing uncombined nodes in the tree as we 179 | ; move through the collection. The 0th entry of this vector is a node for a 180 | ; single element. The last entry is the Node at the highest level yet 181 | ; reached. Nodes percolate upwards through this vector until we have a 182 | ; single node at the top containing the full tree. 183 | (->> coll 184 | (reduce (fn [levels element] 185 | ; Construct base-level node 186 | (let [k (keyfn element) 187 | h (hash (valfn element)) 188 | node (Node. h k k nil nil)] 189 | (percolate levels 0 node))) 190 | (transient [])) 191 | compact))) 192 | 193 | (defn pr-node 194 | [^Node node] 195 | (if node 196 | (str (pr-str (.hash node)) " (" (.min-key node) " " (.max-key node) ")") 197 | "nil")) 198 | 199 | (defn identical-ranges 200 | "Returns an ordered lazy sequence of (start-key end-key) pairs for which two 201 | trees are known to be identical." 202 | [^Node t1 ^Node t2] 203 | ; (println "compare" (pr-node t1) (pr-node t2)) 204 | 205 | (let [r1 (key-range t1) 206 | r2 (key-range t2)] 207 | (cond 208 | ; One node is nil 209 | (nil? t1) (list) 210 | (nil? t2) (list) 211 | 212 | ; Nodes cover the same key range. 213 | (= r1 r2) 214 | (if (= (.hash t1) (.hash t2)) 215 | ; Nodes are identical. 216 | (list (list (.min-key t1) (.max-key t1))) 217 | 218 | ; We *know* these keys differ. 219 | (lazy-cat (identical-ranges (.left t1) (.left t2)) 220 | (identical-ranges (.right t1) (.right t2)))) 221 | 222 | ; t1 and t2 are totally disjoint; nothing here for us. 223 | (range/disjoint? r1 r2) 224 | (list) 225 | 226 | ; t1 is a proper subset of t2; descend into t2 looking for t1. 227 | (range/subset? r1 r2) 228 | (lazy-cat (identical-ranges t1 (.left t2)) 229 | (identical-ranges t1 (.right t2))) 230 | 231 | ; t2 is a proper subset of t1; descend into t1 looking for it. 232 | (range/subset? r2 r1) 233 | (lazy-cat (identical-ranges (.left t1) t2) 234 | (identical-ranges (.right t1) t2)) 235 | 236 | ; t1 and t2 intersect, but neither is a subset of the other. Expand to 237 | ; all branches. 238 | (range/intersect? r1 r2) 239 | (lazy-cat (identical-ranges (.left t1) (.left t2)) 240 | (identical-ranges (.left t1) (.right t2)) 241 | (identical-ranges (.right t1) (.left t2)) 242 | (identical-ranges (.right t1) (.right t2))) 243 | 244 | :else (throw (IllegalStateException. "Shouldn't get here."))))) 245 | 246 | (defn diff-helper 247 | "Given a sorted sequence of identical ranges, and a sequence of elements from 248 | a local collection, yields elements which differ by riffling through both 249 | sequences." 250 | [ranges coll keyfn] 251 | ; [range 1] [range 2] [range 3] 252 | ; x x o x o o o 253 | (when-let [element (first coll)] 254 | (let [k (keyfn element) 255 | range (first ranges)] 256 | (cond 257 | ; We're out of ranges; return entire remaining collection. 258 | (nil? range) 259 | coll 260 | 261 | ; Element lies before the range and should be included. 262 | (< (compare k (first range)) 0) 263 | (cons element 264 | (lazy-seq (diff-helper ranges (next coll) keyfn))) 265 | 266 | ; Element lies within the range 267 | (<= (compare k (second range)) 0) 268 | (lazy-seq (diff-helper ranges (next coll) keyfn)) 269 | 270 | ; Element lies after the range. 271 | :else 272 | (lazy-seq (diff-helper (next ranges) coll keyfn)))))) 273 | 274 | (defn diff 275 | "Given a local collection, a local tree, and a remote tree, yields a lazy 276 | sequence of elements from the local collection which are not known to be 277 | identical based on the two trees." 278 | ([coll local-tree remote-tree] 279 | (diff coll local-tree remote-tree key)) 280 | ([coll local-tree remote-tree keyfn] 281 | (diff-helper (identical-ranges local-tree remote-tree) coll keyfn))) 282 | -------------------------------------------------------------------------------- /src/merkle/range.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.range 2 | "Functions over inclusive ranges, which are just pairs of comparable objects 3 | like [1 2] or (:a :b)." 4 | (:refer-clojure :exclude [< <= contains?])) 5 | 6 | ; For concision, override comparison here to work with all comparables. 7 | (defn- <= [a b] (clojure.core/< (compare a b) 1)) 8 | (defn- < [a b] (clojure.core/< (compare a b) 0)) 9 | 10 | (defn subset? 11 | "Is range1 a subset of range2?" 12 | [range1 range2] 13 | (and (<= (first range2) 14 | (first range1)) 15 | (<= (second range1) 16 | (second range2)))) 17 | 18 | (defn superset? 19 | "Is range1 a superset of set2?" 20 | [r1 r2] 21 | (and (<= (first r1) 22 | (first r2)) 23 | (<= (second r2) 24 | (second r1)))) 25 | 26 | (defn disjoint? 27 | "Are r1 and r2 totally disjoint ranges?" 28 | [r1 r2] 29 | (or 30 | ; [r1 r1] [r2 r2] 31 | (< (second r1) 32 | (first r2)) 33 | ; [r2 r2] [r1 r1] 34 | (< (second r2) 35 | (first r1)))) 36 | 37 | (defn intersect? 38 | "Do r1 and r2 intersect at all?" 39 | [r1 r2] 40 | (not (disjoint? r1 r2))) 41 | 42 | (defn contains? 43 | "Does the range contain the given value?" 44 | [range element] 45 | (and (<= (first range) element) 46 | (<= element (second range)))) 47 | -------------------------------------------------------------------------------- /test/merkle/kv/fixed_test.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.kv.fixed-test 2 | (:require [clojure.test :refer :all] 3 | [merkle.kv.fixed :refer :all] 4 | [clojure.pprint :refer :all] 5 | [simple-check.clojure-test :refer [defspec]] 6 | [simple-check.core :as sc] 7 | [simple-check.generators :as gen] 8 | [simple-check.properties :as prop])) 9 | 10 | (defn power-of-two? 11 | "Is x a positive integer power of 2?" 12 | [x] 13 | (if-not (pos? x) 14 | false 15 | (loop [x x] 16 | (if (= 1 x) 17 | true 18 | (let [x (/ x 2)] 19 | (if-not (integer? x) 20 | false 21 | (recur x))))))) 22 | 23 | (extend-protocol gen/Shrink 24 | nil 25 | (shrink [_] nil)) 26 | 27 | (deftest power-of-two-test 28 | (is (power-of-two? 1)) 29 | (is (power-of-two? 2)) 30 | (is (power-of-two? 4)) 31 | (is (not (power-of-two? 0))) 32 | (is (not (power-of-two? 3)))) 33 | 34 | (defn repair-diffs 35 | "Takes a merge function, a set of diffs, and two segments, v1 and v2. Returns 36 | v1 with updates from v2." 37 | [merge-fn diffs v1 v2] 38 | (map-indexed (fn [i segment] 39 | (if (diffs i) 40 | (merge-fn segment (nth v2 i)) 41 | segment)) 42 | v1)) 43 | 44 | (defn repair 45 | "Takes two segments and repairs differences using their merkle trees. Merges 46 | conflicting elements with (merge-fn). Returns a vector like [(repaired v1) 47 | (repaired v2)]." 48 | [merge-fn depth [v1 v2]] 49 | (let [t1 (tree depth v1) 50 | t2 (tree depth v2) 51 | ; _ (prn :t1 t1) 52 | ; _ (prn :t2 t2) 53 | diffs (set (diffs t1 t2))] 54 | ; (prn :diff-fraction (try (float (/ (count diffs) (count v1))) 55 | ; (catch ArithmeticException e :nan))) 56 | [(repair-diffs merge-fn diffs v1 v2) 57 | (repair-diffs merge-fn diffs v2 v1)])) 58 | 59 | (defn fixed 60 | "Calls (f arg), then feeds the return value into f again, until a fixed point 61 | is found. (f (f (f arg))) and so on." 62 | [f arg] 63 | (let [value (f arg)] 64 | (if (= value arg) 65 | value 66 | (recur f value)))) 67 | 68 | (prefer-method clojure.core/print-method 69 | clojure.lang.ISeq 70 | clojure.lang.IPersistentVector) 71 | (prefer-method clojure.pprint/simple-dispatch 72 | clojure.lang.ISeq 73 | clojure.lang.IPersistentVector) 74 | 75 | (def depth-and-segments 76 | (gen/bind (gen/choose -1 5) 77 | (fn [power] 78 | (if (= -1 power) 79 | ; Special case: empty vectors 80 | (gen/return [1 []]) 81 | 82 | (gen/bind (gen/choose 0 3) 83 | (fn [card] 84 | (let [element (gen/choose (- card) card) 85 | ; Construct a vector 2x as big as we need; 86 | ; we'll split it in half to be the two 87 | ; divergent copies. 88 | size (* 2 (int (Math/pow 2 power)))] 89 | (gen/tuple (gen/choose 1 (inc power)) 90 | (gen/vector element size))))))))) 91 | 92 | (defn segments 93 | "Takes a vector and returns two segment vectors from it." 94 | [v] 95 | (let [n (/ (count v) 2)] 96 | (assert (integer? n)) 97 | (assert (or (zero? n) 98 | (power-of-two? n))) 99 | [(subvec v 0 n) 100 | (subvec v n)])) 101 | 102 | (defspec randomized-resolve-test 103 | ; Generates a couple sequences, then brings them into sync by copying the 104 | ; differing parts. 105 | 100000 106 | (prop/for-all [ds depth-and-segments] 107 | (let [[d v] ds ; Split apart depth and big vector 108 | versions (segments v) ; Split vector into 2 109 | d (max d 1)] ; simple-check will shrink to 0. :( 110 | ;(prn :d d) 111 | ;(prn :v1 (first versions)) 112 | ;(prn :v2 (second versions)) 113 | (assert (apply = (map count versions))) 114 | (let [[r1 r2] (fixed (partial repair max d) versions)] 115 | ; (prn "repaired" r1 r2) 116 | ; (prn "succcess?" (= r1 r2)) 117 | (or 118 | ; Either the repaired pieces are actually equal.... 119 | (= r1 r2) 120 | ; Or there was a hash collision and their trees are now 121 | ; equal 122 | (= (:hash (tree d r1)) 123 | (:hash (tree d r2)))))))) 124 | -------------------------------------------------------------------------------- /test/merkle/kv/linear_test.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.kv.linear-test 2 | (:require [clojure.test :refer :all] 3 | [merkle.kv.linear :refer :all] 4 | clojure.pprint 5 | [clojure.data.generators :as gen])) 6 | 7 | (deftest empty-test 8 | (is (= nil (tree nil))) 9 | (is (= nil (tree {})))) 10 | 11 | (deftest single-test 12 | (is (= (tree {:foo 1}) 13 | (->Node 1 :foo :foo nil nil))) 14 | (is (= (tree {:foo "hi there"}) 15 | (->Node 588714629 :foo :foo nil nil)))) 16 | 17 | (deftest two-test 18 | (is (= (tree (sorted-map :foo 1 :bar 2)) 19 | (->Node 1024 :bar :foo 20 | (->Node 2 :bar :bar nil nil) 21 | (->Node 1 :foo :foo nil nil))))) 22 | 23 | (deftest three-test 24 | (is (= (tree (sorted-map :foo 1 :bar 2 :baz 3)) 25 | (->Node 32768 :bar :foo 26 | (->Node 1026 :bar :baz 27 | (->Node 2 :bar :bar nil nil) 28 | (->Node 3 :baz :baz nil nil)) 29 | (->Node 1 :foo :foo nil nil))))) 30 | 31 | (deftest five-test 32 | (is (= (tree (sorted-map :foo 1 :bar 2 :baz 3 :xyzzy 4 :frob 5)) 33 | (->Node 1047649 :bar :xyzzy 34 | (->Node 33764 :bar :frob 35 | (->Node 1026 :bar :baz 36 | (->Node 2 :bar :bar nil nil) 37 | (->Node 3 :baz :baz nil nil)) 38 | (->Node 997 :foo :frob 39 | (->Node 1 :foo :foo nil nil) 40 | (->Node 5 :frob :frob nil nil))) 41 | (->Node 4 :xyzzy :xyzzy nil nil))))) 42 | 43 | (defn random-map 44 | ([fk fv] (random-map fk fv gen/default-sizer)) 45 | ([fk fv sizer] 46 | (into (sorted-map) 47 | (zipmap (gen/reps sizer fk) 48 | (gen/reps sizer fv))))) 49 | 50 | (deftest identity-test 51 | (dotimes [i 10] 52 | (let [m (random-map gen/string gen/long (gen/geometric 0.001))] 53 | (let [t1 (tree m) 54 | t2 (tree m)] 55 | (= t1 t2))))) 56 | 57 | (deftest different-test 58 | (->> (repeatedly #(random-map gen/int gen/int (gen/geometric 0.01))) 59 | (take 10) 60 | (map tree) 61 | (partition 2 1) 62 | (map (fn [[t1 t2]] (is (not= t1 t2)))) 63 | dorun)) 64 | 65 | (deftest identical-range-test 66 | (testing "One empty" 67 | (is (= (identical-ranges (tree (sorted-map :a 1)) 68 | nil) 69 | []))) 70 | 71 | (testing "Both empty" 72 | (is (= (identical-ranges nil nil) 73 | []))) 74 | 75 | (testing "identical" 76 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 77 | (tree (sorted-map :a 1 :b 2 :c 3 :d 4))) 78 | [[:a :d]]))) 79 | 80 | (testing "one difference" 81 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 82 | (tree (sorted-map :a 1 :b 0 :c 3 :d 4))) 83 | [[:a :a] [:c :d]]))) 84 | 85 | (testing "one branch different" 86 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 87 | (tree (sorted-map :a 0 :b 0 :c 3 :d 4))) 88 | [[:c :d]]))) 89 | 90 | (testing "All values different." 91 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 92 | (tree (sorted-map :a 2 :b 1 :c 4 :d 3))) 93 | []))) 94 | 95 | (testing "One different key at the end." 96 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 97 | (tree (sorted-map :a 1 :b 2 :c 3 :e 4))) 98 | [[:a :b] [:c :c]]))) 99 | 100 | (testing "One new key at the end." 101 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4)) 102 | (tree (sorted-map :a 1 :b 2 :c 3 :d 4 :e 5))) 103 | [[:a :d]]))) 104 | 105 | (testing "One new key at the beginning." 106 | (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4 :e 5)) 107 | (tree (sorted-map :b 2 :c 3 :d 4 :e 5)))) 108 | [[:b :e]]))) 109 | 110 | (deftest randomized-resolve-test 111 | "Generate a couple maps, then bring them into sync by copying the different 112 | parts." 113 | (dotimes [i 1000] 114 | ; Build a random map 115 | (let [seed (random-map gen/byte gen/string (gen/geometric 0.05)) 116 | ; Perturb it into two variant copies 117 | m1 (merge seed (random-map gen/byte gen/string (gen/geometric 0.1))) 118 | m2 (merge seed (random-map gen/byte gen/string (gen/geometric 0.1))) 119 | ; Compute trees 120 | t1 (tree m1) 121 | t2 (tree m2) 122 | ; And delta 123 | identical (identical-ranges t1 t2) 124 | ; And diffs 125 | d1 (diff m1 t1 t2) 126 | d2 (diff m2 t2 t1)] 127 | ; (prn :map1 m1) 128 | ; (prn :map2 m2) 129 | ; (prn :diff1 (doall d1)) 130 | ; (prn :diff2 (doall d2)) 131 | 132 | ; Check that the updates force the maps to converge: 133 | (is (= m2 (select-keys (into m1 d2) (keys m2)))) 134 | (is (= m1 (select-keys (into m2 d1) (keys m1))))))) 135 | 136 | (deftest roundtrip-test 137 | (dotimes [i 10] 138 | (let [m (random-map gen/string gen/long (gen/geometric 0.05)) 139 | t (tree m)] 140 | (is (= t (-> t node->map map->node)))))) 141 | -------------------------------------------------------------------------------- /test/merkle/range_test.clj: -------------------------------------------------------------------------------- 1 | (ns merkle.range-test 2 | (:use clojure.test 3 | [merkle.range :only [disjoint? subset? superset?]])) 4 | 5 | (deftest disjoint-test 6 | (are [a b] (disjoint? a b) 7 | [0 0] [1 1] 8 | [1 1] [0 0] 9 | [1 2] [3 4] 10 | [3 4] [1 2]) 11 | 12 | (are [a b] (not (disjoint? a b)) 13 | [0 0] [0 0] 14 | [0 0] [0 1] 15 | [0 1] [0 0] 16 | [1 2] [1 3] 17 | [1 3] [1 2] 18 | [1 1] [0 2] 19 | [0 2] [1 1])) 20 | 21 | (deftest subset-test 22 | (are [a b] (subset? a b) 23 | [1 1] [1 1] 24 | [1 1] [1 2] 25 | [1 1] [0 1] 26 | [1 1] [0 2] 27 | [1 2] [1 2] 28 | [1 2] [0 2] 29 | [1 2] [1 3] 30 | [1 2] [0 3]) 31 | 32 | (are [a b] (not (subset? a b)) 33 | [1 2] [5 6] 34 | [1 2] [2 3] 35 | [1 4] [2 6])) 36 | --------------------------------------------------------------------------------