├── .gitignore
├── README.md
├── doc
    └── intro.md
├── project.clj
├── src
    └── merkle
    │   ├── kv
    │       ├── fixed.clj
    │       └── linear.clj
    │   └── range.clj
└── test
    └── merkle
        ├── kv
            ├── fixed_test.clj
            └── linear_test.clj
        └── range_test.clj


/.gitignore:
--------------------------------------------------------------------------------
 1 | ~*
 2 | *.swp
 3 | /target
 4 | /lib
 5 | /classes
 6 | /checkouts
 7 | pom.xml
 8 | pom.xml.asc
 9 | *.jar
10 | *.class
11 | .lein-deps-sum
12 | .lein-failures
13 | .lein-plugins
14 | .lein-repl-history
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Merkle
 2 | 
 3 | A Clojure library for computing and comparing hash trees over sorted kv
 4 | collections. Allows you to efficiently find the differing pairs between two
 5 | such collections without exchanging the collections themselves. Useful in the
 6 | synchronization of distributed systems.
 7 | 
 8 | ## Installation
 9 | 
10 | Via clojars: https://clojars.org/merkle
11 | 
12 | # kv.linear
13 | 
14 | `merkle.kv.linear` provides merkle trees over sorted collections of key-value
15 | pairs, which could be sorted maps, lists, results from a database; anything
16 | which exposes a sorted sequence of objects. You can pass arbitrary key and
17 | value extractor fns to `(tree)`; the defaults are the clojure builtin `key` and
18 | `val`. `kv.linear` is oriented towards a particular case of synchronization for
19 | k-ordered keys in distributed databases; see the namespace comments for gory
20 | details.
21 | 
22 | Note: kv.linear is not as efficient as it could be at identifying identical
23 | regions.
24 | 
25 | Note: kv.linear has no way to limit the depth of the trees it produces right
26 | now.
27 | 
28 | Note: kv.linear rarely identifies regions as identical which are not actually
29 | so; might be an issue with hash collisions over small-cardinality values like
30 | bytes. If values are bytes, deltas between hashes on the order of 10-1000
31 | entries may miss differences around 5% of the time. If values are ~10-20
32 | character strings, diffs are incomplete less than 1 in 10000 tries.
33 | 
34 | ## Usage
35 | 
36 | ```clj
37 | (use 'merkle.kv.linear)
38 | 
39 | ; Set up two maps with some differences
40 | (def map1 (sorted-map :a 1 :b 2 :c 3 :d 4))
41 | (def map2 (sorted-map :a 1 :b 2 :c 0))
42 | 
43 | ; Compute a merkle tree of each
44 | (def t1 (tree map1))
45 | (def t2 (tree map2))
46 | 
47 | ; Find pairs of map1 which, if applied to map2, would make it a superset of
48 | ; map1:
49 | (def d1 (diff map1 t1 t2))
50 | ; => ([:c 3] [:d 4])
51 | 
52 | ; And the inverse:
53 | user=> (def d2 (diff map2 t2 t1))
54 | ; => ([:c 0])
55 | 
56 | ; We can merge map2's differences back into map1:
57 | (into map1 d2)
58 | ; => {:a 1, :b 2, :c 0, :d 4}
59 | 
60 | ; And merge map1's differences into map2:
61 | (into map2 d1)
62 | ; => {:a 1, :b 2, :c 3, :d 4}
63 | 
64 | ; Provided a commutative merge function, exchanging diffs is monotonically
65 | ; convergent:
66 | (merge-with max map1 d2)
67 | ; => {:a 1, :b 2, :c 3, :d 4}
68 | (merge-with max map2 d1)
69 | ; => {:a 1, :b 2, :c 3, :d 4}
70 | ```
71 | 
72 | ## License
73 | 
74 | Copyright © 2013 Kyle Kingsbury
75 | 
76 | Distributed under the Eclipse Public License, the same as Clojure.
77 | 


--------------------------------------------------------------------------------
/doc/intro.md:
--------------------------------------------------------------------------------
1 | # Introduction to merkle
2 | 
3 | TODO: write [great documentation](http://jacobian.org/writing/great-documentation/what-to-write/)
4 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject merkle "0.1.1-SNAPSHOT"
 2 |   :description "Merkle trees for constructing efficient diffs over collections."
 3 |   :url "http://github.com/aphyr/merkle"
 4 |   :license {:name "Eclipse Public License"
 5 |             :url "http://www.eclipse.org/legal/epl-v10.html"}
 6 |   :dependencies [[org.clojure/clojure "1.5.1"]
 7 |                  [primitive-math "0.1.3"]]
 8 |   :global-vars {*warn-on-reflection* true}
 9 |   :profiles {:dev {:dependencies [[org.clojure/data.generators "0.1.0"]
10 |                                   [reiddraper/simple-check "0.4.1"]]}}
11 |   :jvm-opts ^:replace ["-server"])
12 | 


--------------------------------------------------------------------------------
/src/merkle/kv/fixed.clj:
--------------------------------------------------------------------------------
  1 | (ns merkle.kv.fixed
  2 |   "This merkle tree implementation operates over a fixed-size sequence of
  3 |   32-bit integers, presumably the hashes of some other data structure. All
  4 |   participants must agree on the sequence size in advance. Trees are of fixed
  5 |   size, and identify the indices (starting with 0) of differing or identical
  6 |   segments."
  7 |   (:require
  8 |     [merkle.range :as range]
  9 |     [primitive-math :as p])
 10 |   (:import
 11 |     [java.util.zip
 12 |      CRC32]
 13 |     [java.util
 14 |      ArrayList]))
 15 | 
 16 | ;;;
 17 | 
 18 | (defrecord Node
 19 |   [^long hash
 20 |    ^long min-segment
 21 |    ^long max-segment
 22 |    left
 23 |    right])
 24 | 
 25 | (defn node->map
 26 |   "Converts a Node to a standard Clojure map; useful for serialization."
 27 |   [^Node node]
 28 |   (when node
 29 |     {:hash    (.hash node)
 30 |      :min-segment (.min-segment node)
 31 |      :max-segment (.max-segment node)
 32 |      :left    (node->map (.left node))
 33 |      :right   (node->map (.right node))}))
 34 | 
 35 | (defn map->node
 36 |   "Converts a map to a Node; useful for serialization."
 37 |   [m]
 38 |   (when m
 39 |     (Node.
 40 |       (:hash m)
 41 |       (:min-segment m)
 42 |       (:max-segment m)
 43 |       (map->node (:left m))
 44 |       (map->node (:right m)))))
 45 | 
 46 | ;;;
 47 | 
 48 | (defn log2 ^long [^long n]
 49 |   (long (/ (Math/log n) (Math/log 2))))
 50 | 
 51 | (defn crc-update-int!
 52 |   "Updates a CRC with an integer. Returns the (mutated) CRC."
 53 |   [^CRC32 crc ^int i]
 54 |   (let [
 55 |   (.update crc (unchecked-int (p/>>> l 24)))
 56 |   (.update crc (unchecked-int (p/>>> l 16)))
 57 |   (.update crc (unchecked-int (p/>>> l 8)))
 58 |   (.update crc (unchecked-int l))
 59 |   crc)
 60 | 
 61 | (defn crc-update-crc!
 62 |   "Updates a CRC with the value of a second CRC. Returns the (mutated) first
 63 |   CRC."
 64 |   [^CRC32 crc ^CRC32 crc2]
 65 |   ; maybe avoid an object allocation??? vooooodooooo....
 66 |   (let [l (unchecked-long (.getValue crc2))]
 67 |     ; CRCs contain a 32-bit unsigned integer packed into a long, so we want to
 68 |     ; extract bits 31-24, 23-16, 15-8, and 7-0. Since CRC update(int) *only*
 69 |     ; uses bits 0-7 of that int, we'll repeatedly right-shift to create ints
 70 |     ; containing values from 0-255.
 71 |     ;
 72 |     ; Fuck Java.
 73 |     (.update crc (unchecked-int (p/>>> l 24)))
 74 |     (.update crc (unchecked-int (p/>>> l 16)))
 75 |     (.update crc (unchecked-int (p/>>> l 8)))
 76 |     (.update crc (unchecked-int l))
 77 |     crc))
 78 | 
 79 | (defn hash-levels
 80 |   "Returns a seq of seqs, with `output-level` elements. Each represents a
 81 |   level of the hash-tree, from bottom to top.
 82 | 
 83 |   The `segment-seq` represents the input hashes of the underlying segments.
 84 |   Elements in the hash-seq may be `nil`, denoting no elements within that
 85 |   segment."
 86 |   [^long output-levels ^long num-segments segment-seq]
 87 |   (assert (pos? output-levels))
 88 |   (let [levels (if (zero? num-segments)
 89 |                  0
 90 |                  (long (p/inc (log2 num-segments))))
 91 |         emit-all? (= output-levels levels)
 92 |         ^objects crcs (object-array levels)
 93 |         get-crc (fn [^long idx]
 94 |                   (if-let [crc (aget crcs idx)]
 95 |                     crc
 96 |                     (let [crc (CRC32.)]
 97 |                       (aset crcs idx crc)
 98 |                       crc)))
 99 |         lists (object-array
100 |                 (repeatedly output-levels
101 |                   #(ArrayList.)))]
102 |     (loop [idx 0, s segment-seq]
103 |       (when-not (empty? s)
104 | 
105 |         (let [x (first s)]
106 |          
107 |           ;; update the level-0 hash
108 |           (when x
109 |             (let [^CRC32 c (get-crc 0)]
110 |               (.update c (unchecked-int x))))
111 | 
112 |           (when emit-all?
113 |             (.add ^ArrayList (aget lists 0) x))
114 |           
115 |           ;; ascend the levels as appropriate
116 |           (loop [idx idx, level 0]
117 |             (when (== 1 (p/bit-and 1 idx))
118 |               (let [^CRC32 crc (aget crcs level)]
119 |                 
120 |                 ;; if there's a crc, propagate it upwards
121 |                 (when crc
122 |                   (.update ^CRC32 (get-crc (p/inc level)) (.getValue crc))
123 |                   (aset crcs level nil))
124 | 
125 |                 ;; if we're above the threshold for the output tree, write to it
126 |                 (let [output-level (p/+ output-levels 1 (p/- level levels))]
127 |                   (when (<= 0 output-level)
128 |                     (.add ^ArrayList (aget lists output-level) (when crc (.getValue crc))))))
129 |               (recur (p/>> idx 1) (p/inc level)))))
130 | 
131 |         (recur (p/inc idx) (rest s))))
132 | 
133 |     (map seq lists)))
134 | 
135 | (defn hash-levels->tree
136 |   "Takes tiered sequences from `hash-levels`, and returns the root `Node` of a
137 |   tree."
138 |   [hash-levels num-segments]
139 |   (let [k (long (/ num-segments (Math/pow 2 (dec (count hash-levels)))))]
140 |     (if (zero? num-segments)
141 |       (Node. 0 0 0 nil nil)
142 |       (first
143 |         (reduce
144 |           (fn [nodes hashes]
145 |             (map
146 |               (fn [hash [^Node l ^Node r]]
147 |                 (Node. hash (.min-segment l) (.max-segment r) l r))
148 |               hashes
149 |               (partition 2 nodes)))
150 |           (map-indexed
151 |             (fn [idx hash]
152 |               (Node. hash (* idx k) (* (inc idx) k) nil nil))
153 |             (first hash-levels))
154 |           (rest hash-levels))))))
155 | 
156 | (defn tree
157 |   "Returns the root `Node` of a hash-tree with `depth` levels.  The input
158 |   `segment-seq` is a list of hashes for a discrete number of segments, which
159 |   must have a cardinality which is a power of two."
160 |   ([depth segment-seq]
161 |    (tree depth (count segment-seq) segment-seq))
162 |   ([depth num-segments segment-seq]
163 |    (hash-levels->tree 
164 |      (hash-levels depth num-segments segment-seq)
165 |      num-segments)))
166 | 
167 | ;;;
168 | 
169 | (defn- merge-ranges [a b]
170 |   "Merges contiguous ranges.  Returns a list of one or two ranges, depending on
171 |   whether `a` and `b` are contiguous." 
172 |   (if (and a b)
173 |     (let [prefix (butlast a)
174 |           suffix (rest b)
175 |           a' (last a)
176 |           b' (first b)]
177 |       (concat
178 |         prefix
179 |         (if (= (last a') (first b'))
180 |           [[(first a') (last b')]]
181 |           [a' b'])
182 |         suffix))
183 |     (or a b)))
184 | 
185 | (defn identical-ranges
186 |   "Returns a list of [min,max] tuples describing the segment ranges for which
187 |   the two nodes are identical."
188 |   [^Node n1 ^Node n2]
189 |   (when (and n1 n2)
190 |     (if (== (.hash n1) (.hash n2))
191 |       [[(.min-segment n1) (.max-segment n1)]]
192 |       (merge-ranges
193 |         (identical-ranges (.left n1) (.left n2))
194 |         (identical-ranges (.right n1) (.right n2))))))
195 | 
196 | (defn diff-ranges
197 |   "Returns a list of [min,max] tuples describing the segment ranges for which
198 |   the two nodes are different."
199 |   [^Node n1 ^Node n2]
200 |   (assert (instance? Node n1))
201 |   (assert (instance? Node n2))
202 |   (let [min (.min-segment n1)
203 |         max (.max-segment n1)
204 |         ranges (identical-ranges n1 n2)]
205 |     (if (empty? ranges)
206 |       ; No identical ranges.
207 |       [[min max]]
208 | 
209 |       ; *some* region is identical.
210 |       (concat
211 |         (when-not (= min (ffirst ranges))
212 |           [[0 (ffirst ranges)]])
213 |         (->> ranges
214 |              (partition 2 1)
215 |              (map
216 |                (fn [[[_ l] [u _]]]
217 |                  [l u])))
218 |         (when-not (= max (last (last ranges)))
219 |           [[(last (last ranges)) max]])))))
220 | 
221 | (defn diffs
222 |   "Returns a sequence of segment indices for which two nodes are different."
223 |   [n1 n2]
224 |   (mapcat (partial apply range)
225 |           (diff-ranges n1 n2)))
226 | 


--------------------------------------------------------------------------------
/src/merkle/kv/linear.clj:
--------------------------------------------------------------------------------
  1 | (ns merkle.kv.linear
  2 |   "Merkle trees are a data structure used to efficiently localize differences
  3 |   between two copies of an ordered collection. Each element in the collection
  4 |   is hashed into a leaf node. Every n leaf nodes have their hashes concatenated
  5 |   and hashed to produce a supervening node. Those nodes are then combined and
  6 |   hashed again, until a single root node has hashed the entire collection.
  7 |   Given two of these trees, one can find differences between two copies of the
  8 |   collection by finding nodes with differing hashes.
  9 | 
 10 |   This particular implementation is aimed at a specific sub-problem: comparing
 11 |   two key-value collections where:
 12 | 
 13 |   1. The key set could vary between copies.
 14 |   2. The collections could be much larger than RAM.
 15 |   3. Keys are in sorted order.
 16 |   4. The keyspace and probability distribution of keys is unknown.
 17 |   5. New keys are usually added at the end.
 18 | 
 19 |   Because the key set varies, we need to encode some information about the keys
 20 |   in the tree nodes. Each of our nodes keeps the minimum and maximum key
 21 |   covered by its hash. We can then consider nodes equal when they have the
 22 |   same hash and the same start and end key, which makes it possible to compare
 23 |   collections with different keys.
 24 | 
 25 |   Since the collection is much larger than RAM (for instance, a table on disk),
 26 |   but the distribution of the keyspace is unknown, our algorithm forgoes having
 27 |   a well-balanced tree in favor of a single linear scan over the collection,
 28 |   accruing at most log(n) nodes in RAM.
 29 | 
 30 |   Because the key are in sorted order, we can efficiently construct this tree
 31 |   in a single linear pass, at the cost of having a much smaller right-hand
 32 |   subtree. The best-case scenario is a fully balanced tree, and the worst-case
 33 |   scenario is a root node with a fully balanced subtree on the left, and a
 34 |   single leaf node on the right. The worst-case cost for this structure, as
 35 |   compared to a balanced btree, is one additional layer of depth.
 36 | 
 37 |   Since new keys are added mostly to the end, two collections which are
 38 |   periodically synchronized will tend to agree on their
 39 |   first/smallest/left-hand keys and nodes. If two collections agree on the
 40 |   first N keys, their hash trees will have an identical node->key distribution
 41 |   over those N keys. In a periodically synchronized collection, this means
 42 |   we'll only differ on a few keys towards the end, and can efficiently ignore
 43 |   the bulk of the dataset. In the worst-case scenario, a single key is added to
 44 |   the beginning of the collection, and none of the hashes are aligned, forcing
 45 |   a full resync."
 46 |   (:require [merkle.range :as range]))
 47 | 
 48 | (defrecord Node [^long hash min-key max-key left right])
 49 | 
 50 | (defn node->map
 51 |   "Converts a Node to a standard Clojure map; useful for serialization."
 52 |   [^Node node]
 53 |   (when node
 54 |     {:hash (.hash node)
 55 |      :min-key (.min-key node)
 56 |      :max-key (.max-key node)
 57 |      :left    (node->map (.left node))
 58 |      :right   (node->map (.right node))}))
 59 | 
 60 | (defn map->node
 61 |   "Converts a map to a Node; useful for serialization."
 62 |   [m]
 63 |   (when m
 64 |     (Node. (:hash m)
 65 |            (:min-key m)
 66 |            (:max-key m)
 67 |            (map->node (:left m))
 68 |            (map->node (:right m))))) 
 69 | (defn key-range
 70 |   "The inclusive range of keys a Node covers."
 71 |   [^Node node]
 72 |   (when node
 73 |     (list (.min-key node)
 74 |           (.max-key node))))
 75 | 
 76 | (defn aligned?
 77 |   "Are two nodes comparable--i.e. do they have the same min-key and max-key?"
 78 |   [^Node n1 ^Node n2]
 79 |   (if (nil? n1)
 80 |     (nil? n2)
 81 |     (and n2 
 82 |          (= (.min-key n1) (.min-key n2))
 83 |          (= (.max-key n1) (.max-key n2)))))
 84 | 
 85 | (defn same?
 86 |   "Are two nodes equivalent; i.e. do they have the same hash, min-key, and
 87 |   max-key?"
 88 |   [^Node n1 ^Node n2]
 89 |   (if (nil? n1)
 90 |     (nil? n2)
 91 |     (and n2
 92 |          (= (.hash n1) (.hash n2))
 93 |          (= (.min-key n1) (.min-key n2))
 94 |          (= (.max-key n1) (.max-key n2)))))
 95 | 
 96 | (defn hash-nodes
 97 |   "Combines two nodes together."
 98 |   [^Node left ^Node right]
 99 |   (cond
100 |     (nil? left) right
101 |     (nil? right) left
102 |     :else (Node. (hash (list (.hash left)
103 |                              (.hash right)))
104 |                  (.min-key left)
105 |                  (.max-key right)
106 |                  left
107 |                  right)))
108 | 
109 | (defn assoc-or-conj!
110 |   "Associates, and conj's on OutOfBoundsException."
111 |   [v i value]
112 |   (try
113 |     (assoc! v i value)
114 |     (catch IndexOutOfBoundsException e
115 |       (conj! v value))))
116 | 
117 | (defn percolate
118 |   "Takes a vector of nodes awaiting merging with their neighbors, and a new
119 |   node to merge into position i. Returns a new vector of nodes awaiting merging
120 |   with their neighbors."
121 |   [levels i right]
122 |   (let [left (try (nth levels i)
123 |                   (catch IndexOutOfBoundsException e
124 |                     :out-of-bounds))]
125 |     (let [x (persistent! levels)
126 |           levels (transient x)]
127 |       (condp = left
128 |         ; Fill in an empty space above
129 |         nil            (assoc! levels i right)
130 | 
131 |         ; Add a new slot to the top
132 |         :out-of-bounds (conj! levels right)
133 | 
134 |         ; Merge with the left node and recur upwards.
135 |         (recur (assoc! levels i nil)
136 |                (inc i)
137 |                (hash-nodes left right))))))
138 | 
139 | (defn transient-seq
140 |   "A sequence over a transient collection. Definitely not safe unless you
141 |   realize it immediately."
142 |   [levels]
143 |   (->> levels
144 |        count
145 |        range
146 |        (map (partial nth levels))))
147 | 
148 | (defn compact?
149 |   "Is the given level merged into a single node--i.e. does it look like [nil
150 |   nil nil nil node]. Levels are transient so we have to use nth."
151 |   [levels]
152 |   (->> levels
153 |        transient-seq
154 |        (remove nil?)
155 |        (count)
156 |        (<= 1)))
157 | 
158 | (defn compact
159 |   "Takes a vector of nodes awaiting merging with their neighbors, and merges
160 |   them all into a single node. This handles the (usual) case where the right
161 |   side of the tree has fewer entries than the right, leaving us with dangling
162 |   levels."
163 |   [levels]
164 |   (let [levels (persistent! levels)]
165 |     (if (empty? levels)
166 |       nil
167 |       (->> levels reverse (remove nil?) (reduce hash-nodes)))))
168 | 
169 | (defn tree
170 |   "Computes a merkle tree for a sorted kv collection. Does not retain the head.
171 |   
172 |   Can take two optional functions: keyfn and valfn, which extract the key and
173 |   the value from a given element of the collection; if not given, these default
174 |   to key and val, respectively."
175 |   ([coll]
176 |    (tree coll key val))
177 |   ([coll keyfn valfn]
178 |    ; We keep a temporary vector containing uncombined nodes in the tree as we
179 |    ; move through the collection. The 0th entry of this vector is a node for a
180 |    ; single element. The last entry is the Node at the highest level yet
181 |    ; reached. Nodes percolate upwards through this vector until we have a
182 |    ; single node at the top containing the full tree.
183 |    (->> coll
184 |         (reduce (fn [levels element]
185 |                   ; Construct base-level node
186 |                   (let [k    (keyfn element)
187 |                         h    (hash (valfn element))
188 |                         node (Node. h k k nil nil)]
189 |                     (percolate levels 0 node)))
190 |                 (transient []))
191 |         compact)))
192 | 
193 | (defn pr-node
194 |   [^Node node]
195 |   (if node
196 |     (str (pr-str (.hash node)) " (" (.min-key node) " " (.max-key node) ")")
197 |     "nil"))
198 | 
199 | (defn identical-ranges
200 |   "Returns an ordered lazy sequence of (start-key end-key) pairs for which two
201 |   trees are known to be identical."
202 |   [^Node t1 ^Node t2]
203 | ;  (println "compare" (pr-node t1) (pr-node t2))
204 | 
205 |   (let [r1 (key-range t1)
206 |         r2 (key-range t2)]
207 |     (cond
208 |       ; One node is nil
209 |       (nil? t1) (list)
210 |       (nil? t2) (list)
211 | 
212 |       ; Nodes cover the same key range.
213 |       (= r1 r2)
214 |       (if (= (.hash t1) (.hash t2))
215 |         ; Nodes are identical.
216 |         (list (list (.min-key t1) (.max-key t1)))
217 | 
218 |         ; We *know* these keys differ.
219 |         (lazy-cat (identical-ranges (.left t1) (.left t2))
220 |                   (identical-ranges (.right t1) (.right t2))))
221 | 
222 |       ; t1 and t2 are totally disjoint; nothing here for us.
223 |       (range/disjoint? r1 r2)
224 |       (list)
225 | 
226 |       ; t1 is a proper subset of t2; descend into t2 looking for t1.
227 |       (range/subset? r1 r2)
228 |       (lazy-cat (identical-ranges t1 (.left t2))
229 |                 (identical-ranges t1 (.right t2)))
230 |       
231 |       ; t2 is a proper subset of t1; descend into t1 looking for it.
232 |       (range/subset? r2 r1)
233 |       (lazy-cat (identical-ranges (.left t1) t2)
234 |                 (identical-ranges (.right t1) t2))
235 | 
236 |       ; t1 and t2 intersect, but neither is a subset of the other. Expand to
237 |       ; all branches.
238 |       (range/intersect? r1 r2)
239 |       (lazy-cat (identical-ranges (.left  t1) (.left  t2))
240 |                 (identical-ranges (.left  t1) (.right t2))
241 |                 (identical-ranges (.right t1) (.left  t2))
242 |                 (identical-ranges (.right t1) (.right t2)))
243 |       
244 |       :else (throw (IllegalStateException. "Shouldn't get here.")))))
245 | 
246 | (defn diff-helper
247 |   "Given a sorted sequence of identical ranges, and a sequence of elements from
248 |   a local collection, yields elements which differ by riffling through both
249 |   sequences."
250 |   [ranges coll keyfn]
251 |   ; [range 1]     [range 2]     [range 3]
252 |   ;  x    x    o    x       o o           o
253 |   (when-let [element (first coll)]
254 |     (let [k (keyfn element)
255 |           range (first ranges)]
256 |         (cond
257 |           ; We're out of ranges; return entire remaining collection.
258 |           (nil? range)
259 |           coll
260 | 
261 |           ; Element lies before the range and should be included.
262 |           (< (compare k (first range)) 0)
263 |           (cons element
264 |                 (lazy-seq (diff-helper ranges (next coll) keyfn)))
265 | 
266 |           ; Element lies within the range
267 |           (<= (compare k (second range)) 0)
268 |           (lazy-seq (diff-helper ranges (next coll) keyfn))
269 | 
270 |           ; Element lies after the range.
271 |           :else
272 |           (lazy-seq (diff-helper (next ranges) coll keyfn))))))
273 | 
274 | (defn diff
275 |   "Given a local collection, a local tree, and a remote tree, yields a lazy
276 |   sequence of elements from the local collection which are not known to be
277 |   identical based on the two trees."
278 |   ([coll local-tree remote-tree]
279 |    (diff coll local-tree remote-tree key))
280 |   ([coll local-tree remote-tree keyfn]
281 |    (diff-helper (identical-ranges local-tree remote-tree) coll keyfn)))
282 | 


--------------------------------------------------------------------------------
/src/merkle/range.clj:
--------------------------------------------------------------------------------
 1 | (ns merkle.range
 2 |   "Functions over inclusive ranges, which are just pairs of comparable objects
 3 |   like [1 2] or (:a :b)."
 4 |   (:refer-clojure :exclude [< <= contains?]))
 5 | 
 6 | ; For concision, override comparison here to work with all comparables.
 7 | (defn- <= [a b] (clojure.core/< (compare a b) 1))
 8 | (defn- < [a b] (clojure.core/< (compare a b) 0))
 9 | 
10 | (defn subset?
11 |   "Is range1 a subset of range2?"
12 |   [range1 range2]
13 |   (and (<= (first range2)
14 |            (first range1))
15 |        (<= (second range1)
16 |            (second range2))))
17 | 
18 | (defn superset?
19 |   "Is range1 a superset of set2?"
20 |   [r1 r2]
21 |   (and (<= (first r1)
22 |            (first r2))
23 |        (<= (second r2)
24 |            (second r1))))
25 | 
26 | (defn disjoint?
27 |   "Are r1 and r2 totally disjoint ranges?"
28 |   [r1 r2]
29 |   (or
30 |     ; [r1 r1]  [r2 r2]
31 |     (< (second r1)
32 |        (first r2))
33 |     ; [r2 r2]  [r1 r1]
34 |     (< (second r2)
35 |        (first r1))))
36 | 
37 | (defn intersect?
38 |   "Do r1 and r2 intersect at all?"
39 |   [r1 r2]
40 |   (not (disjoint? r1 r2)))
41 | 
42 | (defn contains?
43 |   "Does the range contain the given value?"
44 |   [range element]
45 |   (and (<= (first range) element)
46 |        (<= element (second range))))
47 | 


--------------------------------------------------------------------------------
/test/merkle/kv/fixed_test.clj:
--------------------------------------------------------------------------------
  1 | (ns merkle.kv.fixed-test
  2 |   (:require [clojure.test :refer :all]
  3 |             [merkle.kv.fixed :refer :all]
  4 |             [clojure.pprint :refer :all]
  5 |             [simple-check.clojure-test :refer [defspec]]
  6 |             [simple-check.core :as sc]
  7 |             [simple-check.generators :as gen]
  8 |             [simple-check.properties :as prop]))
  9 | 
 10 | (defn power-of-two?
 11 |   "Is x a positive integer power of 2?"
 12 |   [x]
 13 |   (if-not (pos? x)
 14 |     false
 15 |     (loop [x x]
 16 |       (if (= 1 x)
 17 |         true
 18 |         (let [x (/ x 2)]
 19 |           (if-not (integer? x)
 20 |             false
 21 |             (recur x)))))))
 22 | 
 23 | (extend-protocol gen/Shrink
 24 |   nil
 25 |   (shrink [_] nil))
 26 | 
 27 | (deftest power-of-two-test
 28 |   (is (power-of-two? 1))
 29 |   (is (power-of-two? 2))
 30 |   (is (power-of-two? 4))
 31 |   (is (not (power-of-two? 0)))
 32 |   (is (not (power-of-two? 3))))
 33 | 
 34 | (defn repair-diffs
 35 |   "Takes a merge function, a set of diffs, and two segments, v1 and v2. Returns
 36 |   v1 with updates from v2."
 37 |   [merge-fn diffs v1 v2]
 38 |   (map-indexed (fn [i segment]
 39 |                  (if (diffs i)
 40 |                    (merge-fn segment (nth v2 i))
 41 |                    segment))
 42 |                v1))
 43 | 
 44 | (defn repair
 45 |   "Takes two segments and repairs differences using their merkle trees. Merges
 46 |   conflicting elements with (merge-fn). Returns a vector like [(repaired v1)
 47 |   (repaired v2)]."
 48 |   [merge-fn depth [v1 v2]]
 49 |   (let [t1 (tree depth v1)
 50 |         t2 (tree depth v2)
 51 | ;        _ (prn :t1 t1)
 52 | ;        _ (prn :t2 t2)
 53 |         diffs (set (diffs t1 t2))]
 54 | ;    (prn :diff-fraction (try (float (/ (count diffs) (count v1)))
 55 | ;                             (catch ArithmeticException e :nan)))
 56 |     [(repair-diffs merge-fn diffs v1 v2)
 57 |      (repair-diffs merge-fn diffs v2 v1)]))
 58 | 
 59 | (defn fixed
 60 |   "Calls (f arg), then feeds the return value into f again, until a fixed point
 61 |   is found. (f (f (f arg))) and so on."
 62 |   [f arg]
 63 |   (let [value (f arg)]
 64 |     (if (= value arg)
 65 |       value
 66 |       (recur f value))))
 67 | 
 68 | (prefer-method clojure.core/print-method
 69 |                clojure.lang.ISeq
 70 |                clojure.lang.IPersistentVector)
 71 | (prefer-method clojure.pprint/simple-dispatch
 72 |                clojure.lang.ISeq
 73 |                clojure.lang.IPersistentVector)
 74 | 
 75 | (def depth-and-segments
 76 |   (gen/bind (gen/choose -1 5)
 77 |             (fn [power]
 78 |               (if (= -1 power)
 79 |                 ; Special case: empty vectors
 80 |                 (gen/return [1 []])
 81 | 
 82 |                 (gen/bind (gen/choose 0 3)
 83 |                           (fn [card]
 84 |                             (let [element (gen/choose (- card) card)
 85 |                                   ; Construct a vector 2x as big as we need;
 86 |                                   ; we'll split it in half to be the two
 87 |                                   ; divergent copies.
 88 |                                   size    (* 2 (int (Math/pow 2 power)))]
 89 |                               (gen/tuple (gen/choose 1 (inc power))
 90 |                                          (gen/vector element size)))))))))
 91 | 
 92 | (defn segments
 93 |   "Takes a vector and returns two segment vectors from it."
 94 |   [v]
 95 |   (let [n (/ (count v) 2)]
 96 |     (assert (integer? n)) 
 97 |     (assert (or (zero? n)
 98 |                 (power-of-two? n)))
 99 |     [(subvec v 0 n)
100 |      (subvec v n)]))
101 | 
102 | (defspec randomized-resolve-test
103 |   ; Generates a couple sequences, then brings them into sync by copying the
104 |   ; differing parts.
105 |   100000
106 |   (prop/for-all [ds depth-and-segments]
107 |                 (let [[d v]    ds           ; Split apart depth and big vector
108 |                       versions (segments v) ; Split vector into 2
109 |                       d        (max d 1)]   ; simple-check will shrink to 0. :(
110 |                   ;(prn :d d)
111 |                   ;(prn :v1 (first versions))
112 |                   ;(prn :v2 (second versions))
113 |                   (assert (apply = (map count versions)))
114 |                   (let [[r1 r2] (fixed (partial repair max d) versions)]
115 | ;                    (prn "repaired" r1 r2)
116 | ;                    (prn "succcess?" (= r1 r2))
117 |                     (or
118 |                       ; Either the repaired pieces are actually equal....
119 |                       (= r1 r2)
120 |                       ; Or there was a hash collision and their trees are now
121 |                       ; equal
122 |                       (= (:hash (tree d r1))
123 |                          (:hash (tree d r2))))))))
124 | 


--------------------------------------------------------------------------------
/test/merkle/kv/linear_test.clj:
--------------------------------------------------------------------------------
  1 | (ns merkle.kv.linear-test
  2 |   (:require [clojure.test :refer :all]
  3 |             [merkle.kv.linear :refer :all]
  4 |             clojure.pprint
  5 |             [clojure.data.generators :as gen]))
  6 | 
  7 | (deftest empty-test
  8 |   (is (= nil (tree nil)))
  9 |   (is (= nil (tree {}))))
 10 | 
 11 | (deftest single-test
 12 |   (is (= (tree {:foo 1})
 13 |          (->Node 1 :foo :foo nil nil)))
 14 |   (is (= (tree {:foo "hi there"})
 15 |          (->Node 588714629 :foo :foo nil nil))))
 16 | 
 17 | (deftest two-test
 18 |   (is (= (tree (sorted-map :foo 1 :bar 2))
 19 |          (->Node 1024 :bar :foo
 20 |                  (->Node 2 :bar :bar nil nil)
 21 |                  (->Node 1 :foo :foo nil nil)))))
 22 | 
 23 | (deftest three-test
 24 |   (is (= (tree (sorted-map :foo 1 :bar 2 :baz 3))
 25 |          (->Node 32768 :bar :foo
 26 |                  (->Node 1026 :bar :baz
 27 |                          (->Node 2 :bar :bar nil nil)
 28 |                          (->Node 3 :baz :baz nil nil))
 29 |                  (->Node 1 :foo :foo nil nil)))))
 30 | 
 31 | (deftest five-test
 32 |   (is (= (tree (sorted-map :foo 1 :bar 2 :baz 3 :xyzzy 4 :frob 5))
 33 |          (->Node 1047649 :bar :xyzzy
 34 |                  (->Node 33764 :bar :frob
 35 |                          (->Node 1026 :bar :baz
 36 |                                  (->Node 2 :bar :bar nil nil)
 37 |                                  (->Node 3 :baz :baz nil nil))
 38 |                          (->Node 997 :foo :frob
 39 |                                  (->Node 1 :foo :foo nil nil)
 40 |                                  (->Node 5 :frob :frob nil nil)))
 41 |                  (->Node 4 :xyzzy :xyzzy nil nil)))))
 42 | 
 43 | (defn random-map
 44 |   ([fk fv] (random-map fk fv gen/default-sizer))
 45 |   ([fk fv sizer]
 46 |    (into (sorted-map)
 47 |          (zipmap (gen/reps sizer fk)
 48 |                  (gen/reps sizer fv)))))
 49 | 
 50 | (deftest identity-test
 51 |   (dotimes [i 10]
 52 |     (let [m (random-map gen/string gen/long (gen/geometric 0.001))]
 53 |       (let [t1 (tree m)
 54 |             t2 (tree m)]
 55 |         (= t1 t2)))))
 56 | 
 57 | (deftest different-test
 58 |   (->> (repeatedly #(random-map gen/int gen/int (gen/geometric 0.01)))
 59 |        (take 10)
 60 |        (map tree)
 61 |        (partition 2 1)
 62 |        (map (fn [[t1 t2]] (is (not= t1 t2))))
 63 |        dorun))
 64 | 
 65 | (deftest identical-range-test
 66 |   (testing "One empty"
 67 |     (is (= (identical-ranges (tree (sorted-map :a 1))
 68 |                              nil)
 69 |            [])))
 70 | 
 71 |   (testing "Both empty"
 72 |     (is (= (identical-ranges nil nil)
 73 |            [])))
 74 | 
 75 |   (testing "identical"
 76 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
 77 |                              (tree (sorted-map :a 1 :b 2 :c 3 :d 4)))
 78 |            [[:a :d]])))
 79 | 
 80 |   (testing "one difference"
 81 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
 82 |                              (tree (sorted-map :a 1 :b 0 :c 3 :d 4)))
 83 |            [[:a :a] [:c :d]])))
 84 | 
 85 |   (testing "one branch different"
 86 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
 87 |                              (tree (sorted-map :a 0 :b 0 :c 3 :d 4)))
 88 |            [[:c :d]])))
 89 | 
 90 |   (testing "All values different."
 91 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
 92 |                              (tree (sorted-map :a 2 :b 1 :c 4 :d 3)))
 93 |            [])))
 94 | 
 95 |   (testing "One different key at the end."
 96 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
 97 |                              (tree (sorted-map :a 1 :b 2 :c 3 :e 4)))
 98 |            [[:a :b] [:c :c]])))
 99 | 
100 |   (testing "One new key at the end."
101 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4))
102 |                              (tree (sorted-map :a 1 :b 2 :c 3 :d 4 :e 5)))
103 |            [[:a :d]])))
104 |   
105 |   (testing "One new key at the beginning."
106 |     (is (= (identical-ranges (tree (sorted-map :a 1 :b 2 :c 3 :d 4 :e 5))
107 |                              (tree (sorted-map      :b 2 :c 3 :d 4 :e 5))))
108 |         [[:b :e]])))
109 | 
110 | (deftest randomized-resolve-test
111 |   "Generate a couple maps, then bring them into sync by copying the different
112 |   parts."
113 |   (dotimes [i 1000]
114 |     ; Build a random map
115 |     (let [seed (random-map gen/byte gen/string (gen/geometric 0.05))
116 |           ; Perturb it into two variant copies
117 |           m1 (merge seed (random-map gen/byte gen/string (gen/geometric 0.1)))
118 |           m2 (merge seed (random-map gen/byte gen/string (gen/geometric 0.1)))
119 |           ; Compute trees
120 |           t1 (tree m1)
121 |           t2 (tree m2)
122 |           ; And delta
123 |           identical (identical-ranges t1 t2)
124 |           ; And diffs
125 |           d1 (diff m1 t1 t2)
126 |           d2 (diff m2 t2 t1)]
127 | ;      (prn :map1 m1)
128 | ;      (prn :map2 m2)
129 | ;      (prn :diff1 (doall d1))
130 | ;      (prn :diff2 (doall d2))
131 | 
132 |       ; Check that the updates force the maps to converge:
133 |       (is (= m2 (select-keys (into m1 d2) (keys m2))))
134 |       (is (= m1 (select-keys (into m2 d1) (keys m1)))))))
135 | 
136 | (deftest roundtrip-test
137 |   (dotimes [i 10]
138 |     (let [m (random-map gen/string gen/long (gen/geometric 0.05))
139 |           t (tree m)]
140 |       (is (= t (-> t node->map map->node))))))
141 | 


--------------------------------------------------------------------------------
/test/merkle/range_test.clj:
--------------------------------------------------------------------------------
 1 | (ns merkle.range-test
 2 |   (:use clojure.test
 3 |         [merkle.range :only [disjoint? subset? superset?]]))
 4 | 
 5 | (deftest disjoint-test
 6 |   (are [a b] (disjoint? a b)
 7 |        [0 0] [1 1]
 8 |        [1 1] [0 0]
 9 |        [1 2] [3 4]
10 |        [3 4] [1 2])
11 | 
12 |   (are [a b] (not (disjoint? a b))
13 |        [0 0] [0 0]
14 |        [0 0] [0 1]
15 |        [0 1] [0 0]
16 |        [1 2] [1 3]
17 |        [1 3] [1 2]
18 |        [1 1] [0 2]
19 |        [0 2] [1 1]))
20 | 
21 | (deftest subset-test
22 |   (are [a b] (subset? a b)
23 |        [1 1] [1 1]
24 |        [1 1] [1 2]
25 |        [1 1] [0 1]
26 |        [1 1] [0 2]
27 |        [1 2] [1 2]
28 |        [1 2] [0 2]
29 |        [1 2] [1 3]
30 |        [1 2] [0 3])
31 | 
32 |   (are [a b] (not (subset? a b))
33 |        [1 2] [5 6]
34 |        [1 2] [2 3]
35 |        [1 4] [2 6]))
36 | 


--------------------------------------------------------------------------------