├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── project.clj ├── src └── diffit │ ├── map.cljc │ └── vec.cljc └── test └── diffit ├── debug.clj ├── map_test.clj └── vec_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | /doc 5 | pom.xml 6 | pom.xml.asc 7 | *.jar 8 | *.class 9 | /.lein-* 10 | /.nrepl-port 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: clojure 2 | lein: lein2 3 | script: lein2 all test 4 | jdk: 5 | - openjdk7 6 | - oraclejdk7 7 | - oraclejdk8 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # diffit 2 | 3 | A Clojure(Script) diff and patch implementation for vectors and 4 | maps. 5 | 6 | [![Build Status](https://travis-ci.org/friemen/diffit.png?branch=master)](https://travis-ci.org/friemen/diffit) 7 | 8 | [![Clojars Project](https://clojars.org/diffit/latest-version.svg)](https://clojars.org/diffit) 9 | 10 | Include the dependency as shown above in your project.clj 11 | 12 | [API docs](https://friemen.github.io/diffit) 13 | 14 | Diff calculates the *edit-distance* and a minimal *edit-script* to 15 | transform the first into the second collection via patch. 16 | The vector based diff implementation follows 17 | [An O(NP) Sequence Comparison Algorithm](https://publications.mpi-cbg.de/Wu_1990_6334.pdf) 18 | by Wu, Manber, Myers and Miller. 19 | 20 | 21 | ## Why 22 | 23 | The obvious application of the vector based implementation is diff and 24 | patch of text files represented by seqs of strings. 25 | 26 | Another could be the desire to synchronize the final result of a data 27 | transformation (composed of side-effect free functions) with state 28 | wrapped by an expensive mutation-based API, for example a bound JavaFX 29 | ObservableList. A fast diff with a configurable patch is one way to 30 | tackle this. 31 | 32 | A 3rd scenario could be the requirement to transmit a minimal amount 33 | of data changes to a remote process for further processing. 34 | 35 | 36 | There is `clojure.data/diff` but it does not exactly what I had 37 | in mind, as the following REPL output shows: 38 | 39 | ```clojure 40 | (clojure.data/diff [1 2 3 4 5 6 7 8] [1 3 4 5 6 9]) 41 | ;= [[nil 2 3 4 5 6 7 8] [nil 3 4 5 6 9] [1]] 42 | ``` 43 | 44 | The docstring says that the resulting triplet contains 45 | `[things-only-in-a things-only-in-b things-in-both]`. However, the 46 | common sequence `[3 4 5 6]` is part of the first two, so the result is 47 | not a minimal edit-script. 48 | 49 | 50 | Next stop was [clj-diff](https://github.com/brentonashworth/clj-diff), 51 | but unfortunately the good work never made it into a non-snapshot 52 | release. My implementation is from scratch, but - of course - contains 53 | insights from existing open source work such as `clj-diff` and others. 54 | 55 | 56 | ## Usage 57 | 58 | There are two namespaces: `diffit.vec` and `diffit.map`. Each contains 59 | a `diff` and a `patch` function. 60 | 61 | `diffit.vec/diff` and `diffit.vec/patch` work for sequential things 62 | which will be treated as vectors. 63 | 64 | `diffit.map/diff` and `diffit.map/patch` work for associative things. 65 | 66 | Let `x`, `y` both be sequential or associative, respectively: 67 | * `(diff x y)` produces a *diff-result* which is a pair of 68 | `[edit-distance edit-script]`. 69 | * `(patch x diff-result)` applies the edit-script in diff-result to 70 | `x` in order to reproduce `y`. 71 | * It holds that `(= y (patch x (diff x y))`. 72 | 73 | 74 | In the REPL: 75 | 76 | ```clojure 77 | (require '[diffit.vec :as v]) 78 | ;= nil 79 | (v/diff [1 2 3 4] [1 2 7 8 4]) 80 | ;= [3 [[:+ 2 [7 8]] [:- 4 1]]] 81 | ``` 82 | 83 | The diff-result is a pair: the first part is the 84 | [edit-distance](https://en.wikipedia.org/wiki/Edit_distance), the 85 | second part is a sequence of edits (called an *edit-script*) that is 86 | needed to produce the second input collection from the first. 87 | 88 | The edit-script for vectors allows sequential processing in the 89 | `patch` function with insert and remove operations, where insert takes 90 | a sequence, a position and a sequence to insert, and remove takes a 91 | sequence, a position and a number of items to remove. Positions are 92 | zero-based. 93 | 94 | The edit-script in the output above can be read like this: 95 | 96 | * First, add the sequence [7 8] at position 2 (this will shift the items 3 4 to the right). 97 | * Then, remove 1 item at position 4. 98 | 99 | Once you have the result from `diff` you can apply its edit-script with `patch`: 100 | 101 | ```clojure 102 | (v/patch [1 2 3 4] diff-result) 103 | ;= [1 2 7 8 4] 104 | ``` 105 | 106 | The `patch` function can be used with two or three, respectively, additional 107 | arguments to hand in functions that actually do the insert, remove, 108 | assoc, dissoc or replace operation. This lets you adapt `patch` to an 109 | API based on mutation. 110 | 111 | 112 | For a Java `ArrayList` here's a corresponding snippet: 113 | 114 | ```clojure 115 | (patch (fn [l index item] 116 | (.addAll l index item) 117 | l) 118 | (fn [l index n] 119 | (dotimes [_ n] (.remove l index)) 120 | l) 121 | mutable-list diff-result) 122 | ``` 123 | 124 | For Java `HashMap` this looks like this 125 | 126 | ```clojure 127 | (patch (fn [m k v] 128 | (.put m k v) 129 | m) 130 | (fn [m k] 131 | (.remove m k) 132 | m) 133 | (fn [m k v] 134 | (.put m k v) 135 | m) 136 | mutable-map diff-result) 137 | ``` 138 | 139 | 140 | 141 | ## Performance for vector based diff 142 | 143 | `diffit.vec/diff` is a slight bit better than the 144 | [clj-diff](https://github.com/brentonashworth/clj-diff) implementation 145 | which uses the same algorithm and was also built with performance as 146 | key requirement. Both clearly outperform the 1.3.0 version of an open source 147 | Java library called 148 | [diffutils](https://code.google.com/p/java-diff-utils/). 149 | 150 | (So far, I did no extensive research for other Java alternatives. If 151 | you find a better candidate that handles Java lists, and not only 152 | text, drop me a mail.) 153 | 154 | 155 | ### Setup 156 | 157 | I tested it for 2000 items where the sink `bs` was derived from the source 158 | `as` by randomly adding or removing items. Add/remove took place 159 | with 10% probability each, so 80% of the sink is the same as the 160 | source. 161 | 162 | This snippet defines source and sink sequences: 163 | 164 | ```clojure 165 | (do (def as (range 2000)) 166 | (def bs (rand-alter 80 10 10 as)))) 167 | ``` 168 | 169 | Here's the piece of code that creates the sink from the source vector: 170 | 171 | ```clojure 172 | (defn rand-alter 173 | [pass-prob remove-prob add-prob xs] 174 | (let [ops (vec (concat (repeat pass-prob :=) 175 | (repeat remove-prob :-) 176 | (repeat add-prob :+)))] 177 | (reduce (fn [xs x] 178 | (case (rand-nth ops) 179 | :+ (conj xs x "-") 180 | :- xs 181 | := (conj xs x))) 182 | [] 183 | xs))) 184 | ``` 185 | 186 | I used [criterium](https://github.com/hugoduncan/criterium) `bench` to 187 | gather times on a JDK 1.8.0_5 with Clojure 1.6.0 and 4 cores of 188 | `Intel(R) Core(TM) i5 CPU M 560 @ 2.67GHz`. Here are the results: 189 | 190 | ### diffit 191 | 192 | ``` 193 | diffit.vec-test> (>bench (diff as bs)) 194 | Evaluation count : 2340 in 60 samples of 39 calls. 195 | Execution time mean : 28.204187 ms 196 | Execution time std-deviation : 1.357085 ms 197 | Execution time lower quantile : 26.592218 ms ( 2.5%) 198 | Execution time upper quantile : 30.952525 ms (97.5%) 199 | Overhead used : 2.049385 ns 200 | 201 | Found 2 outliers in 60 samples (3.3333 %) 202 | low-severe 1 (1.6667 %) 203 | low-mild 1 (1.6667 %) 204 | Variance from outliers : 33.6221 % Variance is moderately inflated by outliers 205 | ``` 206 | 207 | ### clj-diff 208 | 209 | ``` 210 | diffit.vec-test> (>bench (clj-diff.core/diff as bs)) 211 | Evaluation count : 1860 in 60 samples of 31 calls. 212 | Execution time mean : 33.078036 ms 213 | Execution time std-deviation : 1.514409 ms 214 | Execution time lower quantile : 31.606479 ms ( 2.5%) 215 | Execution time upper quantile : 35.715329 ms (97.5%) 216 | Overhead used : 2.049385 ns 217 | 218 | Found 3 outliers in 60 samples (5.0000 %) 219 | low-severe 2 (3.3333 %) 220 | low-mild 1 (1.6667 %) 221 | Variance from outliers : 31.9529 % Variance is moderately inflated by outliers 222 | ``` 223 | 224 | ### java-diff-utils 225 | 226 | ``` 227 | diffit.vec-test> (>bench (DiffUtils/diff as bs)) 228 | Evaluation count : 120 in 60 samples of 2 calls. 229 | Execution time mean : 1.014824 sec 230 | Execution time std-deviation : 16.119216 ms 231 | Execution time lower quantile : 990.193018 ms ( 2.5%) 232 | Execution time upper quantile : 1.054244 sec (97.5%) 233 | Overhead used : 2.049385 ns 234 | 235 | Found 4 outliers in 60 samples (6.6667 %) 236 | low-severe 4 (6.6667 %) 237 | Variance from outliers : 1.6389 % Variance is slightly inflated by outliers 238 | ``` 239 | 240 | ## Performance for map based diff 241 | 242 | Unsurprisingly, the `diff` for maps is an *order of magnitude faster* due 243 | to the performance characteristics of the underlying datastructures. 244 | 245 | I did no comparison between different libraries (are there any?) or 246 | implementations. 247 | 248 | 249 | ## License 250 | 251 | Copyright © 2014-2025 F.Riemenschneider 252 | 253 | Distributed under the Eclipse Public License version 1.0. 254 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject diffit "1.0.1-SNAPSHOT" 2 | :description 3 | "Clojure(Script) diff and patch implementations for vector and map." 4 | 5 | :url 6 | "https://github.com/friemen/diffit" 7 | 8 | :license 9 | {:name "Eclipse Public License" 10 | :url "http://www.eclipse.org/legal/epl-v10.html"} 11 | 12 | :dependencies 13 | [[org.clojure/clojure "1.12.1"]] 14 | 15 | :plugins 16 | [[lein-codox "0.10.8"]] 17 | 18 | :codox 19 | {:language :clojure 20 | :source-paths ["src"] 21 | :namespaces [#"^diffit"] 22 | :source-uri "https://github.com/friemen/diffit/blob/master/{filepath}#L{line}"} 23 | 24 | :scm 25 | {:name "git" 26 | :url "https://github.com/friemen/diffit"} 27 | 28 | :repositories 29 | [["clojars" {:url "https://clojars.org/repo" 30 | :creds :gpg}]] 31 | 32 | :aliases 33 | {"deploy" ["do" "clean," "deploy" "clojars"]}) 34 | -------------------------------------------------------------------------------- /src/diffit/map.cljc: -------------------------------------------------------------------------------- 1 | (ns diffit.map 2 | "Diff and patch on maps.") 3 | 4 | 5 | ;; --------------------------------------------------------------------------- 6 | ;; diff 7 | 8 | (defn diff 9 | "Returns a pair [edit-distance edit-script] as result of comparision 10 | of two maps am and bm. 11 | 12 | The edit-distance is the number of assocs+dissocs necessary to 13 | reproduce bm from am. 14 | 15 | The edit-script is a map with three entries: 16 | :+ sequence of k-v pairs that are only present in bm 17 | :- sequence of keys removed in bm 18 | :r sequence of k-v pairs that have changed in bm. 19 | 20 | The edit-script can be used with patch to create bm from am." 21 | [am bm] 22 | (let [[adds dels reps] 23 | (->> (keys am) 24 | (concat (keys bm)) 25 | (into #{}) 26 | (reduce (fn [[adds dels reps] k] 27 | (let [a (get am k ::none) 28 | b (get bm k ::none) 29 | a? (not= a ::none) 30 | b? (not= b ::none)] 31 | (cond 32 | (and a? b? (not= a b)) [adds dels (conj! reps [k b])] 33 | (and a? b?) [adds dels reps] 34 | a? [adds (conj! dels k) reps] 35 | b? [(conj! adds [k b]) dels reps]))) 36 | [(transient []) (transient []) (transient [])]))] 37 | [(+ (count adds) (count dels) (count reps)) 38 | {:+ (persistent! adds) 39 | :- (persistent! dels) 40 | :r (persistent! reps)}])) 41 | 42 | 43 | 44 | ;; --------------------------------------------------------------------------- 45 | ;; patch 46 | 47 | (defn patch 48 | "Takes a map am and the result as produced by diff and returns a map 49 | with all deletions, replacements and additions applied to the input 50 | map." 51 | ([assoc-f dissoc-f replace-f am [d {adds :+ dels :- reps :r}]] 52 | (let [m' (reduce (fn [m k] 53 | (dissoc-f m k)) am dels) 54 | m' (reduce (fn [m [k v]] 55 | (replace-f m k v)) m' reps)] 56 | (reduce (fn [m [k v]] 57 | (assoc-f m k v)) m' adds))) 58 | ([am [d {adds :+ dels :- reps :r}]] 59 | (into (reduce dissoc am dels) (concat reps adds)))) 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/diffit/vec.cljc: -------------------------------------------------------------------------------- 1 | (ns diffit.vec 2 | "O(NP) sequence diff and corresponding patch on vectors.") 3 | 4 | 5 | ;; Concepts 6 | ;; 7 | ;; fp is a map {k -> [d edits]} from diagonal k to a pair where 8 | ;; d is the furthest distance and 9 | ;; edits is a vector of edit operations. 10 | ;; 11 | ;; as, bs are sequences of arbitrary items that support equals (=) 12 | ;; av, bv are vector versions that have better count and nth performance 13 | ;; 14 | 15 | 16 | ;; --------------------------------------------------------------------------- 17 | ;; diff 18 | 19 | (defn- edits 20 | [fp k] 21 | (nth (get fp k [nil []]) 1)) 22 | 23 | 24 | (defn- distance 25 | [fp k] 26 | (nth (get fp k [-1]) 0)) 27 | 28 | 29 | (defn- snake 30 | "Advances x on the diagonal k as long as corresponding items in av 31 | and bv match." 32 | [av bv fp k] 33 | (let [n (count av) 34 | m (count bv) 35 | #?(:clj ^long k+1 :cljs k+1) (inc k) 36 | #?(:clj ^long k-1 :cljs k-1) (dec k) 37 | i (inc (distance fp k-1)) 38 | #?(:clj ^long j :cljs j) (distance fp k+1) 39 | #?(:clj ^long x :cljs x) (max i j) 40 | #?(:clj ^long y :cljs y) (- x k) 41 | ;; search for the maximum x on diagonal 42 | fx (loop [#?(:clj ^long x :cljs x) x 43 | #?(:clj ^long y :cljs y) y] 44 | (if (and (< x n) (< y m) (= (nth av x) (nth bv y)) ) 45 | (recur (inc x) (inc y)) 46 | x))] 47 | [fx 48 | ;; add edit operation symbols 49 | (let [es (if (> i j) 50 | (conj (edits fp k-1) :-) 51 | (conj (edits fp k+1) :+))] 52 | (if (> fx x) 53 | (conj es (- fx x)) 54 | es))])) 55 | 56 | 57 | (defn- step 58 | "Returns the next pair of [fp p] of furthest distances." 59 | [av bv delta [fp p]] 60 | (let [p (inc p) 61 | fpt (transient fp) 62 | fpt (loop [k (* -1 p) fpt fpt] 63 | (if (< k delta) 64 | (recur (inc k) (assoc! fpt k (snake av bv fpt k))) 65 | fpt)) 66 | fpt (loop [k (+ delta p) fpt fpt] 67 | (if (< delta k) 68 | (recur (dec k) (assoc! fpt k (snake av bv fpt k))) 69 | fpt)) 70 | fp (persistent! (assoc! fpt delta (snake av bv fpt delta)))] 71 | [fp p])) 72 | 73 | 74 | (defn- diff* 75 | "Assumes that (count as) >= (count bs)." 76 | [av bv] 77 | (let [delta (- (count av) (count bv)) 78 | [fp p] (->> [{} -1] 79 | (iterate (partial step av bv delta)) 80 | (drop-while (fn [[fp _]] 81 | (not= (distance fp delta) (count av)))) 82 | (first))] 83 | [(+ delta (* 2 p)) (->> (get fp delta) 84 | (second) 85 | (drop 1))])) 86 | 87 | 88 | (defn- swap-insdels 89 | "Swaps edit operation symbols :+ <-> :-" 90 | [[d edits]] 91 | [d (map (fn [op] (case op :+ :- :- :+ op)) edits)]) 92 | 93 | 94 | (defn- editscript 95 | "Produces an edit script from the edits issued by diff*." 96 | [av bv edits] 97 | ;; the groups are seqs of :+'s or :-'s or one number 98 | (loop [groups (partition-by identity edits) 99 | x 0 100 | y 0 101 | script []] 102 | (if-let [[op & ops] (first groups)] 103 | (let [n (inc (count ops))] 104 | (case op 105 | :- (recur (rest groups) 106 | x y 107 | (conj script [:- x n])) 108 | :+ (recur (rest groups) 109 | (+ x n) (+ y n) 110 | (conj script [:+ y (subvec bv y (+ y n))])) 111 | (recur (rest groups) 112 | (+ x op) (+ y op) ; op is the number of items to skip 113 | script))) 114 | script))) 115 | 116 | 117 | (defn diff 118 | "Returns a pair [edit-distance edit-script] as result of comparision 119 | of sequences as and bs. 120 | 121 | Edit-distance is an integer. 122 | 123 | The edit-script is a sequence of vectors starting with an insert or 124 | delete operation symbol :+ or :-. 125 | 126 | An insert is [:+ position items]. 127 | A delete is [:- position number-of-items]. 128 | 129 | The edit-script is made for sequential processing with operations 130 | like insert-at: [xs pos items -> xs'] and remove-at: [xs pos n -> xs']." 131 | [as bs] 132 | (cond 133 | (and (empty? as) (empty? bs)) 134 | [0 []] 135 | (empty? as) 136 | [(count bs) [[:+ 0 bs]]] 137 | (empty? bs) 138 | [(count as) [[:- 0 (count as)]]] 139 | :else 140 | (let [av (vec as) 141 | bv (vec bs) 142 | [d edits] (if (< (count av) (count bv)) 143 | (swap-insdels (diff* bv av)) 144 | (diff* av bv))] 145 | [d (editscript av bv edits)]))) 146 | 147 | 148 | ;; --------------------------------------------------------------------------- 149 | ;; patch 150 | 151 | 152 | (defn insert-at 153 | "Insert sequence ys at position i into xs." 154 | [xs i ys] 155 | (let [xv (vec xs)] 156 | (concat (subvec xv 0 i) ys (subvec xv i)))) 157 | 158 | 159 | (defn remove-at 160 | "Remove n items at position i from xs." 161 | ([xs i] 162 | (remove-at xs i 1)) 163 | ([xs i n] 164 | (let [xv (vec xs)] 165 | (concat (subvec xv 0 i) (subvec xv (+ i n)))))) 166 | 167 | 168 | (defn patch 169 | "Applies the edit-script (as contained in the result of diff) to 170 | sequence as, using by default insert-at and remove-at as implemented 171 | in this namespace. Returns a vector." 172 | ([as diff-result] 173 | (patch insert-at remove-at (vec as) diff-result)) 174 | ([insert-f remove-f as [d es]] 175 | (vec (reduce (fn [bs [op & params]] 176 | (case op 177 | :+ (insert-f bs (first params) (second params)) 178 | :- (remove-f bs (first params) (second params)))) 179 | as 180 | es)))) 181 | -------------------------------------------------------------------------------- /test/diffit/debug.clj: -------------------------------------------------------------------------------- 1 | (ns diffit.debug 2 | (:require [diffit.vec :as v] 3 | [diffit.vec-test :refer [rand-alter]])) 4 | 5 | 6 | #_ (do (def as (range 2000)) 7 | (def bs (rand-alter 80 10 10 as))) 8 | 9 | #_ (>bench (v/diff as bs)) 10 | 11 | #_ ( require '[clj-diff.core :as d]) 12 | #_ (>bench (d/diff as bs)) 13 | 14 | #_ (import '[difflib DiffUtils]) 15 | #_ (>bench (DiffUtils/diff as bs)) 16 | 17 | 18 | 19 | ;; --------------------------------------------------------------------------- 20 | ;; stuff to debug and tune performance 21 | 22 | #_ (defn- dump 23 | [fp] 24 | (doseq [[k [d edits]] (sort-by first fp)] 25 | (println (format "%4d" k) (format "%4d" d) " -> " edits)) 26 | (println (apply str (repeat 40 "-")))) 27 | 28 | 29 | #_ (defmacro ^:private with-time 30 | [time-atom & exprs] 31 | `(let [start# (System/nanoTime) 32 | result# ~@exprs 33 | stop# (System/nanoTime)] 34 | (swap! ~time-atom + (- stop# start#)) 35 | result#)) 36 | 37 | #_ (def t (atom 0)) 38 | 39 | 40 | #_ (defn diffpatch 41 | [as bs] 42 | (println as) 43 | (println bs) 44 | (let [diffres (v/diff as bs) 45 | patched (v/patch as diffres)] 46 | (println (second diffres)) 47 | (println "expected" (vec bs)) 48 | (println "actual " patched) 49 | (assert (= (vec bs) patched)))) 50 | 51 | #_ (do (reset! diffit.vec/t 0) 52 | (v/diff as bs) 53 | (println (float (/ @diffit.vec/t 1e6)))) 54 | 55 | -------------------------------------------------------------------------------- /test/diffit/map_test.clj: -------------------------------------------------------------------------------- 1 | (ns diffit.map-test 2 | (:require [clojure.test :refer :all] 3 | [diffit.map :refer [diff patch]]) 4 | (:import [java.util HashMap])) 5 | 6 | 7 | (deftest diff-tests 8 | (are [am bm d es] (= [d es] (diff am bm)) 9 | {} {} 0 {:+ [] :- [] :r []} 10 | {:a 1} {} 1 {:+ [] :- [:a] :r []} 11 | {:a 1} {:a 2} 1 {:+ [] :- [] :r [[:a 2]]} 12 | {} {:b 2} 1 {:+ [[:b 2]] :- [] :r []} 13 | {:a 1 :b 2} {:a 2 :c 4} 3 {:+ [[:c 4]] :- [:b] :r [[:a 2]]})) 14 | 15 | 16 | (deftest diffpatch-tests 17 | (are [am bm] (= bm (patch am (diff am bm))) 18 | {} {} 19 | {:a 1} {} 20 | {:a 1} {:a 2} 21 | {} {:b 2} 22 | {:a 1 :b 2} {:a 2 :b 3 :c 4})) 23 | 24 | 25 | (deftest patch-javamap-test 26 | (let [am {:a 1 :b 2 :c 3 :d 4} 27 | bm {:a 2 :c 3 :d 5 :e 6} 28 | dr (diff am bm) 29 | mm (let [mm (HashMap.)] 30 | (doseq [[k v] am] (.put mm k v)) 31 | mm)] 32 | (patch (fn [m k v] 33 | (.put m k v) 34 | m) 35 | (fn [m k] 36 | (.remove m k) 37 | m) 38 | (fn [m k v] 39 | (.put m k v) 40 | m) 41 | mm 42 | dr) 43 | (is (= mm bm)))) 44 | 45 | 46 | #_(def am (->> (repeatedly #(rand-nth [:foo :bar :baz])) 47 | (map vector (range 1e5)) 48 | (into {}))) 49 | #_(def bm (->> am 50 | (filter (fn [_] (rand-nth [true true false]))) 51 | (into {}))) 52 | -------------------------------------------------------------------------------- /test/diffit/vec_test.clj: -------------------------------------------------------------------------------- 1 | (ns diffit.vec-test 2 | (:require [clojure.test :refer :all] 3 | [diffit.vec :refer [diff patch]]) 4 | (:import [java.util ArrayList])) 5 | 6 | 7 | 8 | (deftest diffpatch-tests 9 | (are [as bs] (= bs (apply str (patch as (diff as bs)))) 10 | "" "" 11 | "" "abc" 12 | "ab" "" 13 | "ab" "ABCab" 14 | "ab" "abXYZ" 15 | "ab" "ABCabXYZ" 16 | "ABC" "abc" 17 | "ABCDEF" "ADEF" 18 | "ABCDEF" "ABCdefXYZEFABCDEF")) 19 | 20 | 21 | (deftest patch-javalist-test 22 | (let [as [:a :b :c :d :e :a :b :d] 23 | bs [:a :c :d :a :b :d] 24 | dr (diff as bs) 25 | ml (let [ml (ArrayList.)] 26 | (doseq [a as] 27 | (.add ml a)) 28 | ml)] 29 | (patch (fn [l index item] 30 | (.addAll l index item) 31 | l) 32 | (fn [l index n] 33 | (dotimes [_ n] (.remove l index)) 34 | l) 35 | ml 36 | dr) 37 | (= ml bs))) 38 | 39 | 40 | (defn rand-alter 41 | [pass-prob remove-prob add-prob xs] 42 | (let [ops (vec (concat (repeat pass-prob :=) 43 | (repeat remove-prob :-) 44 | (repeat add-prob :+)))] 45 | (reduce (fn [xs x] 46 | (case (rand-nth ops) 47 | :+ (conj xs x "-") 48 | :- xs 49 | := (conj xs x))) 50 | [] 51 | xs))) 52 | 53 | 54 | (deftest random-tests 55 | (are [n] (let [as (range n) 56 | bs (rand-alter 90 5 5 as) 57 | _ (println n "items") 58 | diffres (time (diff as bs)) 59 | patched (patch as diffres)] 60 | (= bs patched)) 61 | 10 100 1000 2000)) 62 | --------------------------------------------------------------------------------