├── .cljrc.clj ├── .gitignore ├── COPYING ├── README.textile ├── project.clj ├── script ├── clean ├── javac └── push ├── src ├── clj │ └── clj_diff │ │ ├── core.clj │ │ ├── miller.clj │ │ └── optimizations.clj └── jvm │ └── clj_diff │ └── FastStringOps.java └── test └── clj_diff └── test ├── core.clj ├── miller.clj └── optimizations.clj /.cljrc.clj: -------------------------------------------------------------------------------- 1 | (set! *print-length* 103) 2 | (set! *print-level* 15) 3 | #_(set! *warn-on-reflection* true) 4 | 5 | (defn exit [] (. System exit 0)) 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml 2 | *jar 3 | lib 4 | classes 5 | *.png 6 | files 7 | docs/ 8 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Eclipse Public License - v 1.0 2 | 3 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE 4 | PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF 5 | THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 6 | 7 | 1. DEFINITIONS 8 | 9 | "Contribution" means: 10 | 11 | a) in the case of the initial Contributor, the initial code and 12 | documentation distributed under this Agreement, and 13 | 14 | b) in the case of each subsequent Contributor: 15 | 16 | i) changes to the Program, and 17 | 18 | ii) additions to the Program; 19 | 20 | where such changes and/or additions to the Program originate from and 21 | are distributed by that particular Contributor. A Contribution 22 | 'originates' from a Contributor if it was added to the Program by such 23 | Contributor itself or anyone acting on such Contributor's 24 | behalf. Contributions do not include additions to the Program which: 25 | (i) are separate modules of software distributed in conjunction with 26 | the Program under their own license agreement, and (ii) are not 27 | derivative works of the Program. 28 | 29 | "Contributor" means any person or entity that distributes the Program. 30 | 31 | "Licensed Patents" mean patent claims licensable by a Contributor 32 | which are necessarily infringed by the use or sale of its Contribution 33 | alone or when combined with the Program. 34 | 35 | "Program" means the Contributions distributed in accordance with this 36 | Agreement. 37 | 38 | "Recipient" means anyone who receives the Program under this 39 | Agreement, including all Contributors. 40 | 41 | 2. GRANT OF RIGHTS 42 | 43 | a) Subject to the terms of this Agreement, each Contributor hereby 44 | grants Recipient a non-exclusive, worldwide, royalty-free copyright 45 | license to reproduce, prepare derivative works of, publicly display, 46 | publicly perform, distribute and sublicense the Contribution of such 47 | Contributor, if any, and such derivative works, in source code and 48 | object code form. 49 | 50 | b) Subject to the terms of this Agreement, each Contributor hereby 51 | grants Recipient a non-exclusive, worldwide, royalty-free patent 52 | license under Licensed Patents to make, use, sell, offer to sell, 53 | import and otherwise transfer the Contribution of such Contributor, if 54 | any, in source code and object code form. This patent license shall 55 | apply to the combination of the Contribution and the Program if, at 56 | the time the Contribution is added by the Contributor, such addition 57 | of the Contribution causes such combination to be covered by the 58 | Licensed Patents. The patent license shall not apply to any other 59 | combinations which include the Contribution. No hardware per se is 60 | licensed hereunder. 61 | 62 | c) Recipient understands that although each Contributor grants the 63 | licenses to its Contributions set forth herein, no assurances are 64 | provided by any Contributor that the Program does not infringe the 65 | patent or other intellectual property rights of any other entity. Each 66 | Contributor disclaims any liability to Recipient for claims brought by 67 | any other entity based on infringement of intellectual property rights 68 | or otherwise. As a condition to exercising the rights and licenses 69 | granted hereunder, each Recipient hereby assumes sole responsibility 70 | to secure any other intellectual property rights needed, if any. For 71 | example, if a third party patent license is required to allow 72 | Recipient to distribute the Program, it is Recipient's responsibility 73 | to acquire that license before distributing the Program. 74 | 75 | d) Each Contributor represents that to its knowledge it has sufficient 76 | copyright rights in its Contribution, if any, to grant the copyright 77 | license set forth in this Agreement. 78 | 79 | 3. REQUIREMENTS 80 | 81 | A Contributor may choose to distribute the Program in object code form 82 | under its own license agreement, provided that: 83 | 84 | a) it complies with the terms and conditions of this Agreement; and 85 | 86 | b) its license agreement: 87 | 88 | i) effectively disclaims on behalf of all Contributors all warranties 89 | and conditions, express and implied, including warranties or 90 | conditions of title and non-infringement, and implied warranties or 91 | conditions of merchantability and fitness for a particular purpose; 92 | 93 | ii) effectively excludes on behalf of all Contributors all liability 94 | for damages, including direct, indirect, special, incidental and 95 | consequential damages, such as lost profits; 96 | 97 | iii) states that any provisions which differ from this Agreement are 98 | offered by that Contributor alone and not by any other party; and 99 | 100 | iv) states that source code for the Program is available from such 101 | Contributor, and informs licensees how to obtain it in a reasonable 102 | manner on or through a medium customarily used for software exchange. 103 | 104 | When the Program is made available in source code form: 105 | 106 | a) it must be made available under this Agreement; and 107 | 108 | b) a copy of this Agreement must be included with each copy of the Program. 109 | 110 | Contributors may not remove or alter any copyright notices contained 111 | within the Program. 112 | 113 | Each Contributor must identify itself as the originator of its 114 | Contribution, if any, in a manner that reasonably allows subsequent 115 | Recipients to identify the originator of the Contribution. 116 | 117 | 4. COMMERCIAL DISTRIBUTION 118 | 119 | Commercial distributors of software may accept certain 120 | responsibilities with respect to end users, business partners and the 121 | like. While this license is intended to facilitate the commercial use 122 | of the Program, the Contributor who includes the Program in a 123 | commercial product offering should do so in a manner which does not 124 | create potential liability for other Contributors. Therefore, if a 125 | Contributor includes the Program in a commercial product offering, 126 | such Contributor ("Commercial Contributor") hereby agrees to defend 127 | and indemnify every other Contributor ("Indemnified Contributor") 128 | against any losses, damages and costs (collectively "Losses") arising 129 | from claims, lawsuits and other legal actions brought by a third party 130 | against the Indemnified Contributor to the extent caused by the acts 131 | or omissions of such Commercial Contributor in connection with its 132 | distribution of the Program in a commercial product offering. The 133 | obligations in this section do not apply to any claims or Losses 134 | relating to any actual or alleged intellectual property 135 | infringement. In order to qualify, an Indemnified Contributor must: a) 136 | promptly notify the Commercial Contributor in writing of such claim, 137 | and b) allow the Commercial Contributor tocontrol, and cooperate with 138 | the Commercial Contributor in, the defense and any related settlement 139 | negotiations. The Indemnified Contributor may participate in any such 140 | claim at its own expense. 141 | 142 | For example, a Contributor might include the Program in a commercial 143 | product offering, Product X. That Contributor is then a Commercial 144 | Contributor. If that Commercial Contributor then makes performance 145 | claims, or offers warranties related to Product X, those performance 146 | claims and warranties are such Commercial Contributor's responsibility 147 | alone. Under this section, the Commercial Contributor would have to 148 | defend claims against the other Contributors related to those 149 | performance claims and warranties, and if a court requires any other 150 | Contributor to pay any damages as a result, the Commercial Contributor 151 | must pay those damages. 152 | 153 | 5. NO WARRANTY 154 | 155 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS 156 | PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 157 | KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY 158 | WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY 159 | OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely 160 | responsible for determining the appropriateness of using and 161 | distributing the Program and assumes all risks associated with its 162 | exercise of rights under this Agreement , including but not limited to 163 | the risks and costs of program errors, compliance with applicable 164 | laws, damage to or loss of data, programs or equipment, and 165 | unavailability or interruption of operations. 166 | 167 | 6. DISCLAIMER OF LIABILITY 168 | 169 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR 170 | ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, 171 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING 172 | WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF 173 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 174 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR 175 | DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED 176 | HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 177 | 178 | 7. GENERAL 179 | 180 | If any provision of this Agreement is invalid or unenforceable under 181 | applicable law, it shall not affect the validity or enforceability of 182 | the remainder of the terms of this Agreement, and without further 183 | action by the parties hereto, such provision shall be reformed to the 184 | minimum extent necessary to make such provision valid and enforceable. 185 | 186 | If Recipient institutes patent litigation against any entity 187 | (including a cross-claim or counterclaim in a lawsuit) alleging that 188 | the Program itself (excluding combinations of the Program with other 189 | software or hardware) infringes such Recipient's patent(s), then such 190 | Recipient's rights granted under Section 2(b) shall terminate as of 191 | the date such litigation is filed. 192 | 193 | All Recipient's rights under this Agreement shall terminate if it 194 | fails to comply with any of the material terms or conditions of this 195 | Agreement and does not cure such failure in a reasonable period of 196 | time after becoming aware of such noncompliance. If all Recipient's 197 | rights under this Agreement terminate, Recipient agrees to cease use 198 | and distribution of the Program as soon as reasonably 199 | practicable. However, Recipient's obligations under this Agreement and 200 | any licenses granted by Recipient relating to the Program shall 201 | continue and survive. 202 | 203 | Everyone is permitted to copy and distribute copies of this Agreement, 204 | but in order to avoid inconsistency the Agreement is copyrighted and 205 | may only be modified in the following manner. The Agreement Steward 206 | reserves the right to publish new versions (including revisions) of 207 | this Agreement from time to time. No one other than the Agreement 208 | Steward has the right to modify this Agreement. The Eclipse Foundation 209 | is the initial Agreement Steward. The Eclipse Foundation may assign 210 | the responsibility to serve as the Agreement Steward to a suitable 211 | separate entity. Each new version of the Agreement will be given a 212 | distinguishing version number. The Program (including Contributions) 213 | may always be distributed subject to the version of the Agreement 214 | under which it was received. In addition, after a new version of the 215 | Agreement is published, Contributor may elect to distribute the 216 | Program (including its Contributions) under the new version. Except as 217 | expressly stated in Sections 2(a) and 2(b) above, Recipient receives 218 | no rights or licenses to the intellectual property of any Contributor 219 | under this Agreement, whether expressly, by implication, estoppel or 220 | otherwise. All rights in the Program not expressly granted under this 221 | Agreement are reserved. 222 | 223 | This Agreement is governed by the laws of the State of California and 224 | the intellectual property laws of the United States of America. No 225 | party to this Agreement will bring a legal action under this Agreement 226 | more than one year after the cause of action arose. Each party waives 227 | its rights to a jury trial in any resulting litigation. -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | h1. clj-diff 2 | 3 | Provides diff and patch functions for Clojure sequences where (diff a b) -> x and (patch a x) -> b. Also provides edit-distance and levenshtein-distance functions for calculating the difference between two sequences. 4 | 5 | h2. Usage 6 | 7 |
user=> (use 'clj-diff.core)
 8 | user=> (diff "John went to the movies." "John was in the movies.")
 9 | {:+ [[5 \a \s \space \i]], :- [6 8 9 10 11]}
10 | user=> (patch "John went to the movies." *1)
11 | "John was in the movies."
12 | user=> (edit-distance "John went to the movies." "John was in the movies."))
13 | 9
14 | user=> (levenshtein-distance "John went to the movies." "John was in the movies.")
15 | 8
16 | 
17 | 18 | There is already a "Java library":http://code.google.com/p/google-diff-match-patch/ which does this well. Why create a Clojure version? So that we can do this: 19 | 20 |
user=> (def a [{:a 1} {:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7}])
21 | user=> (def b [{:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7} {:a 1}])
22 | user=> (diff a b)
23 | {:+ [[6 {:a 1}]], :- [0]}
24 | user=> (patch a *1)
25 | ({:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7} {:a 1})
26 | user=> (edit-distance a b)
27 | 2
28 | 
29 | 30 | h2. Notes 31 | 32 | The current diff algorithm comes from the paper "An O(NP) Sequence Comparison Algorithm":http://portal.acm.org/citation.cfm?id=96223 by Sun Wu, Udi Manber, Gene Myers and Webb Miller. It is fast and memory efficient. It also makes use of the pre-diff optimizations mentioned in Neil Fraser's "Diff Strategies":http://neil.fraser.name/writing/diff/. The worst-case running time of the algorithm is dependent only on the length of the longest sequence (N) and the number of deletions (P). This is much better than the "Myers":http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6927&rep=rep1&type=pdf algorithm which is an O(ND) algorithm where N is the sum of the length of the two sequences and D is the edit distance. 33 | 34 | I have created a "separate project":http://github.com/brentonashworth/clj-diff-performance for comparing the performance of different diff algorithms. The main performance goal of this project is to create a Clojure diff that can outperform Fraser's Java implementation. One of the most interesting results is show below. 35 | 36 | 37 | 38 | This chart shows the most interesting range of comparison between the three algorithms. The current algorithm being used by clj-diff is labeled "Miller". For larger sequences, the Miller algorithm outperforms Fraser. To summarize all of the performance test results; the Miller algorithm is never more than 50 milliseconds slower than the Fraser algorithm but the Fraser algorithm can be multiple seconds slower, and will run out of memory more quickly, for large sequences. For more information about performance see the "clj-diff-performance":http://github.com/brentonashworth/clj-diff-performance project. 39 | 40 | h2. Installation 41 | 42 | h3. Leiningen 43 | 44 | Add [clj-diff "1.0.0-SNAPSHOT"] to your :dependencies in project.clj. 45 | 46 | h3. Maven 47 | 48 | Add the following dependency: 49 | 50 |

51 |   clj-diff
52 |   clj-diff
53 |   1.0.0-SNAPSHOT
54 | 
55 | 56 | ...which comes from Clojars... 57 | 58 |

59 |   clojars.org
60 |   http://clojars.org/repo
61 | 
62 | 63 | h2. References 64 | 65 | * "An O(ND) Difference Algorithm and Its Variations by Eugene W. Myers":http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6927&rep=rep1&type=pdf 66 | * "An O(NP) Sequence Comparison Algorithm by Sun Wu, Udi Manber, Gene Myers and Webb Miller":http://portal.acm.org/citation.cfm?id=96223 67 | 68 | In order to understand the above two titles, you will need to know what N, D and P stand for. Let A and B be two sequences with lengths Q and M where Q >= M. Let D be the length of the minimum edit script for A and B. In the title of the first paper N = Q + M. In the second paper N = Q and P = (1/2)D - (1/2)(Q - M). Therefore, the O(NP) version is faster which is visualized in the charts below. 69 | 70 | * "Diff Strategies by Neil Fraser":http://neil.fraser.name/writing/diff/ 71 | 72 | h2. Roadmap 73 | 74 | h3. Version 1.0 75 | 76 | # Text diff visualization. 77 | # HTML diff visualization. 78 | # Semantic cleanup. 79 | # Sequence chunking. Allow line based diff for strings. 80 | 81 | h3. Version 2.0 82 | 83 | # Improve performance by searching the edit graph from both ends at the same time. 84 | # Arbitrary Clojure form diff/Nested diffs. Integrate with clojure.data/diff in Clojure 1.3 85 | # Set the maximum time to look for an optimal diff. 86 | # Add a fast and correct levenshtein-distance function. 87 | 88 | 89 | h2. License 90 | 91 | Copyright (C) 2010-2011 Brenton Ashworth 92 | 93 | Distributed under the Eclipse Public License, the same as Clojure uses. See the file COPYING. 94 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-diff "1.0.0-SNAPSHOT" 2 | :description "Sequential diff in Clojure." 3 | :url "http://github.com/brentonashworth/clj-diff" 4 | :source-path "src/clj" 5 | :java-source-path "src/jvm" 6 | :java-fork "true" 7 | :java-debug "true" 8 | :hooks [leiningen.hooks.javac 9 | leiningen.hooks.difftest] 10 | :dev-dependencies [[org.clojure/clojure "1.2.0"] 11 | [lein-javac "1.2.1-SNAPSHOT"] 12 | [marginalia "0.5.0"] 13 | [lein-difftest "1.3.2-SNAPSHOT"]]) 14 | -------------------------------------------------------------------------------- /script/clean: -------------------------------------------------------------------------------- 1 | rm clj-diff-* 2 | rm -rf classes 3 | -------------------------------------------------------------------------------- /script/javac: -------------------------------------------------------------------------------- 1 | rm -rf classes 2 | cd src/jvm 3 | javac clj_diff/FastStringOps.java 4 | cd ../../ 5 | mkdir classes 6 | mkdir classes/clj_diff 7 | mv src/jvm/clj_diff/FastStringOps.class classes/clj_diff 8 | -------------------------------------------------------------------------------- /script/push: -------------------------------------------------------------------------------- 1 | lein jar 2 | lein pom 3 | scp pom.xml clj-diff-* clojars@clojars.org: 4 | -------------------------------------------------------------------------------- /src/clj/clj_diff/core.clj: -------------------------------------------------------------------------------- 1 | (ns clj-diff.core 2 | "Diff, patch and related functions for Clojure sequences." 3 | (:require [clj-diff [miller :as miller]])) 4 | 5 | (defn diff 6 | "Create the edit script for transforming sequance a into sequence b. 7 | An edit script is a map with keys :+ and :- for additions and deletions. 8 | Additions are represented as a sequence of vectors. The first item in each 9 | vector is the index where the rest of the items in the vector are to be 10 | inserted. For example [3 b c] means to insert b an c after whatever is 11 | in index 3. Deletions are represented as a sequence of indexes to delete. 12 | 13 | For example: the diff of 'abcabba' and 'cbabac' would generate the edit 14 | script below. 15 | 16 | {:+ [[2 b] [6 c]], :- [0 1 5]} 17 | 18 | An index of -1 may appear in additions and is a special case which means to 19 | add the elements at the beginning of the sequence." 20 | [a b] 21 | (miller/diff a b)) 22 | 23 | (defn- merge-patch 24 | [s edit-script delete-symbol] 25 | (let [s (vec s) 26 | additions (:+ edit-script) 27 | deletions (:- edit-script) 28 | s (reduce (fn [a b] 29 | (assoc a b delete-symbol)) 30 | s 31 | deletions) 32 | s (reduce (fn [a b] 33 | (let [index (first b) 34 | items (rest b)] 35 | (if (= index -1) 36 | (assoc a 0 (conj (vec items) (get a 0))) 37 | (assoc a index (conj items (get a index)))))) 38 | s 39 | additions)] 40 | (flatten s))) 41 | 42 | (defn patch* 43 | [s edit-script] 44 | (filter #(not (nil? %)) (merge-patch s edit-script nil))) 45 | 46 | (defmulti ^{:arglists '([s edit-script])} patch 47 | "Use the instructions in the edit script to transform the sequence s into 48 | a new sequence. If the edit script was created by using diff on a and b then 49 | patch will use the edit script to transform a into b. 50 | 51 | (diff a b) -> x, (patch a x) -> b." 52 | (fn [s _] (class s))) 53 | 54 | (defmethod patch :default 55 | [s edit-script] 56 | (patch* s edit-script)) 57 | 58 | (defmethod patch String 59 | [s edit-script] 60 | (apply str (patch* s edit-script))) 61 | 62 | (defn edit-distance 63 | "Returns the edit distance between the two passed sequences. May also be 64 | passed an edit script. The edit distance is the minimum number of insertions 65 | and deletions required to transform one sequence into another." 66 | ([a b] 67 | (miller/edit-distance a b)) 68 | ([edit-script] 69 | (+ (count (:- edit-script)) 70 | (reduce + (map #(count (drop 1 %)) (:+ edit-script)))))) 71 | 72 | (defn- max-or-zero [coll] 73 | (if (and (coll? coll) 74 | (not (empty? coll))) 75 | (apply max coll) 76 | 0)) 77 | 78 | (defn levenshtein-distance 79 | "Returns the Levenshtein distance between two sequences. May either be passed 80 | the two sequences or a diff of the two sequences. 81 | 82 | From [Wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance): 83 | The Levenshtein distance between two strings is the minimum number of edits 84 | needed to transform one string into the other, with the allowable edit 85 | operations being insertion, deletion and substitution of a single character. 86 | 87 | This function works not only with strings but with any Clojure sequence. 88 | 89 | Warning! Technically this function is estimating the Levenshtein distance 90 | from a computed diff. Most of the time, it is the same as the real Levenshtein 91 | distance but in same cases it may be larger. The reason for this is that 92 | there may be multiple paths through an edit graph with the same edit 93 | distance but with differing Levenshtein distance. A future improvement to 94 | the diff algorithm whould be to find all paths and prefer the one with the 95 | minimum Levenshtein distance." 96 | ([a b] 97 | (levenshtein-distance (diff a b))) 98 | ([edit-script] 99 | (let [additions (map #(let [index (first %) 100 | items (rest %)] 101 | (apply vector index (repeat (count items) :a))) 102 | (:+ edit-script)) 103 | max-index (max (max-or-zero (map first additions)) 104 | (max-or-zero (:- edit-script))) 105 | v (vec (repeat max-index :e)) 106 | patched (merge-patch v (merge edit-script {:+ additions}) :d) 107 | edit-groups (filter #(not= :e (first %)) 108 | (partition-by #(if (= % :e) :e :edit) 109 | patched))] 110 | (reduce + (map (fn [group] 111 | (max (count (filter #(= % :a) group)) 112 | (count (filter #(= % :d) group)))) 113 | edit-groups))))) 114 | 115 | (defn longest-common-subseq [a b] 116 | (miller/longest-common-subseq a b)) 117 | -------------------------------------------------------------------------------- /src/clj/clj_diff/miller.clj: -------------------------------------------------------------------------------- 1 | (ns clj-diff.miller 2 | "Algorithm from 'An O(NP) Sequence Comparison Algorithm' by 3 | Sun Wu, Udi Manber, Gene Myers and Web Miller. 4 | 5 | Please refer to the above paper while reading this code." 6 | (:require [clj-diff [optimizations :as opt]])) 7 | 8 | (defn- next-x 9 | "Get the next farthest x value by looking at previous farthest values on the 10 | diagonal above and below diagonal k. Choose the greater of the farthest x on 11 | the above diagonal and the farthest x on the diagonal below plus one. fp is 12 | a map of diagonals => farthest points." 13 | [k fp] 14 | (max (inc (get fp (dec k) -1)) 15 | (get fp (inc k) -1))) 16 | 17 | (defn- snake 18 | "Starting at the farthest point on diagonal k, return the x value of the 19 | point at the end of the longest snake on this diagonal. A snake is a 20 | sequence of diagonal moves connecting match points on the edit graph." 21 | [a b n m k fp] 22 | {:pre [(and (vector? a) (vector? b))]} 23 | (let [x (next-x k fp) 24 | y (- x k)] 25 | (loop [x x 26 | y y] 27 | (if (and (< x n) (< y m) (= (get a (inc x)) (get b (inc y)))) 28 | (recur (inc x) (inc y)) 29 | x)))) 30 | 31 | (defn- p-band-diagonals 32 | "Given a p value and a delta, return all diagonals in this p-band." 33 | [p delta] 34 | (concat (range (* -1 p) delta) 35 | (reverse (range (inc delta) (+ delta (inc p)))) 36 | [delta])) 37 | 38 | (defn- search-p-band 39 | "Given a p value, search all diagonals in the p-band for the furthest 40 | reaching endpoints. Record the furthest reaching endpoint for each p value 41 | in the map fp. Returns an updated fp map for p. a and b are the two 42 | sequences and n and m are their lengths respectively. delta is the 43 | diagonal of the sink and is equal to n - m." 44 | [a b n m delta p fp] 45 | (reduce (fn [fp next-k] 46 | (assoc fp next-k (snake a b n m next-k fp))) 47 | fp 48 | (p-band-diagonals p delta))) 49 | 50 | (defn ses 51 | "Find the size of the shortest edit script (ses). Returns a 3-tuple of the 52 | size of the ses, the delta value (which is the diagonal of the sink) 53 | and the fp map. The optimal path from source to sink can be constructed from 54 | this information." 55 | [a b] 56 | {:pre [(>= (count a) (count b))]} 57 | (let [n (dec (count a)) 58 | m (dec (count b)) 59 | delta (- n m)] 60 | (loop [p 0 61 | fp {}] 62 | (if (= (-> (get fp (dec p) {}) 63 | (get delta)) 64 | n) 65 | [(dec p) delta fp] 66 | (recur (inc p) 67 | (assoc fp p 68 | (search-p-band a b n m delta p (get fp (dec p) {})))))))) 69 | 70 | ;; 71 | ;; Build the edit script from the map of farthest endpoints. 72 | ;; 73 | 74 | (defn edit-dist 75 | "Given a delta, p and k value, calculate the edit distance." 76 | [delta p k] 77 | (if (> k delta) 78 | (+ (* 2 (- p (- k delta))) k) 79 | (+ (* 2 p) k))) 80 | 81 | (defn- p-value-up 82 | "Calculate the p value that will be used to look up the farthest reaching 83 | end point for the diagonal above k." 84 | [delta p k] 85 | (if (> (inc k) delta) p (dec p))) 86 | 87 | (defn- p-value-left 88 | "Calculate the p value that will be used to look up the farthest reaching 89 | end point for the diagonal below k." 90 | [delta p k] 91 | (if (< (dec k) delta) p (dec p))) 92 | 93 | (defn- look-up 94 | "Get information about the vertex above the one at x on k. If this vertex 95 | is chosen, it will represent an insertion." 96 | [graph delta p x k] 97 | (when (> (- x k) 0) 98 | (let [up-k (inc k) 99 | up-p (p-value-up delta p k) 100 | x* (-> graph 101 | (get up-p {}) 102 | (get up-k -1))] 103 | (when (and (>= x* 0) (= x x*)) 104 | {:edit :insert 105 | :x x* 106 | :p up-p 107 | :k up-k 108 | :d (edit-dist delta up-p up-k)})))) 109 | 110 | (defn- look-left 111 | "Get information about the vertex to the left of the one at x on k. If this 112 | vertex is chosen, it will represent an deletion." 113 | [graph delta p x k] 114 | (when (> x 0) 115 | (let [left-k (dec k) 116 | left-p (p-value-left delta p k) 117 | x* (-> graph 118 | (get left-p {}) 119 | (get left-k -1))] 120 | (when (and (>= x* 0) (= (dec x) x*)) 121 | {:edit :delete 122 | :x x* 123 | :p left-p 124 | :k left-k 125 | :d (edit-dist delta left-p left-k)})))) 126 | 127 | (defn- backtrack-snake 128 | "Find the x value at the head of the longest snake ending at (x, y)." 129 | [a b x y] 130 | {:pre [(and (>= x 0) (>= y 0))]} 131 | (loop [x x 132 | y y] 133 | (if (or (= x y 0) (not (= (get a x) (get b y)))) 134 | x 135 | (recur (dec x) (dec y))))) 136 | 137 | ;; See the paper for an example of how there are multiple shortest 138 | ;; paths through an edit graph. 139 | 140 | (defn- next-edit 141 | "Find the next move through the edit graph which will decrease the 142 | edit distance by 1." 143 | [a b graph delta p x k] 144 | {:post [(= (dec (edit-dist delta p k)) (:d %))]} 145 | (let [d (edit-dist delta p k) 146 | head-x (backtrack-snake a b x (- x k))] 147 | (loop [head-x head-x] 148 | (let [move (first (filter #(and (not (nil? %)) ;; <<<=== 149 | (= (:d %) (dec d))) 150 | (map #(% graph delta p head-x k) 151 | [look-left look-up])))] 152 | (if (and (< head-x x) (nil? move)) 153 | (recur (inc head-x)) 154 | move))))) 155 | 156 | (defn- edits 157 | "Calculate the sequence of edits from the map of farthest reaching end 158 | points." 159 | [a b p delta graph] 160 | (let [next-fn (partial next-edit a b graph delta)] 161 | (loop [edits '() 162 | prev {:x (count a) :p p :k delta 163 | :d (edit-dist delta p delta)}] 164 | (if (= (:d prev) 0) 165 | edits 166 | (let [next (next-fn (:p prev) (:x prev) (:k prev))] 167 | (recur (conj edits next) next)))))) 168 | 169 | (defn- transpose 170 | "If a is shorter than b, then the diff is calculated from b to a and this 171 | function is used to transpose the results into a diff from a to b." 172 | [edit] 173 | (-> edit 174 | (assoc :edit (if (= :insert (:edit edit)) :delete :insert)) 175 | (assoc :x (- (:x edit) (:k edit))) 176 | (assoc :k (- (:k edit))))) 177 | 178 | (defn- edits->script 179 | "Convert a sequence of edits into an edit script." 180 | [b edits f] 181 | (reduce (fn [script edit] 182 | (let [{:keys [edit x k]} (f edit) 183 | y (inc (- x k)) 184 | insertions (:+ script) 185 | last-insert (last insertions)] 186 | (if (= edit :delete) 187 | (assoc script :- (conj (:- script) x)) 188 | (assoc script :+ (let [index (dec x)] 189 | (if (= index (first last-insert)) 190 | (conj (vec (butlast insertions)) 191 | (conj last-insert (get b y))) 192 | (conj insertions [(dec x) (get b y)]))))))) 193 | {:+ [] 194 | :- []} 195 | edits)) 196 | 197 | (defn vectorize [& more] 198 | (map #(vec (cons nil %)) more)) 199 | 200 | (defn order->ses 201 | [a b] 202 | (let [[a* b*] (if (> (count b) (count a)) [b a] [a b])] 203 | [(ses a* b*) a* b*])) 204 | 205 | (defn seq-diff 206 | [a b] 207 | (let [[a b] (vectorize a b) 208 | [es a* b*] (order->ses a b) 209 | edits (apply edits a* b* es)] 210 | (edits->script b edits (if (= a* a) identity transpose)))) 211 | 212 | (defn string-dispatch [a b] 213 | (when (and (string? a) (string? b)) :string)) 214 | 215 | (defmulti ^{:arglists '([a b])} diff 216 | "Create an edit script that may be used to transform a into b. See doc string 217 | for clj-diff.core/diff. This function will ensure that diff* is called with 218 | arguments a and b where a >= b. If the passed values of a and b need to be 219 | swapped then the resulting path with will transposed." 220 | string-dispatch) 221 | 222 | (defmethod diff :default 223 | [a b] 224 | (seq-diff a b)) 225 | 226 | (defmethod diff :string 227 | [a b] 228 | (opt/diff a b seq-diff)) 229 | 230 | (defn seq-edit-dist 231 | [a b] 232 | (let [[a b] (vectorize a b) 233 | [[p & more] a* b*] (order->ses a b)] 234 | (+ (* 2 p) (- (count a*) (count b*))))) 235 | 236 | (defmulti edit-distance string-dispatch) 237 | 238 | (defmethod edit-distance :default 239 | [a b] 240 | (seq-edit-dist a b)) 241 | 242 | ;; TODO - Modify optimizations so that it can be used here and with 243 | ;; longest-common-subseq 244 | (defmethod edit-distance :string 245 | [a b] 246 | (seq-edit-dist a b)) 247 | 248 | (defn seq-lcs 249 | [a b] 250 | (let [diff (seq-diff a b) 251 | deletions (:- diff)] 252 | (filter #(not= % ::d) 253 | (reduce (fn [coll next] 254 | (assoc coll next ::d)) 255 | (vec (seq a)) 256 | deletions)))) 257 | 258 | (defmulti longest-common-subseq string-dispatch) 259 | 260 | (defmethod longest-common-subseq :default 261 | [a b] 262 | (seq-lcs a b)) 263 | 264 | (defmethod longest-common-subseq :string 265 | [a b] 266 | (apply str (seq-lcs a b))) 267 | -------------------------------------------------------------------------------- /src/clj/clj_diff/optimizations.clj: -------------------------------------------------------------------------------- 1 | (ns clj-diff.optimizations 2 | "String optimizations for diff algorithms. 3 | See http://neil.fraser.name/writing/diff/." 4 | (:import clj_diff.FastStringOps)) 5 | 6 | (defn common-prefix [^String a ^String b] 7 | (let [i (FastStringOps/commonPrefix a b)] 8 | [i (.substring a i) (.substring b i)])) 9 | 10 | (defn common-suffix [^String a ^String b] 11 | (let [i (FastStringOps/commonSuffix a b)] 12 | [i 13 | (.substring a 0 (- (.length a) i)) 14 | (.substring b 0 (- (.length b) i))])) 15 | 16 | (defn- short-within-long 17 | "Return a diff if the shorter sequence exists in the longer one. No need to 18 | use the expensive diff algorithm for this." 19 | [^String a ^String b ^Integer ca ^Integer cb] 20 | (let [[short long] (if (> ca cb) [b a] [a b]) 21 | i (int (.indexOf long short))] 22 | (if (= i -1) 23 | nil 24 | (if (= short a) 25 | {:+ (filter #(not (nil? %)) 26 | [(when (> i 0) 27 | (vec (concat [-1] (seq (.substring b 0 i))))) 28 | (when (< (+ i ca) cb) 29 | (vec (concat [(dec (+ i ca))] 30 | (seq (.substring b (+ i ca))))))]) 31 | :- []} 32 | {:+ [] 33 | :- (vec (concat (range 0 i) 34 | (range (+ i cb) ca)))})))) 35 | 36 | (defn- half-match* [^String long ^String short ^Integer i] 37 | (let [target (.substring long i (+ i (quot (count long) 4)))] 38 | (loop [j (.indexOf short target 0) 39 | result []] 40 | (if (= j -1) 41 | (if (>= (count (or (first result) "")) 42 | (quot (count long) 2)) 43 | result 44 | nil) 45 | (let [prefix-length (first (common-prefix (.substring long i) 46 | (.substring short j))) 47 | suffix-length (first (common-suffix (.substring long 0 i) 48 | (.substring short 0 j))) 49 | common (or (first result) "")] 50 | (recur (.indexOf short target (inc j)) 51 | (if (< (count common) (+ prefix-length suffix-length)) 52 | [(str (.substring short (- j suffix-length) j) 53 | (.substring short j (+ j prefix-length))) 54 | (.substring long 0 (- i suffix-length)) 55 | (.substring long (+ i prefix-length)) 56 | (.substring short 0 (- j suffix-length)) 57 | (.substring short (+ j prefix-length))] 58 | result))))))) 59 | 60 | (defn- half-match 61 | "Find a substring shared by both sequences which is at least half as long 62 | as the longer sequence. Return a vector of five elements if one is found and 63 | and nil if not. The five elements are: the common sequence, the prefix 64 | of sequence a, the suffix of sequence a, the prefix of sequence b and the 65 | suffix of sequence b." 66 | [^String a ^String b] 67 | (let [[short long] (if (> (count a) (count b)) [b a] [a b]) 68 | short-count (count short) 69 | long-count (count long)] 70 | (if (or (< long-count 4) 71 | (< (* short-count 2) long-count)) 72 | nil 73 | (let [hm-second-q (half-match* long short (quot (+ long-count 3) 4)) 74 | hm-third-q (half-match* long short (quot (+ long-count 1) 2)) 75 | half-match (cond (and hm-second-q hm-third-q) 76 | (if (> (count (first hm-second-q)) 77 | (count (first hm-third-q))) 78 | hm-second-q 79 | hm-third-q) 80 | :else (or hm-second-q hm-third-q))] 81 | (cond (nil? half-match) nil 82 | (= a long) half-match 83 | :else [(get half-match 0) 84 | (get half-match 3) 85 | (get half-match 4) 86 | (get half-match 1) 87 | (get half-match 2)]))))) 88 | 89 | (defn- offset-diffs [diffs offset] 90 | {:+ (vec (map #(apply vector 91 | (+ offset (first %)) (rest %)) (:+ diffs))) 92 | :- (vec (map #(+ offset %) (:- diffs)))}) 93 | 94 | (declare diff) 95 | 96 | (defn- diff* 97 | "Calculate the diff using the function f only after ensuring that this 98 | algorithm is required. At this point we know that a and b are different at 99 | both ends. A diff can be calculated manually if the length of a or b is 0 100 | or if the smaller of the two sequences is contained within the longer." 101 | [^String a ^String b f] 102 | (let [ca (count a) 103 | cb (count b)] 104 | (or (cond (= ca 0) {:+ [(vec (concat [-1] (seq b)))] 105 | :- []} 106 | (= cb 0) {:+ [] 107 | :- (vec (range 0 ca))} 108 | :else (if-let [diffs (short-within-long a b ca cb)] 109 | diffs 110 | (if-let [half-match (half-match a b)] 111 | (let [common (get half-match 0) 112 | a-prefix (get half-match 1) 113 | a-suffix (get half-match 2) 114 | b-prefix (get half-match 3) 115 | b-suffix (get half-match 4) 116 | diff-a (diff a-prefix b-prefix f) 117 | diff-b (diff a-suffix b-suffix f)] 118 | (merge-with concat 119 | diff-a 120 | (offset-diffs diff-b 121 | (+ (count common) 122 | (count a-prefix))))) 123 | nil))) 124 | (f a b)))) 125 | 126 | (defn diff 127 | "Return the diff of a and b. Wrap the diff function f in pre and post 128 | optimizations. Check for nil and equality. Remove common prefix and suffix." 129 | [^String a ^String b f] 130 | (let [diffs (cond (or (nil? a) (nil? b)) 131 | (throw (IllegalArgumentException. "Cannot diff nil.")) 132 | (= a b) {:+ [] :- []} 133 | :else nil)] 134 | (or diffs 135 | (let [[prefix a b] (common-prefix a b) 136 | [suffix a b] (common-suffix a b) 137 | diffs (diff* a b f)] 138 | (if (> prefix 0) 139 | {:+ (vec (map #(apply vector 140 | (+ prefix (first %)) (rest %)) (:+ diffs))) 141 | :- (vec (map #(+ prefix %) (:- diffs)))} 142 | diffs))))) 143 | -------------------------------------------------------------------------------- /src/jvm/clj_diff/FastStringOps.java: -------------------------------------------------------------------------------- 1 | package clj_diff; 2 | 3 | /** 4 | * Fast string operations for clj-diff. 5 | */ 6 | public class FastStringOps { 7 | 8 | /** 9 | * @return the number of common prefix characters for strings a and b 10 | */ 11 | public static int commonPrefix(String a, String b) { 12 | 13 | int n = Math.min(a.length(), b.length()); 14 | for(int i=0; i "aba" 107 | "aba" "ada" => "aa" 108 | "abca" "aca" => "aca" 109 | "abma" "aca" => "aa" 110 | "kitten" "sitting" => "ittn" 111 | "Saturday" "Sunday" => "Suday" 112 | "gumbo" "gambol" => "gmbo" 113 | "nBP8GaFHVls2dI8h9aK1FWdRgevf43" "925BCPcYhT5hs8L9T3K2T5C7U3Lz5v" => 114 | "BPs89Kv")) 115 | 116 | (deftest longest-common-subseq-seq-test 117 | (are [a b _ d] (= (longest-common-subseq (seq a) (seq b)) d) 118 | "kitten" "sitting" => [\i \t \t \n] 119 | "Saturday" "Sunday" => [\S \u \d \a \y] 120 | "gumbo" "gambol" => [\g \m \b \o])) 121 | 122 | (deftest longest-common-subseq-clojure-test 123 | (are [a b _ d] (= (longest-common-subseq (seq a) (seq b)) d) 124 | [:k :i :t :t :e :n] [:s :i :t :t :i :n :g] => [:i :t :t :n] 125 | [:s :a :t :u :r :d :a :y] [:s :u :n :d :a :y] => [:s :u :d :a :y] 126 | [{:x 1 :y 3} {:x 2 :y 7} {:x 3 :y 2} {:x 8 :y 3}] 127 | [{:x 5 :y 3} {:x 1 :y 3} {:x 3 :y 2} {:x 2 :y 8} {:x 8 :y 3}] => 128 | [{:x 1 :y 3} {:x 3 :y 2} {:x 8 :y 3}])) 129 | -------------------------------------------------------------------------------- /test/clj_diff/test/miller.clj: -------------------------------------------------------------------------------- 1 | (ns clj-diff.test.miller 2 | (:use [clj-diff.miller] 3 | [clojure.test]) 4 | (:require [clj-diff [core :as core]])) 5 | 6 | (def a1 (vec (cons nil "acebdabbabed"))) 7 | (def b1 (vec (cons nil "acbdeacbed"))) 8 | 9 | (def a2 (vec (cons nil "abc"))) 10 | (def b2 (vec (cons nil "xyz"))) 11 | 12 | (def a3 (vec (cons nil (seq "Udghi4")))) 13 | (def b3 (vec (cons nil (seq "Udhki4")))) 14 | 15 | (def a4 (vec (cons nil (seq "IFvclxMax1")))) 16 | (def b4 (vec (cons nil (seq "IF2qvWMa21")))) 17 | 18 | (def a5 (vec (cons nil (seq "sobibm")))) 19 | (def b5 (vec (cons nil (seq "sobmgc")))) 20 | 21 | (deftest next-x-test 22 | (let [t #'clj-diff.miller/next-x] 23 | (is (= (t 0 { }) 0)) 24 | (is (= (t 1 {0 2 }) 3)) 25 | (is (= (t 2 {0 2, 1 5 }) 6)) 26 | (is (= (t -1 {0 2, 1 5, 2 6}) 2)) 27 | 28 | (is (= (t 0 { 0 0 }) 0)) 29 | (is (= (t -1 { 0 0 }) 0)) 30 | (is (= (t 0 { -1 0, 0 1 }) 1)) 31 | (is (= (t 0 { -1 0, 0 1 }) 1)) 32 | (is (= (t -2 { -1 0, 0 1 }) 0)) 33 | (is (= (t -1 {-2 0, -1 0, 0 1 }) 1)) 34 | (is (= (t 1 {-2 0, -1 1, 0 1 }) 2)) 35 | (is (= (t 0 {-2 0, -1 1, 0 1, 1 2}) 2)))) 36 | 37 | (deftest snake-test 38 | (let [n (dec (count a1)) 39 | m (dec (count b1)) 40 | t (fn [k fp] (#'clj-diff.miller/snake a1 b1 n m k fp))] 41 | ;; p = 0 42 | (is (= (t 0 { }) 2)) 43 | (is (= (t 1 { 0 2 }) 5)) 44 | (is (= (t 2 { 0 2, 1 5 }) 6)) 45 | ;; { 0 2, 1 5, 2 6 } 46 | ;; p = 1 47 | (is (= (t -1 { 0 2, 1 5, 2 6 }) 2)) 48 | (is (= (t 0 { -1 2, 0 2, 1 5, 2 6 }) 6)) 49 | (is (= (t 1 { -1 2, 0 6, 1 5, 2 6 }) 7)) 50 | (is (= (t 2 { -1 2, 0 6, 1 7, 2 6 }) 8)) 51 | ;; { -1 2, 0 6, 1 7, 2 8 } 52 | ;; p = 2 53 | (is (= (t -2 { -1 2, 0 6, 1 7, 2 8 }) 3)) 54 | (is (= (t -1 {-2 3, -1 2, 0 6, 1 7, 2 8 }) 7)) 55 | (is (= (t 0 {-2 3, -1 7, 0 6, 1 7, 2 8 }) 8)) 56 | (is (= (t 1 {-2 3, -1 7, 0 8, 1 7, 2 8 }) 9)) 57 | (is (= (t 3 {-2 3, -1 7, 0 8, 1 9, 2 8 }) 9)) 58 | (is (= (t 2 {-2 3, -1 7, 0 8, 1 9, 2 8, 3 9}) 12)) 59 | ;; {-2 3, -1 7, 0 8, 1 9, 2 12, 3 9} 60 | ) 61 | (let [n (dec (count a2)) 62 | m (dec (count b2)) 63 | t (fn [k fp] (#'clj-diff.miller/snake a2 b2 n m k fp))] 64 | (is (= (t 0 {0 0}) 0)) 65 | (is (= (t -2 {-1 0, 0 1}) 0)) 66 | (is (= (t -1 {-2 0, -1 0, 0 1}) 1))) 67 | (let [n (dec (count a3)) 68 | m (dec (count b3)) 69 | t (fn [k fp] (#'clj-diff.miller/snake a3 b3 n m k fp))] 70 | (is (= (t 0 { }) 2)) 71 | (is (= (t -1 { 0 2 }) 2)) 72 | (is (= (t 0 { -1 2, 0 2 }) 3)) 73 | (is (= (t -2 { -1 2, 0 3 }) 2)) 74 | (is (= (t -1 {-2 2, -1 2, 0 3 }) 3)) 75 | (is (= (t 1 {-2 2, -1 3, 0 3 }) 4)) 76 | (is (= (t 0 {-2 2, -1 3, 0 3, 1 4}) 6))) 77 | (let [n (dec (count a4)) 78 | m (dec (count b4)) 79 | t (fn [k fp] (#'clj-diff.miller/snake a4 b4 n m k fp))] 80 | (is (= (t 0 { }) 2)) 81 | (is (= (t -1 { 0 2}) 2)) 82 | (is (= (t 0 {-1 2, 0 2}) 3)) 83 | (is (= (t -2 {-1 2, 0 3}) 3)))) 84 | 85 | (deftest p-band-diagonals-test 86 | (let [t #'clj-diff.miller/p-band-diagonals] 87 | (is (= (t 0 2) [0 1 2])))) 88 | 89 | (deftest search-p-band-test 90 | (let [n (dec (count a1)) 91 | m (dec (count b1)) 92 | t (fn [p fp] (#'clj-diff.miller/search-p-band a1 b1 n m 2 p fp))] 93 | (is (= (t 0 {}) 94 | {0 2, 1 5, 2 6})) 95 | (is (= (t 1 {0 2, 1 5, 2 6}) 96 | {-1 2, 0 6, 1 7, 2 8, 3 7})) 97 | (is (= (t 2 {-1 2, 0 6, 1 7, 2 8, 3 7}) 98 | {-2 3, -1 7, 0 8, 1 9, 2 12, 3 9, 4 8}))) 99 | (let [n (dec (count a2)) 100 | m (dec (count b2)) 101 | t (fn [p fp] (#'clj-diff.miller/search-p-band a2 b2 n m 0 p fp))] 102 | (is (= (t 0 {}) 103 | {0 0})) 104 | (is (= (t 1 {0 0}) 105 | {-1 0, 0 1, 1 1})) 106 | (is (= (t 2 {-1 0, 0 1, 1 1}) 107 | {-2 0, -1 1, 0 2, 1 2, 2 2})) 108 | (is (= (t 3 {-2 0, -1 1, 0 2, 1 2, 2 2}) 109 | {-3 0, -2 1, -1 2, 0 3, 1 3, 2 3, 3 3}))) 110 | (let [n (dec (count a3)) 111 | m (dec (count b3)) 112 | t (fn [p fp] (#'clj-diff.miller/search-p-band a3 b3 n m 0 p fp))] 113 | (is (= (t 0 {}) 114 | {0 2})) 115 | (is (= (t 1 {0 2}) 116 | {-1 2, 0 6, 1 4}))) 117 | (let [n (dec (count a4)) 118 | m (dec (count b4)) 119 | t (fn [p fp] (#'clj-diff.miller/search-p-band a4 b4 n m 0 p fp))] 120 | (is (= (t 0 {}) 121 | {0 2})) 122 | (is (= (t 1 {0 2}) 123 | {-1 2, 0 3, 1 3})) 124 | (is (= (t 2 {-1 2, 0 3, 1 3}) 125 | {-2 3, -1 4, 0 5, 1 4, 2 4})) 126 | (is (= (t 3 {-2 3, -1 4, 0 5, 1 4, 2 4}) 127 | {-3 3, -2 4, -1 5, 0 8, 1 6, 2 5, 3 5})) 128 | (is (= (t 4 {-3 3, -2 4, -1 5, 0 8, 1 6, 2 5, 3 5}) 129 | {-4 3, -3 4, -2 5, -1 8, 0 10, 1 9, 2 7, 3 6, 4 6})))) 130 | 131 | (def a1-graph {0 { 0 2, 1 5, 2 6 } 132 | 1 { -1 2, 0 6, 1 7, 2 8, 3 7 } 133 | 2 {-2 3, -1 7, 0 8, 1 9, 2 12, 3 9, 4 8}}) 134 | 135 | (def a2-graph {0 { 0 0 } 136 | 1 { -1 0, 0 1, 1 1 } 137 | 2 { -2 0, -1 1, 0 2, 1 2, 2 2 } 138 | 3 {-3 0, -2 1, -1 2, 0 3, 1 3, 2 3, 3 3}}) 139 | 140 | (def a3-graph {0 { 0 2 } 141 | 1 {-1 2, 0 6, 1 4}}) 142 | 143 | (def a4-graph {0 { 0 2 } 144 | 1 { -1 2, 0 3, 1 3 } 145 | 2 { -2 3, -1 4, 0 5, 1 4, 2 4 } 146 | 3 { -3 3, -2 4, -1 5, 0 8, 1 6, 2 5, 3 5 } 147 | 4 {-4 3, -3 4, -2 5, -1 8, 0 10, 1 9, 2 7, 3 6, 4 6}}) 148 | 149 | (def a5-graph {0 {0 3} 150 | 1 {1 4, -1 3, 0 4} 151 | 2 {2 6, -2 3, 1 6, -1 4, 0 6}}) 152 | 153 | (deftest ses-test 154 | (is (= (ses a1 b1) 155 | [2 2 a1-graph])) 156 | (is (= (ses a2 b2) 157 | [3 0 a2-graph])) 158 | (is (= (ses a3 b3) 159 | [1 0 a3-graph])) 160 | (is (= (ses a4 b4) 161 | [4 0 a4-graph])) 162 | (is (= (ses a5 b5) 163 | [2 0 a5-graph]))) 164 | 165 | (deftest edit-dist-test 166 | (let [t (fn [p k] (#'clj-diff.miller/edit-dist 0 p k))] 167 | (is (= (t 4 1) 7)) 168 | (is (= (t 4 0) 8)) 169 | (is (= (t 4 -1) 7)) 170 | (is (= (t 2 0) 4)) 171 | (is (= (t 2 1) 3)) 172 | (is (= (t 2 2) 2)) 173 | (is (= (t 1 1) 1)) 174 | (is (= (t 0 0) 0))) 175 | (let [t (fn [p k] (#'clj-diff.miller/edit-dist 2 p k))] 176 | (is (= (t 2 3) 5)) 177 | (is (= (t 2 1) 5)) 178 | (is (= (t 2 -1) 3)))) 179 | 180 | (deftest p-value-up-test 181 | (let [t (fn [p k] (#'clj-diff.miller/p-value-up 0 p k))] 182 | (are [p k p*] (= (t p k) p*) 183 | 1 -1 0 184 | 1 0 1 185 | 2 1 2 186 | 3 2 3 187 | 2 -2 1 188 | 2 -1 1 189 | 2 0 2 190 | 3 1 3)) 191 | (let [t (fn [p k] (#'clj-diff.miller/p-value-up 2 p k))] 192 | (are [p k p*] (= (t p k) p*) 193 | 1 -1 0 194 | 1 0 0 195 | 1 1 0 196 | 1 2 1 197 | 2 3 2 198 | 3 4 3))) 199 | 200 | (deftest p-value-left-test 201 | (let [t (fn [p k] (#'clj-diff.miller/p-value-left 0 p k))] 202 | (are [p k p*] (= (t p k) p*) 203 | 1 1 0 204 | 1 0 1 205 | 2 -1 2 206 | 3 -2 3 207 | 2 2 1 208 | 2 1 1 209 | 2 0 2 210 | 3 -1 3)) 211 | (let [t (fn [p k] (#'clj-diff.miller/p-value-left 2 p k))] 212 | (are [p k p*] (= (t p k) p*) 213 | 0 1 0 214 | 1 0 1 215 | 2 -1 2 216 | 0 2 0 217 | 1 1 1 218 | 2 0 2 219 | 3 -1 3 220 | 1 3 0 221 | 1 2 1 222 | 2 1 2 223 | 3 0 3 224 | 4 -1 4))) 225 | 226 | (deftest look-up-test 227 | (let [t (fn [p x k] (#'clj-diff.miller/look-up a1-graph 2 p x k))] 228 | (is (= (t 2 9 2) {:edit :insert :x 9 :p 2 :k 3 :d 5})) 229 | (is (nil? (t 2 9 3))) 230 | (is (= (t 1 5 0) {:edit :insert :x 5 :p 0 :k 1 :d 1})) 231 | (is (nil? (t 1 7 1)))) 232 | (let [t (partial #'clj-diff.miller/look-up a4-graph 0)] 233 | (is (= (t 4 9 0) {:edit :insert :x 9 :p 4 :k 1 :d 7})) 234 | (is (= (t 3 6 0) {:edit :insert :x 6 :p 3 :k 1 :d 5})) 235 | (is (nil? (t 2 6 1))) 236 | (is (= (t 3 5 -1) {:edit :insert :x 5 :p 2 :k 0 :d 4})) 237 | (is (= (t 1 2 -1) {:edit :insert :x 2 :p 0 :k 0 :d 0})) 238 | (is (nil? (t 2 3 -2)))) 239 | (let [t (partial #'clj-diff.miller/look-up a3-graph 0)] 240 | (is (= (t 1 4 0) {:edit :insert :x 4 :p 1 :k 1 :d 1}))) 241 | (let [t (partial #'clj-diff.miller/look-up a2-graph 0)] 242 | (is (= (t 3 0 -3) {:edit :insert :x 0 :p 2 :k -2 :d 2}))) 243 | (let [t (partial #'clj-diff.miller/look-up a5-graph 0)] 244 | (is (= (t 2 6 0) {:edit :insert :x 6 :p 2 :k 1 :d 3})) 245 | (is (= (t 2 6 1) {:edit :insert :x 6 :p 2 :k 2 :d 2})))) 246 | 247 | (deftest look-left-test 248 | (let [t (partial #'clj-diff.miller/look-left a1-graph 2)] 249 | (is (nil? (t 2 9 2))) 250 | (is (= (t 2 9 3) {:edit :delete :x 8 :p 1 :k 2 :d 4})) 251 | (is (= (t 1 8 2) {:edit :delete :x 7 :p 1 :k 1 :d 3})) 252 | (is (nil? (t 1 5 0)))) 253 | (let [t (partial #'clj-diff.miller/look-left a4-graph 0)] 254 | (is (= (t 4 9 0) {:edit :delete :x 8 :p 4 :k -1 :d 7})) 255 | (is (= (t 3 6 0) {:edit :delete :x 5 :p 3 :k -1 :d 5})) 256 | (is (= (t 3 5 -1) {:edit :delete :x 4 :p 3 :k -2 :d 4})) 257 | (is (nil? (t 3 3 -3)))) 258 | (let [t (partial #'clj-diff.miller/look-left a2-graph 0)] 259 | (is (nil? (t 3 0 -3))))) 260 | 261 | (deftest next-edit-test 262 | (let [t (fn [p x k] (#'clj-diff.miller/next-edit a1 b1 a1-graph 2 p x k))] 263 | (is (= (t 2 12 2) 264 | {:edit :insert :x 9 :p 2 :k 3 :d 5})) 265 | (is (= (t 2 9 3) 266 | {:edit :insert :x 8 :p 2 :k 4 :d 4})) 267 | (is (= (t 2 8 4) 268 | {:edit :delete :x 7 :p 1 :k 3 :d 3})) 269 | (is (= (t 1 7 3) 270 | {:edit :delete :x 6 :p 0 :k 2 :d 2})) 271 | (is (= (t 0 6 2) 272 | {:edit :delete :x 5 :p 0 :k 1 :d 1})) 273 | (is (= (t 0 5 1) 274 | {:edit :delete :x 2 :p 0 :k 0 :d 0}))) 275 | (let [t (fn [p x k] (#'clj-diff.miller/next-edit a5 b5 a5-graph 0 p x k))] 276 | (is (= (t 2 6 0) 277 | {:edit :insert :x 6 :p 2 :k 1 :d 3})) 278 | (is (= (t 2 6 1) 279 | {:edit :insert :x 6 :p 2 :k 2 :d 2})) 280 | (is (= (t 2 6 2) 281 | {:edit :delete :x 4 :p 1 :k 1 :d 1})) 282 | (is (= (t 1 4 1) 283 | {:edit :delete :x 3 :p 0 :k 0 :d 0})))) 284 | 285 | (def a1-edits [{:edit :delete :x 2 :p 0 :k 0 :d 0} 286 | {:edit :delete :x 5 :p 0 :k 1 :d 1} 287 | {:edit :delete :x 6 :p 0 :k 2 :d 2} 288 | {:edit :delete :x 7 :p 1 :k 3 :d 3} 289 | {:edit :insert :x 8 :p 2 :k 4 :d 4} 290 | {:edit :insert :x 9 :p 2 :k 3 :d 5}]) 291 | 292 | (def a2-edits [{:edit :insert :x 0 :p 0 :k 0 :d 0} 293 | {:edit :insert :x 0 :p 1 :k -1 :d 1} 294 | {:edit :insert :x 0 :p 2 :k -2 :d 2} 295 | {:edit :delete :x 0 :p 3 :k -3 :d 3} 296 | {:edit :delete :x 1 :p 3 :k -2 :d 4} 297 | {:edit :delete :x 2 :p 3 :k -1 :d 5}]) 298 | 299 | (def a3-edits [{:edit :delete :x 2 :p 0 :k 0 :d 0} 300 | {:edit :insert :x 4 :p 1 :k 1 :d 1}]) 301 | 302 | (def a4-edits [{:edit :insert :x 2 :p 0 :k 0 :d 0} 303 | {:edit :insert :x 2 :p 1 :k -1 :d 1} 304 | {:edit :insert :x 3 :p 2 :k -2 :d 2} 305 | {:edit :delete :x 3 :p 3 :k -3 :d 3} 306 | {:edit :delete :x 4 :p 3 :k -2 :d 4} 307 | {:edit :delete :x 5 :p 3 :k -1 :d 5} 308 | {:edit :insert :x 8 :p 3 :k 0 :d 6} 309 | {:edit :delete :x 8 :p 4 :k -1 :d 7}]) 310 | 311 | (deftest edits-test 312 | (let [t #'clj-diff.miller/edits] 313 | (is (= (t a1 b1 2 2 a1-graph) a1-edits)) 314 | (is (= (t a2 b2 3 0 a2-graph) a2-edits)) 315 | (is (= (t a3 b3 1 0 a3-graph) a3-edits)) 316 | (is (= (t a4 b4 4 0 a4-graph) a4-edits)))) 317 | 318 | (deftest transpose-test 319 | (let [t #'clj-diff.miller/transpose] 320 | (is (= (t {:edit :delete :x 2 :p 0 :k 0 :d 0}) 321 | {:edit :insert :x 2 :p 0 :k 0 :d 0 })) 322 | (is (= (t {:edit :insert :x 5 :p 0 :k 1 :d 1}) 323 | {:edit :delete :x 4 :p 0 :k -1 :d 1 })))) 324 | 325 | (deftest edits->script 326 | (let [t #'clj-diff.miller/edits->script] 327 | (is (= (t b1 a1-edits identity) 328 | {:+ [[7 \e] [8 \c]] :- [2 5 6 7]})) 329 | (is (= (t a1 a1-edits #'clj-diff.miller/transpose) 330 | {:+ [[1 \e] [3 \a \b \b]] :- [4 6]})))) 331 | 332 | (deftest diff-test 333 | (let [t (fn [a b] (core/edit-distance (diff a b)))] 334 | (are [a b _ d] (= (t a b) d) 335 | "acebdabbabed" "acbdeacbed" :=> 6 336 | "acbdeacbed" "acebdabbabed" :=> 6 337 | "sobibm" "sobmgc" :=> 4 338 | [1 2 3 4 3 2 3 2 1 2 3] [2 3 1 2 3 4 5 4 3] :=> 10 339 | "abcab" "cbab" :=> 3 340 | "abcabba" "cbabac" :=> 5 341 | "abc" "xyz" :=> 6 342 | "IFvclxMax1" "IF2qvWMa21" :=> 8 343 | "Udghi4" "Udhki4" :=> 2))) 344 | 345 | (deftest roundtrip 346 | (are [a b] 347 | (= b (core/patch a (diff a b))) 348 | 349 | "aba" "aca" 350 | "abcabba" "cbabac" 351 | "acebdabbabed" "acbdeacbed" 352 | "FWdRgevf43" "T5C7U3Lz5v" 353 | "s2dI8h9aK1FWdRgevf43" "5hs8L9T3K2T5C7U3Lz5v" 354 | "nBP8GaFHVls2dI8h9aK1FWdRgevf43" "925BCPcYhT5hs8L9T3K2T5C7U3Lz5v" 355 | "aba" "aca" 356 | "Udghi4" "Udhki4" 357 | "sobibm" "sobmgc")) 358 | -------------------------------------------------------------------------------- /test/clj_diff/test/optimizations.clj: -------------------------------------------------------------------------------- 1 | (ns clj-diff.test.optimizations 2 | (:use [clj-diff.optimizations]) 3 | (:use [clojure.test])) 4 | 5 | (deftest common-prefix-test 6 | (is (= (common-prefix "abcdef" "abcxyz") 7 | [3 "def" "xyz"])) 8 | (is (= (common-prefix "xy" "ab") 9 | [0 "xy" "ab"])) 10 | (is (= (common-prefix "ab" "ab") 11 | [2 "" ""]))) 12 | 13 | (deftest common-suffix-test 14 | (is (= (common-suffix "defabc" "xyzabc") 15 | [3 "def" "xyz"])) 16 | (is (= (common-suffix "xy" "ab") 17 | [0 "xy" "ab"])) 18 | (is (= (common-suffix "ab" "ab") 19 | [2 "" ""]))) 20 | 21 | (deftest diff*-test 22 | (let [t (fn [a b] (#'clj-diff.optimizations/diff* a b (constantly nil)))] 23 | (is (= (t "" "abc") 24 | {:+ [[-1 \a \b \c]] 25 | :- []})) 26 | (is (= (t "abc" "") 27 | {:+ [] 28 | :- [0 1 2]})) 29 | (is (= (t "abc" "xyzabcmnop") 30 | {:+ [[-1 \x \y \z] [5 \m \n \o \p]] 31 | :- []})) 32 | (is (= (t "abc" "abcm") 33 | {:+ [[2 \m]] 34 | :- []})) 35 | (is (= (t "abcm" "abc") 36 | {:+ [] 37 | :- [3]})) 38 | (is (= (t "abc" "mabc") 39 | {:+ [[-1 \m]] 40 | :- []})) 41 | (is (= (t "mabc" "abc") 42 | {:+ [] 43 | :- [0]})) 44 | (is (= (t "mabc" "abc") 45 | {:+ [] 46 | :- [0]})) 47 | (is (nil? (t "abcac" "cbab"))))) 48 | 49 | (deftest half-match-test 50 | (let [t #'clj-diff.optimizations/half-match] 51 | (is (= (t "a" "b") 52 | nil)) 53 | (is (= (t "bb" "bbg") 54 | nil)) 55 | (is (= (t "ahgt" "bhahgtgbh") 56 | nil)) 57 | (is (= (t "aaapppppb" "cpppppdddd") 58 | ["ppppp" "aaa" "b" "c" "dddd"])) 59 | (is (= (t "apppppaab" "pppppcdddd") 60 | ["ppppp" "a" "aab" "" "cdddd"])) 61 | (is (= (t "apppppaab" "cddddppppp") 62 | ["ppppp" "a" "aab" "cdddd" ""])) 63 | (is (= (t "aaappppbb" "cppppddddd") 64 | nil)))) 65 | --------------------------------------------------------------------------------