├── .cljrc.clj
├── .gitignore
├── COPYING
├── README.textile
├── project.clj
├── script
├── clean
├── javac
└── push
├── src
├── clj
│ └── clj_diff
│ │ ├── core.clj
│ │ ├── miller.clj
│ │ └── optimizations.clj
└── jvm
│ └── clj_diff
│ └── FastStringOps.java
└── test
└── clj_diff
└── test
├── core.clj
├── miller.clj
└── optimizations.clj
/.cljrc.clj:
--------------------------------------------------------------------------------
1 | (set! *print-length* 103)
2 | (set! *print-level* 15)
3 | #_(set! *warn-on-reflection* true)
4 |
5 | (defn exit [] (. System exit 0))
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | pom.xml
2 | *jar
3 | lib
4 | classes
5 | *.png
6 | files
7 | docs/
8 |
--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | Eclipse Public License - v 1.0
2 |
3 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
4 | PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
5 | THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
6 |
7 | 1. DEFINITIONS
8 |
9 | "Contribution" means:
10 |
11 | a) in the case of the initial Contributor, the initial code and
12 | documentation distributed under this Agreement, and
13 |
14 | b) in the case of each subsequent Contributor:
15 |
16 | i) changes to the Program, and
17 |
18 | ii) additions to the Program;
19 |
20 | where such changes and/or additions to the Program originate from and
21 | are distributed by that particular Contributor. A Contribution
22 | 'originates' from a Contributor if it was added to the Program by such
23 | Contributor itself or anyone acting on such Contributor's
24 | behalf. Contributions do not include additions to the Program which:
25 | (i) are separate modules of software distributed in conjunction with
26 | the Program under their own license agreement, and (ii) are not
27 | derivative works of the Program.
28 |
29 | "Contributor" means any person or entity that distributes the Program.
30 |
31 | "Licensed Patents" mean patent claims licensable by a Contributor
32 | which are necessarily infringed by the use or sale of its Contribution
33 | alone or when combined with the Program.
34 |
35 | "Program" means the Contributions distributed in accordance with this
36 | Agreement.
37 |
38 | "Recipient" means anyone who receives the Program under this
39 | Agreement, including all Contributors.
40 |
41 | 2. GRANT OF RIGHTS
42 |
43 | a) Subject to the terms of this Agreement, each Contributor hereby
44 | grants Recipient a non-exclusive, worldwide, royalty-free copyright
45 | license to reproduce, prepare derivative works of, publicly display,
46 | publicly perform, distribute and sublicense the Contribution of such
47 | Contributor, if any, and such derivative works, in source code and
48 | object code form.
49 |
50 | b) Subject to the terms of this Agreement, each Contributor hereby
51 | grants Recipient a non-exclusive, worldwide, royalty-free patent
52 | license under Licensed Patents to make, use, sell, offer to sell,
53 | import and otherwise transfer the Contribution of such Contributor, if
54 | any, in source code and object code form. This patent license shall
55 | apply to the combination of the Contribution and the Program if, at
56 | the time the Contribution is added by the Contributor, such addition
57 | of the Contribution causes such combination to be covered by the
58 | Licensed Patents. The patent license shall not apply to any other
59 | combinations which include the Contribution. No hardware per se is
60 | licensed hereunder.
61 |
62 | c) Recipient understands that although each Contributor grants the
63 | licenses to its Contributions set forth herein, no assurances are
64 | provided by any Contributor that the Program does not infringe the
65 | patent or other intellectual property rights of any other entity. Each
66 | Contributor disclaims any liability to Recipient for claims brought by
67 | any other entity based on infringement of intellectual property rights
68 | or otherwise. As a condition to exercising the rights and licenses
69 | granted hereunder, each Recipient hereby assumes sole responsibility
70 | to secure any other intellectual property rights needed, if any. For
71 | example, if a third party patent license is required to allow
72 | Recipient to distribute the Program, it is Recipient's responsibility
73 | to acquire that license before distributing the Program.
74 |
75 | d) Each Contributor represents that to its knowledge it has sufficient
76 | copyright rights in its Contribution, if any, to grant the copyright
77 | license set forth in this Agreement.
78 |
79 | 3. REQUIREMENTS
80 |
81 | A Contributor may choose to distribute the Program in object code form
82 | under its own license agreement, provided that:
83 |
84 | a) it complies with the terms and conditions of this Agreement; and
85 |
86 | b) its license agreement:
87 |
88 | i) effectively disclaims on behalf of all Contributors all warranties
89 | and conditions, express and implied, including warranties or
90 | conditions of title and non-infringement, and implied warranties or
91 | conditions of merchantability and fitness for a particular purpose;
92 |
93 | ii) effectively excludes on behalf of all Contributors all liability
94 | for damages, including direct, indirect, special, incidental and
95 | consequential damages, such as lost profits;
96 |
97 | iii) states that any provisions which differ from this Agreement are
98 | offered by that Contributor alone and not by any other party; and
99 |
100 | iv) states that source code for the Program is available from such
101 | Contributor, and informs licensees how to obtain it in a reasonable
102 | manner on or through a medium customarily used for software exchange.
103 |
104 | When the Program is made available in source code form:
105 |
106 | a) it must be made available under this Agreement; and
107 |
108 | b) a copy of this Agreement must be included with each copy of the Program.
109 |
110 | Contributors may not remove or alter any copyright notices contained
111 | within the Program.
112 |
113 | Each Contributor must identify itself as the originator of its
114 | Contribution, if any, in a manner that reasonably allows subsequent
115 | Recipients to identify the originator of the Contribution.
116 |
117 | 4. COMMERCIAL DISTRIBUTION
118 |
119 | Commercial distributors of software may accept certain
120 | responsibilities with respect to end users, business partners and the
121 | like. While this license is intended to facilitate the commercial use
122 | of the Program, the Contributor who includes the Program in a
123 | commercial product offering should do so in a manner which does not
124 | create potential liability for other Contributors. Therefore, if a
125 | Contributor includes the Program in a commercial product offering,
126 | such Contributor ("Commercial Contributor") hereby agrees to defend
127 | and indemnify every other Contributor ("Indemnified Contributor")
128 | against any losses, damages and costs (collectively "Losses") arising
129 | from claims, lawsuits and other legal actions brought by a third party
130 | against the Indemnified Contributor to the extent caused by the acts
131 | or omissions of such Commercial Contributor in connection with its
132 | distribution of the Program in a commercial product offering. The
133 | obligations in this section do not apply to any claims or Losses
134 | relating to any actual or alleged intellectual property
135 | infringement. In order to qualify, an Indemnified Contributor must: a)
136 | promptly notify the Commercial Contributor in writing of such claim,
137 | and b) allow the Commercial Contributor tocontrol, and cooperate with
138 | the Commercial Contributor in, the defense and any related settlement
139 | negotiations. The Indemnified Contributor may participate in any such
140 | claim at its own expense.
141 |
142 | For example, a Contributor might include the Program in a commercial
143 | product offering, Product X. That Contributor is then a Commercial
144 | Contributor. If that Commercial Contributor then makes performance
145 | claims, or offers warranties related to Product X, those performance
146 | claims and warranties are such Commercial Contributor's responsibility
147 | alone. Under this section, the Commercial Contributor would have to
148 | defend claims against the other Contributors related to those
149 | performance claims and warranties, and if a court requires any other
150 | Contributor to pay any damages as a result, the Commercial Contributor
151 | must pay those damages.
152 |
153 | 5. NO WARRANTY
154 |
155 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS
156 | PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
157 | KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY
158 | WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY
159 | OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely
160 | responsible for determining the appropriateness of using and
161 | distributing the Program and assumes all risks associated with its
162 | exercise of rights under this Agreement , including but not limited to
163 | the risks and costs of program errors, compliance with applicable
164 | laws, damage to or loss of data, programs or equipment, and
165 | unavailability or interruption of operations.
166 |
167 | 6. DISCLAIMER OF LIABILITY
168 |
169 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
170 | ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
171 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
172 | WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
173 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
174 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
175 | DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
176 | HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
177 |
178 | 7. GENERAL
179 |
180 | If any provision of this Agreement is invalid or unenforceable under
181 | applicable law, it shall not affect the validity or enforceability of
182 | the remainder of the terms of this Agreement, and without further
183 | action by the parties hereto, such provision shall be reformed to the
184 | minimum extent necessary to make such provision valid and enforceable.
185 |
186 | If Recipient institutes patent litigation against any entity
187 | (including a cross-claim or counterclaim in a lawsuit) alleging that
188 | the Program itself (excluding combinations of the Program with other
189 | software or hardware) infringes such Recipient's patent(s), then such
190 | Recipient's rights granted under Section 2(b) shall terminate as of
191 | the date such litigation is filed.
192 |
193 | All Recipient's rights under this Agreement shall terminate if it
194 | fails to comply with any of the material terms or conditions of this
195 | Agreement and does not cure such failure in a reasonable period of
196 | time after becoming aware of such noncompliance. If all Recipient's
197 | rights under this Agreement terminate, Recipient agrees to cease use
198 | and distribution of the Program as soon as reasonably
199 | practicable. However, Recipient's obligations under this Agreement and
200 | any licenses granted by Recipient relating to the Program shall
201 | continue and survive.
202 |
203 | Everyone is permitted to copy and distribute copies of this Agreement,
204 | but in order to avoid inconsistency the Agreement is copyrighted and
205 | may only be modified in the following manner. The Agreement Steward
206 | reserves the right to publish new versions (including revisions) of
207 | this Agreement from time to time. No one other than the Agreement
208 | Steward has the right to modify this Agreement. The Eclipse Foundation
209 | is the initial Agreement Steward. The Eclipse Foundation may assign
210 | the responsibility to serve as the Agreement Steward to a suitable
211 | separate entity. Each new version of the Agreement will be given a
212 | distinguishing version number. The Program (including Contributions)
213 | may always be distributed subject to the version of the Agreement
214 | under which it was received. In addition, after a new version of the
215 | Agreement is published, Contributor may elect to distribute the
216 | Program (including its Contributions) under the new version. Except as
217 | expressly stated in Sections 2(a) and 2(b) above, Recipient receives
218 | no rights or licenses to the intellectual property of any Contributor
219 | under this Agreement, whether expressly, by implication, estoppel or
220 | otherwise. All rights in the Program not expressly granted under this
221 | Agreement are reserved.
222 |
223 | This Agreement is governed by the laws of the State of California and
224 | the intellectual property laws of the United States of America. No
225 | party to this Agreement will bring a legal action under this Agreement
226 | more than one year after the cause of action arose. Each party waives
227 | its rights to a jury trial in any resulting litigation.
--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
1 | h1. clj-diff
2 |
3 | Provides diff
and patch
functions for Clojure sequences where (diff a b) -> x
and (patch a x) -> b
. Also provides edit-distance
and levenshtein-distance
functions for calculating the difference between two sequences.
4 |
5 | h2. Usage
6 |
7 |
user=> (use 'clj-diff.core)
8 | user=> (diff "John went to the movies." "John was in the movies.")
9 | {:+ [[5 \a \s \space \i]], :- [6 8 9 10 11]}
10 | user=> (patch "John went to the movies." *1)
11 | "John was in the movies."
12 | user=> (edit-distance "John went to the movies." "John was in the movies."))
13 | 9
14 | user=> (levenshtein-distance "John went to the movies." "John was in the movies.")
15 | 8
16 |
17 |
18 | There is already a "Java library":http://code.google.com/p/google-diff-match-patch/ which does this well. Why create a Clojure version? So that we can do this:
19 |
20 | user=> (def a [{:a 1} {:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7}])
21 | user=> (def b [{:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7} {:a 1}])
22 | user=> (diff a b)
23 | {:+ [[6 {:a 1}]], :- [0]}
24 | user=> (patch a *1)
25 | ({:a 2} {:a 3} {:a 4} {:a 5} {:a 6} {:a 7} {:a 1})
26 | user=> (edit-distance a b)
27 | 2
28 |
29 |
30 | h2. Notes
31 |
32 | The current diff algorithm comes from the paper "An O(NP) Sequence Comparison Algorithm":http://portal.acm.org/citation.cfm?id=96223 by Sun Wu, Udi Manber, Gene Myers and Webb Miller. It is fast and memory efficient. It also makes use of the pre-diff optimizations mentioned in Neil Fraser's "Diff Strategies":http://neil.fraser.name/writing/diff/. The worst-case running time of the algorithm is dependent only on the length of the longest sequence (N) and the number of deletions (P). This is much better than the "Myers":http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6927&rep=rep1&type=pdf algorithm which is an O(ND) algorithm where N is the sum of the length of the two sequences and D is the edit distance.
33 |
34 | I have created a "separate project":http://github.com/brentonashworth/clj-diff-performance for comparing the performance of different diff algorithms. The main performance goal of this project is to create a Clojure diff that can outperform Fraser's Java implementation. One of the most interesting results is show below.
35 |
36 | [clj-diff "1.0.0-SNAPSHOT"]
to your :dependencies in project.clj.
45 |
46 | h3. Maven
47 |
48 | Add the following dependency:
49 |
50 |
51 | clj-diff
52 | clj-diff
53 | 1.0.0-SNAPSHOT
54 |
55 |
56 | ...which comes from Clojars...
57 |
58 |
59 | clojars.org
60 | http://clojars.org/repo
61 |
62 |
63 | h2. References
64 |
65 | * "An O(ND) Difference Algorithm and Its Variations by Eugene W. Myers":http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6927&rep=rep1&type=pdf
66 | * "An O(NP) Sequence Comparison Algorithm by Sun Wu, Udi Manber, Gene Myers and Webb Miller":http://portal.acm.org/citation.cfm?id=96223
67 |
68 | In order to understand the above two titles, you will need to know what N, D and P stand for. Let A and B be two sequences with lengths Q and M where Q >= M. Let D be the length of the minimum edit script for A and B. In the title of the first paper N = Q + M. In the second paper N = Q and P = (1/2)D - (1/2)(Q - M). Therefore, the O(NP) version is faster which is visualized in the charts below.
69 |
70 | * "Diff Strategies by Neil Fraser":http://neil.fraser.name/writing/diff/
71 |
72 | h2. Roadmap
73 |
74 | h3. Version 1.0
75 |
76 | # Text diff visualization.
77 | # HTML diff visualization.
78 | # Semantic cleanup.
79 | # Sequence chunking. Allow line based diff for strings.
80 |
81 | h3. Version 2.0
82 |
83 | # Improve performance by searching the edit graph from both ends at the same time.
84 | # Arbitrary Clojure form diff/Nested diffs. Integrate with clojure.data/diff in Clojure 1.3
85 | # Set the maximum time to look for an optimal diff.
86 | # Add a fast and correct levenshtein-distance function.
87 |
88 |
89 | h2. License
90 |
91 | Copyright (C) 2010-2011 Brenton Ashworth
92 |
93 | Distributed under the Eclipse Public License, the same as Clojure uses. See the file COPYING.
94 |
--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
1 | (defproject clj-diff "1.0.0-SNAPSHOT"
2 | :description "Sequential diff in Clojure."
3 | :url "http://github.com/brentonashworth/clj-diff"
4 | :source-path "src/clj"
5 | :java-source-path "src/jvm"
6 | :java-fork "true"
7 | :java-debug "true"
8 | :hooks [leiningen.hooks.javac
9 | leiningen.hooks.difftest]
10 | :dev-dependencies [[org.clojure/clojure "1.2.0"]
11 | [lein-javac "1.2.1-SNAPSHOT"]
12 | [marginalia "0.5.0"]
13 | [lein-difftest "1.3.2-SNAPSHOT"]])
14 |
--------------------------------------------------------------------------------
/script/clean:
--------------------------------------------------------------------------------
1 | rm clj-diff-*
2 | rm -rf classes
3 |
--------------------------------------------------------------------------------
/script/javac:
--------------------------------------------------------------------------------
1 | rm -rf classes
2 | cd src/jvm
3 | javac clj_diff/FastStringOps.java
4 | cd ../../
5 | mkdir classes
6 | mkdir classes/clj_diff
7 | mv src/jvm/clj_diff/FastStringOps.class classes/clj_diff
8 |
--------------------------------------------------------------------------------
/script/push:
--------------------------------------------------------------------------------
1 | lein jar
2 | lein pom
3 | scp pom.xml clj-diff-* clojars@clojars.org:
4 |
--------------------------------------------------------------------------------
/src/clj/clj_diff/core.clj:
--------------------------------------------------------------------------------
1 | (ns clj-diff.core
2 | "Diff, patch and related functions for Clojure sequences."
3 | (:require [clj-diff [miller :as miller]]))
4 |
5 | (defn diff
6 | "Create the edit script for transforming sequance a into sequence b.
7 | An edit script is a map with keys :+ and :- for additions and deletions.
8 | Additions are represented as a sequence of vectors. The first item in each
9 | vector is the index where the rest of the items in the vector are to be
10 | inserted. For example [3 b c] means to insert b an c after whatever is
11 | in index 3. Deletions are represented as a sequence of indexes to delete.
12 |
13 | For example: the diff of 'abcabba' and 'cbabac' would generate the edit
14 | script below.
15 |
16 | {:+ [[2 b] [6 c]], :- [0 1 5]}
17 |
18 | An index of -1 may appear in additions and is a special case which means to
19 | add the elements at the beginning of the sequence."
20 | [a b]
21 | (miller/diff a b))
22 |
23 | (defn- merge-patch
24 | [s edit-script delete-symbol]
25 | (let [s (vec s)
26 | additions (:+ edit-script)
27 | deletions (:- edit-script)
28 | s (reduce (fn [a b]
29 | (assoc a b delete-symbol))
30 | s
31 | deletions)
32 | s (reduce (fn [a b]
33 | (let [index (first b)
34 | items (rest b)]
35 | (if (= index -1)
36 | (assoc a 0 (conj (vec items) (get a 0)))
37 | (assoc a index (conj items (get a index))))))
38 | s
39 | additions)]
40 | (flatten s)))
41 |
42 | (defn patch*
43 | [s edit-script]
44 | (filter #(not (nil? %)) (merge-patch s edit-script nil)))
45 |
46 | (defmulti ^{:arglists '([s edit-script])} patch
47 | "Use the instructions in the edit script to transform the sequence s into
48 | a new sequence. If the edit script was created by using diff on a and b then
49 | patch will use the edit script to transform a into b.
50 |
51 | (diff a b) -> x, (patch a x) -> b."
52 | (fn [s _] (class s)))
53 |
54 | (defmethod patch :default
55 | [s edit-script]
56 | (patch* s edit-script))
57 |
58 | (defmethod patch String
59 | [s edit-script]
60 | (apply str (patch* s edit-script)))
61 |
62 | (defn edit-distance
63 | "Returns the edit distance between the two passed sequences. May also be
64 | passed an edit script. The edit distance is the minimum number of insertions
65 | and deletions required to transform one sequence into another."
66 | ([a b]
67 | (miller/edit-distance a b))
68 | ([edit-script]
69 | (+ (count (:- edit-script))
70 | (reduce + (map #(count (drop 1 %)) (:+ edit-script))))))
71 |
72 | (defn- max-or-zero [coll]
73 | (if (and (coll? coll)
74 | (not (empty? coll)))
75 | (apply max coll)
76 | 0))
77 |
78 | (defn levenshtein-distance
79 | "Returns the Levenshtein distance between two sequences. May either be passed
80 | the two sequences or a diff of the two sequences.
81 |
82 | From [Wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance):
83 | The Levenshtein distance between two strings is the minimum number of edits
84 | needed to transform one string into the other, with the allowable edit
85 | operations being insertion, deletion and substitution of a single character.
86 |
87 | This function works not only with strings but with any Clojure sequence.
88 |
89 | Warning! Technically this function is estimating the Levenshtein distance
90 | from a computed diff. Most of the time, it is the same as the real Levenshtein
91 | distance but in same cases it may be larger. The reason for this is that
92 | there may be multiple paths through an edit graph with the same edit
93 | distance but with differing Levenshtein distance. A future improvement to
94 | the diff algorithm whould be to find all paths and prefer the one with the
95 | minimum Levenshtein distance."
96 | ([a b]
97 | (levenshtein-distance (diff a b)))
98 | ([edit-script]
99 | (let [additions (map #(let [index (first %)
100 | items (rest %)]
101 | (apply vector index (repeat (count items) :a)))
102 | (:+ edit-script))
103 | max-index (max (max-or-zero (map first additions))
104 | (max-or-zero (:- edit-script)))
105 | v (vec (repeat max-index :e))
106 | patched (merge-patch v (merge edit-script {:+ additions}) :d)
107 | edit-groups (filter #(not= :e (first %))
108 | (partition-by #(if (= % :e) :e :edit)
109 | patched))]
110 | (reduce + (map (fn [group]
111 | (max (count (filter #(= % :a) group))
112 | (count (filter #(= % :d) group))))
113 | edit-groups)))))
114 |
115 | (defn longest-common-subseq [a b]
116 | (miller/longest-common-subseq a b))
117 |
--------------------------------------------------------------------------------
/src/clj/clj_diff/miller.clj:
--------------------------------------------------------------------------------
1 | (ns clj-diff.miller
2 | "Algorithm from 'An O(NP) Sequence Comparison Algorithm' by
3 | Sun Wu, Udi Manber, Gene Myers and Web Miller.
4 |
5 | Please refer to the above paper while reading this code."
6 | (:require [clj-diff [optimizations :as opt]]))
7 |
8 | (defn- next-x
9 | "Get the next farthest x value by looking at previous farthest values on the
10 | diagonal above and below diagonal k. Choose the greater of the farthest x on
11 | the above diagonal and the farthest x on the diagonal below plus one. fp is
12 | a map of diagonals => farthest points."
13 | [k fp]
14 | (max (inc (get fp (dec k) -1))
15 | (get fp (inc k) -1)))
16 |
17 | (defn- snake
18 | "Starting at the farthest point on diagonal k, return the x value of the
19 | point at the end of the longest snake on this diagonal. A snake is a
20 | sequence of diagonal moves connecting match points on the edit graph."
21 | [a b n m k fp]
22 | {:pre [(and (vector? a) (vector? b))]}
23 | (let [x (next-x k fp)
24 | y (- x k)]
25 | (loop [x x
26 | y y]
27 | (if (and (< x n) (< y m) (= (get a (inc x)) (get b (inc y))))
28 | (recur (inc x) (inc y))
29 | x))))
30 |
31 | (defn- p-band-diagonals
32 | "Given a p value and a delta, return all diagonals in this p-band."
33 | [p delta]
34 | (concat (range (* -1 p) delta)
35 | (reverse (range (inc delta) (+ delta (inc p))))
36 | [delta]))
37 |
38 | (defn- search-p-band
39 | "Given a p value, search all diagonals in the p-band for the furthest
40 | reaching endpoints. Record the furthest reaching endpoint for each p value
41 | in the map fp. Returns an updated fp map for p. a and b are the two
42 | sequences and n and m are their lengths respectively. delta is the
43 | diagonal of the sink and is equal to n - m."
44 | [a b n m delta p fp]
45 | (reduce (fn [fp next-k]
46 | (assoc fp next-k (snake a b n m next-k fp)))
47 | fp
48 | (p-band-diagonals p delta)))
49 |
50 | (defn ses
51 | "Find the size of the shortest edit script (ses). Returns a 3-tuple of the
52 | size of the ses, the delta value (which is the diagonal of the sink)
53 | and the fp map. The optimal path from source to sink can be constructed from
54 | this information."
55 | [a b]
56 | {:pre [(>= (count a) (count b))]}
57 | (let [n (dec (count a))
58 | m (dec (count b))
59 | delta (- n m)]
60 | (loop [p 0
61 | fp {}]
62 | (if (= (-> (get fp (dec p) {})
63 | (get delta))
64 | n)
65 | [(dec p) delta fp]
66 | (recur (inc p)
67 | (assoc fp p
68 | (search-p-band a b n m delta p (get fp (dec p) {}))))))))
69 |
70 | ;;
71 | ;; Build the edit script from the map of farthest endpoints.
72 | ;;
73 |
74 | (defn edit-dist
75 | "Given a delta, p and k value, calculate the edit distance."
76 | [delta p k]
77 | (if (> k delta)
78 | (+ (* 2 (- p (- k delta))) k)
79 | (+ (* 2 p) k)))
80 |
81 | (defn- p-value-up
82 | "Calculate the p value that will be used to look up the farthest reaching
83 | end point for the diagonal above k."
84 | [delta p k]
85 | (if (> (inc k) delta) p (dec p)))
86 |
87 | (defn- p-value-left
88 | "Calculate the p value that will be used to look up the farthest reaching
89 | end point for the diagonal below k."
90 | [delta p k]
91 | (if (< (dec k) delta) p (dec p)))
92 |
93 | (defn- look-up
94 | "Get information about the vertex above the one at x on k. If this vertex
95 | is chosen, it will represent an insertion."
96 | [graph delta p x k]
97 | (when (> (- x k) 0)
98 | (let [up-k (inc k)
99 | up-p (p-value-up delta p k)
100 | x* (-> graph
101 | (get up-p {})
102 | (get up-k -1))]
103 | (when (and (>= x* 0) (= x x*))
104 | {:edit :insert
105 | :x x*
106 | :p up-p
107 | :k up-k
108 | :d (edit-dist delta up-p up-k)}))))
109 |
110 | (defn- look-left
111 | "Get information about the vertex to the left of the one at x on k. If this
112 | vertex is chosen, it will represent an deletion."
113 | [graph delta p x k]
114 | (when (> x 0)
115 | (let [left-k (dec k)
116 | left-p (p-value-left delta p k)
117 | x* (-> graph
118 | (get left-p {})
119 | (get left-k -1))]
120 | (when (and (>= x* 0) (= (dec x) x*))
121 | {:edit :delete
122 | :x x*
123 | :p left-p
124 | :k left-k
125 | :d (edit-dist delta left-p left-k)}))))
126 |
127 | (defn- backtrack-snake
128 | "Find the x value at the head of the longest snake ending at (x, y)."
129 | [a b x y]
130 | {:pre [(and (>= x 0) (>= y 0))]}
131 | (loop [x x
132 | y y]
133 | (if (or (= x y 0) (not (= (get a x) (get b y))))
134 | x
135 | (recur (dec x) (dec y)))))
136 |
137 | ;; See the paper for an example of how there are multiple shortest
138 | ;; paths through an edit graph.
139 |
140 | (defn- next-edit
141 | "Find the next move through the edit graph which will decrease the
142 | edit distance by 1."
143 | [a b graph delta p x k]
144 | {:post [(= (dec (edit-dist delta p k)) (:d %))]}
145 | (let [d (edit-dist delta p k)
146 | head-x (backtrack-snake a b x (- x k))]
147 | (loop [head-x head-x]
148 | (let [move (first (filter #(and (not (nil? %)) ;; <<<===
149 | (= (:d %) (dec d)))
150 | (map #(% graph delta p head-x k)
151 | [look-left look-up])))]
152 | (if (and (< head-x x) (nil? move))
153 | (recur (inc head-x))
154 | move)))))
155 |
156 | (defn- edits
157 | "Calculate the sequence of edits from the map of farthest reaching end
158 | points."
159 | [a b p delta graph]
160 | (let [next-fn (partial next-edit a b graph delta)]
161 | (loop [edits '()
162 | prev {:x (count a) :p p :k delta
163 | :d (edit-dist delta p delta)}]
164 | (if (= (:d prev) 0)
165 | edits
166 | (let [next (next-fn (:p prev) (:x prev) (:k prev))]
167 | (recur (conj edits next) next))))))
168 |
169 | (defn- transpose
170 | "If a is shorter than b, then the diff is calculated from b to a and this
171 | function is used to transpose the results into a diff from a to b."
172 | [edit]
173 | (-> edit
174 | (assoc :edit (if (= :insert (:edit edit)) :delete :insert))
175 | (assoc :x (- (:x edit) (:k edit)))
176 | (assoc :k (- (:k edit)))))
177 |
178 | (defn- edits->script
179 | "Convert a sequence of edits into an edit script."
180 | [b edits f]
181 | (reduce (fn [script edit]
182 | (let [{:keys [edit x k]} (f edit)
183 | y (inc (- x k))
184 | insertions (:+ script)
185 | last-insert (last insertions)]
186 | (if (= edit :delete)
187 | (assoc script :- (conj (:- script) x))
188 | (assoc script :+ (let [index (dec x)]
189 | (if (= index (first last-insert))
190 | (conj (vec (butlast insertions))
191 | (conj last-insert (get b y)))
192 | (conj insertions [(dec x) (get b y)])))))))
193 | {:+ []
194 | :- []}
195 | edits))
196 |
197 | (defn vectorize [& more]
198 | (map #(vec (cons nil %)) more))
199 |
200 | (defn order->ses
201 | [a b]
202 | (let [[a* b*] (if (> (count b) (count a)) [b a] [a b])]
203 | [(ses a* b*) a* b*]))
204 |
205 | (defn seq-diff
206 | [a b]
207 | (let [[a b] (vectorize a b)
208 | [es a* b*] (order->ses a b)
209 | edits (apply edits a* b* es)]
210 | (edits->script b edits (if (= a* a) identity transpose))))
211 |
212 | (defn string-dispatch [a b]
213 | (when (and (string? a) (string? b)) :string))
214 |
215 | (defmulti ^{:arglists '([a b])} diff
216 | "Create an edit script that may be used to transform a into b. See doc string
217 | for clj-diff.core/diff. This function will ensure that diff* is called with
218 | arguments a and b where a >= b. If the passed values of a and b need to be
219 | swapped then the resulting path with will transposed."
220 | string-dispatch)
221 |
222 | (defmethod diff :default
223 | [a b]
224 | (seq-diff a b))
225 |
226 | (defmethod diff :string
227 | [a b]
228 | (opt/diff a b seq-diff))
229 |
230 | (defn seq-edit-dist
231 | [a b]
232 | (let [[a b] (vectorize a b)
233 | [[p & more] a* b*] (order->ses a b)]
234 | (+ (* 2 p) (- (count a*) (count b*)))))
235 |
236 | (defmulti edit-distance string-dispatch)
237 |
238 | (defmethod edit-distance :default
239 | [a b]
240 | (seq-edit-dist a b))
241 |
242 | ;; TODO - Modify optimizations so that it can be used here and with
243 | ;; longest-common-subseq
244 | (defmethod edit-distance :string
245 | [a b]
246 | (seq-edit-dist a b))
247 |
248 | (defn seq-lcs
249 | [a b]
250 | (let [diff (seq-diff a b)
251 | deletions (:- diff)]
252 | (filter #(not= % ::d)
253 | (reduce (fn [coll next]
254 | (assoc coll next ::d))
255 | (vec (seq a))
256 | deletions))))
257 |
258 | (defmulti longest-common-subseq string-dispatch)
259 |
260 | (defmethod longest-common-subseq :default
261 | [a b]
262 | (seq-lcs a b))
263 |
264 | (defmethod longest-common-subseq :string
265 | [a b]
266 | (apply str (seq-lcs a b)))
267 |
--------------------------------------------------------------------------------
/src/clj/clj_diff/optimizations.clj:
--------------------------------------------------------------------------------
1 | (ns clj-diff.optimizations
2 | "String optimizations for diff algorithms.
3 | See http://neil.fraser.name/writing/diff/."
4 | (:import clj_diff.FastStringOps))
5 |
6 | (defn common-prefix [^String a ^String b]
7 | (let [i (FastStringOps/commonPrefix a b)]
8 | [i (.substring a i) (.substring b i)]))
9 |
10 | (defn common-suffix [^String a ^String b]
11 | (let [i (FastStringOps/commonSuffix a b)]
12 | [i
13 | (.substring a 0 (- (.length a) i))
14 | (.substring b 0 (- (.length b) i))]))
15 |
16 | (defn- short-within-long
17 | "Return a diff if the shorter sequence exists in the longer one. No need to
18 | use the expensive diff algorithm for this."
19 | [^String a ^String b ^Integer ca ^Integer cb]
20 | (let [[short long] (if (> ca cb) [b a] [a b])
21 | i (int (.indexOf long short))]
22 | (if (= i -1)
23 | nil
24 | (if (= short a)
25 | {:+ (filter #(not (nil? %))
26 | [(when (> i 0)
27 | (vec (concat [-1] (seq (.substring b 0 i)))))
28 | (when (< (+ i ca) cb)
29 | (vec (concat [(dec (+ i ca))]
30 | (seq (.substring b (+ i ca))))))])
31 | :- []}
32 | {:+ []
33 | :- (vec (concat (range 0 i)
34 | (range (+ i cb) ca)))}))))
35 |
36 | (defn- half-match* [^String long ^String short ^Integer i]
37 | (let [target (.substring long i (+ i (quot (count long) 4)))]
38 | (loop [j (.indexOf short target 0)
39 | result []]
40 | (if (= j -1)
41 | (if (>= (count (or (first result) ""))
42 | (quot (count long) 2))
43 | result
44 | nil)
45 | (let [prefix-length (first (common-prefix (.substring long i)
46 | (.substring short j)))
47 | suffix-length (first (common-suffix (.substring long 0 i)
48 | (.substring short 0 j)))
49 | common (or (first result) "")]
50 | (recur (.indexOf short target (inc j))
51 | (if (< (count common) (+ prefix-length suffix-length))
52 | [(str (.substring short (- j suffix-length) j)
53 | (.substring short j (+ j prefix-length)))
54 | (.substring long 0 (- i suffix-length))
55 | (.substring long (+ i prefix-length))
56 | (.substring short 0 (- j suffix-length))
57 | (.substring short (+ j prefix-length))]
58 | result)))))))
59 |
60 | (defn- half-match
61 | "Find a substring shared by both sequences which is at least half as long
62 | as the longer sequence. Return a vector of five elements if one is found and
63 | and nil if not. The five elements are: the common sequence, the prefix
64 | of sequence a, the suffix of sequence a, the prefix of sequence b and the
65 | suffix of sequence b."
66 | [^String a ^String b]
67 | (let [[short long] (if (> (count a) (count b)) [b a] [a b])
68 | short-count (count short)
69 | long-count (count long)]
70 | (if (or (< long-count 4)
71 | (< (* short-count 2) long-count))
72 | nil
73 | (let [hm-second-q (half-match* long short (quot (+ long-count 3) 4))
74 | hm-third-q (half-match* long short (quot (+ long-count 1) 2))
75 | half-match (cond (and hm-second-q hm-third-q)
76 | (if (> (count (first hm-second-q))
77 | (count (first hm-third-q)))
78 | hm-second-q
79 | hm-third-q)
80 | :else (or hm-second-q hm-third-q))]
81 | (cond (nil? half-match) nil
82 | (= a long) half-match
83 | :else [(get half-match 0)
84 | (get half-match 3)
85 | (get half-match 4)
86 | (get half-match 1)
87 | (get half-match 2)])))))
88 |
89 | (defn- offset-diffs [diffs offset]
90 | {:+ (vec (map #(apply vector
91 | (+ offset (first %)) (rest %)) (:+ diffs)))
92 | :- (vec (map #(+ offset %) (:- diffs)))})
93 |
94 | (declare diff)
95 |
96 | (defn- diff*
97 | "Calculate the diff using the function f only after ensuring that this
98 | algorithm is required. At this point we know that a and b are different at
99 | both ends. A diff can be calculated manually if the length of a or b is 0
100 | or if the smaller of the two sequences is contained within the longer."
101 | [^String a ^String b f]
102 | (let [ca (count a)
103 | cb (count b)]
104 | (or (cond (= ca 0) {:+ [(vec (concat [-1] (seq b)))]
105 | :- []}
106 | (= cb 0) {:+ []
107 | :- (vec (range 0 ca))}
108 | :else (if-let [diffs (short-within-long a b ca cb)]
109 | diffs
110 | (if-let [half-match (half-match a b)]
111 | (let [common (get half-match 0)
112 | a-prefix (get half-match 1)
113 | a-suffix (get half-match 2)
114 | b-prefix (get half-match 3)
115 | b-suffix (get half-match 4)
116 | diff-a (diff a-prefix b-prefix f)
117 | diff-b (diff a-suffix b-suffix f)]
118 | (merge-with concat
119 | diff-a
120 | (offset-diffs diff-b
121 | (+ (count common)
122 | (count a-prefix)))))
123 | nil)))
124 | (f a b))))
125 |
126 | (defn diff
127 | "Return the diff of a and b. Wrap the diff function f in pre and post
128 | optimizations. Check for nil and equality. Remove common prefix and suffix."
129 | [^String a ^String b f]
130 | (let [diffs (cond (or (nil? a) (nil? b))
131 | (throw (IllegalArgumentException. "Cannot diff nil."))
132 | (= a b) {:+ [] :- []}
133 | :else nil)]
134 | (or diffs
135 | (let [[prefix a b] (common-prefix a b)
136 | [suffix a b] (common-suffix a b)
137 | diffs (diff* a b f)]
138 | (if (> prefix 0)
139 | {:+ (vec (map #(apply vector
140 | (+ prefix (first %)) (rest %)) (:+ diffs)))
141 | :- (vec (map #(+ prefix %) (:- diffs)))}
142 | diffs)))))
143 |
--------------------------------------------------------------------------------
/src/jvm/clj_diff/FastStringOps.java:
--------------------------------------------------------------------------------
1 | package clj_diff;
2 |
3 | /**
4 | * Fast string operations for clj-diff.
5 | */
6 | public class FastStringOps {
7 |
8 | /**
9 | * @return the number of common prefix characters for strings a and b
10 | */
11 | public static int commonPrefix(String a, String b) {
12 |
13 | int n = Math.min(a.length(), b.length());
14 | for(int i=0; i