├── .gitignore
├── COPYING
├── README
├── chunk-splitting.lisp
├── com.gigamonkeys.prose-diff.asd
├── diff.css
├── diff.js
├── diff.lisp
├── html.lisp
├── jquery-1.4.4.js
├── lcs.lisp
├── notes.txt
├── packages.lisp
├── prose-diff.lisp
├── test-edited.txt
├── test-original.txt
├── tests.lisp
├── text.lisp
├── tokenize.lisp
└── utilities.lisp


/.gitignore:
--------------------------------------------------------------------------------
1 | publish.sh
2 | *.html
3 | test-files/
4 | tmp/


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010, Peter Seibel. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |     * Redistributions of source code must retain the above copyright
 8 |       notice, this list of conditions and the following disclaimer.
 9 | 
10 |     * Redistributions in binary form must reproduce the above
11 |       copyright notice, this list of conditions and the following
12 |       disclaimer in the documentation and/or other materials provided
13 |       with the distribution.
14 | 
15 |     * Neither the name of Gigamonkeys Consulting nor the names of its
16 |       contributors may be used to endorse or promote products derived
17 |       from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | NOTE: this code is a work in progress. However the goal is to create a
 2 | diff program that works better on text files containing prose than the
 3 | traditional Unix line-based diff.
 4 | 
 5 | Update 2014-02-14: I used this code when I was running the ill fated Code
 6 | Quarterly. It was incredibly useful to be able to take a text file from an
 7 | author, edit it to my heart's content, and then generate a diff that was
 8 | even better than what Word track changes could show since it shows sections
 9 | that are moved and also edited in a sane way.
10 | 
11 | The algorithm is not speedy as it does a N^2 comparison of all the paragraphs
12 | that differ at all between the two versions so it's not great for realtime
13 | but for generating a diff offline and then putting on the web for an author
14 | to look at, it was quite nice.
15 | 


--------------------------------------------------------------------------------
/chunk-splitting.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | ;;;
 4 | ;;; Code for splitting chunks into parts corresponding to a bunch of
 5 | ;;; other chunks derived from them. This could, in theory, be used to
 6 | ;;; further refine diffs and recognize more moved text. At the moment,
 7 | ;;; however, it is not used.
 8 | ;;;
 9 | 
10 | (defun split-positions (one-chunk parts)
11 |   "Find the positions where one-chunk should be split to get pieces
12 |   corresponding to the given parts, which are derived from one-chunk.
13 |   For instance, if one-chunk is a piece of text that was split and the
14 |   pieces inserted at various places in the new document, it will show
15 |   up in the diff as a single deletion and multiple additions. Or,
16 |   conversely, if a bunch of separate pieces (from different
17 |   paragraphs) in the original document were combined into contiguous
18 |   text in the edited document, we would have a single addition and
19 |   multiple deletions."
20 |   (multiple-value-bind (one-chunk-lcs-indices combined-lcs-indices) 
21 |       (lcs-positions one-chunk (concatenate-vectors parts))
22 |     (loop for s in (part-starts parts)
23 |        for prev-x = 0 then x
24 |        for x = (position-if (lambda (x) (>= x s)) combined-lcs-indices :start prev-x)
25 |        collect (aref one-chunk-lcs-indices x))))
26 | 
27 | (defun split-vector (one-chunk parts)
28 |   (loop for (start end) on (split-positions one-chunk parts) 
29 |      collect (subseq one-chunk start end)))
30 | 
31 | (defun part-starts (parts)
32 |   (loop for (p . rest) on parts
33 |      summing (length p) into total
34 |      when rest collect total into starts
35 |      finally (return (cons 0 starts))))
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/com.gigamonkeys.prose-diff.asd:
--------------------------------------------------------------------------------
 1 | ;;
 2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved.
 3 | ;;
 4 | 
 5 | (defsystem com.gigamonkeys.prose-diff
 6 |   :name "com.gigamonkeys.prose-diff"
 7 |   :components
 8 |   ((:file "packages")
 9 |    (:file "lcs" :depends-on ("packages"))
10 |    (:file "tokenize" :depends-on ("packages"))
11 |    (:file "text" :depends-on ("packages"))
12 |    (:file "utilities" :depends-on ("packages"))
13 |    (:file "diff" :depends-on ("packages"))
14 |    (:file "html" :depends-on ("packages")))
15 |   :depends-on (:cl-ppcre
16 |                :com.gigamonkeys.pathnames
17 |                :com.gigamonkeys.utilities
18 |                :com.gigamonkeys.macro-utilities
19 |                :com.gigamonkeys.markup
20 |                :monkeylib-markup-html))
21 | 


--------------------------------------------------------------------------------
/diff.css:
--------------------------------------------------------------------------------
 1 | body { margin: 1.5in; }
 2 | 
 3 | #buttons { position: fixed; top: 6px; left: 6px; width: 1in; list-style-type: none; padding: 0; }
 4 | #buttons li button { padding-left: 3px; width: 90%; background: #eee; }
 5 | 
 6 | /* Logical styles */
 7 | 
 8 | .add { }
 9 | 
10 | .delete { }
11 | 
12 | .moved-to { }
13 | 
14 | .moved-from { }
15 | 
16 | /* Actual display styles */
17 | 
18 | .hidden { display: none; }
19 | 
20 | .delete.diff {  color: red; text-decoration: line-through; }
21 | 
22 | .add.diff { color: blue; }
23 | 
24 | .moved-to.diff { background-color: #ffc; }
25 | 
26 | .moved-from.diff { background-color: #ccc; }
27 | 
28 | /* Button style */
29 | 
30 | .selected { font-weight: 900; background: #888 !important; }


--------------------------------------------------------------------------------
/diff.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |     
 3 |     function applyStyles (displayState, button) {
 4 |         var styles = {
 5 |             'original': {
 6 |                 'delete'     : '',
 7 |                 'add'        : 'hidden',
 8 |                 'moved-from' : '',
 9 |                 'moved-to'   : 'hidden'
10 |             },
11 |             
12 |             'new': {
13 |                 'delete'     : 'hidden',
14 |                 'add'        : '',
15 |                 'moved-from' : 'hidden',
16 |                 'moved-to'   : ''
17 |             },
18 |             
19 |             'diff': {
20 |                 'delete'     : 'diff',
21 |                 'add'        : 'diff',
22 |                 'moved-from' : 'diff',
23 |                 'moved-to'   : 'diff'
24 |             },
25 |         }[displayState];
26 |         
27 |         for (var style in styles) { 
28 |             $('.' + style).removeClass().addClass(style).addClass(styles[style]);
29 |         }
30 |         
31 |         $('.selected').toggleClass('selected');
32 |         $(button).toggleClass('selected');
33 |     }
34 |     
35 |     $(document).ready(function () {
36 |         $('#show_diff').bind('click', function () { applyStyles('diff', this); }).click();
37 |         $('#show_original').bind('click', function () { applyStyles('original', this); });
38 |         $('#show_new').bind('click', function () { applyStyles('new', this); });
39 |     });
40 |     
41 | })();


--------------------------------------------------------------------------------
/diff.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | ;;;
 4 | ;;; Generic functions for diffing vectors of objects.
 5 | ;;;
 6 | 
 7 | (defun diff-vectors (old new &optional (lcs-frobber #'identity))
 8 |   "Diff two vectors returning a vector with the elements of old and
 9 | new wrapped in conses whose CAR is either :LCS, :DELETE, or :ADD.
10 | Optionally frob the computed LCS before computing the diff."
11 |   (loop with output = (make-array (length new) :adjustable t :fill-pointer 0)
12 |      with old-i = 0
13 |      with old-length = (length old)
14 |      with new-i = 0
15 |      with new-length = (length new)
16 |      for next-lcs across (funcall lcs-frobber (lcs old new))
17 |      do
18 |        (setf old-i (emit-diffs next-lcs old old-i old-length :delete output))
19 |        (setf new-i (emit-diffs next-lcs new new-i new-length :add output))
20 |        (vector-push-extend (cons :lcs next-lcs) output)
21 | 
22 |      finally
23 |        (emit-diffs (cons nil nil) old old-i old-length :delete output)
24 |        (emit-diffs (cons nil nil) new new-i new-length :add output)
25 |        (return output)))
26 | 
27 | (defun emit-diffs (next-lcs v i max-i marker output)
28 |   (cond
29 |     ((< i max-i) 
30 |      (let ((idx (or (position next-lcs v :start i) max-i)))
31 |        (cond
32 |          ((> idx i)
33 |           (loop for j from i below idx do (vector-push-extend (cons marker (aref v j)) output))
34 |           (1+ idx))
35 |          (t
36 |           (1+ i)))))
37 |     (t i)))
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/html.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | ;;;
 4 | ;;; HTML generation -- given two Markup files generate an HTML file of the diff.
 5 | ;;;
 6 | 
 7 | (defparameter *retagger* 
 8 |   (make-retagger 
 9 |    (mapcar (lambda (x) (cons x 'wrap-add-delete)) '(:add :delete :moved-to :moved-from))))
10 | 
11 | (defun diff-to-html (original-file edited-file output &key (css-dir "") (js-dir ""))
12 |   (let ((original (remove-comments (parse-file original-file :parse-links-p nil)))
13 |         (edited (remove-comments (parse-file edited-file :parse-links-p nil))))
14 |     (with-output-to-file (out output)
15 |       (render-sexps-to-stream
16 |        `(:body
17 |          ,@(diff-to-markup original edited)
18 |          ((:ul :id "buttons")
19 |           (:li ((:button :id "show_diff") "Diff"))
20 |           (:li ((:button :id "show_original") "Original"))
21 |           (:li ((:button :id "show_new") "New"))))
22 |        out
23 |        :stylesheets (loop for x in '("diff.css") collect (format nil "~a~a" css-dir x))
24 |        :scripts (loop for x in '("jquery-1.4.4.js" "diff.js") collect (format nil "~a~a" js-dir x))
25 |        :rewriter (compose  (lambda (x) (footnotes :note x)) *retagger*)))))
26 | 
27 | (defun extract-comments (sexp)
28 |   (let ((num 0)
29 |         (comments ()))
30 |     (labels ((walker (x)
31 |                (cond
32 |                  ((stringp x) x)
33 |                  ((numberp x) x)
34 |                  ((symbolp x) x)
35 |                  ((eql (car x) :comment)
36 |                   (push x comments)
37 |                   `(:commentref ,(princ-to-string (incf num))))
38 |                  (t  `(,(car x) ,@(mapcar #'walker (cdr x)))))))
39 |       (let ((walked (walker sexp)))
40 |         (values walked (nreverse comments))))))
41 | 
42 | (defun remove-comments (sexp)
43 |   (labels ((walker (x)
44 |              (cond
45 |                ((stringp x) (list x))
46 |                ((numberp x) (list x))
47 |                ((symbolp x) (list x))
48 |                ((eql (car x) :comment) nil)
49 |                (t
50 |                 (let ((body (mapcan #'walker (cdr x))))
51 |                   (if body `((,(car x) ,@body)) nil))))))
52 | 
53 |     (first (walker sexp))))
54 | 
55 | (defun commentref->html (sexp)
56 |   (destructuring-bind (tag string) sexp
57 |     (assert (eql tag :commentref))
58 |     (let ((num (parse-integer string)))
59 |       `((:span :href (:format "~(#comment_~d~)" ,num)
60 |                :id (:format "commentref_~d" ,num)
61 |                :class "comment_ref")
62 |         (:character 9997)))))
63 | 
64 | 
65 | (defun comments->html (sexp)
66 |   (destructuring-bind (tag &rest comments) sexp
67 |     (assert (eql tag :comments))
68 |     `(:progn ,@(loop for comment in comments
69 |                   for num from 1 
70 |                   collect 
71 |                     (destructuring-bind (tag &rest body) comment
72 |                       (assert (eql tag :comment))
73 |                       `((:div :id (:format "~(~a_~d~)" ,tag ,num) :class "comment") ,@body))))))
74 | 
75 | (defun wrap-add-delete (sexp)
76 |   (destructuring-bind (which &rest wrapped) sexp
77 |     (let ((class (string-downcase which))
78 |           (wrapped (mapcar *retagger* wrapped)))
79 |       ;; Recursive wrapping necessary since :moved-to and moved-from
80 |       ;; elements can contain :adds and :deletes
81 |       (cond
82 |         ((and (consp (first wrapped)) (block-element-p (car (first wrapped))))
83 |          `((:div :class ,class) ,@wrapped))
84 |         (t `((:span :class ,class) ,@wrapped))))))
85 | 
86 | (defun block-element-p (x)
87 |   (member x 
88 |           '(:body :colgroup :div :dl :fieldset :form :head :html :map :noscript
89 |             :object :ol :optgroup :pre :script :select :style :table :tbody
90 |             :tfoot :thead :tr :ul
91 |             :area :base :blockquote :br :button :caption :col :dd :div :dt :h1
92 |             :h2 :h3 :h4 :h5 :h6 :hr :input :li :link :meta :option :p :param
93 |             :td :textarea :th :title)))
94 | 


--------------------------------------------------------------------------------
/lcs.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | (defun lcs (a b)
 4 |   "Compute the longest common subsequence of vectors `a' and `b'"
 5 |   (map 'vector (lambda (i) (aref a i)) (lcs-positions a b)))
 6 | 
 7 | (defun lcs-length (a b)
 8 |   "Compute the length of the longest common subsequence of vectors `a' and `b'"
 9 |   (multiple-value-bind (table m n) (%lcs-table a b)
10 |     (aref table n m)))
11 | 
12 | (defun lcs-positions (a b)
13 |   "Find the indices in a and b of the elements of the LCS."
14 |   (multiple-value-bind (table m n) (%lcs-table a b)
15 |     (let* ((len (aref table n m))
16 |            (a-indices (make-array len))
17 |            (b-indices (make-array len))
18 |            (idx (1- len))
19 |            (i (length a))
20 |            (j (length b)))
21 |     
22 |     (loop while (> (aref table j i) 0) do
23 |          (let* ((current (aref table j i))
24 |                 (previous (1- current)))
25 |            
26 |            (cond
27 |              ((and (= previous (aref table (1- j) (1- i)))
28 |                    (= previous (aref table j (1- i)))
29 |                    (= previous (aref table (1- j) i)))
30 |               (decf j)
31 |               (decf i)
32 |               (setf (aref a-indices idx) i)
33 |               (setf (aref b-indices idx) j)
34 |               (decf idx))
35 |              ((= current (aref table (1- j) i)) (decf j))
36 |              ((= current (aref table j (1- i))) (decf i))
37 |              (t (error "Assertion gone haywire: ~s ~s" j i)))))
38 |     (values a-indices b-indices))))
39 | 
40 | (defun %lcs-table (a b)
41 |   "Compute the MxN table from which we can extract the LCS, and a
42 | bunch of other good stuff."
43 |   (let* ((m (length a))
44 |          (n (length b))
45 |          (table (make-array (list (1+ n) (1+ m)) :initial-element 0)))
46 | 
47 |     (flet ((lcs-length (j i)
48 |              (cond
49 |                ((eql (aref a (1- i)) (aref b (1- j)))
50 |                 (+ 1 (aref table (1- j) (1- i))))
51 |                (t
52 |                 (max (aref table (1- j) i) (aref table j (1- i)))))))
53 | 
54 |       (loop for j from 1 to n do
55 |            (loop for i from 1 to m do
56 |               (setf (aref table j i) (lcs-length j i)))))
57 | 
58 |     (values table m n)))
59 | 
60 | (defun similarity (a b)
61 |   "Compute the similarity of vectors `a' and `b' in terms of the
62 | average of the ratios of the length of the LCS to their length."
63 |   (let ((lcs-length (lcs-length a b)))
64 |     (/ (+ (/ lcs-length (length a)) (/ lcs-length (length b))) 2.0d0)))
65 | 
66 | (defun one-way-similarity (a b)
67 |   "Like `similarity' but in only one direction."
68 |   (float (/ (lcs-length a b) (length a)) 0d0))
69 | 
70 | 


--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
  1 | -*- mode: markup; -*-
  2 | 
  3 | * Prose diff algorithm:
  4 | 
  5 | Hash (or intern) the paragraphs of the two documents and compute the
  6 | diff using LCS of the sequence of hashes. This will find a backbone of
  7 | unchanged paragraphs in their original order as well as the places
  8 | where paragraphs were "deleted" and "added". However there are a few
  9 | more possibilities.
 10 | 
 11 |   # A paragraph may actually have been cut. This will show up as a
 12 |     deletion with none of the new paragraphs being at all similar to
 13 |     the deleted paragraph. (Cut)
 14 | 
 15 |   # A paragraph may actually have been added. This will show up as an
 16 |     addition with the new paragraph not similar to any of the deleted
 17 |     paragraphs. (Addition)
 18 | 
 19 |   # A paragraph may have been moved which will show up as a deletion
 20 |     in one place and an addition somewhere else of the same hash.
 21 |     (Moved)
 22 | 
 23 |   # A paragraph may have been edited in place which will show up as a
 24 |     deletion and addition in the same position with a lot of
 25 |     similarity between the deleted and added paragraphs. (Edited)
 26 | 
 27 |   # A paragraph may have been moved and edited which will show up as a
 28 |     deletion at the original position and an addition somewhere else
 29 |     with a lot of similarity between the deleted and added paragraphs.
 30 |     (Moved and edited)
 31 | 
 32 |   # A paragraph may have been split, perhaps with edits to both the
 33 |     new paragraphs. This will show up as a deletion and an addition of
 34 |     two (or more) paragraphs at the same position with each of the new
 35 |     paragraphs having a LCS with the old paragraph that is almost the
 36 |     full length of the new paragraph.
 37 | 
 38 |   # N paragraphs may have been joined which will look like N deletions
 39 |     and one addition where each of the deleted paragraphs will have an
 40 |     LCS with the new paragraph that is almost the full length of the
 41 |     deleted paragraph.
 42 | 
 43 |   # A chunk of a paragraph may have been removed and added into
 44 |     another paragraph. This will look like edits to both paragraphs
 45 |     and the LCS of the deleted bits of the first paragraph with the
 46 |     added bits of the second paragraph will be almost the whole
 47 |     length. (Sentences moved)
 48 |      
 49 | To determine what has happened:
 50 | 
 51 |   # Find exactly moved paragraphs and remove them from further
 52 |     consideration. (Another way to implement this is to simply find
 53 |     all the paragraphs that occur in both the old and new document.)
 54 | 
 55 |   # Build a matrix of the LCSs of each remaining paragraph against
 56 |     every other.
 57 | 
 58 | 
 59 | 
 60 | 
 61 | * Output
 62 | 
 63 | A set of paragraphs. Some paragraphs occur in both the original and
 64 | edited document and we record their position in each document.
 65 | 
 66 | Some paragraphs exist only in the original and some only in the
 67 | edited. There is a mapping between the original-only and edited-only
 68 | paragraphs the defines how they are releated.
 69 | 
 70 |  - Original to nothing: a deleted paragraph
 71 | 
 72 |  - nothing to Edited: a completely new paragraph
 73 | 
 74 |  - One original to one edited: an edited paragraph
 75 | 
 76 |  - One original to multiple edited: a split (and possibly edited) paragraph.
 77 | 
 78 |  - Multiple original to one edited: a joined (and possibly edited) parargraphs.
 79 | 
 80 |  - Two originals to two edited: chunk removed from original and moved
 81 |    to another paragraph. I.e. basically an edit of one paragraph to
 82 |    remove a chunk of text and then an edit of another paragraph to add
 83 |    a very similar chunk of text.
 84 | 
 85 | 
 86 | 
 87 | A set of paragraphs, each with a position (possibly Nil) in the
 88 | original document, a position (possibly (Nil) in the new document, and 
 89 | 
 90 | 
 91 | * Unmoved paragraphs
 92 | 
 93 | After we determine the simply equivalent paragraphs between original
 94 | and edited (identical paragraphs and symmetrically similar\note{Though
 95 | need to think about splits and joins}) we can find a sort of
 96 | abstracted LCS at the paragraph level. E.g. If we have a trivial case
 97 | of:
 98 | 
 99 |    A B C D E -> A' B' C' D' E'
100 | 
101 | where X' is equivalent to X, then the paragraph-level LCS is the whole
102 | document. In general the unmoved paragraphs are the ones that are part
103 | of the LCS.
104 | 
105 | We can then run through the rest of the paragraphs in edited and try
106 | to figure out where they come from. 
107 | 
108 | Moved paragraphs will be ones that have versions in both original and
109 | edited but which are not part of the LCS.
110 | 
111 | Extra paragraphs resulting from splits will have high one-way
112 | similarity with the original version of a preceding or following
113 | paragraph
114 | 
115 | Paragraphs resulting from joins will have high one-way similarity with multiple 
116 | 
117 | Totally new paragraphs will have low similarity to any paragraphs.
118 | 
119 | 
120 | Truly deleted paragraphs
121 | will be ones with no version 
122 | 
123 | 
124 | 
125 | 
126 | Simple case: A single paragraph is in the “same” location if its
127 | edited version follow an edited version of the paragraph the preceded
128 | it in the original or precedes an edited version of the paragraph that
129 | followed it in the original.
130 | 
131 | Slightly more complicated: if the preceding or following paragraphs in
132 | edited are the result of a split, they should be considered descended
133 | from the original paragraph. Thus if we have:
134 | 
135 |    AB C DE -> A' B' C' D' E'
136 | 
137 | Then paragraphs B and D are descendants of AB and DE, thus C' is in
138 | the same position.
139 | 
140 | If the preceding or following paragraphs resulted from a join we have:
141 | 
142 |    A B C D E -> AB' C DE'
143 | 
144 | Since AB' descends from both A and B and DE' descends from both D and
145 | E, C is similarly unmoved.
146 | 
147 | An unmoved paragraph may have 
148 | 
149 | 
150 | 
151 | * Deleted paragraphs
152 | 
153 | For paragraphs deleted from the original, find the paragraph
154 | preceeding it in the original and determine what happened to it in the
155 | editied. If it exists (perhaps in modified form) in the edited, put
156 | the deleted paragraph, marked as deleted, after it. If it was also
157 | deleted, find its preceeding paragraph, and so on, until a paragraph
158 | that lives on in the edited verion is found. (Screw case #1: if the
159 | preceeding paragraph was moved far away it would probably make more
160 | sense to leave the deleted paragraphs where they were. Not sure how to
161 | determine when a paragraph was moved. Screw case #2: if the preceeding
162 | paragraph was joined with the paragraph following the deleted
163 | paragraph(s). Probably should put the deleted paragraphs after the
164 | joined paragraph.
165 | 


--------------------------------------------------------------------------------
/packages.lisp:
--------------------------------------------------------------------------------
 1 | ;;
 2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved.
 3 | ;;
 4 | 
 5 | (in-package :cl-user)
 6 | 
 7 | (defpackage :com.gigamonkeys.prose-diff
 8 |   (:use :common-lisp
 9 |         :cl-ppcre
10 |         :com.gigamonkeys.pathnames
11 |         :com.gigamonkeys.utilities
12 |         :com.gigamonkeys.markup
13 |         :com.gigamonkeys.markup.html)
14 |   (:import-from :alexandria :compose)
15 |   (:export
16 |    :show-cuts))
17 | 


--------------------------------------------------------------------------------
/prose-diff.lisp:
--------------------------------------------------------------------------------
  1 | ;;
  2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved.
  3 | ;;
  4 | 
  5 | (in-package :com.gigamonkeys.prose-diff)
  6 | 
  7 | (defparameter *word-regexp* (create-scanner "((?:\\\\n{.*?})|(?:\\w*’\\w+)|(?:\\w+))"))
  8 | (defparameter *old-mode-regexp* (create-scanner "-\\*- mode: .*; -\\*-"))
  9 | (defparameter *new-mode-line* "-*- mode: coders-at-work-editing; -*-")
 10 | (defparameter *minimum-match* 4)
 11 | 
 12 | (defun text (file)
 13 |   (with-output-to-string (s)
 14 |     (with-open-file (in file)
 15 |       (loop for line = (read-line in nil nil)
 16 | 	 while line do (write-line line s)))))
 17 | 
 18 | (defun words (text)
 19 |   (let (words)
 20 |     (do-matches (start end *word-regexp* text (coerce (nreverse words) 'vector))
 21 |       (push (intern (string-upcase (subseq text start end)) :keyword) words))))
 22 | 
 23 | 
 24 | (defun word-starts (text)
 25 |   (let (positions)
 26 |     (do-matches (start end *word-regexp* text (coerce (nreverse positions) 'vector))
 27 |       (push start positions))))
 28 | 
 29 | (defun positions-table (words)
 30 |   (let ((table (make-hash-table)))
 31 |     (loop for position from (1- (length words)) downto 0
 32 |        do (push position (gethash (aref words position) table ())))
 33 |     table))
 34 | 
 35 | (defun dump-table (table)
 36 |   (loop for k being the hash-keys of table
 37 |        do (format t "~a => ~a~%" k (gethash k table))))
 38 | 
 39 | ;; XXX -- the policy of removing the first match if there are ties (by
 40 | ;; length) is not necessarily quite right since you could have
 41 | ;; something like: 'a b c d a b c e' in the original and then in the
 42 | ;; edited: 'a b c x a b c d' in which case the first match attempt on
 43 | ;; 'a' would find the two 'a b c' sequences and then would take the
 44 | ;; first one leaving '- - - d a b c e'. Then when matching starting at
 45 | ;; the second 'a' in the edited text the 'a b c d' has been broken up
 46 | ;; so it will only match 'a b c' leaving an extra 'e'. Don't know how
 47 | ;; much this will matter in practice. (Hmmm, maybe could do something
 48 | ;; like: for each occurence of word W in edited find the longest match
 49 | ;; in original. Then assign the longest overall to which ever
 50 | ;; occurence of W it goes with and then axe any matches that overlap
 51 | ;; with it and assign the longest remaining to it's W and so on. Then
 52 | ;; move to the next word in edited.)
 53 | ;;
 54 | ;; Maybe for section that has ties, we should remember the position in
 55 | ;; edited but leave the word vectors alone. Then after we've gone
 56 | ;; through all of the edited words, go back and try the remembered
 57 | ;; positions again. This will deal with the case where a short phrase
 58 | ;; matches a number of places and the first one happens to be the
 59 | ;; wrong one. If we don't take it out, then that instance of the short
 60 | ;; phrase will hopefully be taken as part of some bigger piece of text
 61 | ;; used later in the edited text. As it stands now, we'll take the
 62 | ;; short phrase out of the middle of the longer text which will then
 63 | ;; get broken into two bits surrounding the short phrase. (Which I
 64 | ;; guess will then be taken from somewhere else.)
 65 | 
 66 | (defun find-original-text (original-text edited-text)
 67 |   "Given `original-text' and `edited-text' return two word vectors
 68 |   representing the given texts with the sequences of words occuring in
 69 |   both texts replaced by nil. Thus the sequences of words that remain
 70 |   in original are the ones not used in the edited version and the
 71 |   words that remain in edited are the ones added during editing."
 72 |   (loop
 73 |      with original        = (words original-text)
 74 |      with edited          = (words edited-text)
 75 |      with positions-table = (positions-table original)
 76 |      with edited-idx      = 0
 77 | 
 78 |      for word = (aref edited edited-idx)
 79 |      for positions = (gethash word positions-table)
 80 | 
 81 |      do
 82 |        (loop
 83 | 	  with longest-match = 0
 84 | 	  with longest-match-starts = ()
 85 | 
 86 | 	  for original-idx in positions
 87 | 	  for match = (match-length original edited original-idx edited-idx longest-match)
 88 | 
 89 | 	  do (when (>= match longest-match)
 90 | 	       (when (> match longest-match)
 91 | 		 (setf longest-match-starts ())
 92 | 		 (setf longest-match match))
 93 | 	       (push original-idx longest-match-starts))
 94 | 
 95 | 	  finally (cond
 96 | 		    ((> longest-match *minimum-match*)
 97 | 		     (null-words original (first longest-match-starts) longest-match)
 98 | 		     (null-words edited edited-idx longest-match)
 99 | 		     (incf edited-idx longest-match))
100 | 		    (t (incf edited-idx))))
101 |        
102 |      while (< edited-idx (length edited))
103 |      finally (return (values original edited))))
104 | 
105 | (defun null-words (words start count)
106 |   (loop repeat count for i from start do (setf (aref words i) nil)))
107 | 
108 | (defun map-in-and-out (words word-starts in-fn out-fn)
109 |   "Given a vector `words' with some words nulled out and a parallel
110 |   vector `word-starts' containing the starting positions of the words
111 |   in a text (not given), map over the chunks of text, calling `in-fn'
112 |   for each chunk of words present in `words' and `out-fn' for each
113 |   chunk of words missing from `words'. The functions are called with a
114 |   bounding index designator of the underlying text."
115 |   (loop with text-idx = 0
116 |      with word-idx = 0
117 |      while word-idx
118 |      do (let ((word (aref words word-idx))
119 | 	      (end (aref word-starts word-idx)))
120 | 	  (unless (zerop word-idx)
121 | 	    (funcall (if word out-fn in-fn) text-idx end)
122 | 	    (setf text-idx end))
123 | 	  (setf word-idx (position nil words :start (1+ word-idx) :key (if word #'identity #'not))))
124 |      finally (funcall (if (aref words (1- (length words))) in-fn out-fn) text-idx nil)))
125 | 
126 | (defun show-cuts (&key master edited output)
127 |   (with-open-file (out output :direction :output :if-exists :supersede)
128 |     (let ((text (text master)))
129 |       (flet ((make-emitter (label)
130 | 	       (lambda (start end)
131 | 		 (let ((text (subseq text start end)))
132 | 		   (when (zerop start)
133 | 		     (setf text (regex-replace *old-mode-regexp* text *new-mode-line*)))
134 | 		   (format out "{{~a}}~a" label text)))))
135 | 	(let ((words (find-original-text text (text edited))))
136 | 	  (map-in-and-out words (word-starts text) (make-emitter "toss") (make-emitter "keep")))))))
137 | 
138 | (defun show-leftovers (&key master edited output)
139 |   (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede)
140 |     (let ((text (text master)))
141 |       (flet ((make-emitter (label)
142 | 	       (lambda (start end)
143 | 		 (let ((text (subseq text start end)))
144 | 		   (when (zerop start)
145 | 		     (setf text (regex-replace *old-mode-regexp* text *new-mode-line*)))
146 | 		   (format out "{{~a}}~a" label text)))))
147 | 	(let ((words (find-original-text text (text edited))))
148 | 	  (map-in-and-out words (word-starts text) (make-emitter "keep") (make-emitter "book")))))))
149 | 
150 | (defun show-unused (&key master edited output)
151 |   (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede)
152 |     (let ((master-text (text master))
153 | 	  (edited-text (text edited)))
154 |       (flet ((make-emitter ()
155 | 	       (lambda (start end)
156 | 		 (let ((text (subseq master-text start end)))
157 | 		   (write-string text out))))
158 | 	     (make-elipsator ()
159 | 	       (lambda (start end)
160 | 		 (let ((text (subseq master-text start end)))
161 | 		   (if (find #\Newline text)
162 | 		       (format out "~2&§~2%")
163 | 		       (format out " … "))))))
164 | 	(let ((words (find-original-text master-text edited-text)))
165 | 	  (map-in-and-out words (word-starts master-text) (make-emitter) (make-elipsator)))))))
166 | 
167 | (defun show-possible-cuts (&key master edited output (minimum 0))
168 |   "Emit all sections left in `master' after `edited' text has been
169 |   removed, longer than `minimum' characters. Sections are output in
170 |   the order they appear in `master'."
171 |   (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede)
172 |     (let ((master-text (text master))
173 | 	  (edited-text (text edited)))
174 |       (let ((chunks-found 0))
175 | 	(flet ((make-emitter ()
176 | 		 (lambda (start end)
177 | 		   (let ((text (subseq master-text start end)))
178 | 		     (when (> (length text) minimum)
179 | 		       (when (zerop start)
180 | 			 (setf text (regex-replace *old-mode-regexp* text *new-mode-line*)))
181 | 		       (incf chunks-found)
182 | 		       (format t "~&Found chunk of ~:d characters." (length text))
183 | 		       (format out "~2&{{~a}}" text))))))
184 | 	  (let ((words (find-original-text master-text edited-text)))
185 | 	    (map-in-and-out words (word-starts master-text) (make-emitter) (constantly nil))))
186 | 	(format t "~&~:d chunks of more than ~:d characters found." chunks-found minimum)))))
187 | 
188 | (defun used-sections (&key master edited output (minimum 0))
189 |   "Emit all sections from `edited' found in `master' in the order they
190 |   appear in `master'."
191 |   (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede)
192 |     (let ((master-text (text master))
193 | 	  (edited-text (text edited)))
194 |       (let ((chunks-found 0))
195 | 	(flet ((make-emitter ()
196 | 		 (lambda (start end)
197 | 		   (let ((text (subseq master-text start end)))
198 | 		     (when (> (length text) minimum)
199 | 		       (when (zerop start)
200 | 			 (setf text (regex-replace *old-mode-regexp* text *new-mode-line*)))
201 | 		       (incf chunks-found)
202 | 		       (format t "~&Found chunk of ~:d characters." (length text))
203 | 		       (format out "~2&{{~a}}" text))))))
204 | 	  (let ((words (find-original-text master-text edited-text)))
205 | 	    (map-in-and-out words (word-starts master-text) (constantly nil) (make-emitter))))
206 | 	(format t "~&~:d chunks of more than ~:d characters found." chunks-found minimum)))))
207 | 
208 | (defun show-sorted-cuts (&key master edited output)
209 |   "Emit all sections left in `master' after `edited' text has been
210 |   removed, sorted by size, longer sections first."
211 |   (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede)
212 |     (let ((master-text (text master))
213 | 	  (edited-text (text edited)))
214 |       (let ((words (find-original-text master-text edited-text))
215 | 	    (chunks ()))
216 | 	(flet ((collector (start end)
217 | 		 (let ((text (subseq master-text start end)))
218 | 		   (push (list (length text) text start end) chunks))))
219 | 	  (map-in-and-out words (word-starts master-text) #'collector (constantly nil)))
220 | 	(loop for (length text start end) in (sort chunks #'> :key #'first) do
221 | 	     (format out "~2&** ~:d characters (~:d-~:d)~2%" length start end)
222 | 	     (write-string text out))))))
223 |       
224 |   
225 | (defun show-additions (&key master edited output)
226 |   (with-open-file (out output :direction :output :if-exists :supersede)
227 |     (let ((text (text edited)))
228 |       (flet ((make-emitter (label)
229 | 	       (lambda (start end)
230 | 		 (let ((text (subseq text start end)))
231 | 		   (when (zerop start)
232 | 		     (setf text (regex-replace *old-mode-regexp* text *new-mode-line*)))
233 | 		   (format out "{{~a}}~a" label text)))))
234 | 	(let ((words (nth-value 1 (find-original-text (text master) text))))
235 | 	  (map-in-and-out words (word-starts text) (make-emitter "book") (make-emitter "keep")))))))
236 |   
237 | 
238 | (defun match-length (original edited o-start e-start longest-match)
239 |   (flet ((same (i)
240 | 	   (let ((e (+ e-start i))
241 | 		 (o (+ o-start i)))
242 | 	     (and (< e (length edited))
243 | 		  (< o (length original))
244 | 		  (eql (aref edited e) (aref original o))))))
245 |     ;; Check the zeroth element in case the original got nulled out
246 |     ;; due to a previous match. Then check that this match could at
247 |     ;; least in theory be as long as the previous longest match, i.e.
248 |     ;; the last element matches, before we go to the bother of
249 |     ;; checking all the intervening elements.
250 |     (if (and (same 0) (or (zerop longest-match) (same (1- longest-match))))
251 | 	(loop for i from 1 when (not (same i)) return i)
252 | 	0)))
253 | 	   
254 | ;;; Scrap
255 | 
256 | (defun words-with-text-positions (text)
257 |   (let (words positions)
258 |     (do-matches (start end *word-regexp* text
259 | 		       (values (coerce (nreverse words) 'vector)
260 | 			       (nreverse (cons (length text) positions))))
261 |       (push (intern (string-upcase (subseq text start end)) :keyword) words)
262 |       (push start positions))))
263 | 
264 | (defun reconstitute-text (text)
265 |   (multiple-value-bind (words positions) (words-with-text-positions text)
266 |     (declare (ignore words))
267 |     (string= text
268 | 	     (with-output-to-string (s)
269 | 	       (loop for (start end) on (cons 0 positions )
270 | 		  while end
271 | 		  do (format s "~a" (subseq text start end)))))))
272 | 
273 | 


--------------------------------------------------------------------------------
/test-edited.txt:
--------------------------------------------------------------------------------
 1 | -*- mode: markup; -*-
 2 | 
 3 | * Prose diff test document
 4 | 
 5 | Moving up is what this paragraph will do.
 6 | 
 7 | Upward movement with some editing is what happens to this text.
 8 | 
 9 | A test document. This is a paragraph that should appear in both versions.
10 | 
11 | Will be edited a little bit in the edited version.
12 | 
13 | This is a long paragraph that will be split into to two parts in the
14 | edited version.
15 | 
16 | As it stands now it just goes on and on and on.
17 | 
18 | This is a short paragraph that will be joined with the next paragraph.
19 | This paragraph will be joined with the previous one.
20 | 
21 | The text of this paragraph is completely new. It doesn't appear in any
22 | form in the original document.
23 | 
24 | I like dogs a lot. Puppies are neat too. I also like cats. Kittens are
25 | fuzzy furballs.
26 | 
27 | This is \b{quux} formatted text.
28 | 
29 | This paragraph will be moved.
30 | 
31 | This paragraph will be moved and changed.
32 | 
33 | ** Tokenization tests
34 | 
35 | Depending on how we split text into tokens when computing the LCS we
36 | can get slightly better or worse diffs. The remaining paragraphs in
37 | this section show some of the issues.
38 | 
39 | Quux biff boom and stuff. Ideally there's one addition and one deletion.
40 | 
41 | One quux biff boom and stuff. Ideally there's one addition and one deletion.
42 | 
43 | Two quux biff and stuff. Ideally there's one addition and one deletion.
44 | 
45 | Three quux biff boom blat and stuff. Ideally there's one addition and one deletion.
46 | 
47 | Four I hope there's only one deletion and addition following: quux biff boom
48 | 
49 | Five I hope there's only one deletion and addition following: quux biff boom and stuff.
50 | 
51 | A sentence that ends with foo and bar. Ideally foo will not be struck out in the diff.
52 | 
53 | A sentence that ends with "foo". Ideally the only diff will be the addition of quotes.
54 | 
55 | .
56 | 


--------------------------------------------------------------------------------
/test-original.txt:
--------------------------------------------------------------------------------
 1 | -*- mode: markup; -*-
 2 | 
 3 | * Prose diff test document
 4 | 
 5 | A test document. This is a paragraph that should appear in both versions.
 6 | 
 7 | Will be slightly edited in the edited version.
 8 | 
 9 | This is a long paragraph that will be split into to two parts in the
10 | edited version. As it stands now it just goes on and on and on.
11 | 
12 | This is a short paragraph that will be joined with the next paragraph.
13 | 
14 | This paragraph will be joined with the previous one.
15 | 
16 | This paragraph will be moved.
17 | 
18 | This paragraph will be moved and edited.
19 | 
20 | Deletion is the fate of this block of text.
21 | 
22 | I like dogs a lot. I also like cats.
23 | 
24 | Puppies are neat too. Kittens are fuzzy furballs.
25 | 
26 | This is \i{quux} formatted text.
27 | 
28 | Moving up is what this paragraph will do.
29 | 
30 | Upward movement with editing is what happens to this text.
31 | 
32 | ** Tokenization tests
33 | 
34 | Depending on how we split text into tokens when computing the LCS we
35 | can get slightly better or worse diffs. The remaining paragraphs in
36 | this section show some of the issues.
37 | 
38 | Foo bar baz and stuff. Ideally there's one addition and one deletion.
39 | 
40 | One foo bar baz and stuff. Ideally there's one addition and one deletion.
41 | 
42 | Two foo bar baz and stuff. Ideally there's one addition and one deletion.
43 | 
44 | Three foo bar baz and stuff. Ideally there's one addition and one deletion.
45 | 
46 | Four I hope there's only one deletion and addition following: foo bar baz
47 | 
48 | Five I hope there's only one deletion and addition following: foo bar baz and stuff.
49 | 
50 | A sentence that ends with foo. Ideally foo will not be struck out in the diff.
51 | 
52 | A sentence that ends with foo. Ideally the only diff will be the addition of quotes.
53 | 
54 | 


--------------------------------------------------------------------------------
/tests.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | 
 4 | (defun remove-tag-test ()
 5 |   (and 
 6 |    (equalp
 7 |     (remove-tag :add '(:i (:b (:add "foo" (:x "abc")))))
 8 |     '(:i (:b "foo" (:x "abc"))))
 9 | 
10 |    (equalp
11 |     (remove-tag :b '(:i (:b (:add "foo" (:x "abc")))))
12 |     '(:i (:add "foo" (:x "abc"))))
13 | 
14 |    (equalp
15 |     (remove-tag :i '(:i (:b (:add "foo" (:x "abc")))))
16 |     '(:b (:add "foo" (:x "abc"))))))
17 | 
18 | (defun diff-textified-markup (a b)
19 |   (diff-textified (textify-markup a) (textify-markup b)))
20 | 
21 | (defun foo-test ()
22 |   (list
23 |    (rewrite-adds-and-deletes
24 |     (detextify-markup
25 |      (diff-textified-markup
26 |       '(:p "foo " (:i "quux") " baz")
27 |       '(:p "foo " (:b "quux") " baz"))))
28 | 
29 |   (rewrite-adds-and-deletes
30 |    (detextify-markup
31 |     (diff-textified-markup
32 |      '(:p "foo " (:i "quux bar biff") " baz")
33 |      '(:p "foo " (:i "quux baz boom") " baz"))))))


--------------------------------------------------------------------------------
/text.lisp:
--------------------------------------------------------------------------------
  1 | (in-package :com.gigamonkeys.prose-diff)
  2 | 
  3 | (defvar *interned-text* (make-hash-table :test #'equal))
  4 | 
  5 | (defparameter *token-scanner* (cl-ppcre:create-scanner "(\\w+|\\s+|[\\W\\S])"))
  6 | 
  7 | (defclass propertied-text ()
  8 |   ((text :initarg :text :accessor text)
  9 |    (properties :initarg :properties :accessor properties)))
 10 | 
 11 | (defun clear-interned-text ()
 12 |   (setf *interned-text* (make-hash-table :test #'equal)))
 13 | 
 14 | (defun intern-text (text &optional properties)
 15 |   (let ((key (cons text properties)))
 16 |     (let ((existing (gethash key *interned-text*)))
 17 |       (or
 18 |        existing
 19 |        (setf
 20 |         (gethash key *interned-text*)
 21 |         (make-instance 'propertied-text
 22 |           :text text 
 23 |           :properties properties))))))
 24 | 
 25 | (defun add-property (text new-prop)
 26 |   (intern-text (text text) (cons new-prop (properties text))))
 27 | 
 28 | (defun remove-properties (text props)
 29 |   (intern-text (text text) (set-difference (properties text) props)))
 30 | 
 31 | (defun has-property-p (prop text)
 32 |   (member prop (properties text)))
 33 | 
 34 | (defmethod print-object ((object propertied-text) stream)
 35 |   (when *print-readably*
 36 |     (error 'print-not-readable object))
 37 |   (format stream "\"~a\"~@[[~{~(~a~)~^:~}]~]" (text object) (properties object)))
 38 | 
 39 | (defun textify-markup (markup-list)
 40 |   "Convert a list of Markup sexps into a vector of interned
 41 | propertied-text objects."
 42 |   (let ((v (make-array 10 :adjustable t :fill-pointer 0)))
 43 |     (loop for markup in markup-list do (vector-push-extend* (%textify-markup markup ()) v))
 44 |     v))
 45 | 
 46 | (defun detextify-markup (v)
 47 |   "Convert a vector of propertied-text object (such as produced by
 48 | textify-markup into a list of Markup sexps."
 49 |   (values (%detextify-markup v 0 (length v) ())))
 50 | 
 51 | (defgeneric %textify-markup (markup properties))
 52 | 
 53 | (defmethod %textify-markup ((markup cons) properties)
 54 |   (destructuring-bind (tag &rest body) markup
 55 |     (let ((props (cons tag properties))
 56 |           (results ()))
 57 |       (loop for element in body do 
 58 |            (loop for x in (%textify-markup element props) do (push x results)))
 59 |       ;;; This next line is a bit of a kludge to keep adjacent
 60 |       ;;; elements with the same tag, e.g. adjacent :P's, from
 61 |       ;;; merging.
 62 |       (push (intern-text "" properties) results)
 63 |       (nreverse results))))
 64 | 
 65 | (defmethod %textify-markup ((markup string) properties)
 66 |   (mapcar (lambda (tok) (intern-text tok properties)) (tokenize-text markup)))
 67 | 
 68 | (defun tokenize-text (text)
 69 |   "Split a text string into a list of tokens containing either all the
 70 | text split into words, whitespace, and punctuation or just the words."
 71 |   (cl-ppcre:all-matches-as-strings *token-scanner* text))
 72 | 
 73 | (defun %detextify-markup (v start end open-props)
 74 |   (assert (not (null start)))
 75 |   (let ((result ())
 76 |         (i start))
 77 |     (with-output-to-string (s)
 78 |       (flet ((save-text ()
 79 |                (let ((str (get-output-stream-string s)))
 80 |                  (when (plusp (length str)) (push str result)))))
 81 |         (loop while (< i end) do
 82 |              (let* ((text (aref v i))
 83 |                     (text-props (properties text)))
 84 |                (cond
 85 |                  ((equal text-props open-props)
 86 |                   (write-string (text text) s)
 87 |                   (incf i))
 88 | 
 89 |                  ((new-open-p text-props open-props)
 90 |                   (save-text)
 91 |                   (let* ((unopen (unopen text-props open-props))
 92 |                          (tag (first (last unopen)))
 93 |                          (now-open (cons tag open-props)))
 94 |                     (multiple-value-bind (thing idx) (%detextify-markup v i end now-open)
 95 |                       (push (cons tag thing) result)
 96 |                       (setf i idx))))
 97 |                  (t (decf i)
 98 |                     (save-text)
 99 |                     (loop-finish)))))
100 |         (save-text)))
101 |     (values (nreverse result) (1+ i))))
102 | 
103 | (defun new-open-p (text-props open-props)
104 |   (equal (last text-props (length open-props)) open-props))
105 | 
106 | (defun unopen (text-props open-props)
107 |   (multiple-value-bind (unopen open) 
108 |       (take text-props (- (length text-props) (length open-props)))
109 |     (assert (equal open-props open) () 
110 |             "Remaining props ~s not equal to open props ~s" open open-props)
111 |     unopen))
112 | 
113 | (defun diff-textified (a b)
114 |   "Diff two vectors of propertied-text, returning a single vector
115 | containing the diff as propertied-text."
116 |   (flet ((translate-textified (x)
117 |            (destructuring-bind (label . thing) x
118 |              (ecase label
119 |                (:lcs thing)
120 |                ((:add :delete) (add-property thing label)))))
121 | 
122 |          (collapse-spaces-in-lcs (lcs)
123 |            ;; Since spaces are quite common in text, the LCS of any
124 |            ;; two bits of text will include a lot of them. However,
125 |            ;; when there are no words between them in the LCS it is
126 |            ;; better to collapse them so that instead of diffing: 'a
127 |            ;; b' and 'd e' as ((:delete a) (:add d) " " (:delete b)
128 |            ;; (:add e))' we get ((:delete "a b") (:add "d e")) We
129 |            ;; also get rid of any leading spaces for a similar
130 |            ;; reason.
131 |            (let ((just-saw-space t))
132 |              (flet ((collapsable? (x)
133 |                       (if (string= (text x) " ")
134 |                           (prog1 just-saw-space (setf just-saw-space t))
135 |                           (setf just-saw-space nil))))
136 |                (remove-if #'collapsable? lcs)))))
137 |     
138 |     (let ((diff (diff-vectors a b #'collapse-spaces-in-lcs)))
139 |       (map-into diff #'translate-textified diff))))


--------------------------------------------------------------------------------
/tokenize.lisp:
--------------------------------------------------------------------------------
  1 | (in-package :com.gigamonkeys.prose-diff)
  2 | 
  3 | (defparameter *short-length* 5)
  4 | 
  5 | (defclass chunk ()
  6 |   ((markup :initarg :markup :accessor markup)
  7 |    (textified :reader textified)
  8 |    (most-similar :initform nil :accessor most-similar)
  9 |    (pair :initform nil :accessor pair)))
 10 | 
 11 | (defclass pair ()
 12 |   ((original :initarg :original :reader original)
 13 |    (edited :initarg :edited :reader edited)))
 14 | 
 15 | (defmethod initialize-instance :after ((chunk chunk) &key &allow-other-keys)
 16 |   (with-slots (markup textified) chunk
 17 |     (setf textified (textify-markup markup))))
 18 | 
 19 | (defun make-chunk (markup)
 20 |   (make-instance 'chunk :markup markup))
 21 | 
 22 | (defun make-empty-chunk ()
 23 |   (make-chunk ()))
 24 | 
 25 | (defun empty-chunk-p (chunk)
 26 |   (null (markup chunk)))
 27 | 
 28 | (defun combine-chunks (&rest chunks)
 29 |   (make-instance 'chunk :markup (apply #'append (mapcar #'markup chunks))))
 30 | 
 31 | (defun paragraphs (parsed)
 32 |   "Split file into chunks, one per paragraph."
 33 |   (loop for p in (rest parsed) collect (make-chunk (list p))))
 34 | 
 35 | (defmethod print-object ((o chunk) stream)
 36 |   (print-unreadable-object (o stream :type t)
 37 |     (format stream "~s identical: ~a; symmetric: ~a" (markup o) (identical-p o) (symmetrical-p o))))
 38 | 
 39 | (defmethod print-object ((o pair) stream)
 40 |   (print-unreadable-object (o stream :type t)
 41 |     (format stream "~&original: ~s~&edited: ~s" (original o) (edited o))))
 42 | 
 43 | (defun identical-p (p)
 44 |   (and (most-similar p) (equal (markup p) (markup (most-similar p)))))
 45 | 
 46 | (defun symmetrical-p (p)
 47 |   (and (most-similar p) (eql (most-similar (most-similar p)) p)))
 48 | 
 49 | (defun establish-pairs (original edited &key unpair-short)
 50 |   "Establish the pairing between two lists of chunks."
 51 |   (pair-identical original edited)
 52 |   ;; XXX -- should perhaps remove elements from original and edited
 53 |   ;; that have most-similar set by pair-identical.
 54 |   (pair-symmetrical original edited)
 55 |   (unpair-asymmetrical original t)
 56 |   (unpair-asymmetrical edited nil)
 57 |   (when unpair-short
 58 |     (unpair-short original t)
 59 |     (unpair-short edited nil)))
 60 | 
 61 | (defun pair-identical (original edited)
 62 |   (let ((identical (make-hash-table :test #'equal)))
 63 |     (loop for o in original do (push o (gethash (markup o) identical nil)))
 64 |     ;; Reverse the lists so we can pop them off in the right order below.
 65 |     (loop for k being the hash-keys of identical do
 66 |          (setf (gethash k identical) (nreverse (gethash k identical))))
 67 |     (loop for e in edited 
 68 |        for o = (pop (gethash (markup e) identical nil))
 69 |        do 
 70 |          (when o
 71 |            (setf (most-similar e) o)
 72 |            (setf (most-similar o) e)
 73 |            (pair-chunks o e)))))
 74 |   
 75 | (defun pair-symmetrical (original edited)
 76 |   (set-most-similar original edited)
 77 |   (loop for chunk in original
 78 |      when (symmetrical-p chunk) do
 79 |        (pair-chunks chunk (most-similar chunk))))
 80 | 
 81 | (defun set-most-similar (original edited)
 82 |   "Make each chunk in original and edited point to the most similar
 83 | chunk on the other side unless they have already been paired with an
 84 | identical chunk."
 85 |   (loop for c in edited unless (most-similar c) do (find-most-similar c original))
 86 |   (loop for c in original unless (most-similar c) do (find-most-similar c edited)))
 87 | 
 88 | (defun find-most-similar (chunk others)
 89 |   (let ((textified (textified chunk)))
 90 |     (flet ((score (other) (similarity textified (textified other))))
 91 |       (setf (most-similar chunk) (maximum others :key #'score)))))
 92 | 
 93 | (defun pair-chunks (original edited)
 94 |   (setf (pair original)
 95 |         (setf (pair edited)
 96 |               (make-instance 'pair :original original :edited edited))))
 97 | 
 98 | (defun unpair-asymmetrical (chunks original-p)
 99 |   (loop for chunk in chunks when (not (symmetrical-p chunk)) do (unpair chunk original-p)))
100 | 
101 | (defun unpair-short (chunks original-p)
102 |   "Unpair very short chunks."
103 |   (loop for chunk in chunks when (< (length (textified chunk)) *short-length*)
104 |      do (unpair chunk original-p)))
105 | 
106 | (defun unpair (chunk original-p)
107 |   (let ((empty (make-empty-chunk)))
108 |     (setf (most-similar chunk) empty)
109 |     (if original-p
110 |         (pair-chunks chunk empty)
111 |         (pair-chunks empty chunk))))
112 |                                   
113 | (defun diff-pair (pair)
114 |   (diff-textified (textified (original pair)) (textified (edited pair))))
115 | 
116 | (defun diff-to-markup (original edited)
117 |   (clean-adds-and-deletes (mark-moves (diff-to-markup/no-moves original edited))))
118 | 
119 | (defun diff-to-markup/no-moves (original-parsed edited-parsed)
120 |   (let ((original (paragraphs original-parsed))
121 |         (edited (paragraphs edited-parsed)))
122 |     (establish-pairs original edited)
123 |     (loop for (label . pair) across (diff-vectors (as-pairs original) (as-pairs edited))
124 |        for diff = (clean-empties (diff-pair pair))
125 |        nconc
126 |        (cond
127 |          ((and (eql label :add) (not (empty-chunk-p (original pair))))
128 |           `((:add ,@diff)))
129 |          ((and (eql label :delete) (not (empty-chunk-p (edited pair))))
130 |           `((:delete ,@diff)))
131 |          (t diff)))))
132 | 
133 | (defun mark-moves (markup)
134 |   "Take the output of diff-to-markup/no-moves and find the adds and
135 | deletes which are actually moves."
136 |   (let ((chunks (make-hash-table)))
137 |     (flet ((chunkify (thing)
138 |              (setf (gethash thing chunks) (make-chunk (rest thing)))))
139 |       (let ((adds (mapcar #'chunkify (extract markup :add)))
140 |             (deletes (mapcar #'chunkify (extract markup :delete))))
141 |         (establish-pairs deletes adds :unpair-short t)
142 | 
143 |         ;; Walk the tree. For each :delete find the chunk. If chunk is
144 |         ;; unpaired then leave the :delete as is. Otherwise replace
145 |         ;; :delete with :moved-away containing the diff of the :delete
146 |         ;; and the :add (original and edited chunks of the pair.) For
147 |         ;; each :add similarly, if the chunk is unpaired, leave as is,
148 |         ;; otherwise replace with :moved-to containing diff of the
149 |         ;; :delete and :add. Also leave things as they are if the new
150 |         ;; diff is actually more complex than the originals.
151 |         (flet ((rewrite (x)
152 |                  (let ((label (car x)))
153 |                    (case label
154 |                      ((:add :delete)
155 |                       (let* ((chunk (gethash x chunks))
156 |                              (pair (pair chunk))
157 |                              (new-diff (diff-pair pair))
158 |                              (other (if (eql label :add) (original pair) (edited pair)))
159 |                              (new-diff-length (length new-diff))
160 |                              (old-diff-length (length (textified chunk)))
161 |                              (other-old-diff-length (length (textified other)))
162 |                              (move-label (if (eql label :add) :moved-to :moved-from))
163 |                              (more-complex (and (> new-diff-length old-diff-length) (> new-diff-length other-old-diff-length))))
164 |                         (cond
165 |                           ((and (symmetrical-p chunk) (not more-complex))
166 |                            `(,move-label ,@(detextify-markup new-diff)))
167 |                           (t `(,label ,@(rest x))))))
168 |                      (t x)))))
169 |           (map-tree #'rewrite markup))))))
170 | 
171 | (defun show-pairing (label add)
172 |   (let* ((ta (textified add))
173 |          (tb (textified (most-similar add)))
174 |          (diff (diff-textified tb ta))
175 |          #+(or)(undiffed (undiffed (detextify-markup diff))))
176 |     (when (finer-grained-p diff ta)
177 |       (format t "~2&~a: ~s => ~s~&diff: ~s~&identical: ~a; symmetric: ~a; similarity: ~f; one-way: ~f)"
178 |               label
179 |               (markup add)
180 |               (markup (most-similar add))
181 |               (detextify-markup diff)
182 |               (identical-p add)
183 |               (symmetrical-p add)
184 |               (similarity ta tb)
185 |               (one-way-similarity ta tb)))))
186 | 
187 | (defun finer-grained-p (new-diff orig-diff)
188 |   "Is the new-diff finer-grained than the original?"
189 |   (< (length (remove-if-not #'part-of-diff-p new-diff)) (length orig-diff)))
190 | 
191 | (defun part-of-diff-p (text)
192 |   (or (has-property-p :add text) (has-property-p :delete text)))
193 | 
194 | (defun pair-adds-and-deletes (original-file edited-file)
195 |   (let ((markup (diff-to-markup/no-moves original-file edited-file)))
196 |     (flet ((chunkify (things)
197 |              (mapcar (lambda (x) (make-chunk (rest x))) things)))
198 |       (let ((adds (chunkify (extract markup :add)))
199 |             (deletes (chunkify (extract markup :delete))))
200 |         (pair-identical adds deletes)
201 |         (pair-symmetrical (remove-if #'identical-p adds) (remove-if #'identical-p deletes))
202 |         (loop for x in deletes do (show-pairing "DELETE" x))
203 |         (loop for x in adds do (show-pairing "ADD" x))
204 |         ))))
205 | 
206 | (defun dediff (textified)
207 |   (map 'vector (lambda (x) (remove-properties x '(:add :delete))) textified))
208 | 
209 | (defun refinement (delete add)
210 |   (let* ((refined-diff (diff-textified (dediff (textified delete)) (dediff (textified add))))
211 |          (diff-length (length (remove-if-not #'part-of-diff-p refined-diff)))
212 |          (original-add-length (length (textified add))))
213 |     (* original-add-length (/ (- original-add-length diff-length) original-add-length))))
214 | 
215 | (defun find-most-refining (add deletes)
216 |   (flet ((score (delete) (refinement delete add)))
217 |     (maximum deletes :key #'score)))
218 | 
219 | (defun extract (markup what)
220 |   "Find all the what elements in markup."
221 |   (cond
222 |     ((consp markup)
223 |      (if (eql (car markup) what)
224 |          (list markup)
225 |          (mapcan (lambda (x) (extract x what)) markup)))
226 |     (t nil)))
227 | 
228 | (defun undiffed (markup)
229 |   (cond
230 |     ((consp markup) 
231 |      (if (or (eql (car markup) :add)
232 |              (eql (car markup) :delete))
233 |          nil
234 |          (mapcan #'undiffed markup)))
235 |     (t (list markup))))
236 | 
237 | (defun as-pairs (chunks) (map 'vector #'pair chunks))
238 | 
239 | (defun cleaned-diff-output (diff)
240 |   (clean-adds-and-deletes (clean-empties diff)))
241 | 
242 | (defun clean-adds-and-deletes (diff)
243 |   (mapcar #'rewrite-adds-and-deletes diff))
244 | 
245 | (defun clean-empties (diff)
246 |   (remove nil (mapcar #'remove-empties (detextify-markup diff))))
247 | 
248 | (defun remove-empties (tree)
249 |   "Remove empty sub-trees from tree."
250 |   (labels ((helper (x)
251 |              (cond
252 |                ((and (consp x) (null (rest x))) nil)
253 |                ((consp x) (list (mapcan #'helper x)))
254 |                (t (list x)))))
255 |     (first (helper tree))))
256 | 
257 | (defun rewrite-adds-and-deletes (tree)
258 |   "Rewrite the Markup tree so that :add and :delete tags are moved out
259 | as far as possible. E.g. given (:p (:add something)) we tranform it
260 | to: (:add (:p something))."
261 |   (cond
262 |     ((consp tree)
263 |      (let ((add-or-delete (has-nested-add-or-delete-p tree)))
264 |        (mapcar
265 |         #'rewrite-adds-and-deletes
266 |         (if add-or-delete (promote-tag add-or-delete tree) tree))))
267 |     (t tree)))
268 | 
269 | (defun has-nested-add-or-delete-p (tree)
270 |   "Tree has a nested :add or :delete tag and it's not the only tag.
271 | Returns which one it is since there should only be one or the other."
272 |   (let ((tags (nested-tags tree)))
273 |     (and (not (null (cdr tags)))
274 |          (or (find :add tags) (find :delete tags)))))
275 | 
276 | (defun promote-tag (tag tree)
277 |   (list tag (remove-tag tag tree)))
278 | 
279 | (defun remove-tag (tag tree)
280 |   (cond 
281 |     ((consp tree)
282 |      (cond 
283 |        ((consp (second tree))
284 |         (destructuring-bind (t1 (t2 . rest)) tree
285 |           (cond
286 |             ((eql t1 tag) `(,t2 ,@rest))
287 |             ((eql t2 tag) `(,t1 ,@rest))
288 |             (t `(,t1 ,(remove-tag tag `(,t2 ,@rest)))))))
289 |        (t (destructuring-bind (t1 . rest) tree
290 |             (cond
291 |               ((eql t1 tag) rest)
292 |               (t `(,t1 ,@rest)))))))
293 |     (t tree)))
294 | 
295 | (defun nested-tags (tree)
296 |   "Give a tree like '(:a (:b (:c ...))) the nested tags are (:a :b :c)"
297 |   (if (consp tree)
298 |       (if (null (cddr tree))
299 |           (cons (car tree) (nested-tags (second tree)))
300 |           (list (car tree)))))


--------------------------------------------------------------------------------
/utilities.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :com.gigamonkeys.prose-diff)
 2 | 
 3 | ;;; Bits of utility code that perhaps should be moved into
 4 | ;;; com.gigamonkeys.utilities or replaced with calls to equivalent
 5 | ;;; bits o fsome standard utility library.
 6 | 
 7 | (defun maximum (list &key (key #'identity))
 8 |   (when list
 9 |     (destructuring-bind (first . rest) list
10 |       (loop with best-score = (funcall key first)
11 |          with best = first
12 |          for x in rest
13 |          for score = (funcall key x) do
14 |            (when (> score best-score)
15 |              (setf best-score score)
16 |              (setf best x))
17 |          finally (return (values best best-score))))))
18 | 
19 | (defun take (list n)
20 |   "Return a list of of the first n values of list and the left-over
21 | tail as a secondary value."
22 |   (let ((tail (nthcdr n list)))
23 |     (values (ldiff list tail) tail)))
24 | 
25 | (defun vector-push-extend* (list v)
26 |   "Push all the elements of `list' onto v as if by vector-push-extend"
27 |   (loop for item in list do (vector-push-extend item v)))
28 | 
29 | (defun split-list (list tail)
30 |   (let ((rest (nthcdr (or (search tail list) (error "~a not found in ~a" tail list)) list)))
31 |     (values (ldiff list rest) rest)))
32 | 
33 | (defun longer (list-a list-b)
34 |   (cond
35 |     ((endp list-b)
36 |      (not (endp list-a)))
37 |     ((endp list-a)
38 |      nil)
39 |     (t (longer (cdr list-a) (cdr list-b)))))
40 | 
41 | (defun concatenate-vectors (vectors)
42 |   (reduce (lambda (a b) (concatenate (class-of a) a b)) vectors))
43 | 
44 | (defun map-tree (fn tree)
45 |   "Map fn down tree, replacing each element of the tree with the
46 |   return value of fn. When the return value is identical to the
47 |   original sub-tree it is recursively mapped."
48 |   (typecase tree
49 |     (cons (let ((new (funcall fn tree)))
50 |             (if (eql new tree)
51 |                 (mapcar (lambda (x) (map-tree fn x)) tree)
52 |                 new)))
53 |     (t tree)))


--------------------------------------------------------------------------------