├── .gitignore ├── COPYING ├── README ├── chunk-splitting.lisp ├── com.gigamonkeys.prose-diff.asd ├── diff.css ├── diff.js ├── diff.lisp ├── html.lisp ├── jquery-1.4.4.js ├── lcs.lisp ├── notes.txt ├── packages.lisp ├── prose-diff.lisp ├── test-edited.txt ├── test-original.txt ├── tests.lisp ├── text.lisp ├── tokenize.lisp └── utilities.lisp /.gitignore: -------------------------------------------------------------------------------- 1 | publish.sh 2 | *.html 3 | test-files/ 4 | tmp/ -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, Peter Seibel. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following 12 | disclaimer in the documentation and/or other materials provided 13 | with the distribution. 14 | 15 | * Neither the name of Gigamonkeys Consulting nor the names of its 16 | contributors may be used to endorse or promote products derived 17 | from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | NOTE: this code is a work in progress. However the goal is to create a 2 | diff program that works better on text files containing prose than the 3 | traditional Unix line-based diff. 4 | 5 | Update 2014-02-14: I used this code when I was running the ill fated Code 6 | Quarterly. It was incredibly useful to be able to take a text file from an 7 | author, edit it to my heart's content, and then generate a diff that was 8 | even better than what Word track changes could show since it shows sections 9 | that are moved and also edited in a sane way. 10 | 11 | The algorithm is not speedy as it does a N^2 comparison of all the paragraphs 12 | that differ at all between the two versions so it's not great for realtime 13 | but for generating a diff offline and then putting on the web for an author 14 | to look at, it was quite nice. 15 | -------------------------------------------------------------------------------- /chunk-splitting.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | ;;; 4 | ;;; Code for splitting chunks into parts corresponding to a bunch of 5 | ;;; other chunks derived from them. This could, in theory, be used to 6 | ;;; further refine diffs and recognize more moved text. At the moment, 7 | ;;; however, it is not used. 8 | ;;; 9 | 10 | (defun split-positions (one-chunk parts) 11 | "Find the positions where one-chunk should be split to get pieces 12 | corresponding to the given parts, which are derived from one-chunk. 13 | For instance, if one-chunk is a piece of text that was split and the 14 | pieces inserted at various places in the new document, it will show 15 | up in the diff as a single deletion and multiple additions. Or, 16 | conversely, if a bunch of separate pieces (from different 17 | paragraphs) in the original document were combined into contiguous 18 | text in the edited document, we would have a single addition and 19 | multiple deletions." 20 | (multiple-value-bind (one-chunk-lcs-indices combined-lcs-indices) 21 | (lcs-positions one-chunk (concatenate-vectors parts)) 22 | (loop for s in (part-starts parts) 23 | for prev-x = 0 then x 24 | for x = (position-if (lambda (x) (>= x s)) combined-lcs-indices :start prev-x) 25 | collect (aref one-chunk-lcs-indices x)))) 26 | 27 | (defun split-vector (one-chunk parts) 28 | (loop for (start end) on (split-positions one-chunk parts) 29 | collect (subseq one-chunk start end))) 30 | 31 | (defun part-starts (parts) 32 | (loop for (p . rest) on parts 33 | summing (length p) into total 34 | when rest collect total into starts 35 | finally (return (cons 0 starts)))) 36 | 37 | 38 | -------------------------------------------------------------------------------- /com.gigamonkeys.prose-diff.asd: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved. 3 | ;; 4 | 5 | (defsystem com.gigamonkeys.prose-diff 6 | :name "com.gigamonkeys.prose-diff" 7 | :components 8 | ((:file "packages") 9 | (:file "lcs" :depends-on ("packages")) 10 | (:file "tokenize" :depends-on ("packages")) 11 | (:file "text" :depends-on ("packages")) 12 | (:file "utilities" :depends-on ("packages")) 13 | (:file "diff" :depends-on ("packages")) 14 | (:file "html" :depends-on ("packages"))) 15 | :depends-on (:cl-ppcre 16 | :com.gigamonkeys.pathnames 17 | :com.gigamonkeys.utilities 18 | :com.gigamonkeys.macro-utilities 19 | :com.gigamonkeys.markup 20 | :monkeylib-markup-html)) 21 | -------------------------------------------------------------------------------- /diff.css: -------------------------------------------------------------------------------- 1 | body { margin: 1.5in; } 2 | 3 | #buttons { position: fixed; top: 6px; left: 6px; width: 1in; list-style-type: none; padding: 0; } 4 | #buttons li button { padding-left: 3px; width: 90%; background: #eee; } 5 | 6 | /* Logical styles */ 7 | 8 | .add { } 9 | 10 | .delete { } 11 | 12 | .moved-to { } 13 | 14 | .moved-from { } 15 | 16 | /* Actual display styles */ 17 | 18 | .hidden { display: none; } 19 | 20 | .delete.diff { color: red; text-decoration: line-through; } 21 | 22 | .add.diff { color: blue; } 23 | 24 | .moved-to.diff { background-color: #ffc; } 25 | 26 | .moved-from.diff { background-color: #ccc; } 27 | 28 | /* Button style */ 29 | 30 | .selected { font-weight: 900; background: #888 !important; } -------------------------------------------------------------------------------- /diff.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | 3 | function applyStyles (displayState, button) { 4 | var styles = { 5 | 'original': { 6 | 'delete' : '', 7 | 'add' : 'hidden', 8 | 'moved-from' : '', 9 | 'moved-to' : 'hidden' 10 | }, 11 | 12 | 'new': { 13 | 'delete' : 'hidden', 14 | 'add' : '', 15 | 'moved-from' : 'hidden', 16 | 'moved-to' : '' 17 | }, 18 | 19 | 'diff': { 20 | 'delete' : 'diff', 21 | 'add' : 'diff', 22 | 'moved-from' : 'diff', 23 | 'moved-to' : 'diff' 24 | }, 25 | }[displayState]; 26 | 27 | for (var style in styles) { 28 | $('.' + style).removeClass().addClass(style).addClass(styles[style]); 29 | } 30 | 31 | $('.selected').toggleClass('selected'); 32 | $(button).toggleClass('selected'); 33 | } 34 | 35 | $(document).ready(function () { 36 | $('#show_diff').bind('click', function () { applyStyles('diff', this); }).click(); 37 | $('#show_original').bind('click', function () { applyStyles('original', this); }); 38 | $('#show_new').bind('click', function () { applyStyles('new', this); }); 39 | }); 40 | 41 | })(); -------------------------------------------------------------------------------- /diff.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | ;;; 4 | ;;; Generic functions for diffing vectors of objects. 5 | ;;; 6 | 7 | (defun diff-vectors (old new &optional (lcs-frobber #'identity)) 8 | "Diff two vectors returning a vector with the elements of old and 9 | new wrapped in conses whose CAR is either :LCS, :DELETE, or :ADD. 10 | Optionally frob the computed LCS before computing the diff." 11 | (loop with output = (make-array (length new) :adjustable t :fill-pointer 0) 12 | with old-i = 0 13 | with old-length = (length old) 14 | with new-i = 0 15 | with new-length = (length new) 16 | for next-lcs across (funcall lcs-frobber (lcs old new)) 17 | do 18 | (setf old-i (emit-diffs next-lcs old old-i old-length :delete output)) 19 | (setf new-i (emit-diffs next-lcs new new-i new-length :add output)) 20 | (vector-push-extend (cons :lcs next-lcs) output) 21 | 22 | finally 23 | (emit-diffs (cons nil nil) old old-i old-length :delete output) 24 | (emit-diffs (cons nil nil) new new-i new-length :add output) 25 | (return output))) 26 | 27 | (defun emit-diffs (next-lcs v i max-i marker output) 28 | (cond 29 | ((< i max-i) 30 | (let ((idx (or (position next-lcs v :start i) max-i))) 31 | (cond 32 | ((> idx i) 33 | (loop for j from i below idx do (vector-push-extend (cons marker (aref v j)) output)) 34 | (1+ idx)) 35 | (t 36 | (1+ i))))) 37 | (t i))) 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /html.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | ;;; 4 | ;;; HTML generation -- given two Markup files generate an HTML file of the diff. 5 | ;;; 6 | 7 | (defparameter *retagger* 8 | (make-retagger 9 | (mapcar (lambda (x) (cons x 'wrap-add-delete)) '(:add :delete :moved-to :moved-from)))) 10 | 11 | (defun diff-to-html (original-file edited-file output &key (css-dir "") (js-dir "")) 12 | (let ((original (remove-comments (parse-file original-file :parse-links-p nil))) 13 | (edited (remove-comments (parse-file edited-file :parse-links-p nil)))) 14 | (with-output-to-file (out output) 15 | (render-sexps-to-stream 16 | `(:body 17 | ,@(diff-to-markup original edited) 18 | ((:ul :id "buttons") 19 | (:li ((:button :id "show_diff") "Diff")) 20 | (:li ((:button :id "show_original") "Original")) 21 | (:li ((:button :id "show_new") "New")))) 22 | out 23 | :stylesheets (loop for x in '("diff.css") collect (format nil "~a~a" css-dir x)) 24 | :scripts (loop for x in '("jquery-1.4.4.js" "diff.js") collect (format nil "~a~a" js-dir x)) 25 | :rewriter (compose (lambda (x) (footnotes :note x)) *retagger*))))) 26 | 27 | (defun extract-comments (sexp) 28 | (let ((num 0) 29 | (comments ())) 30 | (labels ((walker (x) 31 | (cond 32 | ((stringp x) x) 33 | ((numberp x) x) 34 | ((symbolp x) x) 35 | ((eql (car x) :comment) 36 | (push x comments) 37 | `(:commentref ,(princ-to-string (incf num)))) 38 | (t `(,(car x) ,@(mapcar #'walker (cdr x))))))) 39 | (let ((walked (walker sexp))) 40 | (values walked (nreverse comments)))))) 41 | 42 | (defun remove-comments (sexp) 43 | (labels ((walker (x) 44 | (cond 45 | ((stringp x) (list x)) 46 | ((numberp x) (list x)) 47 | ((symbolp x) (list x)) 48 | ((eql (car x) :comment) nil) 49 | (t 50 | (let ((body (mapcan #'walker (cdr x)))) 51 | (if body `((,(car x) ,@body)) nil)))))) 52 | 53 | (first (walker sexp)))) 54 | 55 | (defun commentref->html (sexp) 56 | (destructuring-bind (tag string) sexp 57 | (assert (eql tag :commentref)) 58 | (let ((num (parse-integer string))) 59 | `((:span :href (:format "~(#comment_~d~)" ,num) 60 | :id (:format "commentref_~d" ,num) 61 | :class "comment_ref") 62 | (:character 9997))))) 63 | 64 | 65 | (defun comments->html (sexp) 66 | (destructuring-bind (tag &rest comments) sexp 67 | (assert (eql tag :comments)) 68 | `(:progn ,@(loop for comment in comments 69 | for num from 1 70 | collect 71 | (destructuring-bind (tag &rest body) comment 72 | (assert (eql tag :comment)) 73 | `((:div :id (:format "~(~a_~d~)" ,tag ,num) :class "comment") ,@body)))))) 74 | 75 | (defun wrap-add-delete (sexp) 76 | (destructuring-bind (which &rest wrapped) sexp 77 | (let ((class (string-downcase which)) 78 | (wrapped (mapcar *retagger* wrapped))) 79 | ;; Recursive wrapping necessary since :moved-to and moved-from 80 | ;; elements can contain :adds and :deletes 81 | (cond 82 | ((and (consp (first wrapped)) (block-element-p (car (first wrapped)))) 83 | `((:div :class ,class) ,@wrapped)) 84 | (t `((:span :class ,class) ,@wrapped)))))) 85 | 86 | (defun block-element-p (x) 87 | (member x 88 | '(:body :colgroup :div :dl :fieldset :form :head :html :map :noscript 89 | :object :ol :optgroup :pre :script :select :style :table :tbody 90 | :tfoot :thead :tr :ul 91 | :area :base :blockquote :br :button :caption :col :dd :div :dt :h1 92 | :h2 :h3 :h4 :h5 :h6 :hr :input :li :link :meta :option :p :param 93 | :td :textarea :th :title))) 94 | -------------------------------------------------------------------------------- /lcs.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | (defun lcs (a b) 4 | "Compute the longest common subsequence of vectors `a' and `b'" 5 | (map 'vector (lambda (i) (aref a i)) (lcs-positions a b))) 6 | 7 | (defun lcs-length (a b) 8 | "Compute the length of the longest common subsequence of vectors `a' and `b'" 9 | (multiple-value-bind (table m n) (%lcs-table a b) 10 | (aref table n m))) 11 | 12 | (defun lcs-positions (a b) 13 | "Find the indices in a and b of the elements of the LCS." 14 | (multiple-value-bind (table m n) (%lcs-table a b) 15 | (let* ((len (aref table n m)) 16 | (a-indices (make-array len)) 17 | (b-indices (make-array len)) 18 | (idx (1- len)) 19 | (i (length a)) 20 | (j (length b))) 21 | 22 | (loop while (> (aref table j i) 0) do 23 | (let* ((current (aref table j i)) 24 | (previous (1- current))) 25 | 26 | (cond 27 | ((and (= previous (aref table (1- j) (1- i))) 28 | (= previous (aref table j (1- i))) 29 | (= previous (aref table (1- j) i))) 30 | (decf j) 31 | (decf i) 32 | (setf (aref a-indices idx) i) 33 | (setf (aref b-indices idx) j) 34 | (decf idx)) 35 | ((= current (aref table (1- j) i)) (decf j)) 36 | ((= current (aref table j (1- i))) (decf i)) 37 | (t (error "Assertion gone haywire: ~s ~s" j i))))) 38 | (values a-indices b-indices)))) 39 | 40 | (defun %lcs-table (a b) 41 | "Compute the MxN table from which we can extract the LCS, and a 42 | bunch of other good stuff." 43 | (let* ((m (length a)) 44 | (n (length b)) 45 | (table (make-array (list (1+ n) (1+ m)) :initial-element 0))) 46 | 47 | (flet ((lcs-length (j i) 48 | (cond 49 | ((eql (aref a (1- i)) (aref b (1- j))) 50 | (+ 1 (aref table (1- j) (1- i)))) 51 | (t 52 | (max (aref table (1- j) i) (aref table j (1- i))))))) 53 | 54 | (loop for j from 1 to n do 55 | (loop for i from 1 to m do 56 | (setf (aref table j i) (lcs-length j i))))) 57 | 58 | (values table m n))) 59 | 60 | (defun similarity (a b) 61 | "Compute the similarity of vectors `a' and `b' in terms of the 62 | average of the ratios of the length of the LCS to their length." 63 | (let ((lcs-length (lcs-length a b))) 64 | (/ (+ (/ lcs-length (length a)) (/ lcs-length (length b))) 2.0d0))) 65 | 66 | (defun one-way-similarity (a b) 67 | "Like `similarity' but in only one direction." 68 | (float (/ (lcs-length a b) (length a)) 0d0)) 69 | 70 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | -*- mode: markup; -*- 2 | 3 | * Prose diff algorithm: 4 | 5 | Hash (or intern) the paragraphs of the two documents and compute the 6 | diff using LCS of the sequence of hashes. This will find a backbone of 7 | unchanged paragraphs in their original order as well as the places 8 | where paragraphs were "deleted" and "added". However there are a few 9 | more possibilities. 10 | 11 | # A paragraph may actually have been cut. This will show up as a 12 | deletion with none of the new paragraphs being at all similar to 13 | the deleted paragraph. (Cut) 14 | 15 | # A paragraph may actually have been added. This will show up as an 16 | addition with the new paragraph not similar to any of the deleted 17 | paragraphs. (Addition) 18 | 19 | # A paragraph may have been moved which will show up as a deletion 20 | in one place and an addition somewhere else of the same hash. 21 | (Moved) 22 | 23 | # A paragraph may have been edited in place which will show up as a 24 | deletion and addition in the same position with a lot of 25 | similarity between the deleted and added paragraphs. (Edited) 26 | 27 | # A paragraph may have been moved and edited which will show up as a 28 | deletion at the original position and an addition somewhere else 29 | with a lot of similarity between the deleted and added paragraphs. 30 | (Moved and edited) 31 | 32 | # A paragraph may have been split, perhaps with edits to both the 33 | new paragraphs. This will show up as a deletion and an addition of 34 | two (or more) paragraphs at the same position with each of the new 35 | paragraphs having a LCS with the old paragraph that is almost the 36 | full length of the new paragraph. 37 | 38 | # N paragraphs may have been joined which will look like N deletions 39 | and one addition where each of the deleted paragraphs will have an 40 | LCS with the new paragraph that is almost the full length of the 41 | deleted paragraph. 42 | 43 | # A chunk of a paragraph may have been removed and added into 44 | another paragraph. This will look like edits to both paragraphs 45 | and the LCS of the deleted bits of the first paragraph with the 46 | added bits of the second paragraph will be almost the whole 47 | length. (Sentences moved) 48 | 49 | To determine what has happened: 50 | 51 | # Find exactly moved paragraphs and remove them from further 52 | consideration. (Another way to implement this is to simply find 53 | all the paragraphs that occur in both the old and new document.) 54 | 55 | # Build a matrix of the LCSs of each remaining paragraph against 56 | every other. 57 | 58 | 59 | 60 | 61 | * Output 62 | 63 | A set of paragraphs. Some paragraphs occur in both the original and 64 | edited document and we record their position in each document. 65 | 66 | Some paragraphs exist only in the original and some only in the 67 | edited. There is a mapping between the original-only and edited-only 68 | paragraphs the defines how they are releated. 69 | 70 | - Original to nothing: a deleted paragraph 71 | 72 | - nothing to Edited: a completely new paragraph 73 | 74 | - One original to one edited: an edited paragraph 75 | 76 | - One original to multiple edited: a split (and possibly edited) paragraph. 77 | 78 | - Multiple original to one edited: a joined (and possibly edited) parargraphs. 79 | 80 | - Two originals to two edited: chunk removed from original and moved 81 | to another paragraph. I.e. basically an edit of one paragraph to 82 | remove a chunk of text and then an edit of another paragraph to add 83 | a very similar chunk of text. 84 | 85 | 86 | 87 | A set of paragraphs, each with a position (possibly Nil) in the 88 | original document, a position (possibly (Nil) in the new document, and 89 | 90 | 91 | * Unmoved paragraphs 92 | 93 | After we determine the simply equivalent paragraphs between original 94 | and edited (identical paragraphs and symmetrically similar\note{Though 95 | need to think about splits and joins}) we can find a sort of 96 | abstracted LCS at the paragraph level. E.g. If we have a trivial case 97 | of: 98 | 99 | A B C D E -> A' B' C' D' E' 100 | 101 | where X' is equivalent to X, then the paragraph-level LCS is the whole 102 | document. In general the unmoved paragraphs are the ones that are part 103 | of the LCS. 104 | 105 | We can then run through the rest of the paragraphs in edited and try 106 | to figure out where they come from. 107 | 108 | Moved paragraphs will be ones that have versions in both original and 109 | edited but which are not part of the LCS. 110 | 111 | Extra paragraphs resulting from splits will have high one-way 112 | similarity with the original version of a preceding or following 113 | paragraph 114 | 115 | Paragraphs resulting from joins will have high one-way similarity with multiple 116 | 117 | Totally new paragraphs will have low similarity to any paragraphs. 118 | 119 | 120 | Truly deleted paragraphs 121 | will be ones with no version 122 | 123 | 124 | 125 | 126 | Simple case: A single paragraph is in the “same” location if its 127 | edited version follow an edited version of the paragraph the preceded 128 | it in the original or precedes an edited version of the paragraph that 129 | followed it in the original. 130 | 131 | Slightly more complicated: if the preceding or following paragraphs in 132 | edited are the result of a split, they should be considered descended 133 | from the original paragraph. Thus if we have: 134 | 135 | AB C DE -> A' B' C' D' E' 136 | 137 | Then paragraphs B and D are descendants of AB and DE, thus C' is in 138 | the same position. 139 | 140 | If the preceding or following paragraphs resulted from a join we have: 141 | 142 | A B C D E -> AB' C DE' 143 | 144 | Since AB' descends from both A and B and DE' descends from both D and 145 | E, C is similarly unmoved. 146 | 147 | An unmoved paragraph may have 148 | 149 | 150 | 151 | * Deleted paragraphs 152 | 153 | For paragraphs deleted from the original, find the paragraph 154 | preceeding it in the original and determine what happened to it in the 155 | editied. If it exists (perhaps in modified form) in the edited, put 156 | the deleted paragraph, marked as deleted, after it. If it was also 157 | deleted, find its preceeding paragraph, and so on, until a paragraph 158 | that lives on in the edited verion is found. (Screw case #1: if the 159 | preceeding paragraph was moved far away it would probably make more 160 | sense to leave the deleted paragraphs where they were. Not sure how to 161 | determine when a paragraph was moved. Screw case #2: if the preceeding 162 | paragraph was joined with the paragraph following the deleted 163 | paragraph(s). Probably should put the deleted paragraphs after the 164 | joined paragraph. 165 | -------------------------------------------------------------------------------- /packages.lisp: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved. 3 | ;; 4 | 5 | (in-package :cl-user) 6 | 7 | (defpackage :com.gigamonkeys.prose-diff 8 | (:use :common-lisp 9 | :cl-ppcre 10 | :com.gigamonkeys.pathnames 11 | :com.gigamonkeys.utilities 12 | :com.gigamonkeys.markup 13 | :com.gigamonkeys.markup.html) 14 | (:import-from :alexandria :compose) 15 | (:export 16 | :show-cuts)) 17 | -------------------------------------------------------------------------------- /prose-diff.lisp: -------------------------------------------------------------------------------- 1 | ;; 2 | ;; Copyright (c) 2010, Peter Seibel. All rights reserved. 3 | ;; 4 | 5 | (in-package :com.gigamonkeys.prose-diff) 6 | 7 | (defparameter *word-regexp* (create-scanner "((?:\\\\n{.*?})|(?:\\w*’\\w+)|(?:\\w+))")) 8 | (defparameter *old-mode-regexp* (create-scanner "-\\*- mode: .*; -\\*-")) 9 | (defparameter *new-mode-line* "-*- mode: coders-at-work-editing; -*-") 10 | (defparameter *minimum-match* 4) 11 | 12 | (defun text (file) 13 | (with-output-to-string (s) 14 | (with-open-file (in file) 15 | (loop for line = (read-line in nil nil) 16 | while line do (write-line line s))))) 17 | 18 | (defun words (text) 19 | (let (words) 20 | (do-matches (start end *word-regexp* text (coerce (nreverse words) 'vector)) 21 | (push (intern (string-upcase (subseq text start end)) :keyword) words)))) 22 | 23 | 24 | (defun word-starts (text) 25 | (let (positions) 26 | (do-matches (start end *word-regexp* text (coerce (nreverse positions) 'vector)) 27 | (push start positions)))) 28 | 29 | (defun positions-table (words) 30 | (let ((table (make-hash-table))) 31 | (loop for position from (1- (length words)) downto 0 32 | do (push position (gethash (aref words position) table ()))) 33 | table)) 34 | 35 | (defun dump-table (table) 36 | (loop for k being the hash-keys of table 37 | do (format t "~a => ~a~%" k (gethash k table)))) 38 | 39 | ;; XXX -- the policy of removing the first match if there are ties (by 40 | ;; length) is not necessarily quite right since you could have 41 | ;; something like: 'a b c d a b c e' in the original and then in the 42 | ;; edited: 'a b c x a b c d' in which case the first match attempt on 43 | ;; 'a' would find the two 'a b c' sequences and then would take the 44 | ;; first one leaving '- - - d a b c e'. Then when matching starting at 45 | ;; the second 'a' in the edited text the 'a b c d' has been broken up 46 | ;; so it will only match 'a b c' leaving an extra 'e'. Don't know how 47 | ;; much this will matter in practice. (Hmmm, maybe could do something 48 | ;; like: for each occurence of word W in edited find the longest match 49 | ;; in original. Then assign the longest overall to which ever 50 | ;; occurence of W it goes with and then axe any matches that overlap 51 | ;; with it and assign the longest remaining to it's W and so on. Then 52 | ;; move to the next word in edited.) 53 | ;; 54 | ;; Maybe for section that has ties, we should remember the position in 55 | ;; edited but leave the word vectors alone. Then after we've gone 56 | ;; through all of the edited words, go back and try the remembered 57 | ;; positions again. This will deal with the case where a short phrase 58 | ;; matches a number of places and the first one happens to be the 59 | ;; wrong one. If we don't take it out, then that instance of the short 60 | ;; phrase will hopefully be taken as part of some bigger piece of text 61 | ;; used later in the edited text. As it stands now, we'll take the 62 | ;; short phrase out of the middle of the longer text which will then 63 | ;; get broken into two bits surrounding the short phrase. (Which I 64 | ;; guess will then be taken from somewhere else.) 65 | 66 | (defun find-original-text (original-text edited-text) 67 | "Given `original-text' and `edited-text' return two word vectors 68 | representing the given texts with the sequences of words occuring in 69 | both texts replaced by nil. Thus the sequences of words that remain 70 | in original are the ones not used in the edited version and the 71 | words that remain in edited are the ones added during editing." 72 | (loop 73 | with original = (words original-text) 74 | with edited = (words edited-text) 75 | with positions-table = (positions-table original) 76 | with edited-idx = 0 77 | 78 | for word = (aref edited edited-idx) 79 | for positions = (gethash word positions-table) 80 | 81 | do 82 | (loop 83 | with longest-match = 0 84 | with longest-match-starts = () 85 | 86 | for original-idx in positions 87 | for match = (match-length original edited original-idx edited-idx longest-match) 88 | 89 | do (when (>= match longest-match) 90 | (when (> match longest-match) 91 | (setf longest-match-starts ()) 92 | (setf longest-match match)) 93 | (push original-idx longest-match-starts)) 94 | 95 | finally (cond 96 | ((> longest-match *minimum-match*) 97 | (null-words original (first longest-match-starts) longest-match) 98 | (null-words edited edited-idx longest-match) 99 | (incf edited-idx longest-match)) 100 | (t (incf edited-idx)))) 101 | 102 | while (< edited-idx (length edited)) 103 | finally (return (values original edited)))) 104 | 105 | (defun null-words (words start count) 106 | (loop repeat count for i from start do (setf (aref words i) nil))) 107 | 108 | (defun map-in-and-out (words word-starts in-fn out-fn) 109 | "Given a vector `words' with some words nulled out and a parallel 110 | vector `word-starts' containing the starting positions of the words 111 | in a text (not given), map over the chunks of text, calling `in-fn' 112 | for each chunk of words present in `words' and `out-fn' for each 113 | chunk of words missing from `words'. The functions are called with a 114 | bounding index designator of the underlying text." 115 | (loop with text-idx = 0 116 | with word-idx = 0 117 | while word-idx 118 | do (let ((word (aref words word-idx)) 119 | (end (aref word-starts word-idx))) 120 | (unless (zerop word-idx) 121 | (funcall (if word out-fn in-fn) text-idx end) 122 | (setf text-idx end)) 123 | (setf word-idx (position nil words :start (1+ word-idx) :key (if word #'identity #'not)))) 124 | finally (funcall (if (aref words (1- (length words))) in-fn out-fn) text-idx nil))) 125 | 126 | (defun show-cuts (&key master edited output) 127 | (with-open-file (out output :direction :output :if-exists :supersede) 128 | (let ((text (text master))) 129 | (flet ((make-emitter (label) 130 | (lambda (start end) 131 | (let ((text (subseq text start end))) 132 | (when (zerop start) 133 | (setf text (regex-replace *old-mode-regexp* text *new-mode-line*))) 134 | (format out "{{~a}}~a" label text))))) 135 | (let ((words (find-original-text text (text edited)))) 136 | (map-in-and-out words (word-starts text) (make-emitter "toss") (make-emitter "keep"))))))) 137 | 138 | (defun show-leftovers (&key master edited output) 139 | (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede) 140 | (let ((text (text master))) 141 | (flet ((make-emitter (label) 142 | (lambda (start end) 143 | (let ((text (subseq text start end))) 144 | (when (zerop start) 145 | (setf text (regex-replace *old-mode-regexp* text *new-mode-line*))) 146 | (format out "{{~a}}~a" label text))))) 147 | (let ((words (find-original-text text (text edited)))) 148 | (map-in-and-out words (word-starts text) (make-emitter "keep") (make-emitter "book"))))))) 149 | 150 | (defun show-unused (&key master edited output) 151 | (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede) 152 | (let ((master-text (text master)) 153 | (edited-text (text edited))) 154 | (flet ((make-emitter () 155 | (lambda (start end) 156 | (let ((text (subseq master-text start end))) 157 | (write-string text out)))) 158 | (make-elipsator () 159 | (lambda (start end) 160 | (let ((text (subseq master-text start end))) 161 | (if (find #\Newline text) 162 | (format out "~2&§~2%") 163 | (format out " … ")))))) 164 | (let ((words (find-original-text master-text edited-text))) 165 | (map-in-and-out words (word-starts master-text) (make-emitter) (make-elipsator))))))) 166 | 167 | (defun show-possible-cuts (&key master edited output (minimum 0)) 168 | "Emit all sections left in `master' after `edited' text has been 169 | removed, longer than `minimum' characters. Sections are output in 170 | the order they appear in `master'." 171 | (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede) 172 | (let ((master-text (text master)) 173 | (edited-text (text edited))) 174 | (let ((chunks-found 0)) 175 | (flet ((make-emitter () 176 | (lambda (start end) 177 | (let ((text (subseq master-text start end))) 178 | (when (> (length text) minimum) 179 | (when (zerop start) 180 | (setf text (regex-replace *old-mode-regexp* text *new-mode-line*))) 181 | (incf chunks-found) 182 | (format t "~&Found chunk of ~:d characters." (length text)) 183 | (format out "~2&{{~a}}" text)))))) 184 | (let ((words (find-original-text master-text edited-text))) 185 | (map-in-and-out words (word-starts master-text) (make-emitter) (constantly nil)))) 186 | (format t "~&~:d chunks of more than ~:d characters found." chunks-found minimum))))) 187 | 188 | (defun used-sections (&key master edited output (minimum 0)) 189 | "Emit all sections from `edited' found in `master' in the order they 190 | appear in `master'." 191 | (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede) 192 | (let ((master-text (text master)) 193 | (edited-text (text edited))) 194 | (let ((chunks-found 0)) 195 | (flet ((make-emitter () 196 | (lambda (start end) 197 | (let ((text (subseq master-text start end))) 198 | (when (> (length text) minimum) 199 | (when (zerop start) 200 | (setf text (regex-replace *old-mode-regexp* text *new-mode-line*))) 201 | (incf chunks-found) 202 | (format t "~&Found chunk of ~:d characters." (length text)) 203 | (format out "~2&{{~a}}" text)))))) 204 | (let ((words (find-original-text master-text edited-text))) 205 | (map-in-and-out words (word-starts master-text) (constantly nil) (make-emitter)))) 206 | (format t "~&~:d chunks of more than ~:d characters found." chunks-found minimum))))) 207 | 208 | (defun show-sorted-cuts (&key master edited output) 209 | "Emit all sections left in `master' after `edited' text has been 210 | removed, sorted by size, longer sections first." 211 | (with-open-file (out (ensure-directories-exist output) :direction :output :if-exists :supersede) 212 | (let ((master-text (text master)) 213 | (edited-text (text edited))) 214 | (let ((words (find-original-text master-text edited-text)) 215 | (chunks ())) 216 | (flet ((collector (start end) 217 | (let ((text (subseq master-text start end))) 218 | (push (list (length text) text start end) chunks)))) 219 | (map-in-and-out words (word-starts master-text) #'collector (constantly nil))) 220 | (loop for (length text start end) in (sort chunks #'> :key #'first) do 221 | (format out "~2&** ~:d characters (~:d-~:d)~2%" length start end) 222 | (write-string text out)))))) 223 | 224 | 225 | (defun show-additions (&key master edited output) 226 | (with-open-file (out output :direction :output :if-exists :supersede) 227 | (let ((text (text edited))) 228 | (flet ((make-emitter (label) 229 | (lambda (start end) 230 | (let ((text (subseq text start end))) 231 | (when (zerop start) 232 | (setf text (regex-replace *old-mode-regexp* text *new-mode-line*))) 233 | (format out "{{~a}}~a" label text))))) 234 | (let ((words (nth-value 1 (find-original-text (text master) text)))) 235 | (map-in-and-out words (word-starts text) (make-emitter "book") (make-emitter "keep"))))))) 236 | 237 | 238 | (defun match-length (original edited o-start e-start longest-match) 239 | (flet ((same (i) 240 | (let ((e (+ e-start i)) 241 | (o (+ o-start i))) 242 | (and (< e (length edited)) 243 | (< o (length original)) 244 | (eql (aref edited e) (aref original o)))))) 245 | ;; Check the zeroth element in case the original got nulled out 246 | ;; due to a previous match. Then check that this match could at 247 | ;; least in theory be as long as the previous longest match, i.e. 248 | ;; the last element matches, before we go to the bother of 249 | ;; checking all the intervening elements. 250 | (if (and (same 0) (or (zerop longest-match) (same (1- longest-match)))) 251 | (loop for i from 1 when (not (same i)) return i) 252 | 0))) 253 | 254 | ;;; Scrap 255 | 256 | (defun words-with-text-positions (text) 257 | (let (words positions) 258 | (do-matches (start end *word-regexp* text 259 | (values (coerce (nreverse words) 'vector) 260 | (nreverse (cons (length text) positions)))) 261 | (push (intern (string-upcase (subseq text start end)) :keyword) words) 262 | (push start positions)))) 263 | 264 | (defun reconstitute-text (text) 265 | (multiple-value-bind (words positions) (words-with-text-positions text) 266 | (declare (ignore words)) 267 | (string= text 268 | (with-output-to-string (s) 269 | (loop for (start end) on (cons 0 positions ) 270 | while end 271 | do (format s "~a" (subseq text start end))))))) 272 | 273 | -------------------------------------------------------------------------------- /test-edited.txt: -------------------------------------------------------------------------------- 1 | -*- mode: markup; -*- 2 | 3 | * Prose diff test document 4 | 5 | Moving up is what this paragraph will do. 6 | 7 | Upward movement with some editing is what happens to this text. 8 | 9 | A test document. This is a paragraph that should appear in both versions. 10 | 11 | Will be edited a little bit in the edited version. 12 | 13 | This is a long paragraph that will be split into to two parts in the 14 | edited version. 15 | 16 | As it stands now it just goes on and on and on. 17 | 18 | This is a short paragraph that will be joined with the next paragraph. 19 | This paragraph will be joined with the previous one. 20 | 21 | The text of this paragraph is completely new. It doesn't appear in any 22 | form in the original document. 23 | 24 | I like dogs a lot. Puppies are neat too. I also like cats. Kittens are 25 | fuzzy furballs. 26 | 27 | This is \b{quux} formatted text. 28 | 29 | This paragraph will be moved. 30 | 31 | This paragraph will be moved and changed. 32 | 33 | ** Tokenization tests 34 | 35 | Depending on how we split text into tokens when computing the LCS we 36 | can get slightly better or worse diffs. The remaining paragraphs in 37 | this section show some of the issues. 38 | 39 | Quux biff boom and stuff. Ideally there's one addition and one deletion. 40 | 41 | One quux biff boom and stuff. Ideally there's one addition and one deletion. 42 | 43 | Two quux biff and stuff. Ideally there's one addition and one deletion. 44 | 45 | Three quux biff boom blat and stuff. Ideally there's one addition and one deletion. 46 | 47 | Four I hope there's only one deletion and addition following: quux biff boom 48 | 49 | Five I hope there's only one deletion and addition following: quux biff boom and stuff. 50 | 51 | A sentence that ends with foo and bar. Ideally foo will not be struck out in the diff. 52 | 53 | A sentence that ends with "foo". Ideally the only diff will be the addition of quotes. 54 | 55 | . 56 | -------------------------------------------------------------------------------- /test-original.txt: -------------------------------------------------------------------------------- 1 | -*- mode: markup; -*- 2 | 3 | * Prose diff test document 4 | 5 | A test document. This is a paragraph that should appear in both versions. 6 | 7 | Will be slightly edited in the edited version. 8 | 9 | This is a long paragraph that will be split into to two parts in the 10 | edited version. As it stands now it just goes on and on and on. 11 | 12 | This is a short paragraph that will be joined with the next paragraph. 13 | 14 | This paragraph will be joined with the previous one. 15 | 16 | This paragraph will be moved. 17 | 18 | This paragraph will be moved and edited. 19 | 20 | Deletion is the fate of this block of text. 21 | 22 | I like dogs a lot. I also like cats. 23 | 24 | Puppies are neat too. Kittens are fuzzy furballs. 25 | 26 | This is \i{quux} formatted text. 27 | 28 | Moving up is what this paragraph will do. 29 | 30 | Upward movement with editing is what happens to this text. 31 | 32 | ** Tokenization tests 33 | 34 | Depending on how we split text into tokens when computing the LCS we 35 | can get slightly better or worse diffs. The remaining paragraphs in 36 | this section show some of the issues. 37 | 38 | Foo bar baz and stuff. Ideally there's one addition and one deletion. 39 | 40 | One foo bar baz and stuff. Ideally there's one addition and one deletion. 41 | 42 | Two foo bar baz and stuff. Ideally there's one addition and one deletion. 43 | 44 | Three foo bar baz and stuff. Ideally there's one addition and one deletion. 45 | 46 | Four I hope there's only one deletion and addition following: foo bar baz 47 | 48 | Five I hope there's only one deletion and addition following: foo bar baz and stuff. 49 | 50 | A sentence that ends with foo. Ideally foo will not be struck out in the diff. 51 | 52 | A sentence that ends with foo. Ideally the only diff will be the addition of quotes. 53 | 54 | -------------------------------------------------------------------------------- /tests.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | 4 | (defun remove-tag-test () 5 | (and 6 | (equalp 7 | (remove-tag :add '(:i (:b (:add "foo" (:x "abc"))))) 8 | '(:i (:b "foo" (:x "abc")))) 9 | 10 | (equalp 11 | (remove-tag :b '(:i (:b (:add "foo" (:x "abc"))))) 12 | '(:i (:add "foo" (:x "abc")))) 13 | 14 | (equalp 15 | (remove-tag :i '(:i (:b (:add "foo" (:x "abc"))))) 16 | '(:b (:add "foo" (:x "abc")))))) 17 | 18 | (defun diff-textified-markup (a b) 19 | (diff-textified (textify-markup a) (textify-markup b))) 20 | 21 | (defun foo-test () 22 | (list 23 | (rewrite-adds-and-deletes 24 | (detextify-markup 25 | (diff-textified-markup 26 | '(:p "foo " (:i "quux") " baz") 27 | '(:p "foo " (:b "quux") " baz")))) 28 | 29 | (rewrite-adds-and-deletes 30 | (detextify-markup 31 | (diff-textified-markup 32 | '(:p "foo " (:i "quux bar biff") " baz") 33 | '(:p "foo " (:i "quux baz boom") " baz")))))) -------------------------------------------------------------------------------- /text.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | (defvar *interned-text* (make-hash-table :test #'equal)) 4 | 5 | (defparameter *token-scanner* (cl-ppcre:create-scanner "(\\w+|\\s+|[\\W\\S])")) 6 | 7 | (defclass propertied-text () 8 | ((text :initarg :text :accessor text) 9 | (properties :initarg :properties :accessor properties))) 10 | 11 | (defun clear-interned-text () 12 | (setf *interned-text* (make-hash-table :test #'equal))) 13 | 14 | (defun intern-text (text &optional properties) 15 | (let ((key (cons text properties))) 16 | (let ((existing (gethash key *interned-text*))) 17 | (or 18 | existing 19 | (setf 20 | (gethash key *interned-text*) 21 | (make-instance 'propertied-text 22 | :text text 23 | :properties properties)))))) 24 | 25 | (defun add-property (text new-prop) 26 | (intern-text (text text) (cons new-prop (properties text)))) 27 | 28 | (defun remove-properties (text props) 29 | (intern-text (text text) (set-difference (properties text) props))) 30 | 31 | (defun has-property-p (prop text) 32 | (member prop (properties text))) 33 | 34 | (defmethod print-object ((object propertied-text) stream) 35 | (when *print-readably* 36 | (error 'print-not-readable object)) 37 | (format stream "\"~a\"~@[[~{~(~a~)~^:~}]~]" (text object) (properties object))) 38 | 39 | (defun textify-markup (markup-list) 40 | "Convert a list of Markup sexps into a vector of interned 41 | propertied-text objects." 42 | (let ((v (make-array 10 :adjustable t :fill-pointer 0))) 43 | (loop for markup in markup-list do (vector-push-extend* (%textify-markup markup ()) v)) 44 | v)) 45 | 46 | (defun detextify-markup (v) 47 | "Convert a vector of propertied-text object (such as produced by 48 | textify-markup into a list of Markup sexps." 49 | (values (%detextify-markup v 0 (length v) ()))) 50 | 51 | (defgeneric %textify-markup (markup properties)) 52 | 53 | (defmethod %textify-markup ((markup cons) properties) 54 | (destructuring-bind (tag &rest body) markup 55 | (let ((props (cons tag properties)) 56 | (results ())) 57 | (loop for element in body do 58 | (loop for x in (%textify-markup element props) do (push x results))) 59 | ;;; This next line is a bit of a kludge to keep adjacent 60 | ;;; elements with the same tag, e.g. adjacent :P's, from 61 | ;;; merging. 62 | (push (intern-text "" properties) results) 63 | (nreverse results)))) 64 | 65 | (defmethod %textify-markup ((markup string) properties) 66 | (mapcar (lambda (tok) (intern-text tok properties)) (tokenize-text markup))) 67 | 68 | (defun tokenize-text (text) 69 | "Split a text string into a list of tokens containing either all the 70 | text split into words, whitespace, and punctuation or just the words." 71 | (cl-ppcre:all-matches-as-strings *token-scanner* text)) 72 | 73 | (defun %detextify-markup (v start end open-props) 74 | (assert (not (null start))) 75 | (let ((result ()) 76 | (i start)) 77 | (with-output-to-string (s) 78 | (flet ((save-text () 79 | (let ((str (get-output-stream-string s))) 80 | (when (plusp (length str)) (push str result))))) 81 | (loop while (< i end) do 82 | (let* ((text (aref v i)) 83 | (text-props (properties text))) 84 | (cond 85 | ((equal text-props open-props) 86 | (write-string (text text) s) 87 | (incf i)) 88 | 89 | ((new-open-p text-props open-props) 90 | (save-text) 91 | (let* ((unopen (unopen text-props open-props)) 92 | (tag (first (last unopen))) 93 | (now-open (cons tag open-props))) 94 | (multiple-value-bind (thing idx) (%detextify-markup v i end now-open) 95 | (push (cons tag thing) result) 96 | (setf i idx)))) 97 | (t (decf i) 98 | (save-text) 99 | (loop-finish))))) 100 | (save-text))) 101 | (values (nreverse result) (1+ i)))) 102 | 103 | (defun new-open-p (text-props open-props) 104 | (equal (last text-props (length open-props)) open-props)) 105 | 106 | (defun unopen (text-props open-props) 107 | (multiple-value-bind (unopen open) 108 | (take text-props (- (length text-props) (length open-props))) 109 | (assert (equal open-props open) () 110 | "Remaining props ~s not equal to open props ~s" open open-props) 111 | unopen)) 112 | 113 | (defun diff-textified (a b) 114 | "Diff two vectors of propertied-text, returning a single vector 115 | containing the diff as propertied-text." 116 | (flet ((translate-textified (x) 117 | (destructuring-bind (label . thing) x 118 | (ecase label 119 | (:lcs thing) 120 | ((:add :delete) (add-property thing label))))) 121 | 122 | (collapse-spaces-in-lcs (lcs) 123 | ;; Since spaces are quite common in text, the LCS of any 124 | ;; two bits of text will include a lot of them. However, 125 | ;; when there are no words between them in the LCS it is 126 | ;; better to collapse them so that instead of diffing: 'a 127 | ;; b' and 'd e' as ((:delete a) (:add d) " " (:delete b) 128 | ;; (:add e))' we get ((:delete "a b") (:add "d e")) We 129 | ;; also get rid of any leading spaces for a similar 130 | ;; reason. 131 | (let ((just-saw-space t)) 132 | (flet ((collapsable? (x) 133 | (if (string= (text x) " ") 134 | (prog1 just-saw-space (setf just-saw-space t)) 135 | (setf just-saw-space nil)))) 136 | (remove-if #'collapsable? lcs))))) 137 | 138 | (let ((diff (diff-vectors a b #'collapse-spaces-in-lcs))) 139 | (map-into diff #'translate-textified diff)))) -------------------------------------------------------------------------------- /tokenize.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | (defparameter *short-length* 5) 4 | 5 | (defclass chunk () 6 | ((markup :initarg :markup :accessor markup) 7 | (textified :reader textified) 8 | (most-similar :initform nil :accessor most-similar) 9 | (pair :initform nil :accessor pair))) 10 | 11 | (defclass pair () 12 | ((original :initarg :original :reader original) 13 | (edited :initarg :edited :reader edited))) 14 | 15 | (defmethod initialize-instance :after ((chunk chunk) &key &allow-other-keys) 16 | (with-slots (markup textified) chunk 17 | (setf textified (textify-markup markup)))) 18 | 19 | (defun make-chunk (markup) 20 | (make-instance 'chunk :markup markup)) 21 | 22 | (defun make-empty-chunk () 23 | (make-chunk ())) 24 | 25 | (defun empty-chunk-p (chunk) 26 | (null (markup chunk))) 27 | 28 | (defun combine-chunks (&rest chunks) 29 | (make-instance 'chunk :markup (apply #'append (mapcar #'markup chunks)))) 30 | 31 | (defun paragraphs (parsed) 32 | "Split file into chunks, one per paragraph." 33 | (loop for p in (rest parsed) collect (make-chunk (list p)))) 34 | 35 | (defmethod print-object ((o chunk) stream) 36 | (print-unreadable-object (o stream :type t) 37 | (format stream "~s identical: ~a; symmetric: ~a" (markup o) (identical-p o) (symmetrical-p o)))) 38 | 39 | (defmethod print-object ((o pair) stream) 40 | (print-unreadable-object (o stream :type t) 41 | (format stream "~&original: ~s~&edited: ~s" (original o) (edited o)))) 42 | 43 | (defun identical-p (p) 44 | (and (most-similar p) (equal (markup p) (markup (most-similar p))))) 45 | 46 | (defun symmetrical-p (p) 47 | (and (most-similar p) (eql (most-similar (most-similar p)) p))) 48 | 49 | (defun establish-pairs (original edited &key unpair-short) 50 | "Establish the pairing between two lists of chunks." 51 | (pair-identical original edited) 52 | ;; XXX -- should perhaps remove elements from original and edited 53 | ;; that have most-similar set by pair-identical. 54 | (pair-symmetrical original edited) 55 | (unpair-asymmetrical original t) 56 | (unpair-asymmetrical edited nil) 57 | (when unpair-short 58 | (unpair-short original t) 59 | (unpair-short edited nil))) 60 | 61 | (defun pair-identical (original edited) 62 | (let ((identical (make-hash-table :test #'equal))) 63 | (loop for o in original do (push o (gethash (markup o) identical nil))) 64 | ;; Reverse the lists so we can pop them off in the right order below. 65 | (loop for k being the hash-keys of identical do 66 | (setf (gethash k identical) (nreverse (gethash k identical)))) 67 | (loop for e in edited 68 | for o = (pop (gethash (markup e) identical nil)) 69 | do 70 | (when o 71 | (setf (most-similar e) o) 72 | (setf (most-similar o) e) 73 | (pair-chunks o e))))) 74 | 75 | (defun pair-symmetrical (original edited) 76 | (set-most-similar original edited) 77 | (loop for chunk in original 78 | when (symmetrical-p chunk) do 79 | (pair-chunks chunk (most-similar chunk)))) 80 | 81 | (defun set-most-similar (original edited) 82 | "Make each chunk in original and edited point to the most similar 83 | chunk on the other side unless they have already been paired with an 84 | identical chunk." 85 | (loop for c in edited unless (most-similar c) do (find-most-similar c original)) 86 | (loop for c in original unless (most-similar c) do (find-most-similar c edited))) 87 | 88 | (defun find-most-similar (chunk others) 89 | (let ((textified (textified chunk))) 90 | (flet ((score (other) (similarity textified (textified other)))) 91 | (setf (most-similar chunk) (maximum others :key #'score))))) 92 | 93 | (defun pair-chunks (original edited) 94 | (setf (pair original) 95 | (setf (pair edited) 96 | (make-instance 'pair :original original :edited edited)))) 97 | 98 | (defun unpair-asymmetrical (chunks original-p) 99 | (loop for chunk in chunks when (not (symmetrical-p chunk)) do (unpair chunk original-p))) 100 | 101 | (defun unpair-short (chunks original-p) 102 | "Unpair very short chunks." 103 | (loop for chunk in chunks when (< (length (textified chunk)) *short-length*) 104 | do (unpair chunk original-p))) 105 | 106 | (defun unpair (chunk original-p) 107 | (let ((empty (make-empty-chunk))) 108 | (setf (most-similar chunk) empty) 109 | (if original-p 110 | (pair-chunks chunk empty) 111 | (pair-chunks empty chunk)))) 112 | 113 | (defun diff-pair (pair) 114 | (diff-textified (textified (original pair)) (textified (edited pair)))) 115 | 116 | (defun diff-to-markup (original edited) 117 | (clean-adds-and-deletes (mark-moves (diff-to-markup/no-moves original edited)))) 118 | 119 | (defun diff-to-markup/no-moves (original-parsed edited-parsed) 120 | (let ((original (paragraphs original-parsed)) 121 | (edited (paragraphs edited-parsed))) 122 | (establish-pairs original edited) 123 | (loop for (label . pair) across (diff-vectors (as-pairs original) (as-pairs edited)) 124 | for diff = (clean-empties (diff-pair pair)) 125 | nconc 126 | (cond 127 | ((and (eql label :add) (not (empty-chunk-p (original pair)))) 128 | `((:add ,@diff))) 129 | ((and (eql label :delete) (not (empty-chunk-p (edited pair)))) 130 | `((:delete ,@diff))) 131 | (t diff))))) 132 | 133 | (defun mark-moves (markup) 134 | "Take the output of diff-to-markup/no-moves and find the adds and 135 | deletes which are actually moves." 136 | (let ((chunks (make-hash-table))) 137 | (flet ((chunkify (thing) 138 | (setf (gethash thing chunks) (make-chunk (rest thing))))) 139 | (let ((adds (mapcar #'chunkify (extract markup :add))) 140 | (deletes (mapcar #'chunkify (extract markup :delete)))) 141 | (establish-pairs deletes adds :unpair-short t) 142 | 143 | ;; Walk the tree. For each :delete find the chunk. If chunk is 144 | ;; unpaired then leave the :delete as is. Otherwise replace 145 | ;; :delete with :moved-away containing the diff of the :delete 146 | ;; and the :add (original and edited chunks of the pair.) For 147 | ;; each :add similarly, if the chunk is unpaired, leave as is, 148 | ;; otherwise replace with :moved-to containing diff of the 149 | ;; :delete and :add. Also leave things as they are if the new 150 | ;; diff is actually more complex than the originals. 151 | (flet ((rewrite (x) 152 | (let ((label (car x))) 153 | (case label 154 | ((:add :delete) 155 | (let* ((chunk (gethash x chunks)) 156 | (pair (pair chunk)) 157 | (new-diff (diff-pair pair)) 158 | (other (if (eql label :add) (original pair) (edited pair))) 159 | (new-diff-length (length new-diff)) 160 | (old-diff-length (length (textified chunk))) 161 | (other-old-diff-length (length (textified other))) 162 | (move-label (if (eql label :add) :moved-to :moved-from)) 163 | (more-complex (and (> new-diff-length old-diff-length) (> new-diff-length other-old-diff-length)))) 164 | (cond 165 | ((and (symmetrical-p chunk) (not more-complex)) 166 | `(,move-label ,@(detextify-markup new-diff))) 167 | (t `(,label ,@(rest x)))))) 168 | (t x))))) 169 | (map-tree #'rewrite markup)))))) 170 | 171 | (defun show-pairing (label add) 172 | (let* ((ta (textified add)) 173 | (tb (textified (most-similar add))) 174 | (diff (diff-textified tb ta)) 175 | #+(or)(undiffed (undiffed (detextify-markup diff)))) 176 | (when (finer-grained-p diff ta) 177 | (format t "~2&~a: ~s => ~s~&diff: ~s~&identical: ~a; symmetric: ~a; similarity: ~f; one-way: ~f)" 178 | label 179 | (markup add) 180 | (markup (most-similar add)) 181 | (detextify-markup diff) 182 | (identical-p add) 183 | (symmetrical-p add) 184 | (similarity ta tb) 185 | (one-way-similarity ta tb))))) 186 | 187 | (defun finer-grained-p (new-diff orig-diff) 188 | "Is the new-diff finer-grained than the original?" 189 | (< (length (remove-if-not #'part-of-diff-p new-diff)) (length orig-diff))) 190 | 191 | (defun part-of-diff-p (text) 192 | (or (has-property-p :add text) (has-property-p :delete text))) 193 | 194 | (defun pair-adds-and-deletes (original-file edited-file) 195 | (let ((markup (diff-to-markup/no-moves original-file edited-file))) 196 | (flet ((chunkify (things) 197 | (mapcar (lambda (x) (make-chunk (rest x))) things))) 198 | (let ((adds (chunkify (extract markup :add))) 199 | (deletes (chunkify (extract markup :delete)))) 200 | (pair-identical adds deletes) 201 | (pair-symmetrical (remove-if #'identical-p adds) (remove-if #'identical-p deletes)) 202 | (loop for x in deletes do (show-pairing "DELETE" x)) 203 | (loop for x in adds do (show-pairing "ADD" x)) 204 | )))) 205 | 206 | (defun dediff (textified) 207 | (map 'vector (lambda (x) (remove-properties x '(:add :delete))) textified)) 208 | 209 | (defun refinement (delete add) 210 | (let* ((refined-diff (diff-textified (dediff (textified delete)) (dediff (textified add)))) 211 | (diff-length (length (remove-if-not #'part-of-diff-p refined-diff))) 212 | (original-add-length (length (textified add)))) 213 | (* original-add-length (/ (- original-add-length diff-length) original-add-length)))) 214 | 215 | (defun find-most-refining (add deletes) 216 | (flet ((score (delete) (refinement delete add))) 217 | (maximum deletes :key #'score))) 218 | 219 | (defun extract (markup what) 220 | "Find all the what elements in markup." 221 | (cond 222 | ((consp markup) 223 | (if (eql (car markup) what) 224 | (list markup) 225 | (mapcan (lambda (x) (extract x what)) markup))) 226 | (t nil))) 227 | 228 | (defun undiffed (markup) 229 | (cond 230 | ((consp markup) 231 | (if (or (eql (car markup) :add) 232 | (eql (car markup) :delete)) 233 | nil 234 | (mapcan #'undiffed markup))) 235 | (t (list markup)))) 236 | 237 | (defun as-pairs (chunks) (map 'vector #'pair chunks)) 238 | 239 | (defun cleaned-diff-output (diff) 240 | (clean-adds-and-deletes (clean-empties diff))) 241 | 242 | (defun clean-adds-and-deletes (diff) 243 | (mapcar #'rewrite-adds-and-deletes diff)) 244 | 245 | (defun clean-empties (diff) 246 | (remove nil (mapcar #'remove-empties (detextify-markup diff)))) 247 | 248 | (defun remove-empties (tree) 249 | "Remove empty sub-trees from tree." 250 | (labels ((helper (x) 251 | (cond 252 | ((and (consp x) (null (rest x))) nil) 253 | ((consp x) (list (mapcan #'helper x))) 254 | (t (list x))))) 255 | (first (helper tree)))) 256 | 257 | (defun rewrite-adds-and-deletes (tree) 258 | "Rewrite the Markup tree so that :add and :delete tags are moved out 259 | as far as possible. E.g. given (:p (:add something)) we tranform it 260 | to: (:add (:p something))." 261 | (cond 262 | ((consp tree) 263 | (let ((add-or-delete (has-nested-add-or-delete-p tree))) 264 | (mapcar 265 | #'rewrite-adds-and-deletes 266 | (if add-or-delete (promote-tag add-or-delete tree) tree)))) 267 | (t tree))) 268 | 269 | (defun has-nested-add-or-delete-p (tree) 270 | "Tree has a nested :add or :delete tag and it's not the only tag. 271 | Returns which one it is since there should only be one or the other." 272 | (let ((tags (nested-tags tree))) 273 | (and (not (null (cdr tags))) 274 | (or (find :add tags) (find :delete tags))))) 275 | 276 | (defun promote-tag (tag tree) 277 | (list tag (remove-tag tag tree))) 278 | 279 | (defun remove-tag (tag tree) 280 | (cond 281 | ((consp tree) 282 | (cond 283 | ((consp (second tree)) 284 | (destructuring-bind (t1 (t2 . rest)) tree 285 | (cond 286 | ((eql t1 tag) `(,t2 ,@rest)) 287 | ((eql t2 tag) `(,t1 ,@rest)) 288 | (t `(,t1 ,(remove-tag tag `(,t2 ,@rest))))))) 289 | (t (destructuring-bind (t1 . rest) tree 290 | (cond 291 | ((eql t1 tag) rest) 292 | (t `(,t1 ,@rest))))))) 293 | (t tree))) 294 | 295 | (defun nested-tags (tree) 296 | "Give a tree like '(:a (:b (:c ...))) the nested tags are (:a :b :c)" 297 | (if (consp tree) 298 | (if (null (cddr tree)) 299 | (cons (car tree) (nested-tags (second tree))) 300 | (list (car tree))))) -------------------------------------------------------------------------------- /utilities.lisp: -------------------------------------------------------------------------------- 1 | (in-package :com.gigamonkeys.prose-diff) 2 | 3 | ;;; Bits of utility code that perhaps should be moved into 4 | ;;; com.gigamonkeys.utilities or replaced with calls to equivalent 5 | ;;; bits o fsome standard utility library. 6 | 7 | (defun maximum (list &key (key #'identity)) 8 | (when list 9 | (destructuring-bind (first . rest) list 10 | (loop with best-score = (funcall key first) 11 | with best = first 12 | for x in rest 13 | for score = (funcall key x) do 14 | (when (> score best-score) 15 | (setf best-score score) 16 | (setf best x)) 17 | finally (return (values best best-score)))))) 18 | 19 | (defun take (list n) 20 | "Return a list of of the first n values of list and the left-over 21 | tail as a secondary value." 22 | (let ((tail (nthcdr n list))) 23 | (values (ldiff list tail) tail))) 24 | 25 | (defun vector-push-extend* (list v) 26 | "Push all the elements of `list' onto v as if by vector-push-extend" 27 | (loop for item in list do (vector-push-extend item v))) 28 | 29 | (defun split-list (list tail) 30 | (let ((rest (nthcdr (or (search tail list) (error "~a not found in ~a" tail list)) list))) 31 | (values (ldiff list rest) rest))) 32 | 33 | (defun longer (list-a list-b) 34 | (cond 35 | ((endp list-b) 36 | (not (endp list-a))) 37 | ((endp list-a) 38 | nil) 39 | (t (longer (cdr list-a) (cdr list-b))))) 40 | 41 | (defun concatenate-vectors (vectors) 42 | (reduce (lambda (a b) (concatenate (class-of a) a b)) vectors)) 43 | 44 | (defun map-tree (fn tree) 45 | "Map fn down tree, replacing each element of the tree with the 46 | return value of fn. When the return value is identical to the 47 | original sub-tree it is recursively mapped." 48 | (typecase tree 49 | (cons (let ((new (funcall fn tree))) 50 | (if (eql new tree) 51 | (mapcar (lambda (x) (map-tree fn x)) tree) 52 | new))) 53 | (t tree))) --------------------------------------------------------------------------------