├── dune ├── dune-project ├── .ocamlformat ├── .gitignore ├── test ├── test_merge_diff.mli ├── test_limit_context.mli ├── import.ml ├── test_plain_diff_cutoff.mli ├── dune ├── patience_diff_lib_test.ml ├── test_plain_diff_cutoff.ml ├── test_limit_context.ml └── test_merge_diff.ml ├── src ├── patience_diff.mli ├── patience_diff_lib.ml ├── dune ├── matching_block.mli ├── matching_block.ml ├── move_kind.ml ├── move_id.ml ├── move_id.mli ├── hunks.mli ├── move_kind.mli ├── plain_diff.mli ├── hunks.ml ├── hunk.mli ├── range.mli ├── range.ml ├── hunk.ml ├── patience_diff_intf.ml ├── plain_diff.ml └── patience_diff.ml ├── Makefile ├── patience_diff.opam ├── CHANGES.md ├── LICENSE.md └── CONTRIBUTING.md /dune: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 3.17) 2 | -------------------------------------------------------------------------------- /.ocamlformat: -------------------------------------------------------------------------------- 1 | profile=janestreet 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.install 3 | *.merlin 4 | _opam 5 | 6 | -------------------------------------------------------------------------------- /test/test_merge_diff.mli: -------------------------------------------------------------------------------- 1 | (* This interface is deliberately empty. *) 2 | -------------------------------------------------------------------------------- /test/test_limit_context.mli: -------------------------------------------------------------------------------- 1 | (* This interface is deliberately empty. *) 2 | -------------------------------------------------------------------------------- /src/patience_diff.mli: -------------------------------------------------------------------------------- 1 | include Patience_diff_intf.Patience_diff (** @inline *) 2 | -------------------------------------------------------------------------------- /test/import.ml: -------------------------------------------------------------------------------- 1 | include Expect_test_helpers_core 2 | include Patience_diff_lib 3 | -------------------------------------------------------------------------------- /test/test_plain_diff_cutoff.mli: -------------------------------------------------------------------------------- 1 | (* This interface is deliberately empty. *) 2 | -------------------------------------------------------------------------------- /src/patience_diff_lib.ml: -------------------------------------------------------------------------------- 1 | module Patience_diff = Patience_diff 2 | module Plain_diff = Plain_diff 3 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name patience_diff_lib) 3 | (public_name patience_diff) 4 | (libraries base core) 5 | (preprocess 6 | (pps ppx_jane))) 7 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name patience_diff_lib_test) 3 | (libraries base core expect_test_helpers_core patience_diff_lib) 4 | (preprocess 5 | (pps ppx_jane))) 6 | -------------------------------------------------------------------------------- /test/patience_diff_lib_test.ml: -------------------------------------------------------------------------------- 1 | module Import = Import 2 | module Test_limit_context = Test_limit_context 3 | module Test_merge_diff = Test_merge_diff 4 | module Test_plain_diff_cutoff = Test_plain_diff_cutoff 5 | -------------------------------------------------------------------------------- /src/matching_block.mli: -------------------------------------------------------------------------------- 1 | open! Core 2 | 3 | type t = 4 | { prev_start : int 5 | ; next_start : int 6 | ; length : int 7 | } 8 | 9 | module Stable : sig 10 | module V1 : sig 11 | type nonrec t = t [@@deriving sexp, bin_io] 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /src/matching_block.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | 4 | module V1 = struct 5 | type t = 6 | { prev_start : int 7 | ; next_start : int 8 | ; length : int 9 | } 10 | [@@deriving sexp, bin_io] 11 | end 12 | end 13 | 14 | open! Core 15 | include Stable.V1 16 | -------------------------------------------------------------------------------- /src/move_kind.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | 4 | module V1 = struct 5 | type t = 6 | | Move of Move_id.Stable.V1.t 7 | | Within_move of Move_id.Stable.V1.t 8 | [@@deriving sexp, bin_io, compare ~localize] 9 | end 10 | end 11 | 12 | open! Core 13 | include Stable.V1 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | INSTALL_ARGS := $(if $(PREFIX),--prefix $(PREFIX),) 2 | 3 | default: 4 | dune build 5 | 6 | install: 7 | dune install $(INSTALL_ARGS) 8 | 9 | uninstall: 10 | dune uninstall $(INSTALL_ARGS) 11 | 12 | reinstall: uninstall install 13 | 14 | clean: 15 | dune clean 16 | 17 | .PHONY: default install uninstall reinstall clean 18 | -------------------------------------------------------------------------------- /src/move_id.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | 4 | module V1 = struct 5 | type t = int [@@deriving sexp, bin_io, compare ~localize] 6 | end 7 | end 8 | 9 | open! Core 10 | include Stable.V1 11 | include Comparable.Make_plain (Stable.V1) 12 | 13 | let zero = 0 14 | let succ = Int.succ 15 | let to_string = Int.to_string 16 | -------------------------------------------------------------------------------- /src/move_id.mli: -------------------------------------------------------------------------------- 1 | open! Core 2 | 3 | (** Each move identified in the code is given a unique move ID which can be used to 4 | distinguish it from other moves. *) 5 | type t [@@deriving sexp, compare ~localize] 6 | 7 | include Comparable.S_plain with type t := t 8 | 9 | val to_string : t -> string 10 | 11 | (** Return the 0th move index *) 12 | val zero : t 13 | 14 | (** Get the next move index *) 15 | val succ : t -> t 16 | 17 | module Stable : sig 18 | module V1 : sig 19 | type nonrec t = t [@@deriving sexp, bin_io, compare ~localize] 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /src/hunks.mli: -------------------------------------------------------------------------------- 1 | open! Core 2 | 3 | type 'a t = 'a Hunk.t list 4 | 5 | (** [unified t] converts all Replace ranges in [t] to an Prev range followed by a Next 6 | range. *) 7 | val unified : 'a t -> 'a t 8 | 9 | (** [ranges t] concatenates all the ranges of all hunks together **) 10 | val ranges : 'a t -> 'a Range.t list 11 | 12 | val concat_map_ranges : 'a t -> f:('a Range.t -> 'b Range.t list) -> 'b t 13 | 14 | module Stable : sig 15 | module V2 : sig 16 | type nonrec 'a t = 'a t [@@deriving sexp, bin_io] 17 | end 18 | 19 | module V1 : sig 20 | type nonrec 'a t [@@deriving sexp, bin_io] 21 | 22 | val to_v2 : 'a t -> 'a V2.t 23 | val of_v2_no_moves_exn : 'a V2.t -> 'a t 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/test_plain_diff_cutoff.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | open Import 3 | 4 | let%expect_test "exercise cutoff code path" = 5 | let arr1 = [| 0; 1; 2; 3; 4; 5; 6; 42; 43; 7; 8; 9 |] in 6 | let arr2 = [| 9; 8; 7; 42; 43; 6; 5; 4; 3; 2; 1; 0 |] in 7 | let do_diff ~cutoff = 8 | Plain_diff.iter_matches 9 | ?cutoff 10 | arr1 11 | arr2 12 | ~f:(fun (i, j) -> 13 | assert (arr1.(i) = arr2.(j)); 14 | print_s [%sexp (arr1.(i) : int), ((i, j) : int * int)]) 15 | ~hashable:(module Int) 16 | in 17 | do_diff ~cutoff:None; 18 | [%expect 19 | {| 20 | (42 (7 3)) 21 | (43 (8 4)) 22 | |}]; 23 | do_diff ~cutoff:(Some 3); 24 | (* worse diff, but correct *) 25 | [%expect {| (9 (11 0)) |}] 26 | ;; 27 | -------------------------------------------------------------------------------- /patience_diff.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | maintainer: "Jane Street developers" 3 | authors: ["Jane Street Group, LLC"] 4 | homepage: "https://github.com/janestreet/patience_diff" 5 | bug-reports: "https://github.com/janestreet/patience_diff/issues" 6 | dev-repo: "git+https://github.com/janestreet/patience_diff.git" 7 | doc: "https://ocaml.janestreet.com/ocaml-core/latest/doc/patience_diff/index.html" 8 | license: "MIT" 9 | build: [ 10 | ["dune" "build" "-p" name "-j" jobs] 11 | ] 12 | depends: [ 13 | "ocaml" {>= "5.1.0"} 14 | "base" 15 | "core" 16 | "ppx_jane" 17 | "dune" {>= "3.17.0"} 18 | ] 19 | available: arch != "arm32" & arch != "x86_32" 20 | synopsis: "Diff library using Bram Cohen's patience diff algorithm" 21 | description: " 22 | " 23 | -------------------------------------------------------------------------------- /src/move_kind.mli: -------------------------------------------------------------------------------- 1 | (** If a given range is part of a move it will have a [Move_kind.t]. If the move is simple 2 | with no ranges there will just be two ranges: One [Prev] and one [Next] that share the 3 | same [Move MOVE_INDEX] where the index is used to identify a given move as the same. 4 | 5 | If the move has modifications like additions and deletions then the [Next] part of the 6 | move will have replaces with [Within_move MOVE_INDEX] to denote they are just 7 | modifications to the moved code. *) 8 | type t = 9 | | Move of Move_id.t 10 | | Within_move of Move_id.t 11 | [@@deriving sexp, compare ~localize] 12 | 13 | module Stable : sig 14 | module V1 : sig 15 | type nonrec t = t [@@deriving sexp, bin_io, compare ~localize] 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /src/plain_diff.mli: -------------------------------------------------------------------------------- 1 | (** Basic Myers diff algorithm, translated from GNU diff. **) 2 | 3 | (** [iter_matches ?cutoff ~f ~hashable a b] diffs the arrays [a] and [b] (as in 4 | /usr/bin/diff), and calls [f] on each element of the longest common subsequence in 5 | increasing order. The arguments of [f] are the indices in [a] and [b], respectively, 6 | of that element. 7 | 8 | The [cutoff] is an upper bound on the minimum edit distance between [a] and [b]. When 9 | [cutoff] is exceeded, [iter_matches] returns a correct, but not necessarily minimal 10 | diff. It defaults to about [sqrt (Array.length a + Array.length b)]. *) 11 | val iter_matches 12 | : ?cutoff:int 13 | -> f:(int * int -> unit) 14 | -> hashable:'a Base.Hashtbl.Key.t 15 | -> 'a array 16 | -> 'a array 17 | -> unit 18 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | ## Release v0.17.0 2 | 3 | - Can now find text that has been moved with the `find_moves` parameter 4 | - `Range` now can have a `Move_id` which identifies which move the current `Range` belongs to. 5 | Multiple ranges can belong to the same move. 6 | 7 | ## Release v0.16.0 8 | 9 | * Added `?max_slide` and `?score` arguments to `get_matching_blocks` and `get_hunks`. 10 | 11 | ## Old pre-v0.15 changelogs (very likely stale and incomplete) 12 | 13 | ## 113.24.00 14 | 15 | - Switch to PPX. 16 | 17 | ## 112.24.00 18 | 19 | Update references to `Core.Std.Dequeue` to refer to `Core.Std.Deque` 20 | 21 | ## 111.25.00 22 | 23 | - refactoring and more unit tests 24 | 25 | ## 111.21.00 26 | 27 | - Added plain differ `Plain_diff` and use it in some cases for 28 | improved results. 29 | - Move modules under `Patience_diff_lib.Std`. 30 | 31 | ## 111.17.00 32 | 33 | - Exposed `Patience_diff.matches`. 34 | 35 | ## 111.13.00 36 | 37 | - Moved `Patience_diff` out of `Core_extended` into its own library 38 | depending only on `Core_kernel`. 39 | 40 | -------------------------------------------------------------------------------- /src/hunks.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | module Hunk = Hunk.Stable 4 | 5 | module V2 = struct 6 | type 'a t = 'a Hunk.V2.t list [@@deriving sexp, bin_io] 7 | end 8 | 9 | module V1 = struct 10 | type 'a t = 'a Hunk.V1.t list [@@deriving sexp, bin_io] 11 | 12 | let to_v2 = Core.List.map ~f:Hunk.V1.to_v2 13 | let of_v2_no_moves_exn = Core.List.map ~f:Hunk.V1.of_v2_no_moves_exn 14 | end 15 | end 16 | 17 | open! Core 18 | include Stable.V2 19 | 20 | let concat_map_ranges hunks ~f = List.map hunks ~f:(Hunk.concat_map ~f) 21 | 22 | let unified hunks = 23 | let f : 'a Range.t -> 'a Range.t list = function 24 | | Replace (l_range, r_range, move_id) -> 25 | let move_kind = 26 | Option.map move_id ~f:(fun move_id -> Move_kind.Within_move move_id) 27 | in 28 | [ Prev (l_range, move_kind); Next (r_range, move_kind) ] 29 | | range -> [ range ] 30 | in 31 | concat_map_ranges hunks ~f 32 | ;; 33 | 34 | let ranges hunks = List.concat_map hunks ~f:(fun hunk -> hunk.Hunk.ranges) 35 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2009--2025 Jane Street Group, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/hunk.mli: -------------------------------------------------------------------------------- 1 | (** In diff terms, a hunk is a unit of consecutive ranges with some [Same] context before 2 | and after [Next], [Prev], and [Replace] ranges. Each hunk contains information about 3 | the original arrays, specifically the starting indexes and the number of elements in 4 | both arrays to which the hunk refers. 5 | 6 | Furthermore, a diff is essentially a list of hunks. The simplest case is a diff with 7 | infinite context, consisting of exactly one hunk. *) 8 | 9 | open! Core 10 | 11 | type 'a t = 12 | { prev_start : int 13 | ; prev_size : int 14 | ; next_start : int 15 | ; next_size : int 16 | ; ranges : 'a Range.t list 17 | } 18 | [@@deriving fields ~getters, sexp_of, compare ~localize] 19 | 20 | (** [all_same t] returns true if [t] contains only Same ranges. *) 21 | val all_same : 'a t -> bool 22 | 23 | (** [concat_map t ~f] applies [List.concat_map] on [t.ranges]. *) 24 | val concat_map : 'a t -> f:('a Range.t -> 'b Range.t list) -> 'b t 25 | 26 | (** Take a hunk that was generated with infinite context and limit it to the provided 27 | [context]. *) 28 | val limit_infinite_context_hunk_to_context : context:int -> 'a t -> 'a t list 29 | 30 | module Stable : sig 31 | module V2 : sig 32 | type nonrec 'a t = 'a t [@@deriving sexp, bin_io] 33 | end 34 | 35 | module V1 : sig 36 | type nonrec 'a t [@@deriving sexp, bin_io] 37 | 38 | val to_v2 : 'a t -> 'a V2.t 39 | val of_v2_no_moves_exn : 'a V2.t -> 'a t 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /test/test_limit_context.ml: -------------------------------------------------------------------------------- 1 | open! Core 2 | open Import 3 | 4 | module Test_case = struct 5 | type t = 6 | { prev : string array 7 | ; next : string array 8 | ; context : int 9 | } 10 | [@@deriving sexp_of] 11 | 12 | let gen = 13 | let open Quickcheck.Let_syntax in 14 | let%bind context = Int.gen_incl 1 10 in 15 | let%bind prev_length = Int.gen_incl 30 60 in 16 | let%bind next_length = Int.gen_incl 30 60 in 17 | let line_gen = 18 | Quickcheck.Generator.of_list [ "apple"; "banana"; "cherry"; "dog"; "egg"; "fish" ] 19 | in 20 | let%bind prev = Quickcheck.Generator.list_with_length prev_length line_gen in 21 | let%bind next = Quickcheck.Generator.list_with_length next_length line_gen in 22 | return { prev = List.to_array prev; next = List.to_array next; context } 23 | ;; 24 | end 25 | 26 | let%test_unit "Ensure that limiting infinite context hunk to context is correct" = 27 | Quickcheck.test ~sexp_of:Test_case.sexp_of_t Test_case.gen ~f:(fun test_case -> 28 | let hunks = 29 | Patience_diff.String.get_hunks 30 | ~transform:Fn.id 31 | ~context:test_case.context 32 | ~prev:test_case.prev 33 | ~next:test_case.next 34 | () 35 | in 36 | let infinite_context_hunk = 37 | Patience_diff.String.get_hunks 38 | ~transform:Fn.id 39 | ~context:(-1) 40 | ~prev:test_case.prev 41 | ~next:test_case.next 42 | () 43 | |> List.hd_exn 44 | in 45 | let limited_hunks = 46 | Patience_diff.Hunk.limit_infinite_context_hunk_to_context 47 | ~context:test_case.context 48 | infinite_context_hunk 49 | in 50 | [%test_result: string Patience_diff.Hunk.t list] limited_hunks ~expect:hunks) 51 | ;; 52 | -------------------------------------------------------------------------------- /src/range.mli: -------------------------------------------------------------------------------- 1 | (** For handling diffs abstractly. A range is a subarray of the two original arrays with a 2 | constructor defining its relationship to the two original arrays. A [Same] range 3 | contains a series of elements which can be found in both arrays. A [Next] range 4 | contains elements found only in the second array, while an [Prev] range contains 5 | elements found only in the first array. 6 | 7 | If a range is part of a move it will have a non-None [Move_kind.t] or [Move_id.t] in 8 | the case of [Replace] and [Unified]. A [Prev] with a [Move _] [Move_kind] means that 9 | [Prev] has a corresponding [Next] that it was moved to. A [Prev] with a 10 | [Within_move _] [Move_kind] means that this was some code that was deleted within a 11 | block that moved to a [Next] position of the file. If a [Replace] or [Unified] range 12 | is associated with a move it can only be change within a move so they only hove a 13 | [Move_id.t option] instead of a [Move_kind.t option] like [Prev] or [Next]. 14 | 15 | A [Replace] contains two arrays: elements in the first output array are elements found 16 | only in the first input array, which have been replaced by elements in the second 17 | output array, which are elements found only in the second input array. *) 18 | 19 | type 'a t = 20 | | Same of ('a * 'a) array 21 | | Prev of 'a array * Move_kind.t option 22 | | Next of 'a array * Move_kind.t option 23 | | Replace of 'a array * 'a array * Move_id.t option 24 | | Unified of 'a array * Move_id.t option 25 | [@@deriving sexp, compare ~localize] 26 | 27 | (** [all_same ranges] returns true if all [ranges] are Same *) 28 | val all_same : 'a t list -> bool 29 | 30 | (** [prev_only ranges] drops all Next ranges and converts all Replace ranges to Prev 31 | ranges. *) 32 | val prev_only : 'a t list -> 'a t list 33 | 34 | (** [next_only ranges] drops all Prev ranges and converts all Replace ranges to Next 35 | ranges. *) 36 | val next_only : 'a t list -> 'a t list 37 | 38 | (** Counts number of elements. *) 39 | val prev_size : 'a t -> int 40 | 41 | val next_size : 'a t -> int 42 | 43 | module Stable : sig 44 | module V2 : sig 45 | type nonrec 'a t = 'a t [@@deriving sexp, bin_io, compare ~localize] 46 | end 47 | 48 | module V1 : sig 49 | type nonrec 'a t [@@deriving sexp, bin_io] 50 | 51 | val to_v2 : 'a t -> 'a V2.t 52 | val of_v2_no_moves_exn : 'a V2.t -> 'a t 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | This repository contains open source software that is developed and 2 | maintained by [Jane Street][js]. 3 | 4 | Contributions to this project are welcome and should be submitted via 5 | GitHub pull requests. 6 | 7 | Signing contributions 8 | --------------------- 9 | 10 | We require that you sign your contributions. Your signature certifies 11 | that you wrote the patch or otherwise have the right to pass it on as 12 | an open-source patch. The rules are pretty simple: if you can certify 13 | the below (from [developercertificate.org][dco]): 14 | 15 | ``` 16 | Developer Certificate of Origin 17 | Version 1.1 18 | 19 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 20 | 1 Letterman Drive 21 | Suite D4700 22 | San Francisco, CA, 94129 23 | 24 | Everyone is permitted to copy and distribute verbatim copies of this 25 | license document, but changing it is not allowed. 26 | 27 | 28 | Developer's Certificate of Origin 1.1 29 | 30 | By making a contribution to this project, I certify that: 31 | 32 | (a) The contribution was created in whole or in part by me and I 33 | have the right to submit it under the open source license 34 | indicated in the file; or 35 | 36 | (b) The contribution is based upon previous work that, to the best 37 | of my knowledge, is covered under an appropriate open source 38 | license and I have the right under that license to submit that 39 | work with modifications, whether created in whole or in part 40 | by me, under the same open source license (unless I am 41 | permitted to submit under a different license), as indicated 42 | in the file; or 43 | 44 | (c) The contribution was provided directly to me by some other 45 | person who certified (a), (b) or (c) and I have not modified 46 | it. 47 | 48 | (d) I understand and agree that this project and the contribution 49 | are public and that a record of the contribution (including all 50 | personal information I submit with it, including my sign-off) is 51 | maintained indefinitely and may be redistributed consistent with 52 | this project or the open source license(s) involved. 53 | ``` 54 | 55 | Then you just add a line to every git commit message: 56 | 57 | ``` 58 | Signed-off-by: Joe Smith 59 | ``` 60 | 61 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 62 | 63 | If you set your `user.name` and `user.email` git configs, you can sign 64 | your commit automatically with git commit -s. 65 | 66 | [dco]: http://developercertificate.org/ 67 | [js]: https://opensource.janestreet.com/ 68 | -------------------------------------------------------------------------------- /src/range.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | 4 | module V2 = struct 5 | type 'a t = 6 | | Same of ('a * 'a) array 7 | | Prev of 'a array * Move_kind.Stable.V1.t option 8 | | Next of 'a array * Move_kind.Stable.V1.t option 9 | | Replace of 'a array * 'a array * Move_id.Stable.V1.t option 10 | | Unified of 'a array * Move_id.Stable.V1.t option 11 | [@@deriving sexp, bin_io, compare ~localize] 12 | end 13 | 14 | module V1 = struct 15 | type 'a t = 16 | | Same of ('a * 'a) array 17 | | Prev of 'a array 18 | | Next of 'a array 19 | | Replace of 'a array * 'a array 20 | | Unified of 'a array 21 | [@@deriving sexp, bin_io] 22 | 23 | let to_v2 : 'a t -> 'a V2.t = function 24 | | Same lines -> Same lines 25 | | Prev lines -> Prev (lines, None) 26 | | Next lines -> Next (lines, None) 27 | | Replace (lines_prev, lines_next) -> Replace (lines_prev, lines_next, None) 28 | | Unified lines -> Unified (lines, None) 29 | ;; 30 | 31 | let of_v2_no_moves_exn : 'a V2.t -> 'a t = function 32 | | Prev (_, Some _) | Next (_, Some _) | Replace (_, _, Some _) | Unified (_, Some _) 33 | -> Core.raise_s [%sexp "cannot convert to old patdiff version with a move"] 34 | | Same lines -> Same lines 35 | | Prev (lines, None) -> Prev lines 36 | | Next (lines, None) -> Next lines 37 | | Replace (lines_prev, lines_next, None) -> Replace (lines_prev, lines_next) 38 | | Unified (lines, None) -> Unified lines 39 | ;; 40 | end 41 | end 42 | 43 | open! Core 44 | include Stable.V2 45 | 46 | let all_same ranges = 47 | List.for_all ranges ~f:(fun range -> 48 | match range with 49 | | Same _ -> true 50 | | _ -> false) 51 | ;; 52 | 53 | let prev_and_next range = 54 | match range with 55 | | Same _ -> [ range ], [ range ] 56 | | Prev (_, (None | Some (Move _))) -> [ range ], [] 57 | | Prev (_, Some (Within_move _)) -> 58 | (* Don't include [Prev]s that are within moves because they are showing that some code 59 | was deleted from a different [Prev] to create a corresponding [Next]. *) 60 | [], [] 61 | | Next (_, _) -> [], [ range ] 62 | | Replace (l_range, r_range, None) -> [ Prev (l_range, None) ], [ Next (r_range, None) ] 63 | | Replace (_, r_range, Some move_id) -> 64 | (* Don't include [Replace]s for the same reason as a [Prev] within a move. *) 65 | [], [ Next (r_range, Some (Within_move move_id)) ] 66 | | Unified (_, Some _) -> [], [ range ] 67 | | Unified (_, None) -> [ range ], [ range ] 68 | ;; 69 | 70 | let prev_only ranges = List.concat_map ranges ~f:(fun range -> fst (prev_and_next range)) 71 | let next_only ranges = List.concat_map ranges ~f:(fun range -> snd (prev_and_next range)) 72 | 73 | let prev_size = function 74 | | Unified (lines, None) 75 | | Replace (lines, _, None) 76 | | Prev (lines, None) 77 | | Prev (lines, Some (Move _)) -> Array.length lines 78 | | Same lines -> Array.length lines 79 | (* Don't include [Prev]s that are within moves as they are conceptually [Next]s *) 80 | | Replace (_, _, Some _) | Prev (_, Some (Within_move _)) | Next _ | Unified (_, Some _) 81 | -> 0 82 | ;; 83 | 84 | let next_size = function 85 | | Unified (lines, _) | Replace (_, lines, _) | Next (lines, _) -> Array.length lines 86 | | Same lines -> Array.length lines 87 | | Prev _ -> 0 88 | ;; 89 | -------------------------------------------------------------------------------- /test/test_merge_diff.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | open Import 3 | 4 | let merge_and_print arrs = 5 | Patience_diff.String.merge arrs 6 | |> List.iter ~f:(function 7 | | Same arr -> 8 | print_endline "---- Same -----"; 9 | Array.iter arr ~f:(fun line -> print_endline (" " ^ line)) 10 | | Different arrs -> 11 | print_endline "-- Different --"; 12 | Array.iteri arrs ~f:(fun idx -> 13 | Array.iter ~f:(fun line -> print_endline ("(" ^ Int.to_string idx ^ ")" ^ line)))) 14 | ;; 15 | 16 | let%expect_test "Identical documents" = 17 | let doc = [| "a"; "b"; "c"; "d" |] in 18 | merge_and_print [| doc |]; 19 | [%expect 20 | {| 21 | ---- Same ----- 22 | a 23 | b 24 | c 25 | d 26 | |}]; 27 | merge_and_print [| doc; doc |]; 28 | [%expect 29 | {| 30 | ---- Same ----- 31 | a 32 | b 33 | c 34 | d 35 | |}]; 36 | merge_and_print [| doc; doc; doc |]; 37 | [%expect 38 | {| 39 | ---- Same ----- 40 | a 41 | b 42 | c 43 | d 44 | |}] 45 | ;; 46 | 47 | let%expect_test "Empty documents" = 48 | let doc = [| "a"; "b"; "c" |] in 49 | merge_and_print [| doc; [||] |]; 50 | [%expect 51 | {| 52 | -- Different -- 53 | (0)a 54 | (0)b 55 | (0)c 56 | |}]; 57 | merge_and_print [| [||]; doc |]; 58 | [%expect 59 | {| 60 | -- Different -- 61 | (1)a 62 | (1)b 63 | (1)c 64 | |}]; 65 | let doc = [| "a"; "b"; "c" |] in 66 | merge_and_print [| doc; [| "" |] |]; 67 | [%expect 68 | {| 69 | -- Different -- 70 | (0)a 71 | (0)b 72 | (0)c 73 | (1) 74 | |}]; 75 | merge_and_print [| [| "" |]; doc |]; 76 | [%expect 77 | {| 78 | -- Different -- 79 | (0) 80 | (1)a 81 | (1)b 82 | (1)c 83 | |}] 84 | ;; 85 | 86 | let%expect_test "Documents with trailing added lines" = 87 | let short = [| "a"; "b" |] in 88 | let long = [| "a"; "b"; "c"; "d" |] in 89 | merge_and_print [| short; long |]; 90 | [%expect 91 | {| 92 | ---- Same ----- 93 | a 94 | b 95 | -- Different -- 96 | (1)c 97 | (1)d 98 | |}]; 99 | merge_and_print [| long; short |]; 100 | [%expect 101 | {| 102 | ---- Same ----- 103 | a 104 | b 105 | -- Different -- 106 | (0)c 107 | (0)d 108 | |}] 109 | ;; 110 | 111 | let%expect_test "Documents with leading added lines" = 112 | let short = [| "c"; "d" |] in 113 | let long = [| "a"; "b"; "c"; "d" |] in 114 | merge_and_print [| short; long |]; 115 | [%expect 116 | {| 117 | -- Different -- 118 | (1)a 119 | (1)b 120 | ---- Same ----- 121 | c 122 | d 123 | |}]; 124 | merge_and_print [| long; short |]; 125 | [%expect 126 | {| 127 | -- Different -- 128 | (0)a 129 | (0)b 130 | ---- Same ----- 131 | c 132 | d 133 | |}] 134 | ;; 135 | 136 | let%expect_test "Mixed documents with changes" = 137 | let short = [| "a"; "b"; "foo"; "c"; "d" |] in 138 | let long = [| "a"; "b"; "c"; "bar"; "d" |] in 139 | merge_and_print [| short; long |]; 140 | [%expect 141 | {| 142 | ---- Same ----- 143 | a 144 | b 145 | -- Different -- 146 | (0)foo 147 | ---- Same ----- 148 | c 149 | -- Different -- 150 | (1)bar 151 | ---- Same ----- 152 | d 153 | |}] 154 | ;; 155 | -------------------------------------------------------------------------------- /src/hunk.ml: -------------------------------------------------------------------------------- 1 | module Stable = struct 2 | open! Core.Core_stable 3 | module Range = Range.Stable 4 | 5 | module V2 = struct 6 | type 'a t = 7 | { prev_start : int 8 | ; prev_size : int 9 | ; next_start : int 10 | ; next_size : int 11 | ; ranges : 'a Range.V2.t list 12 | } 13 | [@@deriving fields ~getters, sexp, bin_io, compare ~localize] 14 | end 15 | 16 | module V1 = struct 17 | type 'a t = 18 | { prev_start : int 19 | ; prev_size : int 20 | ; next_start : int 21 | ; next_size : int 22 | ; ranges : 'a Range.V1.t list 23 | } 24 | [@@deriving fields ~getters, sexp, bin_io] 25 | 26 | let to_v2 t = 27 | { V2.prev_start = t.prev_start 28 | ; prev_size = t.prev_size 29 | ; next_start = t.next_start 30 | ; next_size = t.next_size 31 | ; ranges = Core.List.map t.ranges ~f:Range.V1.to_v2 32 | } 33 | ;; 34 | 35 | let of_v2_no_moves_exn (t : _ V2.t) = 36 | { prev_start = t.prev_start 37 | ; prev_size = t.prev_size 38 | ; next_start = t.next_start 39 | ; next_size = t.next_size 40 | ; ranges = Core.List.map t.ranges ~f:Range.V1.of_v2_no_moves_exn 41 | } 42 | ;; 43 | end 44 | end 45 | 46 | open! Core 47 | include Stable.V2 48 | 49 | let _invariant t = 50 | Invariant.invariant t [%sexp_of: _ t] (fun () -> 51 | [%test_result: int] 52 | (List.sum (module Int) t.ranges ~f:Range.prev_size) 53 | ~expect:t.prev_size 54 | ~message:"prev_size"; 55 | [%test_result: int] 56 | (List.sum (module Int) t.ranges ~f:Range.next_size) 57 | ~expect:t.next_size 58 | ~message:"next_size") 59 | ;; 60 | 61 | let all_same hunk = Range.all_same hunk.ranges 62 | let concat_map t ~f = { t with ranges = List.concat_map t.ranges ~f } 63 | 64 | let limit_infinite_context_hunk_to_context ~context hunk = 65 | let trim_range range = 66 | match range with 67 | | Range.Same arr -> 68 | let len = Array.length arr in 69 | if len <= context 70 | then `Did_not_trim range 71 | else ( 72 | let start = Range.Same (Array.sub arr ~pos:0 ~len:context) in 73 | let end_ = Range.Same (Array.sub arr ~pos:(len - context) ~len:context) in 74 | let lines_trimmed = len - (2 * context) in 75 | `Trimmed (lines_trimmed, start, end_)) 76 | | _ -> `Did_not_trim range 77 | in 78 | let working_ranges = Queue.create () in 79 | let working_ranges_offset_from_previous = ref 0 in 80 | let all_hunks = Queue.create () in 81 | let finish_hunk ~offset_from_previous_end = 82 | Queue.enqueue 83 | all_hunks 84 | (!working_ranges_offset_from_previous, Queue.to_list working_ranges); 85 | Queue.clear working_ranges; 86 | working_ranges_offset_from_previous := offset_from_previous_end 87 | in 88 | let last_range_index = List.length hunk.ranges - 1 in 89 | List.iteri hunk.ranges ~f:(fun i range -> 90 | match trim_range range with 91 | | `Did_not_trim range -> 92 | Queue.enqueue working_ranges range; 93 | if i = last_range_index then finish_hunk ~offset_from_previous_end:0 94 | | `Trimmed (lines_trimmed, start_range, end_range) -> 95 | if i = 0 96 | then ( 97 | working_ranges_offset_from_previous := lines_trimmed + Range.prev_size start_range; 98 | Queue.enqueue working_ranges end_range) 99 | else if i = last_range_index 100 | then ( 101 | Queue.enqueue working_ranges start_range; 102 | finish_hunk ~offset_from_previous_end:0) 103 | else if lines_trimmed > 0 104 | then ( 105 | Queue.enqueue working_ranges start_range; 106 | finish_hunk ~offset_from_previous_end:lines_trimmed; 107 | Queue.enqueue working_ranges end_range) 108 | else Queue.enqueue working_ranges range); 109 | let prev_end = ref 1 in 110 | let next_end = ref 1 in 111 | Queue.to_list all_hunks 112 | |> List.filter_map ~f:(fun (offset_from_previous, ranges) -> 113 | if List.is_empty ranges 114 | then None 115 | else ( 116 | let hunk = 117 | { ranges 118 | ; prev_start = !prev_end + offset_from_previous 119 | ; prev_size = List.sum (module Int) ranges ~f:Range.prev_size 120 | ; next_start = !next_end + offset_from_previous 121 | ; next_size = List.sum (module Int) ranges ~f:Range.next_size 122 | } 123 | in 124 | prev_end := hunk.prev_size + hunk.prev_start; 125 | next_end := hunk.next_size + hunk.next_start; 126 | Some hunk)) 127 | ;; 128 | -------------------------------------------------------------------------------- /src/patience_diff_intf.ml: -------------------------------------------------------------------------------- 1 | (** This is a port of Bram Cohen's patience diff algorithm, as found in the Bazaar 1.14.1 2 | source code, available at http://bazaar-vcs.org. 3 | 4 | This copyright notice was included: 5 | 6 | # Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd # # This 7 | program is free software; you can redistribute it and/or modify # it under the terms 8 | of the GNU General Public License as published by # the Free Software Foundation; 9 | either version 2 of the License, or # (at your option) any later version. # # This 10 | program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; 11 | without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR 12 | PURPOSE. See the # GNU General Public License for more details. # # You should have 13 | received a copy of the GNU General Public License # along with this program; if not, 14 | write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, 15 | Boston, MA 02110-1301 USA *) 16 | 17 | (** {v 18 | Bram Cohen's comment from the original Python code (with syntax changed to OCaml): 19 | 20 | [get_matching_blocks a b] returns a list of triples describing matching 21 | subsequences. 22 | 23 | Each triple is of the form (i, j, n), and means that 24 | a <|> (i,i+n) = b <|> (j,j+n). The triples are monotonically increasing in 25 | i and in j. 26 | 27 | The last triple is a dummy, (Array.length a, Array.length b, 0), and is the only 28 | triple with n=0. 29 | 30 | Example: 31 | get_matching_blocks [|"a";"b";"x";"c";"d"|] [|"a";"b";"c";"d"|] 32 | returns 33 | [(0, 0, 2), (3, 2, 2), (5, 4, 0)] 34 | v} *) 35 | 36 | open! Core 37 | module Hunk = Hunk 38 | module Hunks = Hunks 39 | module Matching_block = Matching_block 40 | module Range = Range 41 | module Move_id = Move_id 42 | 43 | module type S = sig 44 | type elt 45 | 46 | (** Get_matching_blocks not only aggregates the data from [matches a b] but also 47 | attempts to remove random, semantically meaningless matches ("semantic cleanup"). 48 | The value of [big_enough] governs how aggressively we do so. See [get_hunks] below 49 | for more details. *) 50 | val get_matching_blocks 51 | : transform:('a -> elt) 52 | -> ?big_enough:int 53 | -> ?max_slide:int 54 | -> ?score:([ `left | `right ] -> 'a -> 'a -> int) 55 | -> prev:'a array 56 | -> next:'a array 57 | -> unit 58 | -> Matching_block.t list 59 | 60 | (** [matches a b] returns a list of pairs (i,j) such that a.(i) = b.(j) and such that 61 | the list is strictly increasing in both its first and second coordinates. This is 62 | essentially a "unfolded" version of what [get_matching_blocks] returns. Instead of 63 | grouping the consecutive matching block using [length] this function would return 64 | all the pairs (prev_start * next_start). *) 65 | val matches : elt array -> elt array -> (int * int) list 66 | 67 | (** [match_ratio a b] computes the ratio defined as: 68 | 69 | {[ 70 | 2 * len (matches a b) / (len a + len b) 71 | ]} 72 | 73 | It is an indication of how much alike a and b are. A ratio closer to 1.0 will 74 | indicate a number of matches close to the number of elements that can potentially 75 | match, thus is a sign that a and b are very much alike. On the next hand, a low 76 | ratio means very little match. *) 77 | val match_ratio : elt array -> elt array -> float 78 | 79 | (** [get_hunks ~transform ~context ~prev ~next] will compare the arrays [prev] and 80 | [next] and produce a list of hunks. (The hunks will contain Same ranges of at most 81 | [context] elements.) Negative [context] is equivalent to infinity (producing a 82 | singleton hunk list). The value of [big_enough] governs how aggressively we try to 83 | clean up spurious matches, by restricting our attention to only matches of length 84 | less than [big_enough]. Thus, setting [big_enough] to a higher value results in more 85 | aggressive cleanup, and the default value of 1 results in no cleanup at all. When 86 | this function is called by [Patdiff_core], the value of [big_enough] is 3 at the 87 | line level, and 7 at the word level. 88 | 89 | The value of [max_slide] controls how far we are willing to shift a diff (which is 90 | immediately preceded/followed by the same lines as it ends/starts with). We choose 91 | between equivalent positions by maximising the sum of the [score] function applied 92 | to the two boundaries of the diff. By default, [max_slide] is 0. The arguments 93 | passed to [score] are firstly whether the boundary is at the start or end of the 94 | diff and then the values on either side of the boundary (if a boundary is considered 95 | at the start or end of the input, it gets a score of 100). *) 96 | val get_hunks 97 | : transform:('a -> elt) 98 | -> context:int 99 | -> ?big_enough:int 100 | -> ?max_slide:int 101 | -> ?score:([ `left | `right ] -> 'a -> 'a -> int) 102 | -> prev:'a array 103 | -> next:'a array 104 | -> unit 105 | -> 'a Hunk.t list 106 | 107 | type 'a segment = 108 | | Same of 'a array 109 | | Different of 'a array array 110 | 111 | type 'a merged_array = 'a segment list 112 | 113 | val merge : elt array array -> elt merged_array 114 | end 115 | 116 | module type Patience_diff = sig 117 | module Hunk = Hunk 118 | module Hunks = Hunks 119 | module Matching_block = Matching_block 120 | module Range = Range 121 | module Move_id = Move_id 122 | module Make (Elt : Hashtbl.Key) : S with type elt = Elt.t 123 | 124 | (* [String] uses String.compare *) 125 | module String : S with type elt = string 126 | 127 | module Stable : sig 128 | module Hunk = Hunk.Stable 129 | module Hunks = Hunks.Stable 130 | module Matching_block = Matching_block.Stable 131 | module Range = Range.Stable 132 | end 133 | end 134 | -------------------------------------------------------------------------------- /src/plain_diff.ml: -------------------------------------------------------------------------------- 1 | (* This files comes from camlp5 (ocaml_src/lib/diff.ml). *) 2 | (* 3 | * Copyright (c) 2007-2013, INRIA (Institut National de Recherches en 4 | * Informatique et Automatique). All rights reserved. 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of INRIA, nor the names of its contributors may be 14 | * used to endorse or promote products derived from this software without 15 | * specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY INRIA AND CONTRIBUTORS ``AS IS'' AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 20 | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INRIA AND 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 24 | * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | *) 30 | (* $Id: diff.ml,v 1.2 2013-02-26 08:15:06 deraugla Exp $ *) 31 | (* Parts of Code of GNU diff (diffseq.h and analyze.c) translated to OCaml and adjusted. 32 | 33 | Basic algorithm described by Eugene W.Myers in: "An O(ND) Difference Algorithm and Its 34 | Variations" *) 35 | 36 | open Base 37 | 38 | (* A partition is the midpoint of the shortest edit script for a specified portion of two 39 | vectors. 40 | 41 | [xmid, ymid] is the midpoint discovered. The diagonal number [xmid - ymid] equals the 42 | number of inserted elements minus the number of deleted elements (counting only 43 | elements before the midpoint). 44 | 45 | [lo_minimal] is true iff the minimal edit script for the left half of the partition is 46 | known; similarly for [hi_minimal]. 47 | *) 48 | module Partition = struct 49 | type t = 50 | { xmid : int 51 | ; ymid : int 52 | ; lo_minimal : bool 53 | ; hi_minimal : bool 54 | } 55 | end 56 | 57 | (* We keep this file in a C-like style so that we can easily compare against the original 58 | C, in which we have great confidence. *) 59 | (* Find the midpoint of the shortest edit script for a specified portion of the two 60 | vectors. 61 | 62 | Scan from the beginnings of the vectors, and simultaneously from the ends, doing a 63 | breadth-first search through the space of edit-sequence. When the two searches meet, we 64 | have found the midpoint of the shortest edit sequence. 65 | 66 | If [find_minimal] is true, find the minimal edit script regardless of expense. 67 | Otherwise, if the search is too expensive, use heuristics to stop the search and report 68 | a suboptimal answer. 69 | 70 | This function assumes that the first elements of the specified portions of the two 71 | vectors do not match, and likewise that the last elements do not match. The caller must 72 | trim matching elements from the beginning and end of the portions it is going to 73 | specify. 74 | 75 | If we return the "wrong" partitions, the worst this can do is cause suboptimal diff 76 | output. It cannot cause incorrect diff output. *) 77 | let diag ~fd ~bd ~sh ~xv ~yv ~xoff ~xlim ~yoff ~ylim ~too_expensive ~find_minimal 78 | : Partition.t 79 | = 80 | let dmin = xoff - ylim (* minimum valid diagonal *) in 81 | let dmax = xlim - yoff (* maximum valid diagonal *) in 82 | let fmid = xoff - yoff (* center diagonal of forward search *) in 83 | let bmid = xlim - ylim (* center diagonal of backward search *) in 84 | (* southeast corner is on an odd diagonal w.r.t the northwest *) 85 | let odd = (fmid - bmid) land 1 <> 0 in 86 | (* [sh] is an offset that lets us use indices in [[-(m+1), n+1]]. *) 87 | fd.(sh + fmid) <- xoff; 88 | bd.(sh + bmid) <- xlim; 89 | With_return.with_return (fun ({ return } : Partition.t With_return.return) -> 90 | (* [c] is cost. 91 | [fmin], [fmax] are limits of the forward search. 92 | [bmin], [bmax] are limits of the backward search. *) 93 | let rec loop ~c ~fmin ~fmax ~bmin ~bmax = 94 | (* Extend the forward search by one edit step in each diagonal. *) 95 | let fmin = 96 | if fmin > dmin 97 | then ( 98 | fd.(sh + fmin - 2) <- -1; 99 | fmin - 1) 100 | else fmin + 1 101 | in 102 | let fmax = 103 | if fmax < dmax 104 | then ( 105 | fd.(sh + fmax + 2) <- -1; 106 | fmax + 1) 107 | else fmax - 1 108 | in 109 | (* [d] is the active diagonal. *) 110 | (let rec loop d = 111 | if d < fmin 112 | then () 113 | else ( 114 | let tlo = fd.(sh + d - 1) in 115 | let thi = fd.(sh + d + 1) in 116 | let x = if tlo >= thi then tlo + 1 else thi in 117 | let x, y = 118 | let rec loop ~xv ~yv ~xlim ~ylim ~x ~y = 119 | if x < xlim && y < ylim && phys_equal (xv x) (yv y) 120 | then loop ~xv ~yv ~xlim ~ylim ~x:(x + 1) ~y:(y + 1) 121 | else x, y 122 | in 123 | loop ~xv ~yv ~xlim ~ylim ~x ~y:(x - d) 124 | in 125 | fd.(sh + d) <- x; 126 | if odd && bmin <= d && d <= bmax && bd.(sh + d) <= fd.(sh + d) 127 | then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true } 128 | else loop (d - 2)) 129 | in 130 | loop fmax); 131 | (* Similarly extend the backward search. *) 132 | let bmin = 133 | if bmin > dmin 134 | then ( 135 | bd.(sh + bmin - 2) <- Int.max_value; 136 | bmin - 1) 137 | else bmin + 1 138 | in 139 | let bmax = 140 | if bmax < dmax 141 | then ( 142 | bd.(sh + bmax + 2) <- Int.max_value; 143 | bmax + 1) 144 | else bmax - 1 145 | in 146 | (let rec loop d = 147 | if d < bmin 148 | then () 149 | else ( 150 | let tlo = bd.(sh + d - 1) in 151 | let thi = bd.(sh + d + 1) in 152 | let x = if tlo < thi then tlo else thi - 1 in 153 | let x, y = 154 | let rec loop ~xv ~yv ~xoff ~yoff ~x ~y = 155 | if x > xoff && y > yoff && phys_equal (xv (x - 1)) (yv (y - 1)) 156 | then loop ~xv ~yv ~xoff ~yoff ~x:(x - 1) ~y:(y - 1) 157 | else x, y 158 | in 159 | loop ~xv ~yv ~xoff ~yoff ~x ~y:(x - d) 160 | in 161 | bd.(sh + d) <- x; 162 | if (not odd) && fmin <= d && d <= fmax && bd.(sh + d) <= fd.(sh + d) 163 | then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true } 164 | else loop (d - 2)) 165 | in 166 | loop bmax); 167 | (* Heuristic: if we've gone well beyond the call of duty, give up and report halfway 168 | between our best results so far. *) 169 | if (not find_minimal) && c >= too_expensive 170 | then ( 171 | (* Find forward diagonal that maximizes [x + y]. *) 172 | let fxybest, fxbest = 173 | let rec loop ~d ~fxybest ~fxbest = 174 | if d < fmin 175 | then fxybest, fxbest 176 | else ( 177 | let x = Int.min fd.(sh + d) xlim in 178 | let y = x - d in 179 | let x, y = if ylim < y then ylim + d, ylim else x, y in 180 | let fxybest, fxbest = 181 | if fxybest < x + y then x + y, x else fxybest, fxbest 182 | in 183 | loop ~d:(d - 2) ~fxybest ~fxbest) 184 | in 185 | loop ~d:fmax ~fxybest:(-1) ~fxbest:fmax 186 | in 187 | (* Find backward diagonal that minimizes [x + y]. *) 188 | let bxybest, bxbest = 189 | let rec loop ~d ~bxybest ~bxbest = 190 | if d < bmin 191 | then bxybest, bxbest 192 | else ( 193 | let x = Int.max xoff bd.(sh + d) in 194 | let y = x - d in 195 | let x, y = if y < yoff then yoff + d, yoff else x, y in 196 | let bxybest, bxbest = 197 | if x + y < bxybest then x + y, x else bxybest, bxbest 198 | in 199 | loop ~d:(d - 2) ~bxybest ~bxbest) 200 | in 201 | loop ~d:bmax ~bxybest:Int.max_value ~bxbest:bmax 202 | in 203 | if xlim + ylim - bxybest < fxybest - (xoff + yoff) 204 | then 205 | return 206 | { xmid = fxbest 207 | ; ymid = fxybest - fxbest 208 | ; lo_minimal = true 209 | ; hi_minimal = false 210 | } 211 | else 212 | return 213 | { xmid = bxbest 214 | ; ymid = bxybest - bxbest 215 | ; lo_minimal = false 216 | ; hi_minimal = true 217 | }) 218 | else loop ~c:(c + 1) ~fmin ~fmax ~bmin ~bmax 219 | in 220 | loop ~c:1 ~fmin:fmid ~fmax:fmid ~bmin:bmid ~bmax:bmid) 221 | ;; 222 | 223 | let diff_loop ~cutoff a ai b bi n m = 224 | let fd = Array.create ~len:(n + m + 3) 0 in 225 | let bd = Array.create ~len:(n + m + 3) 0 in 226 | let sh = m + 1 in 227 | let too_expensive = 228 | match cutoff with 229 | | Some c -> c 230 | | None -> 231 | let diags = n + m + 3 in 232 | let rec loop diags too_expensive = 233 | if diags = 0 then too_expensive else loop (diags asr 2) (too_expensive lsl 1) 234 | in 235 | Int.max 4096 (loop diags 1) 236 | in 237 | let xvec i = a.(ai.(i)) in 238 | let yvec j = b.(bi.(j)) in 239 | let chng1 = Array.create ~len:(Array.length a) true in 240 | let chng2 = Array.create ~len:(Array.length b) true in 241 | for i = 0 to n - 1 do 242 | chng1.(ai.(i)) <- false 243 | done; 244 | for j = 0 to m - 1 do 245 | chng2.(bi.(j)) <- false 246 | done; 247 | let rec loop ~xoff ~xlim ~yoff ~ylim ~find_minimal = 248 | let xoff, yoff = 249 | let rec loop ~xoff ~yoff = 250 | if xoff < xlim && yoff < ylim && phys_equal (xvec xoff) (yvec yoff) 251 | then loop ~xoff:(xoff + 1) ~yoff:(yoff + 1) 252 | else xoff, yoff 253 | in 254 | loop ~xoff ~yoff 255 | in 256 | let xlim, ylim = 257 | let rec loop ~xlim ~ylim = 258 | if xlim > xoff && ylim > yoff && phys_equal (xvec (xlim - 1)) (yvec (ylim - 1)) 259 | then loop ~xlim:(xlim - 1) ~ylim:(ylim - 1) 260 | else xlim, ylim 261 | in 262 | loop ~xlim ~ylim 263 | in 264 | if xoff = xlim 265 | then 266 | for y = yoff to ylim - 1 do 267 | chng2.(bi.(y)) <- true 268 | done 269 | else if yoff = ylim 270 | then 271 | for x = xoff to xlim - 1 do 272 | chng1.(ai.(x)) <- true 273 | done 274 | else ( 275 | let { Partition.xmid; ymid; lo_minimal; hi_minimal } = 276 | diag 277 | ~fd 278 | ~bd 279 | ~sh 280 | ~xv:xvec 281 | ~yv:yvec 282 | ~xoff 283 | ~xlim 284 | ~yoff 285 | ~ylim 286 | ~too_expensive 287 | ~find_minimal 288 | in 289 | loop ~xoff ~xlim:xmid ~yoff ~ylim:ymid ~find_minimal:lo_minimal; 290 | loop ~xoff:xmid ~xlim ~yoff:ymid ~ylim ~find_minimal:hi_minimal) 291 | in 292 | loop ~xoff:0 ~xlim:n ~yoff:0 ~ylim:m ~find_minimal:false; 293 | chng1, chng2 294 | ;; 295 | 296 | (* [make_indexer a b] returns an array of the indices of items of [a] which are also 297 | present in [b]; this way, the main algorithm can skip items which, anyway, are 298 | different. This improves the speed much. At the same time, this function updates the 299 | items of [a] and [b] so that all equal items point to the same unique item. All item 300 | comparisons in the main algorithm can therefore be done with [phys_equal] instead of 301 | [=], which can improve speed much. *) 302 | let make_indexer hashable a b = 303 | let n = Array.length a in 304 | let htb = Hashtbl.create hashable ~size:(10 * Array.length b) in 305 | Array.iteri 306 | ~f:(fun i e -> 307 | match Hashtbl.find htb e with 308 | | Some v -> b.(i) <- v 309 | | None -> Hashtbl.add_exn htb ~key:e ~data:e) 310 | b; 311 | let ai = Array.create ~len:n 0 in 312 | let k = 313 | let rec loop i k = 314 | if i = n 315 | then k 316 | else ( 317 | let k = 318 | match Hashtbl.find htb a.(i) with 319 | | Some v -> 320 | a.(i) <- v; 321 | ai.(k) <- i; 322 | k + 1 323 | | None -> k 324 | in 325 | loop (i + 1) k) 326 | in 327 | loop 0 0 328 | in 329 | Array.sub ai ~pos:0 ~len:k 330 | ;; 331 | 332 | let f ~cutoff ~hashable a b = 333 | let ai = make_indexer hashable a b in 334 | let bi = make_indexer hashable b a in 335 | let n = Array.length ai in 336 | let m = Array.length bi in 337 | diff_loop ~cutoff a ai b bi n m 338 | ;; 339 | 340 | let iter_matches ?cutoff ~f:ff ~hashable a b = 341 | let d1, d2 = f ~cutoff ~hashable a b in 342 | let rec aux i1 i2 = 343 | if i1 >= Array.length d1 || i2 >= Array.length d2 344 | then () 345 | else if not d1.(i1) 346 | then 347 | if not d2.(i2) 348 | then ( 349 | ff (i1, i2); 350 | aux (i1 + 1) (i2 + 1)) 351 | else aux i1 (i2 + 1) 352 | else if not d2.(i2) 353 | then aux (i1 + 1) i2 354 | else aux (i1 + 1) (i2 + 1) 355 | in 356 | aux 0 0 357 | ;; 358 | -------------------------------------------------------------------------------- /src/patience_diff.ml: -------------------------------------------------------------------------------- 1 | open! Core 2 | include Patience_diff_intf 3 | module Hunk = Hunk 4 | module Hunks = Hunks 5 | module Matching_block = Matching_block 6 | module Range = Range 7 | module Move_id = Move_id 8 | 9 | let ( <|> ) ar (i, j) = if j <= i then [||] else Array.slice ar i j 10 | 11 | (* Does the nitty gritty of turning indexes into 12 | line numbers and reversing the ranges, returning a nice new hunk *) 13 | let create_hunk prev_start prev_stop next_start next_stop ranges : _ Hunk.t = 14 | { prev_start = prev_start + 1 15 | ; prev_size = prev_stop - prev_start 16 | ; next_start = next_start + 1 17 | ; next_size = next_stop - next_start 18 | ; ranges = List.rev ranges 19 | } 20 | ;; 21 | 22 | module Ordered_sequence : sig 23 | type elt = int * int [@@deriving compare ~localize] 24 | 25 | (* A [t] has its second coordinates in increasing order *) 26 | 27 | type t = private elt array [@@deriving sexp_of] 28 | 29 | val create : (int * int) array -> t 30 | val is_empty : t -> bool 31 | end = struct 32 | type elt = int * int [@@deriving sexp_of] 33 | 34 | let%template compare_elt a b = 35 | (Comparable.lexicographic [@mode m]) 36 | [ (fun (_, y0) (_, y1) -> Int.compare y0 y1) 37 | ; (fun (x0, _) (x1, _) -> Int.compare x0 x1) 38 | ] 39 | a 40 | b 41 | [@@mode m = (local, global)] 42 | ;; 43 | 44 | type t = elt array [@@deriving sexp_of] 45 | 46 | let create t = 47 | Array.sort t ~compare:compare_elt; 48 | t 49 | ;; 50 | 51 | let is_empty = Array.is_empty 52 | end 53 | 54 | (* This is an implementation of the patience sorting algorithm as explained at 55 | http://en.wikipedia.org/wiki/Patience_sorting *) 56 | module Patience : sig 57 | val longest_increasing_subsequence : Ordered_sequence.t -> (int * int) list 58 | end = struct 59 | module Pile = struct 60 | type 'a t = 'a Stack.t 61 | 62 | let create x = 63 | let t = Stack.create () in 64 | Stack.push t x; 65 | t 66 | ;; 67 | 68 | let top t = Stack.top t |> Option.value_exn 69 | let put_on_top t x = Stack.push t x 70 | end 71 | 72 | module Piles = struct 73 | type 'a t = 'a Pile.t Deque.t 74 | 75 | let empty () : 'a t = Deque.create ~never_shrink:true () 76 | 77 | let get_ith_pile t i dir = 78 | let get index offset = 79 | Option.bind (index t) ~f:(fun index -> Deque.get_opt t (index + offset)) 80 | in 81 | match dir with 82 | | `From_left -> get Deque.front_index i 83 | | `From_right -> get Deque.back_index (-i) 84 | ;; 85 | 86 | let new_rightmost_pile t pile = Deque.enqueue_back t pile 87 | end 88 | 89 | module Backpointers = struct 90 | (* in the terminology of the Wikipedia article, this corresponds to a card together 91 | with its backpointers *) 92 | type 'a tag = 'a t 93 | 94 | and 'a t = 95 | { value : 'a 96 | ; tag : 'a tag option 97 | } 98 | 99 | let to_list t = 100 | let rec to_list acc t = 101 | match t.tag with 102 | | None -> t.value :: acc 103 | | Some t' -> to_list (t.value :: acc) t' 104 | in 105 | to_list [] t 106 | ;; 107 | end 108 | 109 | module Play_patience : sig 110 | val play_patience 111 | : Ordered_sequence.t 112 | -> get_tag: 113 | (pile_opt:int option 114 | -> piles:Ordered_sequence.elt Backpointers.t Piles.t 115 | -> Ordered_sequence.elt Backpointers.tag option) 116 | -> Ordered_sequence.elt Backpointers.t Piles.t 117 | end = struct 118 | let optimized_findi_from_left piles x = 119 | (* first see if any work *) 120 | let last_pile = Piles.get_ith_pile piles 0 `From_right in 121 | (* [x_pile] is a dummy pile just used for comparisons *) 122 | let x_pile = Pile.create { Backpointers.value = x, 0; tag = None } in 123 | let compare_top_values pile1 pile2 = 124 | let top pile = fst (Pile.top pile).Backpointers.value in 125 | Int.compare (top pile1) (top pile2) 126 | in 127 | let%bind.Option last_pile in 128 | if compare_top_values last_pile x_pile < 0 129 | then None 130 | else ( 131 | (* do binary search *) 132 | let r = 133 | Deque.binary_search 134 | piles 135 | `First_strictly_greater_than 136 | x_pile 137 | ~compare:compare_top_values 138 | in 139 | [%globalize: int option] r [@nontail]) 140 | ;; 141 | 142 | (* [play_patience ar ~get_tag] plays patience with the greedy algorithm as described 143 | in the Wikipedia article, taking [ar] to be the deck of cards. It returns the 144 | resulting [Piles.t]. Before putting an element of [ar] in a pile, it tags it using 145 | [get_tag]. [get_tag] takes as its arguments the full [Piles.t] in its current 146 | state, and also the specific [Pile.t] that the element of [ar] is being added to. 147 | *) 148 | let play_patience ar ~get_tag = 149 | let ar = (ar : Ordered_sequence.t :> Ordered_sequence.elt array) in 150 | if Array.length ar = 0 then raise (Invalid_argument "Patience_diff.play_patience"); 151 | let piles = Piles.empty () in 152 | Array.iter ar ~f:(fun x -> 153 | let pile_opt = optimized_findi_from_left piles (fst x) in 154 | let tagged_x = { Backpointers.value = x; tag = get_tag ~pile_opt ~piles } in 155 | match pile_opt with 156 | | None -> Piles.new_rightmost_pile piles (Pile.create tagged_x) 157 | | Some i -> 158 | let pile = Deque.get piles i in 159 | Pile.put_on_top pile tagged_x); 160 | piles 161 | ;; 162 | end 163 | 164 | let longest_increasing_subsequence ar = 165 | if Ordered_sequence.is_empty ar 166 | then [] 167 | else 168 | let module P = Play_patience in 169 | let get_tag ~pile_opt ~piles = 170 | match pile_opt with 171 | | None -> Piles.get_ith_pile piles 0 `From_right |> Option.map ~f:Pile.top 172 | | Some i -> 173 | if i = 0 174 | then None 175 | else 176 | Piles.get_ith_pile piles (i - 1) `From_left 177 | |> Option.value_exn 178 | |> Pile.top 179 | |> Option.some 180 | in 181 | let piles = P.play_patience ar ~get_tag in 182 | Piles.get_ith_pile piles 0 `From_right 183 | |> Option.value_exn 184 | |> Pile.top 185 | |> Backpointers.to_list 186 | ;; 187 | end 188 | 189 | let compare_int_pair = Tuple.T2.compare ~cmp1:Int.compare ~cmp2:Int.compare 190 | 191 | let%template _longest_increasing_subsequence ar = 192 | let ar = (ar : Ordered_sequence.t :> (int * int) array) in 193 | let len = Array.length ar in 194 | if len <= 1 195 | then Array.to_list ar 196 | else ( 197 | let maxlen = ref 0 in 198 | let m = Array.create ~len:(len + 1) (-1) in 199 | let pred = Array.create ~len:(len + 1) (-1) in 200 | for i = 0 to len - 1 do 201 | let p = 202 | Array.binary_search 203 | ~compare:Ordered_sequence.compare_elt 204 | ar 205 | `First_greater_than_or_equal_to 206 | ar.(i) 207 | ~len:(max (!maxlen - 1) 0) 208 | ~pos:1 209 | |> (Option.value [@mode local]) ~default:0 210 | in 211 | pred.(i) <- m.(p); 212 | if p = !maxlen || compare_int_pair ar.(i) ar.(p + 1) < 0 213 | then ( 214 | m.(p + 1) <- i; 215 | if p + 1 > !maxlen then maxlen := p + 1) 216 | done; 217 | let rec loop ac p = if p = -1 then ac else loop (ar.(p) :: ac) pred.(p) in 218 | loop [] m.(!maxlen)) 219 | ;; 220 | 221 | (* Configurable parameters for [semantic_cleanup] and [unique_lcs], all chosen based 222 | on empirical observation. *) 223 | (* This function is called on the edge case of semantic cleanup, when there's a change 224 | that's exactly the same length as the size of the match. If the insert on the next 225 | side is a LOT larger than the match, it should be semantically cleaned up, but 226 | most of the time it should be left alone. *) 227 | let should_discard_if_other_side_equal ~big_enough = 100 / big_enough 228 | 229 | (* These are the numerator and denominator of the cutoff for aborting the patience diff 230 | algorithm in [unique_lcs]. (This will result in us using [Plain_diff] instead.) 231 | Lowering [switch_to_plain_diff_numerator] / [switch_to_plain_diff_denominator] 232 | makes us switch to plain diff less often. The range of this cutoff is from 0 to 1, 233 | where 0 means we always switch and 1 means we never switch. *) 234 | let switch_to_plain_diff_numerator = 1 235 | let switch_to_plain_diff_denominator = 10 236 | 237 | module Make (Elt : Hashtbl.Key) = struct 238 | module Table = Hashtbl.Make (Elt) 239 | 240 | type elt = Elt.t 241 | 242 | (* This is an implementation of the patience diff algorithm by Bram Cohen as seen in 243 | Bazaar version 1.14.1 *) 244 | 245 | module Line_metadata = struct 246 | type t = 247 | | Unique_in_a of { index_in_a : int } 248 | | Unique_in_a_b of 249 | { index_in_a : int 250 | ; index_in_b : int 251 | } 252 | | Not_unique of { occurrences_in_a : int } 253 | end 254 | 255 | let unique_lcs (alpha, alo, ahi) (bravo, blo, bhi) = 256 | (* Create a hashtable which takes elements of a to their index in a iff they're 257 | unique. If an element is not unique, it takes it to its frequency in a. *) 258 | let unique : (elt, Line_metadata.t) Table.hashtbl = 259 | Table.create ~size:(Int.min (ahi - alo) (bhi - blo)) () 260 | in 261 | for x's_pos_in_a = alo to ahi - 1 do 262 | let x = alpha.(x's_pos_in_a) in 263 | match Hashtbl.find unique x with 264 | | None -> 265 | Hashtbl.set unique ~key:x ~data:(Unique_in_a { index_in_a = x's_pos_in_a }) 266 | | Some (Unique_in_a _) -> 267 | Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = 2 }) 268 | | Some (Not_unique { occurrences_in_a = n }) -> 269 | Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = n + 1 }) 270 | (* This case doesn't occur until the second pass through [unique] *) 271 | | Some (Unique_in_a_b _) -> assert false 272 | done; 273 | (* [num_pairs] is the size of the list we use for Longest Increasing Subsequence. 274 | [intersection_size] is the number of tokens in the intersection of the two 275 | sequences, with multiplicity, and is an upper bound on the size of the LCS. *) 276 | let num_pairs = ref 0 in 277 | let intersection_size = ref 0 in 278 | for x's_pos_in_b = blo to bhi - 1 do 279 | let x = bravo.(x's_pos_in_b) in 280 | Hashtbl.find unique x 281 | |> Option.iter ~f:(fun pos -> 282 | match pos with 283 | | Not_unique { occurrences_in_a = n } -> 284 | if n > 0 285 | then ( 286 | Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = n - 1 }); 287 | incr intersection_size) 288 | | Unique_in_a { index_in_a = x's_pos_in_a } -> 289 | incr num_pairs; 290 | incr intersection_size; 291 | Hashtbl.set 292 | unique 293 | ~key:x 294 | ~data:(Unique_in_a_b { index_in_a = x's_pos_in_a; index_in_b = x's_pos_in_b }) 295 | | Unique_in_a_b _ -> 296 | decr num_pairs; 297 | Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = 0 })) 298 | done; 299 | (* If we're ignoring almost all of the text when we perform the patience 300 | diff algorithm, it will often give bad results. *) 301 | if !num_pairs * switch_to_plain_diff_denominator 302 | < !intersection_size * switch_to_plain_diff_numerator 303 | then `Not_enough_unique_tokens 304 | else ( 305 | let a_b = 306 | let arr = Array.init !num_pairs ~f:(fun _ -> 0, 0) in 307 | let i = ref 0 in 308 | Hashtbl.iter unique ~f:(function 309 | | Not_unique _ | Unique_in_a _ -> () 310 | | Unique_in_a_b { index_in_a = i_a; index_in_b = i_b } -> 311 | arr.(!i) <- i_a, i_b; 312 | Int.incr i); 313 | Ordered_sequence.create arr 314 | in 315 | `Computed_lcs (Patience.longest_increasing_subsequence a_b)) 316 | ;; 317 | 318 | (* [matches a b] returns a list of pairs (i,j) such that a.(i) = b.(j) and such that 319 | the list is strictly increasing in both its first and second coordinates. 320 | 321 | This is done by first applying unique_lcs to find matches from a to b among those 322 | elements which are unique in both a and b, and then recursively applying [matches] to 323 | each subinterval determined by those matches. The uniqueness requirement is waived 324 | for blocks of matching lines at the beginning or end. 325 | 326 | I couldn't figure out how to do this efficiently in a functional way, so 327 | this is pretty much a straight translation of the original Python code. *) 328 | let matches alpha bravo = 329 | let matches_ref_length = ref 0 in 330 | let matches_ref = ref [] in 331 | let add_match m = 332 | incr matches_ref_length; 333 | matches_ref := m :: !matches_ref 334 | in 335 | let rec recurse_matches alo blo ahi bhi = 336 | (* printf "alo %d blo %d ahi %d bhi %d\n%!" alo blo ahi bhi; *) 337 | let old_length = !matches_ref_length in 338 | if not (alo >= ahi || blo >= bhi) 339 | then 340 | if Elt.compare alpha.(alo) bravo.(blo) = 0 341 | then ( 342 | let alo = ref alo in 343 | let blo = ref blo in 344 | while !alo < ahi && !blo < bhi && Elt.compare alpha.(!alo) bravo.(!blo) = 0 do 345 | add_match (!alo, !blo); 346 | incr alo; 347 | incr blo 348 | done; 349 | recurse_matches !alo !blo ahi bhi) 350 | else if Elt.compare alpha.(ahi - 1) bravo.(bhi - 1) = 0 351 | then ( 352 | let nahi = ref (ahi - 1) in 353 | let nbhi = ref (bhi - 1) in 354 | while 355 | !nahi > alo 356 | && !nbhi > blo 357 | && Elt.compare alpha.(!nahi - 1) bravo.(!nbhi - 1) = 0 358 | do 359 | decr nahi; 360 | decr nbhi 361 | done; 362 | recurse_matches alo blo !nahi !nbhi; 363 | for i = 0 to ahi - !nahi - 1 do 364 | add_match (!nahi + i, !nbhi + i) 365 | done) 366 | else ( 367 | let last_a_pos = ref (alo - 1) in 368 | let last_b_pos = ref (blo - 1) in 369 | let plain_diff () = 370 | Plain_diff.iter_matches 371 | ~hashable:(module Elt) 372 | (Array.sub alpha ~pos:alo ~len:(ahi - alo)) 373 | (Array.sub bravo ~pos:blo ~len:(bhi - blo)) 374 | ~f:(fun (i1, i2) -> add_match (alo + i1, blo + i2)) 375 | in 376 | match unique_lcs (alpha, alo, ahi) (bravo, blo, bhi) with 377 | | `Not_enough_unique_tokens -> plain_diff () 378 | | `Computed_lcs lcs -> 379 | lcs 380 | |> List.iter ~f:(fun (apos, bpos) -> 381 | if !last_a_pos + 1 <> apos || !last_b_pos + 1 <> bpos 382 | then recurse_matches (!last_a_pos + 1) (!last_b_pos + 1) apos bpos; 383 | last_a_pos := apos; 384 | last_b_pos := bpos; 385 | add_match (apos, bpos)); 386 | if !matches_ref_length > old_length (* Did unique_lcs find anything at all? *) 387 | then recurse_matches (!last_a_pos + 1) (!last_b_pos + 1) ahi bhi 388 | else plain_diff ()) 389 | in 390 | recurse_matches 0 0 (Array.length alpha) (Array.length bravo); 391 | List.rev !matches_ref 392 | ;; 393 | 394 | let collapse_sequences matches = 395 | let collapsed = ref [] in 396 | let start_a = ref None in 397 | let start_b = ref None in 398 | let length = ref 0 in 399 | List.iter matches ~f:(fun (i_a, i_b) -> 400 | match !start_a, !start_b with 401 | | Some start_a_val, Some start_b_val 402 | when i_a = start_a_val + !length && i_b = start_b_val + !length -> incr length 403 | | _ -> 404 | (match !start_a, !start_b with 405 | | Some start_a_val, Some start_b_val -> 406 | let matching_block = 407 | { Matching_block.prev_start = start_a_val 408 | ; next_start = start_b_val 409 | ; length = !length 410 | } 411 | in 412 | collapsed := matching_block :: !collapsed 413 | | _ -> ()); 414 | start_a := Some i_a; 415 | start_b := Some i_b; 416 | length := 1); 417 | (match !start_a, !start_b with 418 | | Some start_a_val, Some start_b_val when !length <> 0 -> 419 | let matching_block = 420 | { Matching_block.prev_start = start_a_val 421 | ; next_start = start_b_val 422 | ; length = !length 423 | } 424 | in 425 | collapsed := matching_block :: !collapsed 426 | | _ -> ()); 427 | List.rev !collapsed 428 | ;; 429 | 430 | (* Given that there's an insert/delete of size [left_change] to the left, and 431 | an insert/delete of size [right_change] to the right, should we keep 432 | this block of length [block_len] in our list of matches, or discard it? *) 433 | let should_discard_match ~big_enough ~left_change ~right_change ~block_len = 434 | (* Throw away if its effective length is too small, 435 | relative to its surrounding inserts / deletes. *) 436 | block_len < big_enough 437 | && ((left_change > block_len && right_change > block_len) 438 | || (left_change >= block_len + should_discard_if_other_side_equal ~big_enough 439 | && right_change = block_len) 440 | || (right_change >= block_len + should_discard_if_other_side_equal ~big_enough 441 | && left_change = block_len)) 442 | ;; 443 | 444 | let change_between 445 | (left_matching_block : Matching_block.t) 446 | (right_matching_block : Matching_block.t) 447 | = 448 | max 449 | (right_matching_block.prev_start - left_matching_block.prev_start) 450 | (right_matching_block.next_start - left_matching_block.next_start) 451 | - left_matching_block.length 452 | ;; 453 | 454 | (* See the "Semantic Chaff" section of https://neil.fraser.name/writing/diff/ *) 455 | let basic_semantic_cleanup ~big_enough matching_blocks = 456 | if big_enough <= 1 457 | then matching_blocks 458 | else ( 459 | match matching_blocks with 460 | | [] -> [] 461 | | first_block :: other_blocks -> 462 | let final_ans, final_pending = 463 | List.fold 464 | other_blocks 465 | ~init:([], first_block) 466 | ~f:(fun (ans, pending) current_block -> 467 | let rec loop ans pending = 468 | match ans with 469 | | [] -> ans, pending 470 | | hd :: tl -> 471 | if should_discard_match 472 | ~big_enough 473 | ~left_change:(change_between hd pending) 474 | ~right_change:(change_between pending current_block) 475 | ~block_len:pending.length 476 | then loop tl hd 477 | else ans, pending 478 | in 479 | let updated_ans, updated_pending = loop ans pending in 480 | updated_pending :: updated_ans, current_block) 481 | in 482 | List.rev (final_pending :: final_ans)) 483 | ;; 484 | 485 | (* Attempts to eliminate the "tunnel vision" problem described in the 486 | "Semantic Chaff" section of https://neil.fraser.name/writing/diff/. 487 | To do this, we go through each pair of consecutive matches 488 | and pretend to combine them into one match. If that match would 489 | be deleted by [basic_semantic_cleanup], we delete both. *) 490 | let advanced_semantic_cleanup ~big_enough matching_blocks = 491 | if big_enough <= 1 492 | then matching_blocks 493 | else ( 494 | match matching_blocks with 495 | | [] -> [] 496 | | [ first_block ] -> [ first_block ] 497 | | first_block :: second_block :: other_blocks -> 498 | let final_ans, final_pendingA, final_pendingB = 499 | List.fold 500 | other_blocks 501 | ~init:([], first_block, second_block) 502 | ~f:(fun (ans, pendingA, pendingB) current_block -> 503 | let rec loop ans pendingA pendingB = 504 | match ans with 505 | | [] -> ans, pendingA, pendingB 506 | | hd :: tl -> 507 | if should_discard_match 508 | ~big_enough 509 | ~left_change:(change_between hd pendingA) 510 | ~right_change:(change_between pendingB current_block) 511 | ~block_len: 512 | (pendingB.length 513 | + min 514 | (pendingB.prev_start - pendingA.prev_start) 515 | (pendingB.next_start - pendingA.next_start)) 516 | then loop tl hd pendingA 517 | else ans, pendingA, pendingB 518 | in 519 | let updated_ans, updated_pendingA, updated_pendingB = 520 | loop ans pendingA pendingB 521 | in 522 | updated_pendingA :: updated_ans, updated_pendingB, current_block) 523 | in 524 | List.rev (final_pendingB :: final_pendingA :: final_ans) 525 | (* The loop above only deleted the second element of each pair we're supposed to 526 | delete. This call to [basic_semantic_cleanup] is guaranteed to finish the job 527 | by deleting the remaining element of those pairs. *) 528 | |> basic_semantic_cleanup ~big_enough) 529 | ;; 530 | 531 | (* Goal: eliminate small, semantically meaningless matches. *) 532 | let semantic_cleanup ~big_enough matching_blocks = 533 | basic_semantic_cleanup ~big_enough matching_blocks 534 | |> advanced_semantic_cleanup ~big_enough 535 | ;; 536 | 537 | (* When we have a choice, we'd prefer one block of equality to two. 538 | For example, instead of A B A C D E F, we prefer 539 | A B A C D E F. There are two reasons: 540 | 541 | (1) A is usually something like "let", and so the second version is more 542 | semantically accurate 543 | (2) Semantic cleanup may delete the lone A match, but it will not delete 544 | the A C D E F match). So by moving the A match, we've also saved it. *) 545 | let combine_equalities ~prev ~next ~matches = 546 | match matches with 547 | | [] -> [] 548 | | first_block :: tl -> 549 | List.fold tl ~init:([], first_block) ~f:(fun (ans, pending) block -> 550 | let rec loop ans ~(pending : Matching_block.t) ~(new_block : Matching_block.t) = 551 | if pending.length = 0 552 | then ans, pending, new_block 553 | else ( 554 | let advance_in_prev = 555 | Elt.compare 556 | prev.(pending.prev_start + pending.length - 1) 557 | prev.(new_block.prev_start - 1) 558 | = 0 559 | in 560 | let advance_in_next = 561 | Elt.compare 562 | next.(pending.next_start + pending.length - 1) 563 | next.(new_block.next_start - 1) 564 | = 0 565 | in 566 | if advance_in_prev && advance_in_next 567 | then 568 | loop 569 | ans 570 | ~pending: 571 | { prev_start = pending.prev_start 572 | ; next_start = pending.next_start 573 | ; length = pending.length - 1 574 | } 575 | ~new_block: 576 | { prev_start = new_block.prev_start - 1 577 | ; next_start = new_block.next_start - 1 578 | ; length = new_block.length + 1 579 | } 580 | else ans, pending, new_block) 581 | in 582 | let updated_ans, updated_pending, updated_new_block = 583 | loop ans ~pending ~new_block:block 584 | in 585 | (* In the original Google heuristic, we would either move all or none 586 | of pending. But because it might start with an unmatched `Newline(0, None), 587 | we are fine with moving all but one token of it. *) 588 | if updated_pending.length = 0 || updated_pending.length = 1 589 | then ( 590 | let new_ans = 591 | if updated_pending.length = 0 592 | then updated_ans 593 | else updated_pending :: updated_ans 594 | in 595 | new_ans, updated_new_block) 596 | else (* Do nothing *) 597 | pending :: ans, block) 598 | |> fun (ans, pending) -> List.rev (pending :: ans) 599 | ;; 600 | 601 | (* The aim here is to be able to transform e.g. foo frob flip into foo 602 | frob flip by scoring each boundary. *) 603 | (* {prev,next}_{elts,scorable} are the two arrays being diffed as (a) [Elt.t array]s 604 | which comes from mapping with the [transform] function and (b) ['a array]s where the 605 | [score] function operates on values of type ['a]. 606 | 607 | [blocks] is a list of [Matching_block.t]s which specify the contiguous chunks 608 | of prev/next which are the same (when index ranges are missing, that missing 609 | chunk is unique to the array they are missing from). *) 610 | let align_diffs 611 | ~prev_elts 612 | ~prev_scorable 613 | ~next_elts 614 | ~next_scorable 615 | ~max_slide 616 | ~score 617 | blocks 618 | = 619 | if max_slide = 0 620 | then blocks 621 | else ( 622 | match blocks with 623 | | [] -> [] 624 | | (first_block : Matching_block.t) :: tl -> 625 | let score_elt_and_prev arr kind i = 626 | let i0 = i - 1 in 627 | let i1 = i in 628 | if i0 < 0 || i1 >= Array.length arr then 100 else score kind arr.(i0) arr.(i1) 629 | in 630 | (* [left] and [right] are both exclusive indexes, on either side of the inserted 631 | or deleted region. *) 632 | let score' arr ~left ~right = 633 | score_elt_and_prev arr `right right + score_elt_and_prev arr `left (left + 1) 634 | in 635 | let score ~prev_left ~prev_right ~next_left ~next_right = 636 | match prev_left + 1 = prev_right, next_left + 1 = next_right with 637 | | true, true -> 638 | (* This shouldn't happen; it means the two blocks of equal elements touch and 639 | should have been merged. *) 640 | 0 641 | | true, false -> 642 | (* inserting *) 643 | score' next_scorable ~left:next_left ~right:next_right 644 | | false, true -> 645 | (* deleting *) 646 | score' prev_scorable ~left:prev_left ~right:prev_right 647 | | false, false -> 648 | (* editing. I think this also shouldn’t happen as if e.g. sliding right is 649 | viable you could just have had a bigger matching block on the right instead 650 | *) 651 | min 652 | (score' prev_scorable ~left:prev_left ~right:prev_right) 653 | (score' next_scorable ~left:next_left ~right:next_right) 654 | in 655 | let rec align_consecutive_blocks acc (left_block : Matching_block.t) right_blocks = 656 | let best_score = ref 0 in 657 | let offset_of_best_score = ref 0 in 658 | let score_initial 659 | (left_block : Matching_block.t) 660 | (right_block : Matching_block.t) 661 | = 662 | best_score 663 | := score 664 | ~prev_left:(left_block.prev_start + left_block.length - 1) 665 | ~prev_right:right_block.prev_start 666 | ~next_left:(left_block.next_start + left_block.length - 1) 667 | ~next_right:right_block.next_start 668 | in 669 | let rec try_to_slide_left 670 | ~i 671 | (left_block : Matching_block.t) 672 | (right_block : Matching_block.t) 673 | = 674 | let offset_into_left = left_block.length - i in 675 | if offset_into_left < 0 676 | then () 677 | else if i > max_slide 678 | then 679 | () 680 | (* {v 681 | next: A B C D X P Q R S T X E F G H I 682 | prev: A B C D X E F G H I 683 | ^ ^ ^ ^ 684 | \..left../ \..right../ v} 685 | 686 | We want to see if shortening [left] and lengthening and moving-left 687 | [right] is viable. Requires last element of [left] to be equal to the 688 | element before [right]. *) 689 | else ( 690 | let prev_left = left_block.prev_start + offset_into_left - 1 in 691 | let prev_right = right_block.prev_start - i in 692 | let next_left = left_block.next_start + offset_into_left - 1 in 693 | let next_right = right_block.next_start - i in 694 | if Elt.compare next_elts.(next_left + 1) next_elts.(next_right) <> 0 695 | then () 696 | else if Elt.compare prev_elts.(prev_left + 1) prev_elts.(prev_right) <> 0 697 | then () 698 | else ( 699 | let score = score ~prev_left ~prev_right ~next_left ~next_right in 700 | if score > !best_score 701 | then ( 702 | best_score := score; 703 | offset_of_best_score := -i); 704 | try_to_slide_left ~i:(i + 1) left_block right_block)) 705 | in 706 | let rec try_to_slide_right 707 | ~i 708 | (left_block : Matching_block.t) 709 | (right_block : Matching_block.t) 710 | = 711 | let offset_into_left = left_block.length + i - 1 in 712 | if i > right_block.length 713 | then () 714 | else if i > max_slide 715 | then 716 | () 717 | (* {v 718 | next: A B C D X P Q R S T X E F G H I 719 | prev: A B C D X E F G H I 720 | ^ ^ ^ ^ 721 | \.left./ \...right.../ v} 722 | 723 | We want to see if lengthening [left] and shortening and moving-right 724 | [right] is viable. Requires element after [left] to be equal to the first 725 | element of [right]. *) 726 | else ( 727 | let prev_left = left_block.prev_start + offset_into_left in 728 | let prev_right = right_block.prev_start + i in 729 | let next_left = left_block.next_start + offset_into_left in 730 | let next_right = right_block.next_start + i in 731 | if Elt.compare next_elts.(next_left) next_elts.(next_right - 1) <> 0 732 | then () 733 | else if Elt.compare prev_elts.(prev_left) prev_elts.(prev_right - 1) <> 0 734 | then () 735 | else ( 736 | let score = score ~prev_left ~prev_right ~next_left ~next_right in 737 | if score > !best_score 738 | then ( 739 | best_score := score; 740 | offset_of_best_score := i); 741 | try_to_slide_right ~i:(i + 1) left_block right_block)) 742 | in 743 | match right_blocks with 744 | | [] -> List.rev (left_block :: acc) 745 | | right_block :: rest -> 746 | (match left_block.length with 747 | | 0 -> 748 | (* It is possible to end up with a length=0 block due to sliding the 749 | previous block to include it. If this happens we should drop it (so long 750 | as it isn’t the last block; we want a length=0 block at the end). *) 751 | align_consecutive_blocks acc right_block rest 752 | | _ -> 753 | score_initial left_block right_block; 754 | try_to_slide_left ~i:1 left_block right_block; 755 | try_to_slide_right ~i:1 left_block right_block; 756 | (match !offset_of_best_score with 757 | | 0 -> align_consecutive_blocks (left_block :: acc) right_block rest 758 | | slide -> 759 | let new_left = { left_block with length = left_block.length + slide } in 760 | let new_right : Matching_block.t = 761 | { prev_start = right_block.prev_start + slide 762 | ; next_start = right_block.next_start + slide 763 | ; length = right_block.length - slide 764 | } 765 | in 766 | let acc = if new_left.length > 0 then new_left :: acc else acc in 767 | align_consecutive_blocks acc new_right rest)) 768 | in 769 | align_consecutive_blocks [] first_block tl) 770 | ;; 771 | 772 | let get_matching_blocks 773 | ~transform 774 | ?(big_enough = 1) 775 | ?(max_slide = 0) 776 | ?(score = fun _ _ _ -> 100) 777 | ~prev:prev_scorable 778 | ~next:next_scorable 779 | () 780 | = 781 | let prev = Array.map prev_scorable ~f:transform in 782 | let next = Array.map next_scorable ~f:transform in 783 | let matches = matches prev next |> collapse_sequences in 784 | let matches = combine_equalities ~prev ~next ~matches in 785 | let last_match = 786 | { Matching_block.prev_start = Array.length prev 787 | ; next_start = Array.length next 788 | ; length = 0 789 | } 790 | in 791 | List.append matches [ last_match ] 792 | |> semantic_cleanup ~big_enough 793 | (* We want to score untransformed elements (e.g. still with whitespace) *) 794 | |> align_diffs 795 | ~prev_elts:prev 796 | ~prev_scorable 797 | ~next_elts:next 798 | ~next_scorable 799 | ~max_slide 800 | ~score 801 | ;; 802 | 803 | let get_ranges_rev ~transform ~big_enough ?max_slide ?score ~prev ~next () = 804 | let rec aux (matching_blocks : Matching_block.t list) i j l : _ Range.t list = 805 | match matching_blocks with 806 | | current_block :: remaining_blocks -> 807 | let prev_index, next_index, size = 808 | current_block.prev_start, current_block.next_start, current_block.length 809 | in 810 | (* Throw away crossover matches *) 811 | if prev_index < i || next_index < j 812 | then aux remaining_blocks i j l 813 | else ( 814 | let range_opt : _ Range.t option = 815 | if i < prev_index && j < next_index 816 | then ( 817 | let prev_range = prev <|> (i, prev_index) in 818 | let next_range = next <|> (j, next_index) in 819 | Some (Replace (prev_range, next_range, None))) 820 | else if i < prev_index 821 | then ( 822 | let prev_range = prev <|> (i, prev_index) in 823 | Some (Prev (prev_range, None))) 824 | else if j < next_index 825 | then ( 826 | let next_range = next <|> (j, next_index) in 827 | Some (Next (next_range, None))) 828 | else None 829 | in 830 | let l = 831 | match range_opt with 832 | | Some range -> range :: l 833 | | None -> l 834 | in 835 | let prev_stop, next_stop = prev_index + size, next_index + size in 836 | let l = 837 | if size = 0 838 | then l 839 | else ( 840 | let prev_range = prev <|> (prev_index, prev_stop) in 841 | let next_range = next <|> (next_index, next_stop) in 842 | let range = Array.map2_exn prev_range next_range ~f:(fun x y -> x, y) in 843 | Same range :: l) 844 | in 845 | aux remaining_blocks prev_stop next_stop l) 846 | | [] -> List.rev l 847 | in 848 | let matching_blocks = 849 | get_matching_blocks ~transform ~big_enough ?max_slide ?score ~prev ~next () 850 | in 851 | aux matching_blocks 0 0 [] 852 | ;; 853 | 854 | let get_hunks ~transform ~context ?(big_enough = 1) ?max_slide ?score ~prev ~next () = 855 | let ranges = get_ranges_rev ~transform ~big_enough ?max_slide ?score ~prev ~next () in 856 | let a = prev in 857 | let b = next in 858 | if context < 0 859 | then ( 860 | let singleton_hunk = 861 | create_hunk 0 (Array.length a) 0 (Array.length b) (List.rev ranges) 862 | in 863 | [ singleton_hunk ]) 864 | else ( 865 | let rec aux ranges_remaining curr_ranges alo ahi blo bhi acc_hunks = 866 | match (ranges_remaining : _ Range.t list) with 867 | | [] -> 868 | (* Finish the last hunk *) 869 | let new_hunk = create_hunk alo ahi blo bhi curr_ranges in 870 | (* Add it to the accumulator *) 871 | let acc_hunks = new_hunk :: acc_hunks in 872 | (* Finished! Return the accumulator *) 873 | List.rev acc_hunks 874 | | [ Same range ] -> 875 | (* If the last range is a Same, we might need to crop to context. *) 876 | let stop = min (Array.length range) context in 877 | let new_range = Range.Same (range <|> (0, stop)) in 878 | let curr_ranges = new_range :: curr_ranges in 879 | (* Finish the current hunk *) 880 | let ahi = ahi + stop in 881 | let bhi = bhi + stop in 882 | let new_hunk = create_hunk alo ahi blo bhi curr_ranges in 883 | (* Add it to the accumulator *) 884 | let acc_hunks = new_hunk :: acc_hunks in 885 | (* Finished! Return the accumulator *) 886 | List.rev acc_hunks 887 | | Same range :: rest -> 888 | let size = Array.length range in 889 | if size > context * 2 890 | then ( 891 | (* If this Same range is sufficiently large, split off a new hunk *) 892 | let new_range = Range.Same (range <|> (0, context)) in 893 | let curr_ranges = new_range :: curr_ranges in 894 | (* Advance both hi's by context *) 895 | let ahi = ahi + context in 896 | let bhi = bhi + context in 897 | (* Finish the current hunk *) 898 | let new_hunk = create_hunk alo ahi blo bhi curr_ranges in 899 | (* Add it to the accumulator *) 900 | let acc_hunks = new_hunk :: acc_hunks in 901 | (* Calculate ranges for the next hunk *) 902 | let alo = ahi + size - (2 * context) in 903 | let ahi = alo in 904 | let blo = bhi + size - (2 * context) in 905 | let bhi = blo in 906 | (* Push the remainder of the Equal range back onto the remaining_ranges *) 907 | let rest = Range.Same (range <|> (size - context, size)) :: rest in 908 | aux rest [] alo ahi blo bhi acc_hunks) 909 | else ( 910 | (* Otherwise, this range is small enough that it qualifies as context for 911 | the both the previous and forthcoming range, so simply add it to 912 | curr_ranges untouched *) 913 | let curr_ranges = Range.Same range :: curr_ranges in 914 | let ahi = ahi + size in 915 | let bhi = bhi + size in 916 | aux rest curr_ranges alo ahi blo bhi acc_hunks) 917 | | range :: rest -> 918 | (* Any range that isn't an Equal is important and not just context, so keep 919 | it in curr_ranges *) 920 | let curr_ranges = range :: curr_ranges in 921 | (* rest could be anything, so extract hunk_info from range *) 922 | let ahi, bhi = 923 | match range with 924 | | Same _ -> 925 | (* We eliminate the possibility of a Same above *) 926 | assert false 927 | | Unified _ -> 928 | (* get_ranges_rev never returns a Unified range *) 929 | assert false 930 | | Prev (_, Some _) | Next (_, Some _) | Replace (_, _, Some _) -> 931 | (* get_ranges_rev never finds moves *) 932 | assert false 933 | | Next (range, None) -> 934 | let stop = bhi + Array.length range in 935 | ahi, stop 936 | | Prev (range, None) -> 937 | let stop = ahi + Array.length range in 938 | stop, bhi 939 | | Replace (a_range, b_range, None) -> 940 | let prev_stop = ahi + Array.length a_range in 941 | let next_stop = bhi + Array.length b_range in 942 | prev_stop, next_stop 943 | in 944 | aux rest curr_ranges alo ahi blo bhi acc_hunks 945 | in 946 | let ranges, alo, ahi, blo, bhi = 947 | match ranges with 948 | (* If the first range is an Equal, shave off the front of the range, according to 949 | context. Keep it on the ranges list so hunk construction can see where the range 950 | begins *) 951 | | Same range :: rest -> 952 | let stop = Array.length range in 953 | let start = max 0 (stop - context) in 954 | let new_range = Range.Same (range <|> (start, stop)) in 955 | new_range :: rest, start, start, start, start 956 | | rest -> rest, 0, 0, 0, 0 957 | in 958 | aux ranges [] alo ahi blo bhi []) 959 | ;; 960 | 961 | let match_ratio a b = 962 | (matches a b |> List.length |> ( * ) 2 |> float) 963 | /. (Array.length a + Array.length b |> float) 964 | ;; 965 | 966 | let collapse_multi_sequences matches = 967 | let collapsed = ref [] in 968 | let value_exn x = Option.value_exn x in 969 | if List.is_empty matches 970 | then [] 971 | else ( 972 | let start = Array.create ~len:(List.length (List.hd_exn matches)) None in 973 | let length = ref 0 in 974 | List.iter matches ~f:(fun il -> 975 | if Array.for_all start ~f:Option.is_some 976 | && List.mapi il ~f:(fun i x -> x = value_exn start.(i) + !length) 977 | |> List.for_all ~f:(fun x -> x) 978 | then incr length 979 | else ( 980 | if Array.for_all start ~f:Option.is_some 981 | then 982 | collapsed 983 | := (Array.map start ~f:value_exn |> Array.to_list, !length) :: !collapsed; 984 | List.iteri il ~f:(fun i x -> start.(i) <- Some x); 985 | length := 1)); 986 | if Array.for_all start ~f:Option.is_some && !length <> 0 987 | then 988 | collapsed 989 | := (Array.map start ~f:value_exn |> Array.to_list, !length) :: !collapsed; 990 | List.rev !collapsed) 991 | ;; 992 | 993 | type 'a segment = 994 | | Same of 'a array 995 | | Different of 'a array array 996 | 997 | type 'a merged_array = 'a segment list 998 | 999 | let array_mapi2 ar1 ar2 ~f = 1000 | Array.zip_exn ar1 ar2 |> Array.mapi ~f:(fun i (x, y) -> f i x y) 1001 | ;; 1002 | 1003 | let merge ar = 1004 | if Array.length ar = 0 1005 | then [] 1006 | else if Array.length ar = 1 1007 | then [ Same ar.(0) ] 1008 | else ( 1009 | let matches's = Array.map (ar <|> (1, Array.length ar)) ~f:(matches ar.(0)) in 1010 | let len = Array.length ar in 1011 | let hashtbl = Int.Table.create () ~size:0 in 1012 | Array.iteri matches's ~f:(fun i matches -> 1013 | List.iter matches ~f:(fun (a, b) -> 1014 | match Hashtbl.find hashtbl a with 1015 | | None -> Hashtbl.set hashtbl ~key:a ~data:[ i, b ] 1016 | | Some l -> Hashtbl.set hashtbl ~key:a ~data:((i, b) :: l))); 1017 | let list = 1018 | Hashtbl.to_alist hashtbl 1019 | |> List.filter_map ~f:(fun (a, l) -> 1020 | if List.length l = len - 1 1021 | then Some (a :: (List.sort l ~compare:compare_int_pair |> List.map ~f:snd)) 1022 | else None) 1023 | |> List.sort ~compare:(List.compare Int.compare) 1024 | in 1025 | let matching_blocks = collapse_multi_sequences list in 1026 | let last_pos = Array.create ~len:(Array.length ar) 0 in 1027 | let merged_array = ref [] in 1028 | List.iter matching_blocks ~f:(fun (l, len) -> 1029 | let ar' = Array.of_list l in 1030 | if Array.compare Int.compare last_pos ar' <> 0 1031 | then 1032 | merged_array 1033 | := Different (array_mapi2 last_pos ar' ~f:(fun i n m -> ar.(i) <|> (n, m))) 1034 | :: !merged_array; 1035 | merged_array := Same (ar.(0) <|> (ar'.(0), ar'.(0) + len)) :: !merged_array; 1036 | Array.iteri last_pos ~f:(fun i _ -> last_pos.(i) <- ar'.(i) + len)); 1037 | let trailing_lines = 1038 | Array.existsi last_pos ~f:(fun i last_pos -> Array.length ar.(i) > last_pos) 1039 | in 1040 | if trailing_lines 1041 | then 1042 | merged_array 1043 | := Different 1044 | (Array.mapi last_pos ~f:(fun i n -> ar.(i) <|> (n, Array.length ar.(i)))) 1045 | :: !merged_array; 1046 | List.rev !merged_array) 1047 | ;; 1048 | end 1049 | 1050 | module%test _ = struct 1051 | module P = Make (Int) 1052 | 1053 | let%test_unit _ = 1054 | let check a b ~expect = [%test_result: (int * int) list] (P.matches a b) ~expect in 1055 | check [||] [||] ~expect:[]; 1056 | check [| 0 |] [| 0 |] ~expect:[ 0, 0 ]; 1057 | check [| 0; 1; 1; 2 |] [| 3; 1; 4; 5 |] ~expect:[ 1, 1 ] 1058 | ;; 1059 | 1060 | (* Needs the plain diff section *) 1061 | 1062 | let rec is_increasing a = function 1063 | | [] -> true 1064 | | hd :: tl -> Int.compare a hd <= 0 && is_increasing hd tl 1065 | ;; 1066 | 1067 | let check_lis a = 1068 | let b = 1069 | Patience.longest_increasing_subsequence (Ordered_sequence.create (Array.of_list a)) 1070 | in 1071 | if is_increasing (-1) (List.map b ~f:fst) && is_increasing (-1) (List.map b ~f:snd) 1072 | then () 1073 | else 1074 | failwiths "invariant failure" (a, b) [%sexp_of: (int * int) list * (int * int) list] 1075 | ;; 1076 | 1077 | let%test_unit _ = check_lis [ 2, 0; 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ] 1078 | let%test_unit _ = check_lis [ 0, 0; 2, 0; 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ] 1079 | let%test_unit _ = check_lis [ 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ] 1080 | 1081 | let%test_unit _ = 1082 | let check a b = 1083 | let matches = P.matches a b in 1084 | if is_increasing (-1) (List.map matches ~f:fst) 1085 | && is_increasing (-1) (List.map matches ~f:snd) 1086 | then () 1087 | else 1088 | failwiths 1089 | "invariant failure" 1090 | (a, b, matches) 1091 | [%sexp_of: int array * int array * (int * int) list] 1092 | in 1093 | check [| 0; 1; 2; 3; 4; 5; 6 |] [| 2; 5; 6; 3; 0; 4; 1 |] 1094 | ;; 1095 | end 1096 | 1097 | module String = Make (String) 1098 | 1099 | module Stable = struct 1100 | module Matching_block = Matching_block.Stable 1101 | module Range = Range.Stable 1102 | module Hunk = Hunk.Stable 1103 | module Hunks = Hunks.Stable 1104 | end 1105 | --------------------------------------------------------------------------------