├── dune
├── dune-project
├── .ocamlformat
├── .gitignore
├── test
    ├── test_merge_diff.mli
    ├── test_limit_context.mli
    ├── import.ml
    ├── test_plain_diff_cutoff.mli
    ├── dune
    ├── patience_diff_lib_test.ml
    ├── test_plain_diff_cutoff.ml
    ├── test_limit_context.ml
    └── test_merge_diff.ml
├── src
    ├── patience_diff.mli
    ├── patience_diff_lib.ml
    ├── dune
    ├── matching_block.mli
    ├── matching_block.ml
    ├── move_kind.ml
    ├── move_id.ml
    ├── move_id.mli
    ├── hunks.mli
    ├── move_kind.mli
    ├── plain_diff.mli
    ├── hunks.ml
    ├── hunk.mli
    ├── range.mli
    ├── range.ml
    ├── hunk.ml
    ├── patience_diff_intf.ml
    ├── plain_diff.ml
    └── patience_diff.ml
├── Makefile
├── patience_diff.opam
├── CHANGES.md
├── LICENSE.md
└── CONTRIBUTING.md


/dune:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 3.17)
2 | 


--------------------------------------------------------------------------------
/.ocamlformat:
--------------------------------------------------------------------------------
1 | profile=janestreet
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *.install
3 | *.merlin
4 | _opam
5 | 
6 | 


--------------------------------------------------------------------------------
/test/test_merge_diff.mli:
--------------------------------------------------------------------------------
1 | (* This interface is deliberately empty. *)
2 | 


--------------------------------------------------------------------------------
/test/test_limit_context.mli:
--------------------------------------------------------------------------------
1 | (* This interface is deliberately empty. *)
2 | 


--------------------------------------------------------------------------------
/src/patience_diff.mli:
--------------------------------------------------------------------------------
1 | include Patience_diff_intf.Patience_diff (** @inline *)
2 | 


--------------------------------------------------------------------------------
/test/import.ml:
--------------------------------------------------------------------------------
1 | include Expect_test_helpers_core
2 | include Patience_diff_lib
3 | 


--------------------------------------------------------------------------------
/test/test_plain_diff_cutoff.mli:
--------------------------------------------------------------------------------
1 | (* This interface is deliberately empty. *)
2 | 


--------------------------------------------------------------------------------
/src/patience_diff_lib.ml:
--------------------------------------------------------------------------------
1 | module Patience_diff = Patience_diff
2 | module Plain_diff = Plain_diff
3 | 


--------------------------------------------------------------------------------
/src/dune:
--------------------------------------------------------------------------------
1 | (library
2 |  (name patience_diff_lib)
3 |  (public_name patience_diff)
4 |  (libraries base core)
5 |  (preprocess
6 |   (pps ppx_jane)))
7 | 


--------------------------------------------------------------------------------
/test/dune:
--------------------------------------------------------------------------------
1 | (library
2 |  (name patience_diff_lib_test)
3 |  (libraries base core expect_test_helpers_core patience_diff_lib)
4 |  (preprocess
5 |   (pps ppx_jane)))
6 | 


--------------------------------------------------------------------------------
/test/patience_diff_lib_test.ml:
--------------------------------------------------------------------------------
1 | module Import = Import
2 | module Test_limit_context = Test_limit_context
3 | module Test_merge_diff = Test_merge_diff
4 | module Test_plain_diff_cutoff = Test_plain_diff_cutoff
5 | 


--------------------------------------------------------------------------------
/src/matching_block.mli:
--------------------------------------------------------------------------------
 1 | open! Core
 2 | 
 3 | type t =
 4 |   { prev_start : int
 5 |   ; next_start : int
 6 |   ; length : int
 7 |   }
 8 | 
 9 | module Stable : sig
10 |   module V1 : sig
11 |     type nonrec t = t [@@deriving sexp, bin_io]
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/src/matching_block.ml:
--------------------------------------------------------------------------------
 1 | module Stable = struct
 2 |   open! Core.Core_stable
 3 | 
 4 |   module V1 = struct
 5 |     type t =
 6 |       { prev_start : int
 7 |       ; next_start : int
 8 |       ; length : int
 9 |       }
10 |     [@@deriving sexp, bin_io]
11 |   end
12 | end
13 | 
14 | open! Core
15 | include Stable.V1
16 | 


--------------------------------------------------------------------------------
/src/move_kind.ml:
--------------------------------------------------------------------------------
 1 | module Stable = struct
 2 |   open! Core.Core_stable
 3 | 
 4 |   module V1 = struct
 5 |     type t =
 6 |       | Move of Move_id.Stable.V1.t
 7 |       | Within_move of Move_id.Stable.V1.t
 8 |     [@@deriving sexp, bin_io, compare ~localize]
 9 |   end
10 | end
11 | 
12 | open! Core
13 | include Stable.V1
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | INSTALL_ARGS := $(if $(PREFIX),--prefix $(PREFIX),)
 2 | 
 3 | default:
 4 | 	dune build
 5 | 
 6 | install:
 7 | 	dune install $(INSTALL_ARGS)
 8 | 
 9 | uninstall:
10 | 	dune uninstall $(INSTALL_ARGS)
11 | 
12 | reinstall: uninstall install
13 | 
14 | clean:
15 | 	dune clean
16 | 
17 | .PHONY: default install uninstall reinstall clean
18 | 


--------------------------------------------------------------------------------
/src/move_id.ml:
--------------------------------------------------------------------------------
 1 | module Stable = struct
 2 |   open! Core.Core_stable
 3 | 
 4 |   module V1 = struct
 5 |     type t = int [@@deriving sexp, bin_io, compare ~localize]
 6 |   end
 7 | end
 8 | 
 9 | open! Core
10 | include Stable.V1
11 | include Comparable.Make_plain (Stable.V1)
12 | 
13 | let zero = 0
14 | let succ = Int.succ
15 | let to_string = Int.to_string
16 | 


--------------------------------------------------------------------------------
/src/move_id.mli:
--------------------------------------------------------------------------------
 1 | open! Core
 2 | 
 3 | (** Each move identified in the code is given a unique move ID which can be used to
 4 |     distinguish it from other moves. *)
 5 | type t [@@deriving sexp, compare ~localize]
 6 | 
 7 | include Comparable.S_plain with type t := t
 8 | 
 9 | val to_string : t -> string
10 | 
11 | (** Return the 0th move index *)
12 | val zero : t
13 | 
14 | (** Get the next move index *)
15 | val succ : t -> t
16 | 
17 | module Stable : sig
18 |   module V1 : sig
19 |     type nonrec t = t [@@deriving sexp, bin_io, compare ~localize]
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/src/hunks.mli:
--------------------------------------------------------------------------------
 1 | open! Core
 2 | 
 3 | type 'a t = 'a Hunk.t list
 4 | 
 5 | (** [unified t] converts all Replace ranges in [t] to an Prev range followed by a Next
 6 |     range. *)
 7 | val unified : 'a t -> 'a t
 8 | 
 9 | (** [ranges t] concatenates all the ranges of all hunks together **)
10 | val ranges : 'a t -> 'a Range.t list
11 | 
12 | val concat_map_ranges : 'a t -> f:('a Range.t -> 'b Range.t list) -> 'b t
13 | 
14 | module Stable : sig
15 |   module V2 : sig
16 |     type nonrec 'a t = 'a t [@@deriving sexp, bin_io]
17 |   end
18 | 
19 |   module V1 : sig
20 |     type nonrec 'a t [@@deriving sexp, bin_io]
21 | 
22 |     val to_v2 : 'a t -> 'a V2.t
23 |     val of_v2_no_moves_exn : 'a V2.t -> 'a t
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/test/test_plain_diff_cutoff.ml:
--------------------------------------------------------------------------------
 1 | open! Base
 2 | open Import
 3 | 
 4 | let%expect_test "exercise cutoff code path" =
 5 |   let arr1 = [| 0; 1; 2; 3; 4; 5; 6; 42; 43; 7; 8; 9 |] in
 6 |   let arr2 = [| 9; 8; 7; 42; 43; 6; 5; 4; 3; 2; 1; 0 |] in
 7 |   let do_diff ~cutoff =
 8 |     Plain_diff.iter_matches
 9 |       ?cutoff
10 |       arr1
11 |       arr2
12 |       ~f:(fun (i, j) ->
13 |         assert (arr1.(i) = arr2.(j));
14 |         print_s [%sexp (arr1.(i) : int), ((i, j) : int * int)])
15 |       ~hashable:(module Int)
16 |   in
17 |   do_diff ~cutoff:None;
18 |   [%expect
19 |     {|
20 |     (42 (7 3))
21 |     (43 (8 4))
22 |     |}];
23 |   do_diff ~cutoff:(Some 3);
24 |   (* worse diff, but correct *)
25 |   [%expect {| (9 (11 0)) |}]
26 | ;;
27 | 


--------------------------------------------------------------------------------
/patience_diff.opam:
--------------------------------------------------------------------------------
 1 | opam-version: "2.0"
 2 | maintainer: "Jane Street developers"
 3 | authors: ["Jane Street Group, LLC"]
 4 | homepage: "https://github.com/janestreet/patience_diff"
 5 | bug-reports: "https://github.com/janestreet/patience_diff/issues"
 6 | dev-repo: "git+https://github.com/janestreet/patience_diff.git"
 7 | doc: "https://ocaml.janestreet.com/ocaml-core/latest/doc/patience_diff/index.html"
 8 | license: "MIT"
 9 | build: [
10 |   ["dune" "build" "-p" name "-j" jobs]
11 | ]
12 | depends: [
13 |   "ocaml" {>= "5.1.0"}
14 |   "base"
15 |   "core"
16 |   "ppx_jane"
17 |   "dune"     {>= "3.17.0"}
18 | ]
19 | available: arch != "arm32" & arch != "x86_32"
20 | synopsis: "Diff library using Bram Cohen's patience diff algorithm"
21 | description: "
22 | "
23 | 


--------------------------------------------------------------------------------
/src/move_kind.mli:
--------------------------------------------------------------------------------
 1 | (** If a given range is part of a move it will have a [Move_kind.t]. If the move is simple
 2 |     with no ranges there will just be two ranges: One [Prev] and one [Next] that share the
 3 |     same [Move MOVE_INDEX] where the index is used to identify a given move as the same.
 4 | 
 5 |     If the move has modifications like additions and deletions then the [Next] part of the
 6 |     move will have replaces with [Within_move MOVE_INDEX] to denote they are just
 7 |     modifications to the moved code. *)
 8 | type t =
 9 |   | Move of Move_id.t
10 |   | Within_move of Move_id.t
11 | [@@deriving sexp, compare ~localize]
12 | 
13 | module Stable : sig
14 |   module V1 : sig
15 |     type nonrec t = t [@@deriving sexp, bin_io, compare ~localize]
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/src/plain_diff.mli:
--------------------------------------------------------------------------------
 1 | (** Basic Myers diff algorithm, translated from GNU diff. **)
 2 | 
 3 | (** [iter_matches ?cutoff ~f ~hashable a b] diffs the arrays [a] and [b] (as in
 4 |     /usr/bin/diff), and calls [f] on each element of the longest common subsequence in
 5 |     increasing order. The arguments of [f] are the indices in [a] and [b], respectively,
 6 |     of that element.
 7 | 
 8 |     The [cutoff] is an upper bound on the minimum edit distance between [a] and [b]. When
 9 |     [cutoff] is exceeded, [iter_matches] returns a correct, but not necessarily minimal
10 |     diff. It defaults to about [sqrt (Array.length a + Array.length b)]. *)
11 | val iter_matches
12 |   :  ?cutoff:int
13 |   -> f:(int * int -> unit)
14 |   -> hashable:'a Base.Hashtbl.Key.t
15 |   -> 'a array
16 |   -> 'a array
17 |   -> unit
18 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | ## Release v0.17.0
 2 | 
 3 | - Can now find text that has been moved with the `find_moves` parameter
 4 | - `Range` now can have a `Move_id` which identifies which move the current `Range` belongs to.
 5 |   Multiple ranges can belong to the same move.
 6 | 
 7 | ## Release v0.16.0
 8 | 
 9 | * Added `?max_slide` and `?score` arguments to `get_matching_blocks` and `get_hunks`.
10 | 
11 | ## Old pre-v0.15 changelogs (very likely stale and incomplete)
12 | 
13 | ## 113.24.00
14 | 
15 | - Switch to PPX.
16 | 
17 | ## 112.24.00
18 | 
19 | Update references to `Core.Std.Dequeue` to refer to `Core.Std.Deque`
20 | 
21 | ## 111.25.00
22 | 
23 | - refactoring and more unit tests
24 | 
25 | ## 111.21.00
26 | 
27 | - Added plain differ `Plain_diff` and use it in some cases for
28 |   improved results.
29 | - Move modules under `Patience_diff_lib.Std`.
30 | 
31 | ## 111.17.00
32 | 
33 | - Exposed `Patience_diff.matches`.
34 | 
35 | ## 111.13.00
36 | 
37 | - Moved `Patience_diff` out of `Core_extended` into its own library
38 |   depending only on `Core_kernel`.
39 | 
40 | 


--------------------------------------------------------------------------------
/src/hunks.ml:
--------------------------------------------------------------------------------
 1 | module Stable = struct
 2 |   open! Core.Core_stable
 3 |   module Hunk = Hunk.Stable
 4 | 
 5 |   module V2 = struct
 6 |     type 'a t = 'a Hunk.V2.t list [@@deriving sexp, bin_io]
 7 |   end
 8 | 
 9 |   module V1 = struct
10 |     type 'a t = 'a Hunk.V1.t list [@@deriving sexp, bin_io]
11 | 
12 |     let to_v2 = Core.List.map ~f:Hunk.V1.to_v2
13 |     let of_v2_no_moves_exn = Core.List.map ~f:Hunk.V1.of_v2_no_moves_exn
14 |   end
15 | end
16 | 
17 | open! Core
18 | include Stable.V2
19 | 
20 | let concat_map_ranges hunks ~f = List.map hunks ~f:(Hunk.concat_map ~f)
21 | 
22 | let unified hunks =
23 |   let f : 'a Range.t -> 'a Range.t list = function
24 |     | Replace (l_range, r_range, move_id) ->
25 |       let move_kind =
26 |         Option.map move_id ~f:(fun move_id -> Move_kind.Within_move move_id)
27 |       in
28 |       [ Prev (l_range, move_kind); Next (r_range, move_kind) ]
29 |     | range -> [ range ]
30 |   in
31 |   concat_map_ranges hunks ~f
32 | ;;
33 | 
34 | let ranges hunks = List.concat_map hunks ~f:(fun hunk -> hunk.Hunk.ranges)
35 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2009--2025 Jane Street Group, LLC <opensource-contacts@janestreet.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/hunk.mli:
--------------------------------------------------------------------------------
 1 | (** In diff terms, a hunk is a unit of consecutive ranges with some [Same] context before
 2 |     and after [Next], [Prev], and [Replace] ranges. Each hunk contains information about
 3 |     the original arrays, specifically the starting indexes and the number of elements in
 4 |     both arrays to which the hunk refers.
 5 | 
 6 |     Furthermore, a diff is essentially a list of hunks. The simplest case is a diff with
 7 |     infinite context, consisting of exactly one hunk. *)
 8 | 
 9 | open! Core
10 | 
11 | type 'a t =
12 |   { prev_start : int
13 |   ; prev_size : int
14 |   ; next_start : int
15 |   ; next_size : int
16 |   ; ranges : 'a Range.t list
17 |   }
18 | [@@deriving fields ~getters, sexp_of, compare ~localize]
19 | 
20 | (** [all_same t] returns true if [t] contains only Same ranges. *)
21 | val all_same : 'a t -> bool
22 | 
23 | (** [concat_map t ~f] applies [List.concat_map] on [t.ranges]. *)
24 | val concat_map : 'a t -> f:('a Range.t -> 'b Range.t list) -> 'b t
25 | 
26 | (** Take a hunk that was generated with infinite context and limit it to the provided
27 |     [context]. *)
28 | val limit_infinite_context_hunk_to_context : context:int -> 'a t -> 'a t list
29 | 
30 | module Stable : sig
31 |   module V2 : sig
32 |     type nonrec 'a t = 'a t [@@deriving sexp, bin_io]
33 |   end
34 | 
35 |   module V1 : sig
36 |     type nonrec 'a t [@@deriving sexp, bin_io]
37 | 
38 |     val to_v2 : 'a t -> 'a V2.t
39 |     val of_v2_no_moves_exn : 'a V2.t -> 'a t
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/test/test_limit_context.ml:
--------------------------------------------------------------------------------
 1 | open! Core
 2 | open Import
 3 | 
 4 | module Test_case = struct
 5 |   type t =
 6 |     { prev : string array
 7 |     ; next : string array
 8 |     ; context : int
 9 |     }
10 |   [@@deriving sexp_of]
11 | 
12 |   let gen =
13 |     let open Quickcheck.Let_syntax in
14 |     let%bind context = Int.gen_incl 1 10 in
15 |     let%bind prev_length = Int.gen_incl 30 60 in
16 |     let%bind next_length = Int.gen_incl 30 60 in
17 |     let line_gen =
18 |       Quickcheck.Generator.of_list [ "apple"; "banana"; "cherry"; "dog"; "egg"; "fish" ]
19 |     in
20 |     let%bind prev = Quickcheck.Generator.list_with_length prev_length line_gen in
21 |     let%bind next = Quickcheck.Generator.list_with_length next_length line_gen in
22 |     return { prev = List.to_array prev; next = List.to_array next; context }
23 |   ;;
24 | end
25 | 
26 | let%test_unit "Ensure that limiting infinite context hunk to context is correct" =
27 |   Quickcheck.test ~sexp_of:Test_case.sexp_of_t Test_case.gen ~f:(fun test_case ->
28 |     let hunks =
29 |       Patience_diff.String.get_hunks
30 |         ~transform:Fn.id
31 |         ~context:test_case.context
32 |         ~prev:test_case.prev
33 |         ~next:test_case.next
34 |         ()
35 |     in
36 |     let infinite_context_hunk =
37 |       Patience_diff.String.get_hunks
38 |         ~transform:Fn.id
39 |         ~context:(-1)
40 |         ~prev:test_case.prev
41 |         ~next:test_case.next
42 |         ()
43 |       |> List.hd_exn
44 |     in
45 |     let limited_hunks =
46 |       Patience_diff.Hunk.limit_infinite_context_hunk_to_context
47 |         ~context:test_case.context
48 |         infinite_context_hunk
49 |     in
50 |     [%test_result: string Patience_diff.Hunk.t list] limited_hunks ~expect:hunks)
51 | ;;
52 | 


--------------------------------------------------------------------------------
/src/range.mli:
--------------------------------------------------------------------------------
 1 | (** For handling diffs abstractly. A range is a subarray of the two original arrays with a
 2 |     constructor defining its relationship to the two original arrays. A [Same] range
 3 |     contains a series of elements which can be found in both arrays. A [Next] range
 4 |     contains elements found only in the second array, while an [Prev] range contains
 5 |     elements found only in the first array.
 6 | 
 7 |     If a range is part of a move it will have a non-None [Move_kind.t] or [Move_id.t] in
 8 |     the case of [Replace] and [Unified]. A [Prev] with a [Move _] [Move_kind] means that
 9 |     [Prev] has a corresponding [Next] that it was moved to. A [Prev] with a
10 |     [Within_move _] [Move_kind] means that this was some code that was deleted within a
11 |     block that moved to a [Next] position of the file. If a [Replace] or [Unified] range
12 |     is associated with a move it can only be change within a move so they only hove a
13 |     [Move_id.t option] instead of a [Move_kind.t option] like [Prev] or [Next].
14 | 
15 |     A [Replace] contains two arrays: elements in the first output array are elements found
16 |     only in the first input array, which have been replaced by elements in the second
17 |     output array, which are elements found only in the second input array. *)
18 | 
19 | type 'a t =
20 |   | Same of ('a * 'a) array
21 |   | Prev of 'a array * Move_kind.t option
22 |   | Next of 'a array * Move_kind.t option
23 |   | Replace of 'a array * 'a array * Move_id.t option
24 |   | Unified of 'a array * Move_id.t option
25 | [@@deriving sexp, compare ~localize]
26 | 
27 | (** [all_same ranges] returns true if all [ranges] are Same *)
28 | val all_same : 'a t list -> bool
29 | 
30 | (** [prev_only ranges] drops all Next ranges and converts all Replace ranges to Prev
31 |     ranges. *)
32 | val prev_only : 'a t list -> 'a t list
33 | 
34 | (** [next_only ranges] drops all Prev ranges and converts all Replace ranges to Next
35 |     ranges. *)
36 | val next_only : 'a t list -> 'a t list
37 | 
38 | (** Counts number of elements. *)
39 | val prev_size : 'a t -> int
40 | 
41 | val next_size : 'a t -> int
42 | 
43 | module Stable : sig
44 |   module V2 : sig
45 |     type nonrec 'a t = 'a t [@@deriving sexp, bin_io, compare ~localize]
46 |   end
47 | 
48 |   module V1 : sig
49 |     type nonrec 'a t [@@deriving sexp, bin_io]
50 | 
51 |     val to_v2 : 'a t -> 'a V2.t
52 |     val of_v2_no_moves_exn : 'a V2.t -> 'a t
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | This repository contains open source software that is developed and
 2 | maintained by [Jane Street][js].
 3 | 
 4 | Contributions to this project are welcome and should be submitted via
 5 | GitHub pull requests.
 6 | 
 7 | Signing contributions
 8 | ---------------------
 9 | 
10 | We require that you sign your contributions. Your signature certifies
11 | that you wrote the patch or otherwise have the right to pass it on as
12 | an open-source patch. The rules are pretty simple: if you can certify
13 | the below (from [developercertificate.org][dco]):
14 | 
15 | ```
16 | Developer Certificate of Origin
17 | Version 1.1
18 | 
19 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
20 | 1 Letterman Drive
21 | Suite D4700
22 | San Francisco, CA, 94129
23 | 
24 | Everyone is permitted to copy and distribute verbatim copies of this
25 | license document, but changing it is not allowed.
26 | 
27 | 
28 | Developer's Certificate of Origin 1.1
29 | 
30 | By making a contribution to this project, I certify that:
31 | 
32 | (a) The contribution was created in whole or in part by me and I
33 |     have the right to submit it under the open source license
34 |     indicated in the file; or
35 | 
36 | (b) The contribution is based upon previous work that, to the best
37 |     of my knowledge, is covered under an appropriate open source
38 |     license and I have the right under that license to submit that
39 |     work with modifications, whether created in whole or in part
40 |     by me, under the same open source license (unless I am
41 |     permitted to submit under a different license), as indicated
42 |     in the file; or
43 | 
44 | (c) The contribution was provided directly to me by some other
45 |     person who certified (a), (b) or (c) and I have not modified
46 |     it.
47 | 
48 | (d) I understand and agree that this project and the contribution
49 |     are public and that a record of the contribution (including all
50 |     personal information I submit with it, including my sign-off) is
51 |     maintained indefinitely and may be redistributed consistent with
52 |     this project or the open source license(s) involved.
53 | ```
54 | 
55 | Then you just add a line to every git commit message:
56 | 
57 | ```
58 | Signed-off-by: Joe Smith <joe.smith@email.com>
59 | ```
60 | 
61 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
62 | 
63 | If you set your `user.name` and `user.email` git configs, you can sign
64 | your commit automatically with git commit -s.
65 | 
66 | [dco]: http://developercertificate.org/
67 | [js]: https://opensource.janestreet.com/
68 | 


--------------------------------------------------------------------------------
/src/range.ml:
--------------------------------------------------------------------------------
 1 | module Stable = struct
 2 |   open! Core.Core_stable
 3 | 
 4 |   module V2 = struct
 5 |     type 'a t =
 6 |       | Same of ('a * 'a) array
 7 |       | Prev of 'a array * Move_kind.Stable.V1.t option
 8 |       | Next of 'a array * Move_kind.Stable.V1.t option
 9 |       | Replace of 'a array * 'a array * Move_id.Stable.V1.t option
10 |       | Unified of 'a array * Move_id.Stable.V1.t option
11 |     [@@deriving sexp, bin_io, compare ~localize]
12 |   end
13 | 
14 |   module V1 = struct
15 |     type 'a t =
16 |       | Same of ('a * 'a) array
17 |       | Prev of 'a array
18 |       | Next of 'a array
19 |       | Replace of 'a array * 'a array
20 |       | Unified of 'a array
21 |     [@@deriving sexp, bin_io]
22 | 
23 |     let to_v2 : 'a t -> 'a V2.t = function
24 |       | Same lines -> Same lines
25 |       | Prev lines -> Prev (lines, None)
26 |       | Next lines -> Next (lines, None)
27 |       | Replace (lines_prev, lines_next) -> Replace (lines_prev, lines_next, None)
28 |       | Unified lines -> Unified (lines, None)
29 |     ;;
30 | 
31 |     let of_v2_no_moves_exn : 'a V2.t -> 'a t = function
32 |       | Prev (_, Some _) | Next (_, Some _) | Replace (_, _, Some _) | Unified (_, Some _)
33 |         -> Core.raise_s [%sexp "cannot convert to old patdiff version with a move"]
34 |       | Same lines -> Same lines
35 |       | Prev (lines, None) -> Prev lines
36 |       | Next (lines, None) -> Next lines
37 |       | Replace (lines_prev, lines_next, None) -> Replace (lines_prev, lines_next)
38 |       | Unified (lines, None) -> Unified lines
39 |     ;;
40 |   end
41 | end
42 | 
43 | open! Core
44 | include Stable.V2
45 | 
46 | let all_same ranges =
47 |   List.for_all ranges ~f:(fun range ->
48 |     match range with
49 |     | Same _ -> true
50 |     | _ -> false)
51 | ;;
52 | 
53 | let prev_and_next range =
54 |   match range with
55 |   | Same _ -> [ range ], [ range ]
56 |   | Prev (_, (None | Some (Move _))) -> [ range ], []
57 |   | Prev (_, Some (Within_move _)) ->
58 |     (* Don't include [Prev]s that are within moves because they are showing that some code
59 |        was deleted from a different [Prev] to create a corresponding [Next]. *)
60 |     [], []
61 |   | Next (_, _) -> [], [ range ]
62 |   | Replace (l_range, r_range, None) -> [ Prev (l_range, None) ], [ Next (r_range, None) ]
63 |   | Replace (_, r_range, Some move_id) ->
64 |     (* Don't include [Replace]s for the same reason as a [Prev] within a move. *)
65 |     [], [ Next (r_range, Some (Within_move move_id)) ]
66 |   | Unified (_, Some _) -> [], [ range ]
67 |   | Unified (_, None) -> [ range ], [ range ]
68 | ;;
69 | 
70 | let prev_only ranges = List.concat_map ranges ~f:(fun range -> fst (prev_and_next range))
71 | let next_only ranges = List.concat_map ranges ~f:(fun range -> snd (prev_and_next range))
72 | 
73 | let prev_size = function
74 |   | Unified (lines, None)
75 |   | Replace (lines, _, None)
76 |   | Prev (lines, None)
77 |   | Prev (lines, Some (Move _)) -> Array.length lines
78 |   | Same lines -> Array.length lines
79 |   (* Don't include [Prev]s that are within moves as they are conceptually [Next]s *)
80 |   | Replace (_, _, Some _) | Prev (_, Some (Within_move _)) | Next _ | Unified (_, Some _)
81 |     -> 0
82 | ;;
83 | 
84 | let next_size = function
85 |   | Unified (lines, _) | Replace (_, lines, _) | Next (lines, _) -> Array.length lines
86 |   | Same lines -> Array.length lines
87 |   | Prev _ -> 0
88 | ;;
89 | 


--------------------------------------------------------------------------------
/test/test_merge_diff.ml:
--------------------------------------------------------------------------------
  1 | open! Base
  2 | open Import
  3 | 
  4 | let merge_and_print arrs =
  5 |   Patience_diff.String.merge arrs
  6 |   |> List.iter ~f:(function
  7 |     | Same arr ->
  8 |       print_endline "---- Same -----";
  9 |       Array.iter arr ~f:(fun line -> print_endline ("   " ^ line))
 10 |     | Different arrs ->
 11 |       print_endline "-- Different --";
 12 |       Array.iteri arrs ~f:(fun idx ->
 13 |         Array.iter ~f:(fun line -> print_endline ("(" ^ Int.to_string idx ^ ")" ^ line))))
 14 | ;;
 15 | 
 16 | let%expect_test "Identical documents" =
 17 |   let doc = [| "a"; "b"; "c"; "d" |] in
 18 |   merge_and_print [| doc |];
 19 |   [%expect
 20 |     {|
 21 |     ---- Same -----
 22 |        a
 23 |        b
 24 |        c
 25 |        d
 26 |     |}];
 27 |   merge_and_print [| doc; doc |];
 28 |   [%expect
 29 |     {|
 30 |     ---- Same -----
 31 |        a
 32 |        b
 33 |        c
 34 |        d
 35 |     |}];
 36 |   merge_and_print [| doc; doc; doc |];
 37 |   [%expect
 38 |     {|
 39 |     ---- Same -----
 40 |        a
 41 |        b
 42 |        c
 43 |        d
 44 |     |}]
 45 | ;;
 46 | 
 47 | let%expect_test "Empty documents" =
 48 |   let doc = [| "a"; "b"; "c" |] in
 49 |   merge_and_print [| doc; [||] |];
 50 |   [%expect
 51 |     {|
 52 |     -- Different --
 53 |     (0)a
 54 |     (0)b
 55 |     (0)c
 56 |     |}];
 57 |   merge_and_print [| [||]; doc |];
 58 |   [%expect
 59 |     {|
 60 |     -- Different --
 61 |     (1)a
 62 |     (1)b
 63 |     (1)c
 64 |     |}];
 65 |   let doc = [| "a"; "b"; "c" |] in
 66 |   merge_and_print [| doc; [| "" |] |];
 67 |   [%expect
 68 |     {|
 69 |     -- Different --
 70 |     (0)a
 71 |     (0)b
 72 |     (0)c
 73 |     (1)
 74 |     |}];
 75 |   merge_and_print [| [| "" |]; doc |];
 76 |   [%expect
 77 |     {|
 78 |     -- Different --
 79 |     (0)
 80 |     (1)a
 81 |     (1)b
 82 |     (1)c
 83 |     |}]
 84 | ;;
 85 | 
 86 | let%expect_test "Documents with trailing added lines" =
 87 |   let short = [| "a"; "b" |] in
 88 |   let long = [| "a"; "b"; "c"; "d" |] in
 89 |   merge_and_print [| short; long |];
 90 |   [%expect
 91 |     {|
 92 |     ---- Same -----
 93 |        a
 94 |        b
 95 |     -- Different --
 96 |     (1)c
 97 |     (1)d
 98 |     |}];
 99 |   merge_and_print [| long; short |];
100 |   [%expect
101 |     {|
102 |     ---- Same -----
103 |        a
104 |        b
105 |     -- Different --
106 |     (0)c
107 |     (0)d
108 |     |}]
109 | ;;
110 | 
111 | let%expect_test "Documents with leading added lines" =
112 |   let short = [| "c"; "d" |] in
113 |   let long = [| "a"; "b"; "c"; "d" |] in
114 |   merge_and_print [| short; long |];
115 |   [%expect
116 |     {|
117 |     -- Different --
118 |     (1)a
119 |     (1)b
120 |     ---- Same -----
121 |        c
122 |        d
123 |     |}];
124 |   merge_and_print [| long; short |];
125 |   [%expect
126 |     {|
127 |     -- Different --
128 |     (0)a
129 |     (0)b
130 |     ---- Same -----
131 |        c
132 |        d
133 |     |}]
134 | ;;
135 | 
136 | let%expect_test "Mixed documents with changes" =
137 |   let short = [| "a"; "b"; "foo"; "c"; "d" |] in
138 |   let long = [| "a"; "b"; "c"; "bar"; "d" |] in
139 |   merge_and_print [| short; long |];
140 |   [%expect
141 |     {|
142 |     ---- Same -----
143 |        a
144 |        b
145 |     -- Different --
146 |     (0)foo
147 |     ---- Same -----
148 |        c
149 |     -- Different --
150 |     (1)bar
151 |     ---- Same -----
152 |        d
153 |     |}]
154 | ;;
155 | 


--------------------------------------------------------------------------------
/src/hunk.ml:
--------------------------------------------------------------------------------
  1 | module Stable = struct
  2 |   open! Core.Core_stable
  3 |   module Range = Range.Stable
  4 | 
  5 |   module V2 = struct
  6 |     type 'a t =
  7 |       { prev_start : int
  8 |       ; prev_size : int
  9 |       ; next_start : int
 10 |       ; next_size : int
 11 |       ; ranges : 'a Range.V2.t list
 12 |       }
 13 |     [@@deriving fields ~getters, sexp, bin_io, compare ~localize]
 14 |   end
 15 | 
 16 |   module V1 = struct
 17 |     type 'a t =
 18 |       { prev_start : int
 19 |       ; prev_size : int
 20 |       ; next_start : int
 21 |       ; next_size : int
 22 |       ; ranges : 'a Range.V1.t list
 23 |       }
 24 |     [@@deriving fields ~getters, sexp, bin_io]
 25 | 
 26 |     let to_v2 t =
 27 |       { V2.prev_start = t.prev_start
 28 |       ; prev_size = t.prev_size
 29 |       ; next_start = t.next_start
 30 |       ; next_size = t.next_size
 31 |       ; ranges = Core.List.map t.ranges ~f:Range.V1.to_v2
 32 |       }
 33 |     ;;
 34 | 
 35 |     let of_v2_no_moves_exn (t : _ V2.t) =
 36 |       { prev_start = t.prev_start
 37 |       ; prev_size = t.prev_size
 38 |       ; next_start = t.next_start
 39 |       ; next_size = t.next_size
 40 |       ; ranges = Core.List.map t.ranges ~f:Range.V1.of_v2_no_moves_exn
 41 |       }
 42 |     ;;
 43 |   end
 44 | end
 45 | 
 46 | open! Core
 47 | include Stable.V2
 48 | 
 49 | let _invariant t =
 50 |   Invariant.invariant t [%sexp_of: _ t] (fun () ->
 51 |     [%test_result: int]
 52 |       (List.sum (module Int) t.ranges ~f:Range.prev_size)
 53 |       ~expect:t.prev_size
 54 |       ~message:"prev_size";
 55 |     [%test_result: int]
 56 |       (List.sum (module Int) t.ranges ~f:Range.next_size)
 57 |       ~expect:t.next_size
 58 |       ~message:"next_size")
 59 | ;;
 60 | 
 61 | let all_same hunk = Range.all_same hunk.ranges
 62 | let concat_map t ~f = { t with ranges = List.concat_map t.ranges ~f }
 63 | 
 64 | let limit_infinite_context_hunk_to_context ~context hunk =
 65 |   let trim_range range =
 66 |     match range with
 67 |     | Range.Same arr ->
 68 |       let len = Array.length arr in
 69 |       if len <= context
 70 |       then `Did_not_trim range
 71 |       else (
 72 |         let start = Range.Same (Array.sub arr ~pos:0 ~len:context) in
 73 |         let end_ = Range.Same (Array.sub arr ~pos:(len - context) ~len:context) in
 74 |         let lines_trimmed = len - (2 * context) in
 75 |         `Trimmed (lines_trimmed, start, end_))
 76 |     | _ -> `Did_not_trim range
 77 |   in
 78 |   let working_ranges = Queue.create () in
 79 |   let working_ranges_offset_from_previous = ref 0 in
 80 |   let all_hunks = Queue.create () in
 81 |   let finish_hunk ~offset_from_previous_end =
 82 |     Queue.enqueue
 83 |       all_hunks
 84 |       (!working_ranges_offset_from_previous, Queue.to_list working_ranges);
 85 |     Queue.clear working_ranges;
 86 |     working_ranges_offset_from_previous := offset_from_previous_end
 87 |   in
 88 |   let last_range_index = List.length hunk.ranges - 1 in
 89 |   List.iteri hunk.ranges ~f:(fun i range ->
 90 |     match trim_range range with
 91 |     | `Did_not_trim range ->
 92 |       Queue.enqueue working_ranges range;
 93 |       if i = last_range_index then finish_hunk ~offset_from_previous_end:0
 94 |     | `Trimmed (lines_trimmed, start_range, end_range) ->
 95 |       if i = 0
 96 |       then (
 97 |         working_ranges_offset_from_previous := lines_trimmed + Range.prev_size start_range;
 98 |         Queue.enqueue working_ranges end_range)
 99 |       else if i = last_range_index
100 |       then (
101 |         Queue.enqueue working_ranges start_range;
102 |         finish_hunk ~offset_from_previous_end:0)
103 |       else if lines_trimmed > 0
104 |       then (
105 |         Queue.enqueue working_ranges start_range;
106 |         finish_hunk ~offset_from_previous_end:lines_trimmed;
107 |         Queue.enqueue working_ranges end_range)
108 |       else Queue.enqueue working_ranges range);
109 |   let prev_end = ref 1 in
110 |   let next_end = ref 1 in
111 |   Queue.to_list all_hunks
112 |   |> List.filter_map ~f:(fun (offset_from_previous, ranges) ->
113 |     if List.is_empty ranges
114 |     then None
115 |     else (
116 |       let hunk =
117 |         { ranges
118 |         ; prev_start = !prev_end + offset_from_previous
119 |         ; prev_size = List.sum (module Int) ranges ~f:Range.prev_size
120 |         ; next_start = !next_end + offset_from_previous
121 |         ; next_size = List.sum (module Int) ranges ~f:Range.next_size
122 |         }
123 |       in
124 |       prev_end := hunk.prev_size + hunk.prev_start;
125 |       next_end := hunk.next_size + hunk.next_start;
126 |       Some hunk))
127 | ;;
128 | 


--------------------------------------------------------------------------------
/src/patience_diff_intf.ml:
--------------------------------------------------------------------------------
  1 | (** This is a port of Bram Cohen's patience diff algorithm, as found in the Bazaar 1.14.1
  2 |     source code, available at http://bazaar-vcs.org.
  3 | 
  4 |     This copyright notice was included:
  5 | 
  6 |     # Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd # # This
  7 |     program is free software; you can redistribute it and/or modify # it under the terms
  8 |     of the GNU General Public License as published by # the Free Software Foundation;
  9 |     either version 2 of the License, or # (at your option) any later version. # # This
 10 |     program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY;
 11 |     without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR
 12 |     PURPOSE. See the # GNU General Public License for more details. # # You should have
 13 |     received a copy of the GNU General Public License # along with this program; if not,
 14 |     write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor,
 15 |     Boston, MA 02110-1301 USA *)
 16 | 
 17 | (** {v
 18 |  Bram Cohen's comment from the original Python code (with syntax changed to OCaml):
 19 | 
 20 |     [get_matching_blocks a b] returns a list of triples describing matching
 21 |     subsequences.
 22 | 
 23 |     Each triple is of the form (i, j, n), and means that
 24 |     a <|> (i,i+n) = b <|> (j,j+n).  The triples are monotonically increasing in
 25 |     i and in j.
 26 | 
 27 |     The last triple is a dummy, (Array.length a, Array.length b, 0), and is the only
 28 |     triple with n=0.
 29 | 
 30 |     Example:
 31 |     get_matching_blocks [|"a";"b";"x";"c";"d"|] [|"a";"b";"c";"d"|]
 32 |     returns
 33 |     [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
 34 |     v} *)
 35 | 
 36 | open! Core
 37 | module Hunk = Hunk
 38 | module Hunks = Hunks
 39 | module Matching_block = Matching_block
 40 | module Range = Range
 41 | module Move_id = Move_id
 42 | 
 43 | module type S = sig
 44 |   type elt
 45 | 
 46 |   (** Get_matching_blocks not only aggregates the data from [matches a b] but also
 47 |       attempts to remove random, semantically meaningless matches ("semantic cleanup").
 48 |       The value of [big_enough] governs how aggressively we do so. See [get_hunks] below
 49 |       for more details. *)
 50 |   val get_matching_blocks
 51 |     :  transform:('a -> elt)
 52 |     -> ?big_enough:int
 53 |     -> ?max_slide:int
 54 |     -> ?score:([ `left | `right ] -> 'a -> 'a -> int)
 55 |     -> prev:'a array
 56 |     -> next:'a array
 57 |     -> unit
 58 |     -> Matching_block.t list
 59 | 
 60 |   (** [matches a b] returns a list of pairs (i,j) such that a.(i) = b.(j) and such that
 61 |       the list is strictly increasing in both its first and second coordinates. This is
 62 |       essentially a "unfolded" version of what [get_matching_blocks] returns. Instead of
 63 |       grouping the consecutive matching block using [length] this function would return
 64 |       all the pairs (prev_start * next_start). *)
 65 |   val matches : elt array -> elt array -> (int * int) list
 66 | 
 67 |   (** [match_ratio a b] computes the ratio defined as:
 68 | 
 69 |       {[
 70 |         2 * len (matches a b) / (len a + len b)
 71 |       ]}
 72 | 
 73 |       It is an indication of how much alike a and b are. A ratio closer to 1.0 will
 74 |       indicate a number of matches close to the number of elements that can potentially
 75 |       match, thus is a sign that a and b are very much alike. On the next hand, a low
 76 |       ratio means very little match. *)
 77 |   val match_ratio : elt array -> elt array -> float
 78 | 
 79 |   (** [get_hunks ~transform ~context ~prev ~next] will compare the arrays [prev] and
 80 |       [next] and produce a list of hunks. (The hunks will contain Same ranges of at most
 81 |       [context] elements.) Negative [context] is equivalent to infinity (producing a
 82 |       singleton hunk list). The value of [big_enough] governs how aggressively we try to
 83 |       clean up spurious matches, by restricting our attention to only matches of length
 84 |       less than [big_enough]. Thus, setting [big_enough] to a higher value results in more
 85 |       aggressive cleanup, and the default value of 1 results in no cleanup at all. When
 86 |       this function is called by [Patdiff_core], the value of [big_enough] is 3 at the
 87 |       line level, and 7 at the word level.
 88 | 
 89 |       The value of [max_slide] controls how far we are willing to shift a diff (which is
 90 |       immediately preceded/followed by the same lines as it ends/starts with). We choose
 91 |       between equivalent positions by maximising the sum of the [score] function applied
 92 |       to the two boundaries of the diff. By default, [max_slide] is 0. The arguments
 93 |       passed to [score] are firstly whether the boundary is at the start or end of the
 94 |       diff and then the values on either side of the boundary (if a boundary is considered
 95 |       at the start or end of the input, it gets a score of 100). *)
 96 |   val get_hunks
 97 |     :  transform:('a -> elt)
 98 |     -> context:int
 99 |     -> ?big_enough:int
100 |     -> ?max_slide:int
101 |     -> ?score:([ `left | `right ] -> 'a -> 'a -> int)
102 |     -> prev:'a array
103 |     -> next:'a array
104 |     -> unit
105 |     -> 'a Hunk.t list
106 | 
107 |   type 'a segment =
108 |     | Same of 'a array
109 |     | Different of 'a array array
110 | 
111 |   type 'a merged_array = 'a segment list
112 | 
113 |   val merge : elt array array -> elt merged_array
114 | end
115 | 
116 | module type Patience_diff = sig
117 |   module Hunk = Hunk
118 |   module Hunks = Hunks
119 |   module Matching_block = Matching_block
120 |   module Range = Range
121 |   module Move_id = Move_id
122 |   module Make (Elt : Hashtbl.Key) : S with type elt = Elt.t
123 | 
124 |   (* [String] uses String.compare *)
125 |   module String : S with type elt = string
126 | 
127 |   module Stable : sig
128 |     module Hunk = Hunk.Stable
129 |     module Hunks = Hunks.Stable
130 |     module Matching_block = Matching_block.Stable
131 |     module Range = Range.Stable
132 |   end
133 | end
134 | 


--------------------------------------------------------------------------------
/src/plain_diff.ml:
--------------------------------------------------------------------------------
  1 | (* This files comes from camlp5 (ocaml_src/lib/diff.ml). *)
  2 | (*
  3 |  * Copyright (c) 2007-2013, INRIA (Institut National de Recherches en
  4 |  * Informatique et Automatique). All rights reserved.
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *
  8 |  *     * Redistributions of source code must retain the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer.
 10 |  *     * Redistributions in binary form must reproduce the above copyright
 11 |  *       notice, this list of conditions and the following disclaimer in the
 12 |  *       documentation and/or other materials provided with the distribution.
 13 |  *     * Neither the name of INRIA, nor the names of its contributors may be
 14 |  *       used to endorse or promote products derived from this software without
 15 |  *       specific prior written permission.
 16 |  *
 17 |  * THIS SOFTWARE IS PROVIDED BY INRIA AND CONTRIBUTORS ``AS IS'' AND
 18 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 19 |  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 20 |  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INRIA AND
 21 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 24 |  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 25 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 27 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 28 |  * SUCH DAMAGE.
 29 | *)
 30 | (* $Id: diff.ml,v 1.2 2013-02-26 08:15:06 deraugla Exp $ *)
 31 | (* Parts of Code of GNU diff (diffseq.h and analyze.c) translated to OCaml and adjusted.
 32 | 
 33 |    Basic algorithm described by Eugene W.Myers in: "An O(ND) Difference Algorithm and Its
 34 |    Variations" *)
 35 | 
 36 | open Base
 37 | 
 38 | (* A partition is the midpoint of the shortest edit script for a specified portion of two
 39 |    vectors.
 40 | 
 41 |    [xmid, ymid] is the midpoint discovered. The diagonal number [xmid - ymid] equals the
 42 |    number of inserted elements minus the number of deleted elements (counting only
 43 |    elements before the midpoint).
 44 | 
 45 |    [lo_minimal] is true iff the minimal edit script for the left half of the partition is
 46 |    known; similarly for [hi_minimal].
 47 | *)
 48 | module Partition = struct
 49 |   type t =
 50 |     { xmid : int
 51 |     ; ymid : int
 52 |     ; lo_minimal : bool
 53 |     ; hi_minimal : bool
 54 |     }
 55 | end
 56 | 
 57 | (* We keep this file in a C-like style so that we can easily compare against the original
 58 |    C, in which we have great confidence. *)
 59 | (* Find the midpoint of the shortest edit script for a specified portion of the two
 60 |    vectors.
 61 | 
 62 |    Scan from the beginnings of the vectors, and simultaneously from the ends, doing a
 63 |    breadth-first search through the space of edit-sequence. When the two searches meet, we
 64 |    have found the midpoint of the shortest edit sequence.
 65 | 
 66 |    If [find_minimal] is true, find the minimal edit script regardless of expense.
 67 |    Otherwise, if the search is too expensive, use heuristics to stop the search and report
 68 |    a suboptimal answer.
 69 | 
 70 |    This function assumes that the first elements of the specified portions of the two
 71 |    vectors do not match, and likewise that the last elements do not match. The caller must
 72 |    trim matching elements from the beginning and end of the portions it is going to
 73 |    specify.
 74 | 
 75 |    If we return the "wrong" partitions, the worst this can do is cause suboptimal diff
 76 |    output. It cannot cause incorrect diff output. *)
 77 | let diag ~fd ~bd ~sh ~xv ~yv ~xoff ~xlim ~yoff ~ylim ~too_expensive ~find_minimal
 78 |   : Partition.t
 79 |   =
 80 |   let dmin = xoff - ylim (* minimum valid diagonal *) in
 81 |   let dmax = xlim - yoff (* maximum valid diagonal *) in
 82 |   let fmid = xoff - yoff (* center diagonal of forward search *) in
 83 |   let bmid = xlim - ylim (* center diagonal of backward search *) in
 84 |   (* southeast corner is on an odd diagonal w.r.t the northwest *)
 85 |   let odd = (fmid - bmid) land 1 <> 0 in
 86 |   (* [sh] is an offset that lets us use indices in [[-(m+1), n+1]]. *)
 87 |   fd.(sh + fmid) <- xoff;
 88 |   bd.(sh + bmid) <- xlim;
 89 |   With_return.with_return (fun ({ return } : Partition.t With_return.return) ->
 90 |     (* [c] is cost.
 91 |        [fmin], [fmax] are limits of the forward search.
 92 |        [bmin], [bmax] are limits of the backward search. *)
 93 |     let rec loop ~c ~fmin ~fmax ~bmin ~bmax =
 94 |       (* Extend the forward search by one edit step in each diagonal. *)
 95 |       let fmin =
 96 |         if fmin > dmin
 97 |         then (
 98 |           fd.(sh + fmin - 2) <- -1;
 99 |           fmin - 1)
100 |         else fmin + 1
101 |       in
102 |       let fmax =
103 |         if fmax < dmax
104 |         then (
105 |           fd.(sh + fmax + 2) <- -1;
106 |           fmax + 1)
107 |         else fmax - 1
108 |       in
109 |       (* [d] is the active diagonal. *)
110 |       (let rec loop d =
111 |          if d < fmin
112 |          then ()
113 |          else (
114 |            let tlo = fd.(sh + d - 1) in
115 |            let thi = fd.(sh + d + 1) in
116 |            let x = if tlo >= thi then tlo + 1 else thi in
117 |            let x, y =
118 |              let rec loop ~xv ~yv ~xlim ~ylim ~x ~y =
119 |                if x < xlim && y < ylim && phys_equal (xv x) (yv y)
120 |                then loop ~xv ~yv ~xlim ~ylim ~x:(x + 1) ~y:(y + 1)
121 |                else x, y
122 |              in
123 |              loop ~xv ~yv ~xlim ~ylim ~x ~y:(x - d)
124 |            in
125 |            fd.(sh + d) <- x;
126 |            if odd && bmin <= d && d <= bmax && bd.(sh + d) <= fd.(sh + d)
127 |            then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true }
128 |            else loop (d - 2))
129 |        in
130 |        loop fmax);
131 |       (* Similarly extend the backward search. *)
132 |       let bmin =
133 |         if bmin > dmin
134 |         then (
135 |           bd.(sh + bmin - 2) <- Int.max_value;
136 |           bmin - 1)
137 |         else bmin + 1
138 |       in
139 |       let bmax =
140 |         if bmax < dmax
141 |         then (
142 |           bd.(sh + bmax + 2) <- Int.max_value;
143 |           bmax + 1)
144 |         else bmax - 1
145 |       in
146 |       (let rec loop d =
147 |          if d < bmin
148 |          then ()
149 |          else (
150 |            let tlo = bd.(sh + d - 1) in
151 |            let thi = bd.(sh + d + 1) in
152 |            let x = if tlo < thi then tlo else thi - 1 in
153 |            let x, y =
154 |              let rec loop ~xv ~yv ~xoff ~yoff ~x ~y =
155 |                if x > xoff && y > yoff && phys_equal (xv (x - 1)) (yv (y - 1))
156 |                then loop ~xv ~yv ~xoff ~yoff ~x:(x - 1) ~y:(y - 1)
157 |                else x, y
158 |              in
159 |              loop ~xv ~yv ~xoff ~yoff ~x ~y:(x - d)
160 |            in
161 |            bd.(sh + d) <- x;
162 |            if (not odd) && fmin <= d && d <= fmax && bd.(sh + d) <= fd.(sh + d)
163 |            then return { xmid = x; ymid = y; lo_minimal = true; hi_minimal = true }
164 |            else loop (d - 2))
165 |        in
166 |        loop bmax);
167 |       (* Heuristic: if we've gone well beyond the call of duty, give up and report halfway
168 |          between our best results so far. *)
169 |       if (not find_minimal) && c >= too_expensive
170 |       then (
171 |         (* Find forward diagonal that maximizes [x + y]. *)
172 |         let fxybest, fxbest =
173 |           let rec loop ~d ~fxybest ~fxbest =
174 |             if d < fmin
175 |             then fxybest, fxbest
176 |             else (
177 |               let x = Int.min fd.(sh + d) xlim in
178 |               let y = x - d in
179 |               let x, y = if ylim < y then ylim + d, ylim else x, y in
180 |               let fxybest, fxbest =
181 |                 if fxybest < x + y then x + y, x else fxybest, fxbest
182 |               in
183 |               loop ~d:(d - 2) ~fxybest ~fxbest)
184 |           in
185 |           loop ~d:fmax ~fxybest:(-1) ~fxbest:fmax
186 |         in
187 |         (* Find backward diagonal that minimizes [x + y]. *)
188 |         let bxybest, bxbest =
189 |           let rec loop ~d ~bxybest ~bxbest =
190 |             if d < bmin
191 |             then bxybest, bxbest
192 |             else (
193 |               let x = Int.max xoff bd.(sh + d) in
194 |               let y = x - d in
195 |               let x, y = if y < yoff then yoff + d, yoff else x, y in
196 |               let bxybest, bxbest =
197 |                 if x + y < bxybest then x + y, x else bxybest, bxbest
198 |               in
199 |               loop ~d:(d - 2) ~bxybest ~bxbest)
200 |           in
201 |           loop ~d:bmax ~bxybest:Int.max_value ~bxbest:bmax
202 |         in
203 |         if xlim + ylim - bxybest < fxybest - (xoff + yoff)
204 |         then
205 |           return
206 |             { xmid = fxbest
207 |             ; ymid = fxybest - fxbest
208 |             ; lo_minimal = true
209 |             ; hi_minimal = false
210 |             }
211 |         else
212 |           return
213 |             { xmid = bxbest
214 |             ; ymid = bxybest - bxbest
215 |             ; lo_minimal = false
216 |             ; hi_minimal = true
217 |             })
218 |       else loop ~c:(c + 1) ~fmin ~fmax ~bmin ~bmax
219 |     in
220 |     loop ~c:1 ~fmin:fmid ~fmax:fmid ~bmin:bmid ~bmax:bmid)
221 | ;;
222 | 
223 | let diff_loop ~cutoff a ai b bi n m =
224 |   let fd = Array.create ~len:(n + m + 3) 0 in
225 |   let bd = Array.create ~len:(n + m + 3) 0 in
226 |   let sh = m + 1 in
227 |   let too_expensive =
228 |     match cutoff with
229 |     | Some c -> c
230 |     | None ->
231 |       let diags = n + m + 3 in
232 |       let rec loop diags too_expensive =
233 |         if diags = 0 then too_expensive else loop (diags asr 2) (too_expensive lsl 1)
234 |       in
235 |       Int.max 4096 (loop diags 1)
236 |   in
237 |   let xvec i = a.(ai.(i)) in
238 |   let yvec j = b.(bi.(j)) in
239 |   let chng1 = Array.create ~len:(Array.length a) true in
240 |   let chng2 = Array.create ~len:(Array.length b) true in
241 |   for i = 0 to n - 1 do
242 |     chng1.(ai.(i)) <- false
243 |   done;
244 |   for j = 0 to m - 1 do
245 |     chng2.(bi.(j)) <- false
246 |   done;
247 |   let rec loop ~xoff ~xlim ~yoff ~ylim ~find_minimal =
248 |     let xoff, yoff =
249 |       let rec loop ~xoff ~yoff =
250 |         if xoff < xlim && yoff < ylim && phys_equal (xvec xoff) (yvec yoff)
251 |         then loop ~xoff:(xoff + 1) ~yoff:(yoff + 1)
252 |         else xoff, yoff
253 |       in
254 |       loop ~xoff ~yoff
255 |     in
256 |     let xlim, ylim =
257 |       let rec loop ~xlim ~ylim =
258 |         if xlim > xoff && ylim > yoff && phys_equal (xvec (xlim - 1)) (yvec (ylim - 1))
259 |         then loop ~xlim:(xlim - 1) ~ylim:(ylim - 1)
260 |         else xlim, ylim
261 |       in
262 |       loop ~xlim ~ylim
263 |     in
264 |     if xoff = xlim
265 |     then
266 |       for y = yoff to ylim - 1 do
267 |         chng2.(bi.(y)) <- true
268 |       done
269 |     else if yoff = ylim
270 |     then
271 |       for x = xoff to xlim - 1 do
272 |         chng1.(ai.(x)) <- true
273 |       done
274 |     else (
275 |       let { Partition.xmid; ymid; lo_minimal; hi_minimal } =
276 |         diag
277 |           ~fd
278 |           ~bd
279 |           ~sh
280 |           ~xv:xvec
281 |           ~yv:yvec
282 |           ~xoff
283 |           ~xlim
284 |           ~yoff
285 |           ~ylim
286 |           ~too_expensive
287 |           ~find_minimal
288 |       in
289 |       loop ~xoff ~xlim:xmid ~yoff ~ylim:ymid ~find_minimal:lo_minimal;
290 |       loop ~xoff:xmid ~xlim ~yoff:ymid ~ylim ~find_minimal:hi_minimal)
291 |   in
292 |   loop ~xoff:0 ~xlim:n ~yoff:0 ~ylim:m ~find_minimal:false;
293 |   chng1, chng2
294 | ;;
295 | 
296 | (* [make_indexer a b] returns an array of the indices of items of [a] which are also
297 |    present in [b]; this way, the main algorithm can skip items which, anyway, are
298 |    different. This improves the speed much.  At the same time, this function updates the
299 |    items of [a] and [b] so that all equal items point to the same unique item.  All item
300 |    comparisons in the main algorithm can therefore be done with [phys_equal] instead of
301 |    [=], which can improve speed much. *)
302 | let make_indexer hashable a b =
303 |   let n = Array.length a in
304 |   let htb = Hashtbl.create hashable ~size:(10 * Array.length b) in
305 |   Array.iteri
306 |     ~f:(fun i e ->
307 |       match Hashtbl.find htb e with
308 |       | Some v -> b.(i) <- v
309 |       | None -> Hashtbl.add_exn htb ~key:e ~data:e)
310 |     b;
311 |   let ai = Array.create ~len:n 0 in
312 |   let k =
313 |     let rec loop i k =
314 |       if i = n
315 |       then k
316 |       else (
317 |         let k =
318 |           match Hashtbl.find htb a.(i) with
319 |           | Some v ->
320 |             a.(i) <- v;
321 |             ai.(k) <- i;
322 |             k + 1
323 |           | None -> k
324 |         in
325 |         loop (i + 1) k)
326 |     in
327 |     loop 0 0
328 |   in
329 |   Array.sub ai ~pos:0 ~len:k
330 | ;;
331 | 
332 | let f ~cutoff ~hashable a b =
333 |   let ai = make_indexer hashable a b in
334 |   let bi = make_indexer hashable b a in
335 |   let n = Array.length ai in
336 |   let m = Array.length bi in
337 |   diff_loop ~cutoff a ai b bi n m
338 | ;;
339 | 
340 | let iter_matches ?cutoff ~f:ff ~hashable a b =
341 |   let d1, d2 = f ~cutoff ~hashable a b in
342 |   let rec aux i1 i2 =
343 |     if i1 >= Array.length d1 || i2 >= Array.length d2
344 |     then ()
345 |     else if not d1.(i1)
346 |     then
347 |       if not d2.(i2)
348 |       then (
349 |         ff (i1, i2);
350 |         aux (i1 + 1) (i2 + 1))
351 |       else aux i1 (i2 + 1)
352 |     else if not d2.(i2)
353 |     then aux (i1 + 1) i2
354 |     else aux (i1 + 1) (i2 + 1)
355 |   in
356 |   aux 0 0
357 | ;;
358 | 


--------------------------------------------------------------------------------
/src/patience_diff.ml:
--------------------------------------------------------------------------------
   1 | open! Core
   2 | include Patience_diff_intf
   3 | module Hunk = Hunk
   4 | module Hunks = Hunks
   5 | module Matching_block = Matching_block
   6 | module Range = Range
   7 | module Move_id = Move_id
   8 | 
   9 | let ( <|> ) ar (i, j) = if j <= i then [||] else Array.slice ar i j
  10 | 
  11 | (* Does the nitty gritty of turning indexes into
  12 |    line numbers and reversing the ranges, returning a nice new hunk *)
  13 | let create_hunk prev_start prev_stop next_start next_stop ranges : _ Hunk.t =
  14 |   { prev_start = prev_start + 1
  15 |   ; prev_size = prev_stop - prev_start
  16 |   ; next_start = next_start + 1
  17 |   ; next_size = next_stop - next_start
  18 |   ; ranges = List.rev ranges
  19 |   }
  20 | ;;
  21 | 
  22 | module Ordered_sequence : sig
  23 |   type elt = int * int [@@deriving compare ~localize]
  24 | 
  25 |   (* A [t] has its second coordinates in increasing order *)
  26 | 
  27 |   type t = private elt array [@@deriving sexp_of]
  28 | 
  29 |   val create : (int * int) array -> t
  30 |   val is_empty : t -> bool
  31 | end = struct
  32 |   type elt = int * int [@@deriving sexp_of]
  33 | 
  34 |   let%template compare_elt a b =
  35 |     (Comparable.lexicographic [@mode m])
  36 |       [ (fun (_, y0) (_, y1) -> Int.compare y0 y1)
  37 |       ; (fun (x0, _) (x1, _) -> Int.compare x0 x1)
  38 |       ]
  39 |       a
  40 |       b
  41 |   [@@mode m = (local, global)]
  42 |   ;;
  43 | 
  44 |   type t = elt array [@@deriving sexp_of]
  45 | 
  46 |   let create t =
  47 |     Array.sort t ~compare:compare_elt;
  48 |     t
  49 |   ;;
  50 | 
  51 |   let is_empty = Array.is_empty
  52 | end
  53 | 
  54 | (* This is an implementation of the patience sorting algorithm as explained at
  55 |    http://en.wikipedia.org/wiki/Patience_sorting *)
  56 | module Patience : sig
  57 |   val longest_increasing_subsequence : Ordered_sequence.t -> (int * int) list
  58 | end = struct
  59 |   module Pile = struct
  60 |     type 'a t = 'a Stack.t
  61 | 
  62 |     let create x =
  63 |       let t = Stack.create () in
  64 |       Stack.push t x;
  65 |       t
  66 |     ;;
  67 | 
  68 |     let top t = Stack.top t |> Option.value_exn
  69 |     let put_on_top t x = Stack.push t x
  70 |   end
  71 | 
  72 |   module Piles = struct
  73 |     type 'a t = 'a Pile.t Deque.t
  74 | 
  75 |     let empty () : 'a t = Deque.create ~never_shrink:true ()
  76 | 
  77 |     let get_ith_pile t i dir =
  78 |       let get index offset =
  79 |         Option.bind (index t) ~f:(fun index -> Deque.get_opt t (index + offset))
  80 |       in
  81 |       match dir with
  82 |       | `From_left -> get Deque.front_index i
  83 |       | `From_right -> get Deque.back_index (-i)
  84 |     ;;
  85 | 
  86 |     let new_rightmost_pile t pile = Deque.enqueue_back t pile
  87 |   end
  88 | 
  89 |   module Backpointers = struct
  90 |     (* in the terminology of the Wikipedia article, this corresponds to a card together
  91 |        with its backpointers *)
  92 |     type 'a tag = 'a t
  93 | 
  94 |     and 'a t =
  95 |       { value : 'a
  96 |       ; tag : 'a tag option
  97 |       }
  98 | 
  99 |     let to_list t =
 100 |       let rec to_list acc t =
 101 |         match t.tag with
 102 |         | None -> t.value :: acc
 103 |         | Some t' -> to_list (t.value :: acc) t'
 104 |       in
 105 |       to_list [] t
 106 |     ;;
 107 |   end
 108 | 
 109 |   module Play_patience : sig
 110 |     val play_patience
 111 |       :  Ordered_sequence.t
 112 |       -> get_tag:
 113 |            (pile_opt:int option
 114 |             -> piles:Ordered_sequence.elt Backpointers.t Piles.t
 115 |             -> Ordered_sequence.elt Backpointers.tag option)
 116 |       -> Ordered_sequence.elt Backpointers.t Piles.t
 117 |   end = struct
 118 |     let optimized_findi_from_left piles x =
 119 |       (* first see if any work *)
 120 |       let last_pile = Piles.get_ith_pile piles 0 `From_right in
 121 |       (* [x_pile] is a dummy pile just used for comparisons *)
 122 |       let x_pile = Pile.create { Backpointers.value = x, 0; tag = None } in
 123 |       let compare_top_values pile1 pile2 =
 124 |         let top pile = fst (Pile.top pile).Backpointers.value in
 125 |         Int.compare (top pile1) (top pile2)
 126 |       in
 127 |       let%bind.Option last_pile in
 128 |       if compare_top_values last_pile x_pile < 0
 129 |       then None
 130 |       else (
 131 |         (* do binary search *)
 132 |         let r =
 133 |           Deque.binary_search
 134 |             piles
 135 |             `First_strictly_greater_than
 136 |             x_pile
 137 |             ~compare:compare_top_values
 138 |         in
 139 |         [%globalize: int option] r [@nontail])
 140 |     ;;
 141 | 
 142 |     (* [play_patience ar ~get_tag] plays patience with the greedy algorithm as described
 143 |        in the Wikipedia article, taking [ar] to be the deck of cards.  It returns the
 144 |        resulting [Piles.t].  Before putting an element of [ar] in a pile, it tags it using
 145 |        [get_tag].  [get_tag] takes as its arguments the full [Piles.t] in its current
 146 |        state, and also the specific [Pile.t] that the element of [ar] is being added to.
 147 |     *)
 148 |     let play_patience ar ~get_tag =
 149 |       let ar = (ar : Ordered_sequence.t :> Ordered_sequence.elt array) in
 150 |       if Array.length ar = 0 then raise (Invalid_argument "Patience_diff.play_patience");
 151 |       let piles = Piles.empty () in
 152 |       Array.iter ar ~f:(fun x ->
 153 |         let pile_opt = optimized_findi_from_left piles (fst x) in
 154 |         let tagged_x = { Backpointers.value = x; tag = get_tag ~pile_opt ~piles } in
 155 |         match pile_opt with
 156 |         | None -> Piles.new_rightmost_pile piles (Pile.create tagged_x)
 157 |         | Some i ->
 158 |           let pile = Deque.get piles i in
 159 |           Pile.put_on_top pile tagged_x);
 160 |       piles
 161 |     ;;
 162 |   end
 163 | 
 164 |   let longest_increasing_subsequence ar =
 165 |     if Ordered_sequence.is_empty ar
 166 |     then []
 167 |     else
 168 |       let module P = Play_patience in
 169 |       let get_tag ~pile_opt ~piles =
 170 |         match pile_opt with
 171 |         | None -> Piles.get_ith_pile piles 0 `From_right |> Option.map ~f:Pile.top
 172 |         | Some i ->
 173 |           if i = 0
 174 |           then None
 175 |           else
 176 |             Piles.get_ith_pile piles (i - 1) `From_left
 177 |             |> Option.value_exn
 178 |             |> Pile.top
 179 |             |> Option.some
 180 |       in
 181 |       let piles = P.play_patience ar ~get_tag in
 182 |       Piles.get_ith_pile piles 0 `From_right
 183 |       |> Option.value_exn
 184 |       |> Pile.top
 185 |       |> Backpointers.to_list
 186 |   ;;
 187 | end
 188 | 
 189 | let compare_int_pair = Tuple.T2.compare ~cmp1:Int.compare ~cmp2:Int.compare
 190 | 
 191 | let%template _longest_increasing_subsequence ar =
 192 |   let ar = (ar : Ordered_sequence.t :> (int * int) array) in
 193 |   let len = Array.length ar in
 194 |   if len <= 1
 195 |   then Array.to_list ar
 196 |   else (
 197 |     let maxlen = ref 0 in
 198 |     let m = Array.create ~len:(len + 1) (-1) in
 199 |     let pred = Array.create ~len:(len + 1) (-1) in
 200 |     for i = 0 to len - 1 do
 201 |       let p =
 202 |         Array.binary_search
 203 |           ~compare:Ordered_sequence.compare_elt
 204 |           ar
 205 |           `First_greater_than_or_equal_to
 206 |           ar.(i)
 207 |           ~len:(max (!maxlen - 1) 0)
 208 |           ~pos:1
 209 |         |> (Option.value [@mode local]) ~default:0
 210 |       in
 211 |       pred.(i) <- m.(p);
 212 |       if p = !maxlen || compare_int_pair ar.(i) ar.(p + 1) < 0
 213 |       then (
 214 |         m.(p + 1) <- i;
 215 |         if p + 1 > !maxlen then maxlen := p + 1)
 216 |     done;
 217 |     let rec loop ac p = if p = -1 then ac else loop (ar.(p) :: ac) pred.(p) in
 218 |     loop [] m.(!maxlen))
 219 | ;;
 220 | 
 221 | (* Configurable parameters for [semantic_cleanup] and [unique_lcs], all chosen based
 222 |    on empirical observation. *)
 223 | (* This function is called on the edge case of semantic cleanup, when there's a change
 224 |    that's exactly the same length as the size of the match.  If the insert on the next
 225 |    side is a LOT larger than the match, it should be semantically cleaned up, but
 226 |    most of the time it should be left alone. *)
 227 | let should_discard_if_other_side_equal ~big_enough = 100 / big_enough
 228 | 
 229 | (* These are the numerator and denominator of the cutoff for aborting the patience diff
 230 |    algorithm in [unique_lcs].  (This will result in us using [Plain_diff] instead.)
 231 |    Lowering [switch_to_plain_diff_numerator] / [switch_to_plain_diff_denominator]
 232 |    makes us switch to plain diff less often.  The range of this cutoff is from 0 to 1,
 233 |    where 0 means we always switch and 1 means we never switch. *)
 234 | let switch_to_plain_diff_numerator = 1
 235 | let switch_to_plain_diff_denominator = 10
 236 | 
 237 | module Make (Elt : Hashtbl.Key) = struct
 238 |   module Table = Hashtbl.Make (Elt)
 239 | 
 240 |   type elt = Elt.t
 241 | 
 242 |   (* This is an implementation of the patience diff algorithm by Bram Cohen as seen in
 243 |      Bazaar version 1.14.1 *)
 244 | 
 245 |   module Line_metadata = struct
 246 |     type t =
 247 |       | Unique_in_a of { index_in_a : int }
 248 |       | Unique_in_a_b of
 249 |           { index_in_a : int
 250 |           ; index_in_b : int
 251 |           }
 252 |       | Not_unique of { occurrences_in_a : int }
 253 |   end
 254 | 
 255 |   let unique_lcs (alpha, alo, ahi) (bravo, blo, bhi) =
 256 |     (* Create a hashtable which takes elements of a to their index in a iff they're
 257 |        unique. If an element is not unique, it takes it to its frequency in a. *)
 258 |     let unique : (elt, Line_metadata.t) Table.hashtbl =
 259 |       Table.create ~size:(Int.min (ahi - alo) (bhi - blo)) ()
 260 |     in
 261 |     for x's_pos_in_a = alo to ahi - 1 do
 262 |       let x = alpha.(x's_pos_in_a) in
 263 |       match Hashtbl.find unique x with
 264 |       | None ->
 265 |         Hashtbl.set unique ~key:x ~data:(Unique_in_a { index_in_a = x's_pos_in_a })
 266 |       | Some (Unique_in_a _) ->
 267 |         Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = 2 })
 268 |       | Some (Not_unique { occurrences_in_a = n }) ->
 269 |         Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = n + 1 })
 270 |       (* This case doesn't occur until the second pass through [unique] *)
 271 |       | Some (Unique_in_a_b _) -> assert false
 272 |     done;
 273 |     (* [num_pairs] is the size of the list we use for Longest Increasing Subsequence.
 274 |        [intersection_size] is the number of tokens in the intersection of the two
 275 |        sequences, with multiplicity, and is an upper bound on the size of the LCS. *)
 276 |     let num_pairs = ref 0 in
 277 |     let intersection_size = ref 0 in
 278 |     for x's_pos_in_b = blo to bhi - 1 do
 279 |       let x = bravo.(x's_pos_in_b) in
 280 |       Hashtbl.find unique x
 281 |       |> Option.iter ~f:(fun pos ->
 282 |         match pos with
 283 |         | Not_unique { occurrences_in_a = n } ->
 284 |           if n > 0
 285 |           then (
 286 |             Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = n - 1 });
 287 |             incr intersection_size)
 288 |         | Unique_in_a { index_in_a = x's_pos_in_a } ->
 289 |           incr num_pairs;
 290 |           incr intersection_size;
 291 |           Hashtbl.set
 292 |             unique
 293 |             ~key:x
 294 |             ~data:(Unique_in_a_b { index_in_a = x's_pos_in_a; index_in_b = x's_pos_in_b })
 295 |         | Unique_in_a_b _ ->
 296 |           decr num_pairs;
 297 |           Hashtbl.set unique ~key:x ~data:(Not_unique { occurrences_in_a = 0 }))
 298 |     done;
 299 |     (* If we're ignoring almost all of the text when we perform the patience
 300 |        diff algorithm, it will often give bad results. *)
 301 |     if !num_pairs * switch_to_plain_diff_denominator
 302 |        < !intersection_size * switch_to_plain_diff_numerator
 303 |     then `Not_enough_unique_tokens
 304 |     else (
 305 |       let a_b =
 306 |         let arr = Array.init !num_pairs ~f:(fun _ -> 0, 0) in
 307 |         let i = ref 0 in
 308 |         Hashtbl.iter unique ~f:(function
 309 |           | Not_unique _ | Unique_in_a _ -> ()
 310 |           | Unique_in_a_b { index_in_a = i_a; index_in_b = i_b } ->
 311 |             arr.(!i) <- i_a, i_b;
 312 |             Int.incr i);
 313 |         Ordered_sequence.create arr
 314 |       in
 315 |       `Computed_lcs (Patience.longest_increasing_subsequence a_b))
 316 |   ;;
 317 | 
 318 |   (* [matches a b] returns a list of pairs (i,j) such that a.(i) = b.(j) and such that
 319 |      the list is strictly increasing in both its first and second coordinates.
 320 | 
 321 |      This is done by first applying unique_lcs to find matches from a to b among those
 322 |      elements which are unique in both a and b, and then recursively applying [matches] to
 323 |      each subinterval determined by those matches.  The uniqueness requirement is waived
 324 |      for blocks of matching lines at the beginning or end.
 325 | 
 326 |      I couldn't figure out how to do this efficiently in a functional way, so
 327 |      this is pretty much a straight translation of the original Python code. *)
 328 |   let matches alpha bravo =
 329 |     let matches_ref_length = ref 0 in
 330 |     let matches_ref = ref [] in
 331 |     let add_match m =
 332 |       incr matches_ref_length;
 333 |       matches_ref := m :: !matches_ref
 334 |     in
 335 |     let rec recurse_matches alo blo ahi bhi =
 336 |       (*    printf "alo %d blo %d ahi %d bhi %d\n%!" alo blo ahi bhi; *)
 337 |       let old_length = !matches_ref_length in
 338 |       if not (alo >= ahi || blo >= bhi)
 339 |       then
 340 |         if Elt.compare alpha.(alo) bravo.(blo) = 0
 341 |         then (
 342 |           let alo = ref alo in
 343 |           let blo = ref blo in
 344 |           while !alo < ahi && !blo < bhi && Elt.compare alpha.(!alo) bravo.(!blo) = 0 do
 345 |             add_match (!alo, !blo);
 346 |             incr alo;
 347 |             incr blo
 348 |           done;
 349 |           recurse_matches !alo !blo ahi bhi)
 350 |         else if Elt.compare alpha.(ahi - 1) bravo.(bhi - 1) = 0
 351 |         then (
 352 |           let nahi = ref (ahi - 1) in
 353 |           let nbhi = ref (bhi - 1) in
 354 |           while
 355 |             !nahi > alo
 356 |             && !nbhi > blo
 357 |             && Elt.compare alpha.(!nahi - 1) bravo.(!nbhi - 1) = 0
 358 |           do
 359 |             decr nahi;
 360 |             decr nbhi
 361 |           done;
 362 |           recurse_matches alo blo !nahi !nbhi;
 363 |           for i = 0 to ahi - !nahi - 1 do
 364 |             add_match (!nahi + i, !nbhi + i)
 365 |           done)
 366 |         else (
 367 |           let last_a_pos = ref (alo - 1) in
 368 |           let last_b_pos = ref (blo - 1) in
 369 |           let plain_diff () =
 370 |             Plain_diff.iter_matches
 371 |               ~hashable:(module Elt)
 372 |               (Array.sub alpha ~pos:alo ~len:(ahi - alo))
 373 |               (Array.sub bravo ~pos:blo ~len:(bhi - blo))
 374 |               ~f:(fun (i1, i2) -> add_match (alo + i1, blo + i2))
 375 |           in
 376 |           match unique_lcs (alpha, alo, ahi) (bravo, blo, bhi) with
 377 |           | `Not_enough_unique_tokens -> plain_diff ()
 378 |           | `Computed_lcs lcs ->
 379 |             lcs
 380 |             |> List.iter ~f:(fun (apos, bpos) ->
 381 |               if !last_a_pos + 1 <> apos || !last_b_pos + 1 <> bpos
 382 |               then recurse_matches (!last_a_pos + 1) (!last_b_pos + 1) apos bpos;
 383 |               last_a_pos := apos;
 384 |               last_b_pos := bpos;
 385 |               add_match (apos, bpos));
 386 |             if !matches_ref_length > old_length (* Did unique_lcs find anything at all? *)
 387 |             then recurse_matches (!last_a_pos + 1) (!last_b_pos + 1) ahi bhi
 388 |             else plain_diff ())
 389 |     in
 390 |     recurse_matches 0 0 (Array.length alpha) (Array.length bravo);
 391 |     List.rev !matches_ref
 392 |   ;;
 393 | 
 394 |   let collapse_sequences matches =
 395 |     let collapsed = ref [] in
 396 |     let start_a = ref None in
 397 |     let start_b = ref None in
 398 |     let length = ref 0 in
 399 |     List.iter matches ~f:(fun (i_a, i_b) ->
 400 |       match !start_a, !start_b with
 401 |       | Some start_a_val, Some start_b_val
 402 |         when i_a = start_a_val + !length && i_b = start_b_val + !length -> incr length
 403 |       | _ ->
 404 |         (match !start_a, !start_b with
 405 |          | Some start_a_val, Some start_b_val ->
 406 |            let matching_block =
 407 |              { Matching_block.prev_start = start_a_val
 408 |              ; next_start = start_b_val
 409 |              ; length = !length
 410 |              }
 411 |            in
 412 |            collapsed := matching_block :: !collapsed
 413 |          | _ -> ());
 414 |         start_a := Some i_a;
 415 |         start_b := Some i_b;
 416 |         length := 1);
 417 |     (match !start_a, !start_b with
 418 |      | Some start_a_val, Some start_b_val when !length <> 0 ->
 419 |        let matching_block =
 420 |          { Matching_block.prev_start = start_a_val
 421 |          ; next_start = start_b_val
 422 |          ; length = !length
 423 |          }
 424 |        in
 425 |        collapsed := matching_block :: !collapsed
 426 |      | _ -> ());
 427 |     List.rev !collapsed
 428 |   ;;
 429 | 
 430 |   (* Given that there's an insert/delete of size [left_change] to the left, and
 431 |      an insert/delete of size [right_change] to the right, should we keep
 432 |      this block of length [block_len] in our list of matches, or discard it? *)
 433 |   let should_discard_match ~big_enough ~left_change ~right_change ~block_len =
 434 |     (* Throw away if its effective length is too small,
 435 |        relative to its surrounding inserts / deletes. *)
 436 |     block_len < big_enough
 437 |     && ((left_change > block_len && right_change > block_len)
 438 |         || (left_change >= block_len + should_discard_if_other_side_equal ~big_enough
 439 |             && right_change = block_len)
 440 |         || (right_change >= block_len + should_discard_if_other_side_equal ~big_enough
 441 |             && left_change = block_len))
 442 |   ;;
 443 | 
 444 |   let change_between
 445 |     (left_matching_block : Matching_block.t)
 446 |     (right_matching_block : Matching_block.t)
 447 |     =
 448 |     max
 449 |       (right_matching_block.prev_start - left_matching_block.prev_start)
 450 |       (right_matching_block.next_start - left_matching_block.next_start)
 451 |     - left_matching_block.length
 452 |   ;;
 453 | 
 454 |   (* See the "Semantic Chaff" section of https://neil.fraser.name/writing/diff/ *)
 455 |   let basic_semantic_cleanup ~big_enough matching_blocks =
 456 |     if big_enough <= 1
 457 |     then matching_blocks
 458 |     else (
 459 |       match matching_blocks with
 460 |       | [] -> []
 461 |       | first_block :: other_blocks ->
 462 |         let final_ans, final_pending =
 463 |           List.fold
 464 |             other_blocks
 465 |             ~init:([], first_block)
 466 |             ~f:(fun (ans, pending) current_block ->
 467 |               let rec loop ans pending =
 468 |                 match ans with
 469 |                 | [] -> ans, pending
 470 |                 | hd :: tl ->
 471 |                   if should_discard_match
 472 |                        ~big_enough
 473 |                        ~left_change:(change_between hd pending)
 474 |                        ~right_change:(change_between pending current_block)
 475 |                        ~block_len:pending.length
 476 |                   then loop tl hd
 477 |                   else ans, pending
 478 |               in
 479 |               let updated_ans, updated_pending = loop ans pending in
 480 |               updated_pending :: updated_ans, current_block)
 481 |         in
 482 |         List.rev (final_pending :: final_ans))
 483 |   ;;
 484 | 
 485 |   (* Attempts to eliminate the "tunnel vision" problem described in the
 486 |      "Semantic Chaff" section of https://neil.fraser.name/writing/diff/.
 487 |      To do this, we go through each pair of consecutive matches
 488 |      and pretend to combine them into one match.  If that match would
 489 |      be deleted by [basic_semantic_cleanup], we delete both. *)
 490 |   let advanced_semantic_cleanup ~big_enough matching_blocks =
 491 |     if big_enough <= 1
 492 |     then matching_blocks
 493 |     else (
 494 |       match matching_blocks with
 495 |       | [] -> []
 496 |       | [ first_block ] -> [ first_block ]
 497 |       | first_block :: second_block :: other_blocks ->
 498 |         let final_ans, final_pendingA, final_pendingB =
 499 |           List.fold
 500 |             other_blocks
 501 |             ~init:([], first_block, second_block)
 502 |             ~f:(fun (ans, pendingA, pendingB) current_block ->
 503 |               let rec loop ans pendingA pendingB =
 504 |                 match ans with
 505 |                 | [] -> ans, pendingA, pendingB
 506 |                 | hd :: tl ->
 507 |                   if should_discard_match
 508 |                        ~big_enough
 509 |                        ~left_change:(change_between hd pendingA)
 510 |                        ~right_change:(change_between pendingB current_block)
 511 |                        ~block_len:
 512 |                          (pendingB.length
 513 |                           + min
 514 |                               (pendingB.prev_start - pendingA.prev_start)
 515 |                               (pendingB.next_start - pendingA.next_start))
 516 |                   then loop tl hd pendingA
 517 |                   else ans, pendingA, pendingB
 518 |               in
 519 |               let updated_ans, updated_pendingA, updated_pendingB =
 520 |                 loop ans pendingA pendingB
 521 |               in
 522 |               updated_pendingA :: updated_ans, updated_pendingB, current_block)
 523 |         in
 524 |         List.rev (final_pendingB :: final_pendingA :: final_ans)
 525 |         (* The loop above only deleted the second element of each pair we're supposed to
 526 |            delete.  This call to [basic_semantic_cleanup] is guaranteed to finish the job
 527 |            by deleting the remaining element of those pairs. *)
 528 |         |> basic_semantic_cleanup ~big_enough)
 529 |   ;;
 530 | 
 531 |   (* Goal: eliminate small, semantically meaningless matches. *)
 532 |   let semantic_cleanup ~big_enough matching_blocks =
 533 |     basic_semantic_cleanup ~big_enough matching_blocks
 534 |     |> advanced_semantic_cleanup ~big_enough
 535 |   ;;
 536 | 
 537 |   (* When we have a choice, we'd prefer one block of equality to two.
 538 |      For example, instead of A <insert>B A</insert> C D E F, we prefer
 539 |      <insert>A B</insert> A C D E F.  There are two reasons:
 540 | 
 541 |      (1) A is usually something like "let", and so the second version is more
 542 |      semantically accurate
 543 |      (2) Semantic cleanup may delete the lone A match, but it will not delete
 544 |      the A C D E F match). So by moving the A match, we've also saved it. *)
 545 |   let combine_equalities ~prev ~next ~matches =
 546 |     match matches with
 547 |     | [] -> []
 548 |     | first_block :: tl ->
 549 |       List.fold tl ~init:([], first_block) ~f:(fun (ans, pending) block ->
 550 |         let rec loop ans ~(pending : Matching_block.t) ~(new_block : Matching_block.t) =
 551 |           if pending.length = 0
 552 |           then ans, pending, new_block
 553 |           else (
 554 |             let advance_in_prev =
 555 |               Elt.compare
 556 |                 prev.(pending.prev_start + pending.length - 1)
 557 |                 prev.(new_block.prev_start - 1)
 558 |               = 0
 559 |             in
 560 |             let advance_in_next =
 561 |               Elt.compare
 562 |                 next.(pending.next_start + pending.length - 1)
 563 |                 next.(new_block.next_start - 1)
 564 |               = 0
 565 |             in
 566 |             if advance_in_prev && advance_in_next
 567 |             then
 568 |               loop
 569 |                 ans
 570 |                 ~pending:
 571 |                   { prev_start = pending.prev_start
 572 |                   ; next_start = pending.next_start
 573 |                   ; length = pending.length - 1
 574 |                   }
 575 |                 ~new_block:
 576 |                   { prev_start = new_block.prev_start - 1
 577 |                   ; next_start = new_block.next_start - 1
 578 |                   ; length = new_block.length + 1
 579 |                   }
 580 |             else ans, pending, new_block)
 581 |         in
 582 |         let updated_ans, updated_pending, updated_new_block =
 583 |           loop ans ~pending ~new_block:block
 584 |         in
 585 |         (* In the original Google heuristic, we would either move all or none
 586 |            of pending.  But because it might start with an unmatched `Newline(0, None),
 587 |            we are fine with moving all but one token of it. *)
 588 |         if updated_pending.length = 0 || updated_pending.length = 1
 589 |         then (
 590 |           let new_ans =
 591 |             if updated_pending.length = 0
 592 |             then updated_ans
 593 |             else updated_pending :: updated_ans
 594 |           in
 595 |           new_ans, updated_new_block)
 596 |         else (* Do nothing *)
 597 |           pending :: ans, block)
 598 |       |> fun (ans, pending) -> List.rev (pending :: ans)
 599 |   ;;
 600 | 
 601 |   (* The aim here is to be able to transform e.g. foo f<insert>rob f</insert>lip into foo
 602 |      <insert>frob </insert>flip by scoring each boundary. *)
 603 |   (* {prev,next}_{elts,scorable} are the two arrays being diffed as (a) [Elt.t array]s
 604 |      which comes from mapping with the [transform] function and (b) ['a array]s where the
 605 |      [score] function operates on values of type ['a].
 606 | 
 607 |      [blocks] is a list of [Matching_block.t]s which specify the contiguous chunks
 608 |      of prev/next which are the same (when index ranges are missing, that missing
 609 |      chunk is unique to the array they are missing from). *)
 610 |   let align_diffs
 611 |     ~prev_elts
 612 |     ~prev_scorable
 613 |     ~next_elts
 614 |     ~next_scorable
 615 |     ~max_slide
 616 |     ~score
 617 |     blocks
 618 |     =
 619 |     if max_slide = 0
 620 |     then blocks
 621 |     else (
 622 |       match blocks with
 623 |       | [] -> []
 624 |       | (first_block : Matching_block.t) :: tl ->
 625 |         let score_elt_and_prev arr kind i =
 626 |           let i0 = i - 1 in
 627 |           let i1 = i in
 628 |           if i0 < 0 || i1 >= Array.length arr then 100 else score kind arr.(i0) arr.(i1)
 629 |         in
 630 |         (* [left] and [right] are both exclusive indexes, on either side of the inserted
 631 |            or deleted region. *)
 632 |         let score' arr ~left ~right =
 633 |           score_elt_and_prev arr `right right + score_elt_and_prev arr `left (left + 1)
 634 |         in
 635 |         let score ~prev_left ~prev_right ~next_left ~next_right =
 636 |           match prev_left + 1 = prev_right, next_left + 1 = next_right with
 637 |           | true, true ->
 638 |             (* This shouldn't happen; it means the two blocks of equal elements touch and
 639 |                should have been merged. *)
 640 |             0
 641 |           | true, false ->
 642 |             (* inserting *)
 643 |             score' next_scorable ~left:next_left ~right:next_right
 644 |           | false, true ->
 645 |             (* deleting *)
 646 |             score' prev_scorable ~left:prev_left ~right:prev_right
 647 |           | false, false ->
 648 |             (* editing. I think this also shouldn’t happen as if e.g. sliding right is
 649 |                viable you could just have had a bigger matching block on the right instead
 650 |             *)
 651 |             min
 652 |               (score' prev_scorable ~left:prev_left ~right:prev_right)
 653 |               (score' next_scorable ~left:next_left ~right:next_right)
 654 |         in
 655 |         let rec align_consecutive_blocks acc (left_block : Matching_block.t) right_blocks =
 656 |           let best_score = ref 0 in
 657 |           let offset_of_best_score = ref 0 in
 658 |           let score_initial
 659 |             (left_block : Matching_block.t)
 660 |             (right_block : Matching_block.t)
 661 |             =
 662 |             best_score
 663 |             := score
 664 |                  ~prev_left:(left_block.prev_start + left_block.length - 1)
 665 |                  ~prev_right:right_block.prev_start
 666 |                  ~next_left:(left_block.next_start + left_block.length - 1)
 667 |                  ~next_right:right_block.next_start
 668 |           in
 669 |           let rec try_to_slide_left
 670 |             ~i
 671 |             (left_block : Matching_block.t)
 672 |             (right_block : Matching_block.t)
 673 |             =
 674 |             let offset_into_left = left_block.length - i in
 675 |             if offset_into_left < 0
 676 |             then ()
 677 |             else if i > max_slide
 678 |             then
 679 |               ()
 680 |               (* {v
 681 |                 next: A B C D X P Q R S T X E F G H I
 682 |                 prev: A B C D X             E F G H I
 683 |                      ^         ^           ^         ^
 684 |                      \..left../            \..right../ v}
 685 | 
 686 |                  We want to see if shortening [left] and lengthening and moving-left
 687 |                  [right] is viable. Requires last element of [left] to be equal to the
 688 |                  element before [right]. *)
 689 |             else (
 690 |               let prev_left = left_block.prev_start + offset_into_left - 1 in
 691 |               let prev_right = right_block.prev_start - i in
 692 |               let next_left = left_block.next_start + offset_into_left - 1 in
 693 |               let next_right = right_block.next_start - i in
 694 |               if Elt.compare next_elts.(next_left + 1) next_elts.(next_right) <> 0
 695 |               then ()
 696 |               else if Elt.compare prev_elts.(prev_left + 1) prev_elts.(prev_right) <> 0
 697 |               then ()
 698 |               else (
 699 |                 let score = score ~prev_left ~prev_right ~next_left ~next_right in
 700 |                 if score > !best_score
 701 |                 then (
 702 |                   best_score := score;
 703 |                   offset_of_best_score := -i);
 704 |                 try_to_slide_left ~i:(i + 1) left_block right_block))
 705 |           in
 706 |           let rec try_to_slide_right
 707 |             ~i
 708 |             (left_block : Matching_block.t)
 709 |             (right_block : Matching_block.t)
 710 |             =
 711 |             let offset_into_left = left_block.length + i - 1 in
 712 |             if i > right_block.length
 713 |             then ()
 714 |             else if i > max_slide
 715 |             then
 716 |               ()
 717 |               (* {v
 718 |                 next: A B C D X P Q R S T X E F G H I
 719 |                 prev: A B C D             X E F G H I
 720 |                      ^       ^           ^           ^
 721 |                      \.left./            \...right.../ v}
 722 | 
 723 |                  We want to see if lengthening [left] and shortening and moving-right
 724 |                  [right] is viable. Requires element after [left] to be equal to the first
 725 |                  element of [right]. *)
 726 |             else (
 727 |               let prev_left = left_block.prev_start + offset_into_left in
 728 |               let prev_right = right_block.prev_start + i in
 729 |               let next_left = left_block.next_start + offset_into_left in
 730 |               let next_right = right_block.next_start + i in
 731 |               if Elt.compare next_elts.(next_left) next_elts.(next_right - 1) <> 0
 732 |               then ()
 733 |               else if Elt.compare prev_elts.(prev_left) prev_elts.(prev_right - 1) <> 0
 734 |               then ()
 735 |               else (
 736 |                 let score = score ~prev_left ~prev_right ~next_left ~next_right in
 737 |                 if score > !best_score
 738 |                 then (
 739 |                   best_score := score;
 740 |                   offset_of_best_score := i);
 741 |                 try_to_slide_right ~i:(i + 1) left_block right_block))
 742 |           in
 743 |           match right_blocks with
 744 |           | [] -> List.rev (left_block :: acc)
 745 |           | right_block :: rest ->
 746 |             (match left_block.length with
 747 |              | 0 ->
 748 |                (* It is possible to end up with a length=0 block due to sliding the
 749 |                   previous block to include it. If this happens we should drop it (so long
 750 |                   as it isn’t the last block; we want a length=0 block at the end). *)
 751 |                align_consecutive_blocks acc right_block rest
 752 |              | _ ->
 753 |                score_initial left_block right_block;
 754 |                try_to_slide_left ~i:1 left_block right_block;
 755 |                try_to_slide_right ~i:1 left_block right_block;
 756 |                (match !offset_of_best_score with
 757 |                 | 0 -> align_consecutive_blocks (left_block :: acc) right_block rest
 758 |                 | slide ->
 759 |                   let new_left = { left_block with length = left_block.length + slide } in
 760 |                   let new_right : Matching_block.t =
 761 |                     { prev_start = right_block.prev_start + slide
 762 |                     ; next_start = right_block.next_start + slide
 763 |                     ; length = right_block.length - slide
 764 |                     }
 765 |                   in
 766 |                   let acc = if new_left.length > 0 then new_left :: acc else acc in
 767 |                   align_consecutive_blocks acc new_right rest))
 768 |         in
 769 |         align_consecutive_blocks [] first_block tl)
 770 |   ;;
 771 | 
 772 |   let get_matching_blocks
 773 |     ~transform
 774 |     ?(big_enough = 1)
 775 |     ?(max_slide = 0)
 776 |     ?(score = fun _ _ _ -> 100)
 777 |     ~prev:prev_scorable
 778 |     ~next:next_scorable
 779 |     ()
 780 |     =
 781 |     let prev = Array.map prev_scorable ~f:transform in
 782 |     let next = Array.map next_scorable ~f:transform in
 783 |     let matches = matches prev next |> collapse_sequences in
 784 |     let matches = combine_equalities ~prev ~next ~matches in
 785 |     let last_match =
 786 |       { Matching_block.prev_start = Array.length prev
 787 |       ; next_start = Array.length next
 788 |       ; length = 0
 789 |       }
 790 |     in
 791 |     List.append matches [ last_match ]
 792 |     |> semantic_cleanup ~big_enough
 793 |     (* We want to score untransformed elements (e.g. still with whitespace) *)
 794 |     |> align_diffs
 795 |          ~prev_elts:prev
 796 |          ~prev_scorable
 797 |          ~next_elts:next
 798 |          ~next_scorable
 799 |          ~max_slide
 800 |          ~score
 801 |   ;;
 802 | 
 803 |   let get_ranges_rev ~transform ~big_enough ?max_slide ?score ~prev ~next () =
 804 |     let rec aux (matching_blocks : Matching_block.t list) i j l : _ Range.t list =
 805 |       match matching_blocks with
 806 |       | current_block :: remaining_blocks ->
 807 |         let prev_index, next_index, size =
 808 |           current_block.prev_start, current_block.next_start, current_block.length
 809 |         in
 810 |         (* Throw away crossover matches *)
 811 |         if prev_index < i || next_index < j
 812 |         then aux remaining_blocks i j l
 813 |         else (
 814 |           let range_opt : _ Range.t option =
 815 |             if i < prev_index && j < next_index
 816 |             then (
 817 |               let prev_range = prev <|> (i, prev_index) in
 818 |               let next_range = next <|> (j, next_index) in
 819 |               Some (Replace (prev_range, next_range, None)))
 820 |             else if i < prev_index
 821 |             then (
 822 |               let prev_range = prev <|> (i, prev_index) in
 823 |               Some (Prev (prev_range, None)))
 824 |             else if j < next_index
 825 |             then (
 826 |               let next_range = next <|> (j, next_index) in
 827 |               Some (Next (next_range, None)))
 828 |             else None
 829 |           in
 830 |           let l =
 831 |             match range_opt with
 832 |             | Some range -> range :: l
 833 |             | None -> l
 834 |           in
 835 |           let prev_stop, next_stop = prev_index + size, next_index + size in
 836 |           let l =
 837 |             if size = 0
 838 |             then l
 839 |             else (
 840 |               let prev_range = prev <|> (prev_index, prev_stop) in
 841 |               let next_range = next <|> (next_index, next_stop) in
 842 |               let range = Array.map2_exn prev_range next_range ~f:(fun x y -> x, y) in
 843 |               Same range :: l)
 844 |           in
 845 |           aux remaining_blocks prev_stop next_stop l)
 846 |       | [] -> List.rev l
 847 |     in
 848 |     let matching_blocks =
 849 |       get_matching_blocks ~transform ~big_enough ?max_slide ?score ~prev ~next ()
 850 |     in
 851 |     aux matching_blocks 0 0 []
 852 |   ;;
 853 | 
 854 |   let get_hunks ~transform ~context ?(big_enough = 1) ?max_slide ?score ~prev ~next () =
 855 |     let ranges = get_ranges_rev ~transform ~big_enough ?max_slide ?score ~prev ~next () in
 856 |     let a = prev in
 857 |     let b = next in
 858 |     if context < 0
 859 |     then (
 860 |       let singleton_hunk =
 861 |         create_hunk 0 (Array.length a) 0 (Array.length b) (List.rev ranges)
 862 |       in
 863 |       [ singleton_hunk ])
 864 |     else (
 865 |       let rec aux ranges_remaining curr_ranges alo ahi blo bhi acc_hunks =
 866 |         match (ranges_remaining : _ Range.t list) with
 867 |         | [] ->
 868 |           (* Finish the last hunk *)
 869 |           let new_hunk = create_hunk alo ahi blo bhi curr_ranges in
 870 |           (* Add it to the accumulator *)
 871 |           let acc_hunks = new_hunk :: acc_hunks in
 872 |           (* Finished! Return the accumulator *)
 873 |           List.rev acc_hunks
 874 |         | [ Same range ] ->
 875 |           (* If the last range is a Same, we might need to crop to context. *)
 876 |           let stop = min (Array.length range) context in
 877 |           let new_range = Range.Same (range <|> (0, stop)) in
 878 |           let curr_ranges = new_range :: curr_ranges in
 879 |           (* Finish the current hunk *)
 880 |           let ahi = ahi + stop in
 881 |           let bhi = bhi + stop in
 882 |           let new_hunk = create_hunk alo ahi blo bhi curr_ranges in
 883 |           (* Add it to the accumulator *)
 884 |           let acc_hunks = new_hunk :: acc_hunks in
 885 |           (* Finished! Return the accumulator *)
 886 |           List.rev acc_hunks
 887 |         | Same range :: rest ->
 888 |           let size = Array.length range in
 889 |           if size > context * 2
 890 |           then (
 891 |             (* If this Same range is sufficiently large, split off a new hunk *)
 892 |             let new_range = Range.Same (range <|> (0, context)) in
 893 |             let curr_ranges = new_range :: curr_ranges in
 894 |             (* Advance both hi's by context *)
 895 |             let ahi = ahi + context in
 896 |             let bhi = bhi + context in
 897 |             (* Finish the current hunk *)
 898 |             let new_hunk = create_hunk alo ahi blo bhi curr_ranges in
 899 |             (* Add it to the accumulator *)
 900 |             let acc_hunks = new_hunk :: acc_hunks in
 901 |             (* Calculate ranges for the next hunk *)
 902 |             let alo = ahi + size - (2 * context) in
 903 |             let ahi = alo in
 904 |             let blo = bhi + size - (2 * context) in
 905 |             let bhi = blo in
 906 |             (* Push the remainder of the Equal range back onto the remaining_ranges *)
 907 |             let rest = Range.Same (range <|> (size - context, size)) :: rest in
 908 |             aux rest [] alo ahi blo bhi acc_hunks)
 909 |           else (
 910 |             (* Otherwise, this range is small enough that it qualifies as context for
 911 |                the both the previous and forthcoming range, so simply add it to
 912 |                curr_ranges untouched *)
 913 |             let curr_ranges = Range.Same range :: curr_ranges in
 914 |             let ahi = ahi + size in
 915 |             let bhi = bhi + size in
 916 |             aux rest curr_ranges alo ahi blo bhi acc_hunks)
 917 |         | range :: rest ->
 918 |           (* Any range that isn't an Equal is important and not just context, so keep
 919 |              it in curr_ranges *)
 920 |           let curr_ranges = range :: curr_ranges in
 921 |           (* rest could be anything, so extract hunk_info from range *)
 922 |           let ahi, bhi =
 923 |             match range with
 924 |             | Same _ ->
 925 |               (* We eliminate the possibility of a Same above *)
 926 |               assert false
 927 |             | Unified _ ->
 928 |               (* get_ranges_rev never returns a Unified range *)
 929 |               assert false
 930 |             | Prev (_, Some _) | Next (_, Some _) | Replace (_, _, Some _) ->
 931 |               (* get_ranges_rev never finds moves *)
 932 |               assert false
 933 |             | Next (range, None) ->
 934 |               let stop = bhi + Array.length range in
 935 |               ahi, stop
 936 |             | Prev (range, None) ->
 937 |               let stop = ahi + Array.length range in
 938 |               stop, bhi
 939 |             | Replace (a_range, b_range, None) ->
 940 |               let prev_stop = ahi + Array.length a_range in
 941 |               let next_stop = bhi + Array.length b_range in
 942 |               prev_stop, next_stop
 943 |           in
 944 |           aux rest curr_ranges alo ahi blo bhi acc_hunks
 945 |       in
 946 |       let ranges, alo, ahi, blo, bhi =
 947 |         match ranges with
 948 |         (* If the first range is an Equal, shave off the front of the range, according to
 949 |            context.  Keep it on the ranges list so hunk construction can see where the range
 950 |            begins *)
 951 |         | Same range :: rest ->
 952 |           let stop = Array.length range in
 953 |           let start = max 0 (stop - context) in
 954 |           let new_range = Range.Same (range <|> (start, stop)) in
 955 |           new_range :: rest, start, start, start, start
 956 |         | rest -> rest, 0, 0, 0, 0
 957 |       in
 958 |       aux ranges [] alo ahi blo bhi [])
 959 |   ;;
 960 | 
 961 |   let match_ratio a b =
 962 |     (matches a b |> List.length |> ( * ) 2 |> float)
 963 |     /. (Array.length a + Array.length b |> float)
 964 |   ;;
 965 | 
 966 |   let collapse_multi_sequences matches =
 967 |     let collapsed = ref [] in
 968 |     let value_exn x = Option.value_exn x in
 969 |     if List.is_empty matches
 970 |     then []
 971 |     else (
 972 |       let start = Array.create ~len:(List.length (List.hd_exn matches)) None in
 973 |       let length = ref 0 in
 974 |       List.iter matches ~f:(fun il ->
 975 |         if Array.for_all start ~f:Option.is_some
 976 |            && List.mapi il ~f:(fun i x -> x = value_exn start.(i) + !length)
 977 |               |> List.for_all ~f:(fun x -> x)
 978 |         then incr length
 979 |         else (
 980 |           if Array.for_all start ~f:Option.is_some
 981 |           then
 982 |             collapsed
 983 |             := (Array.map start ~f:value_exn |> Array.to_list, !length) :: !collapsed;
 984 |           List.iteri il ~f:(fun i x -> start.(i) <- Some x);
 985 |           length := 1));
 986 |       if Array.for_all start ~f:Option.is_some && !length <> 0
 987 |       then
 988 |         collapsed
 989 |         := (Array.map start ~f:value_exn |> Array.to_list, !length) :: !collapsed;
 990 |       List.rev !collapsed)
 991 |   ;;
 992 | 
 993 |   type 'a segment =
 994 |     | Same of 'a array
 995 |     | Different of 'a array array
 996 | 
 997 |   type 'a merged_array = 'a segment list
 998 | 
 999 |   let array_mapi2 ar1 ar2 ~f =
1000 |     Array.zip_exn ar1 ar2 |> Array.mapi ~f:(fun i (x, y) -> f i x y)
1001 |   ;;
1002 | 
1003 |   let merge ar =
1004 |     if Array.length ar = 0
1005 |     then []
1006 |     else if Array.length ar = 1
1007 |     then [ Same ar.(0) ]
1008 |     else (
1009 |       let matches's = Array.map (ar <|> (1, Array.length ar)) ~f:(matches ar.(0)) in
1010 |       let len = Array.length ar in
1011 |       let hashtbl = Int.Table.create () ~size:0 in
1012 |       Array.iteri matches's ~f:(fun i matches ->
1013 |         List.iter matches ~f:(fun (a, b) ->
1014 |           match Hashtbl.find hashtbl a with
1015 |           | None -> Hashtbl.set hashtbl ~key:a ~data:[ i, b ]
1016 |           | Some l -> Hashtbl.set hashtbl ~key:a ~data:((i, b) :: l)));
1017 |       let list =
1018 |         Hashtbl.to_alist hashtbl
1019 |         |> List.filter_map ~f:(fun (a, l) ->
1020 |           if List.length l = len - 1
1021 |           then Some (a :: (List.sort l ~compare:compare_int_pair |> List.map ~f:snd))
1022 |           else None)
1023 |         |> List.sort ~compare:(List.compare Int.compare)
1024 |       in
1025 |       let matching_blocks = collapse_multi_sequences list in
1026 |       let last_pos = Array.create ~len:(Array.length ar) 0 in
1027 |       let merged_array = ref [] in
1028 |       List.iter matching_blocks ~f:(fun (l, len) ->
1029 |         let ar' = Array.of_list l in
1030 |         if Array.compare Int.compare last_pos ar' <> 0
1031 |         then
1032 |           merged_array
1033 |           := Different (array_mapi2 last_pos ar' ~f:(fun i n m -> ar.(i) <|> (n, m)))
1034 |              :: !merged_array;
1035 |         merged_array := Same (ar.(0) <|> (ar'.(0), ar'.(0) + len)) :: !merged_array;
1036 |         Array.iteri last_pos ~f:(fun i _ -> last_pos.(i) <- ar'.(i) + len));
1037 |       let trailing_lines =
1038 |         Array.existsi last_pos ~f:(fun i last_pos -> Array.length ar.(i) > last_pos)
1039 |       in
1040 |       if trailing_lines
1041 |       then
1042 |         merged_array
1043 |         := Different
1044 |              (Array.mapi last_pos ~f:(fun i n -> ar.(i) <|> (n, Array.length ar.(i))))
1045 |            :: !merged_array;
1046 |       List.rev !merged_array)
1047 |   ;;
1048 | end
1049 | 
1050 | module%test _ = struct
1051 |   module P = Make (Int)
1052 | 
1053 |   let%test_unit _ =
1054 |     let check a b ~expect = [%test_result: (int * int) list] (P.matches a b) ~expect in
1055 |     check [||] [||] ~expect:[];
1056 |     check [| 0 |] [| 0 |] ~expect:[ 0, 0 ];
1057 |     check [| 0; 1; 1; 2 |] [| 3; 1; 4; 5 |] ~expect:[ 1, 1 ]
1058 |   ;;
1059 | 
1060 |   (* Needs the plain diff section *)
1061 | 
1062 |   let rec is_increasing a = function
1063 |     | [] -> true
1064 |     | hd :: tl -> Int.compare a hd <= 0 && is_increasing hd tl
1065 |   ;;
1066 | 
1067 |   let check_lis a =
1068 |     let b =
1069 |       Patience.longest_increasing_subsequence (Ordered_sequence.create (Array.of_list a))
1070 |     in
1071 |     if is_increasing (-1) (List.map b ~f:fst) && is_increasing (-1) (List.map b ~f:snd)
1072 |     then ()
1073 |     else
1074 |       failwiths "invariant failure" (a, b) [%sexp_of: (int * int) list * (int * int) list]
1075 |   ;;
1076 | 
1077 |   let%test_unit _ = check_lis [ 2, 0; 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ]
1078 |   let%test_unit _ = check_lis [ 0, 0; 2, 0; 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ]
1079 |   let%test_unit _ = check_lis [ 5, 1; 6, 2; 3, 3; 0, 4; 4, 5; 1, 6 ]
1080 | 
1081 |   let%test_unit _ =
1082 |     let check a b =
1083 |       let matches = P.matches a b in
1084 |       if is_increasing (-1) (List.map matches ~f:fst)
1085 |          && is_increasing (-1) (List.map matches ~f:snd)
1086 |       then ()
1087 |       else
1088 |         failwiths
1089 |           "invariant failure"
1090 |           (a, b, matches)
1091 |           [%sexp_of: int array * int array * (int * int) list]
1092 |     in
1093 |     check [| 0; 1; 2; 3; 4; 5; 6 |] [| 2; 5; 6; 3; 0; 4; 1 |]
1094 |   ;;
1095 | end
1096 | 
1097 | module String = Make (String)
1098 | 
1099 | module Stable = struct
1100 |   module Matching_block = Matching_block.Stable
1101 |   module Range = Range.Stable
1102 |   module Hunk = Hunk.Stable
1103 |   module Hunks = Hunks.Stable
1104 | end
1105 | 


--------------------------------------------------------------------------------