├── benchmark
    ├── rust
    │   ├── rust-toolchain
    │   ├── Cargo.toml
    │   ├── Cargo.lock
    │   └── src
    │   │   └── main.rs
    ├── rust-ffi
    │   ├── libacbench
    │   │   ├── rust-toolchain
    │   │   ├── Cargo.toml
    │   │   ├── Cargo.lock
    │   │   └── src
    │   │   │   └── lib.rs
    │   ├── package.yaml
    │   └── app
    │   │   └── Main.hs
    ├── java
    │   ├── BUILD
    │   ├── WORKSPACE
    │   └── main.java
    ├── data-utf8
    │   └── example.txt
    ├── data
    │   └── example.txt
    ├── .gitignore
    ├── haskell
    │   ├── package.yaml
    │   └── app
    │   │   └── Main.hs
    ├── default.nix
    ├── report.py
    ├── naive.py
    ├── benchmark.py
    └── README.md
├── nix
    ├── haskell-overlay.nix
    ├── ghc966-overlay.nix
    ├── nixpkgs-pinned.nix
    ├── haskell-dependencies.nix
    ├── sources.json
    └── sources.nix
├── Setup.hs
├── performance.png
├── .semaphore
    ├── install-nix.sha256
    └── semaphore.yml
├── .gitignore
├── stack.yaml.lock
├── stack.yaml
├── tests
    ├── Main.hs
    └── Data
    │   └── Text
    │       ├── TestInstances.hs
    │       ├── Utf8Spec.hs
    │       ├── BoyerMooreCISpec.hs
    │       ├── BoyerMooreSpec.hs
    │       └── AhoCorasickSpec.hs
├── app
    └── dump-automaton
    │   └── Main.hs
├── src
    └── Data
    │   ├── Text
    │       ├── CaseSensitivity.hs
    │       ├── Utf8
    │       │   └── Unlower.hs
    │       ├── BoyerMooreCI
    │       │   ├── Replacer.hs
    │       │   └── Searcher.hs
    │       ├── BoyerMoore
    │       │   ├── Replacer.hs
    │       │   ├── Searcher.hs
    │       │   └── Automaton.hs
    │       ├── AhoCorasick
    │       │   ├── Splitter.hs
    │       │   ├── Searcher.hs
    │       │   └── Replacer.hs
    │       └── Utf8.hs
    │   └── Primitive
    │       └── Extended.hs
├── stack-shell.nix
├── default.nix
├── LICENSE
├── benchmark.sh
├── bench
    ├── uvector-vs-tba
    │   └── Main.hs
    └── bm
    │   └── Main.hs
├── CHANGELOG.md
├── README.md
├── alfred-margaret.cabal
└── .stylish-haskell.yaml


/benchmark/rust/rust-toolchain:
--------------------------------------------------------------------------------
1 | stable
2 | 


--------------------------------------------------------------------------------
/nix/haskell-overlay.nix:
--------------------------------------------------------------------------------
1 | self: super: {
2 | 
3 | }
4 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/libacbench/rust-toolchain:
--------------------------------------------------------------------------------
1 | stable
2 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/channable/alfred-margaret/HEAD/performance.png


--------------------------------------------------------------------------------
/.semaphore/install-nix.sha256:
--------------------------------------------------------------------------------
1 | a2d0e4f6954a6295664994dc4e5492843b7de3e7e23e89a1df9e0820975d2fde  install-nix-2.24.12
2 | 


--------------------------------------------------------------------------------
/benchmark/rust/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "acbench-rust"
3 | version = "0.0.0"
4 | 
5 | [dependencies]
6 | aho-corasick = "0.6.8"
7 | filebuffer = "0.4.0"
8 | memchr = "2.1.0"
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Stack build directory.
 2 | /.stack-work/
 3 | 
 4 | # Graph files generated by dot for debugging
 5 | debug/*.png
 6 | 
 7 | # Benchmark results
 8 | *.stats
 9 | *.results
10 | 


--------------------------------------------------------------------------------
/nix/ghc966-overlay.nix:
--------------------------------------------------------------------------------
1 | self: super:
2 | let
3 |   haskellOverlay = import ./haskell-overlay.nix;
4 | in {
5 |   ghc966Packages = super.haskell.packages.ghc966.extend haskellOverlay;
6 | }
7 | 


--------------------------------------------------------------------------------
/nix/nixpkgs-pinned.nix:
--------------------------------------------------------------------------------
1 | { overlays ? [] }:
2 | let
3 |   sources = import ./sources.nix;
4 | in
5 |   import sources.nixpkgs {
6 |     overlays = [(import ./ghc966-overlay.nix)] ++ overlays;
7 |   }
8 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/libacbench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "libacbench"
 3 | version = "0.0.0"
 4 | 
 5 | [dependencies]
 6 | aho-corasick = "0.6.8"
 7 | 
 8 | [lib]
 9 | name = "libacbench"
10 | crate-type = ["staticlib"]


--------------------------------------------------------------------------------
/benchmark/java/BUILD:
--------------------------------------------------------------------------------
1 | # NOTE: You can build a standalone jar with `bazel build :acbench_deploy.jar`.
2 | 
3 | java_binary(
4 |     name = "acbench",
5 |     srcs = ["main.java"],
6 |     deps = ["@hankcs_aho_corasick//jar"],
7 |     main_class = "AcBench",
8 | )
9 | 


--------------------------------------------------------------------------------
/stack.yaml.lock:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by Stack.
2 | # You should not edit this file by hand.
3 | # For more information, please see the documentation at:
4 | #   https://docs.haskellstack.org/en/stable/lock_files
5 | 
6 | packages: []
7 | snapshots: []
8 | 


--------------------------------------------------------------------------------
/benchmark/java/WORKSPACE:
--------------------------------------------------------------------------------
1 | maven_jar(
2 |     name = "hankcs_aho_corasick",
3 |     artifact = "com.hankcs:aho-corasick-double-array-trie:1.2.0",
4 |     sha1 = "7b75748bad8e193cc048a7fb0c71605e300b2286",
5 |     sha1_src = "eaea57776bea89b6067fe75473445a878e4b3469",
6 | )
7 | 


--------------------------------------------------------------------------------
/benchmark/data-utf8/example.txt:
--------------------------------------------------------------------------------
 1 | Henk
 2 | Piet
 3 | Klaas
 4 | Sjaak
 5 | Marieke
 6 | 
 7 | Henk eet een appel en Piet eet kaas.
 8 | Klaas eet ook kaas.
 9 | Kaas is baas.
10 | Mari en Marieke wandelen door het bos.
11 | De auto van Sjaak heeft geen trekhaak en die van Klaas ook niet.
12 | 


--------------------------------------------------------------------------------
/nix/haskell-dependencies.nix:
--------------------------------------------------------------------------------
 1 | haskellPackages:
 2 |   with haskellPackages; [
 3 |     aeson
 4 |     bytestring
 5 |     criterion
 6 |     hashable
 7 |     primitive
 8 |     text
 9 |     unordered-containers
10 |     hspec
11 |     hspec-expectations
12 |     quickcheck-instances
13 |   ]
14 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | resolver: ghc-9.6.6
 2 | packages:
 3 | - .
 4 | - benchmark/haskell
 5 | - benchmark/rust-ffi
 6 | 
 7 | # Note: This section will be ignored by stack, on non-NixOS systems.
 8 | # It can be explicitly enabled on non-NixOS systems by passing --nix.
 9 | # On NixOS this section is needed to bring the non-Haskell dependencies
10 | # into scope.
11 | nix:
12 |   enable: true
13 |   shell-file: stack-shell.nix
14 |   path: ["nixpkgs=./nix/nixpkgs-pinned.nix"]
15 | 


--------------------------------------------------------------------------------
/benchmark/data/example.txt:
--------------------------------------------------------------------------------
 1 | H e n k 
 2 |  P i e t 
 3 |  K l a a s 
 4 |  S j a a k 
 5 |  M a r i e k e 
 6 |  
 7 |  H e n k   e e t   e e n   a p p e l   e n   P i e t   e e t   k a a s . 
 8 |  K l a a s   e e t   o o k   k a a s . 
 9 |  K a a s   i s   b a a s . 
10 |  M a r i   e n   M a r i e k e   w a n d e l e n   d o o r   h e t   b o s . 
11 |  D e   a u t o   v a n   S j a a k   h e e f t   g e e n   t r e k h a a k   e n   d i e   v a n   K l a a s   o o k   n i e t . 
12 |  


--------------------------------------------------------------------------------
/benchmark/.gitignore:
--------------------------------------------------------------------------------
 1 | data/*.txt
 2 | data-utf8/*.txt
 3 | !data/example.txt
 4 | !data-utf8/example.txt
 5 | data/*.tar.xz
 6 | *.aux
 7 | *.hp
 8 | *.pdf
 9 | *.prof
10 | *.ps
11 | 
12 | java/bazel-bin
13 | java/bazel-genfiles
14 | java/bazel-java
15 | java/bazel-out
16 | java/bazel-testlogs
17 | 
18 | haskell/ac-bench.cabal
19 | haskell/.stack-work
20 | haskell-utf8/ac-bench-utf8.cabal
21 | haskell-utf8/.stack-work
22 | 
23 | rust/target/
24 | 
25 | rust-ffi/ac-bench-ffi.cabal
26 | rust-ffi/.stack-work/
27 | rust-ffi/libacbench/target/
28 | 


--------------------------------------------------------------------------------
/tests/Main.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import Test.Hspec (describe, hspec)
 4 | 
 5 | import Data.Text.AhoCorasickSpec as AhoCorasickSpec
 6 | import Data.Text.BoyerMooreSpec as BoyerMooreSpec
 7 | import Data.Text.BoyerMooreCISpec as BoyerMooreCISpec
 8 | import Data.Text.Utf8Spec as Utf8Spec
 9 | 
10 | main :: IO ()
11 | main = hspec $ do
12 |   describe "Data.Text.AhoCorasick" AhoCorasickSpec.spec
13 |   describe "Data.Text.BoyerMoore" BoyerMooreSpec.spec
14 |   describe "Data.Text.BoyerMooreCI" BoyerMooreCISpec.spec
15 |   describe "Data.Text.Utf8" Utf8Spec.spec
16 | 


--------------------------------------------------------------------------------
/app/dump-automaton/Main.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import Control.Monad (forM)
 4 | import qualified Data.Text.Utf8 as Utf8
 5 | import Data.Text.AhoCorasick.Automaton (debugBuildDot)
 6 | import System.Environment (getArgs)
 7 | import System.IO (hPrint, hPutStr, stderr)
 8 | 
 9 | main = do
10 |     args <- getArgs
11 |     needles <- forM args $ \needle -> do
12 |         hPutStr stderr $ needle ++ ": "
13 |         let needleBytes = Utf8.unpackUtf8 $ Utf8.pack needle
14 |         hPrint stderr needleBytes
15 |         pure $ Utf8.pack needle
16 | 
17 |     let dot = debugBuildDot needles
18 |     putStrLn dot
19 | 


--------------------------------------------------------------------------------
/src/Data/Text/CaseSensitivity.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP #-}
 2 | {-# LANGUAGE DeriveAnyClass #-}
 3 | {-# LANGUAGE DeriveGeneric #-}
 4 | {-# LANGUAGE DerivingStrategies #-}
 5 | 
 6 | module Data.Text.CaseSensitivity where
 7 | 
 8 | import Control.DeepSeq (NFData)
 9 | import Data.Hashable (Hashable)
10 | import GHC.Generics (Generic)
11 | #if defined(HAS_AESON)
12 | import Data.Aeson (FromJSON, ToJSON)
13 | #endif
14 | data CaseSensitivity
15 |   = CaseSensitive
16 |   | IgnoreCase
17 |   deriving stock (Eq, Generic, Show)
18 | #if defined(HAS_AESON)
19 |   deriving anyclass (Hashable, NFData, FromJSON, ToJSON)
20 | #else
21 |   deriving anyclass (Hashable, NFData)
22 | #endif
23 | 


--------------------------------------------------------------------------------
/nix/sources.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "nixpkgs": {
 3 |         "branch": "nixpkgs-unstable",
 4 |         "description": "Nix Packages collection",
 5 |         "homepage": null,
 6 |         "owner": "NixOS",
 7 |         "repo": "nixpkgs",
 8 |         "rev": "b833ff01a0d694b910daca6e2ff4a3f26dee478c",
 9 |         "sha256": "1v3y9km48glcmgzk7h8s9sg5sgv1w86pyad973d981sk84a85cdl",
10 |         "type": "tarball",
11 |         "url": "https://github.com/NixOS/nixpkgs/archive/b833ff01a0d694b910daca6e2ff4a3f26dee478c.tar.gz",
12 |         "url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz",
13 |         "version": "5e7fb7699c84da3420495e40459dfbff459c16e4"
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/stack-shell.nix:
--------------------------------------------------------------------------------
 1 | {}:
 2 | let
 3 |   pkgs = import ./nix/nixpkgs-pinned.nix {};
 4 |   haskellDependencies = import ./nix/haskell-dependencies.nix;
 5 | 
 6 |   libacbench = pkgs.rustPlatform.buildRustPackage rec {
 7 |     name = "libacbench";
 8 |     src = ./benchmark/rust-ffi/libacbench;
 9 |     buildType = "release";
10 |     cargoLock = {
11 |       lockFile = ./benchmark/rust-ffi/libacbench/Cargo.lock;
12 |     };
13 |   };
14 | in
15 |   pkgs.haskell.lib.buildStackProject {
16 |     name = "alfred-margaret";
17 |     ghc = pkgs.ghc966Packages.ghcWithPackages haskellDependencies;
18 |     buildInputs = with pkgs; [
19 |       llvm_13
20 |       zlib
21 |       libacbench
22 |     ];
23 |   }
24 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/libacbench/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "aho-corasick"
 7 | version = "0.6.10"
 8 | source = "registry+https://github.com/rust-lang/crates.io-index"
 9 | checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5"
10 | dependencies = [
11 |  "memchr",
12 | ]
13 | 
14 | [[package]]
15 | name = "libacbench"
16 | version = "0.0.0"
17 | dependencies = [
18 |  "aho-corasick",
19 | ]
20 | 
21 | [[package]]
22 | name = "memchr"
23 | version = "2.4.1"
24 | source = "registry+https://github.com/rust-lang/crates.io-index"
25 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
26 | 


--------------------------------------------------------------------------------
/benchmark/haskell/package.yaml:
--------------------------------------------------------------------------------
 1 | name: ac-bench
 2 | maintainer: Channable
 3 | category: Other
 4 | synopsis: Benchmark code for alfred-margaret
 5 | description: Benchmark code for alfred-margaret
 6 | version: '0'
 7 | github: channable/alfred-margaret
 8 | 
 9 | extra-source-files:
10 | - package.yaml
11 | 
12 | library: {}
13 | 
14 | dependencies:
15 | - base
16 | 
17 | ghc-options: >-
18 |   -Wall
19 |   -Wincomplete-record-updates
20 |   -Wincomplete-uni-patterns
21 |   -Wpartial-fields
22 |   -j4
23 |   +RTS -A64m -RTS
24 | 
25 | executables:
26 |   ac-bench:
27 |     dependencies:
28 |     - alfred-margaret
29 |     - base
30 |     - bytestring
31 |     - clock
32 |     - deepseq
33 |     - text
34 |     ghc-options:
35 |     - -rtsopts
36 |     - -threaded
37 |     - '"-with-rtsopts=-I0 -T -N -A32m -n4m -qg"'
38 |     main: Main.hs
39 |     source-dirs: app/
40 | 


--------------------------------------------------------------------------------
/src/Data/Primitive/Extended.hs:
--------------------------------------------------------------------------------
 1 | -- | "Data.Primitive" extended with extra definitions.
 2 | --
 3 | -- Based on the ".Extended Modules" pattern:
 4 | -- https://jaspervdj.be/posts/2015-01-20-haskell-design-patterns-extended-modules.html
 5 | module Data.Primitive.Extended
 6 |   ( module Data.Primitive
 7 |   , replicateMutablePrimArray
 8 |   )
 9 |   where
10 | 
11 | import Data.Primitive
12 | import Control.Monad.Primitive (PrimMonad(..))
13 | 
14 | -- | Like 'replicatePrimArray', but does not freeze the array afterwards and
15 | -- stays within a monadic context, so it can easily be mutated further.
16 | {-# INLINE replicateMutablePrimArray #-}
17 | replicateMutablePrimArray :: (Prim a, PrimMonad m) => Int -> a -> m (MutablePrimArray (PrimState m) a)
18 | replicateMutablePrimArray len value = do
19 |     arr <- newPrimArray len
20 |     setPrimArray arr 0 len value
21 |     pure arr
22 | 


--------------------------------------------------------------------------------
/benchmark/default.nix:
--------------------------------------------------------------------------------
 1 | let
 2 |     pinnedPkgs = import (builtins.fetchTarball {
 3 |         name = "ac-bench-nixpkgs";
 4 |         url = "https://github.com/NixOS/nixpkgs/archive/1882c6b7368fd284ad01b0a5b5601ef136321292.tar.gz";
 5 |         sha256 = "0zg7ak2mcmwzi2kg29g4v9fvbvs0viykjsg2pwaphm1fi13s7s0i";
 6 |     }) {};
 7 | in
 8 | { pkgs ? pinnedPkgs }:
 9 | pkgs.mkShell {
10 |     buildInputs = [
11 |         # For benchmark code
12 |         (pkgs.python3.withPackages (pyPkgs: [
13 |             pyPkgs.clize
14 |             pyPkgs.numpy
15 |         ]))
16 | 
17 |         # For rust implementation
18 |         pkgs.cargo
19 | 
20 |         # For java implementation (uses outdated bazel stuff)
21 |         pkgs.jdk8
22 |         pkgs.bazel_1
23 | 
24 |         # For Haskell implementation
25 |         pkgs.stack
26 |         pkgs.gmp
27 |         # GHC 8.10.7 produces faster code with LLVM 9 than with LLVM 12
28 |         pkgs.llvmPackages_9.llvm
29 |     ];
30 | }
31 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/package.yaml:
--------------------------------------------------------------------------------
 1 | name: ac-bench-ffi
 2 | maintainer: Channable
 3 | category: Other
 4 | synopsis: Benchmark code for alfred-margaret with Rust FFI
 5 | description: Benchmark code for alfred-margaret with Rust FFI
 6 | version: '0'
 7 | github: channable/alfred-margaret
 8 | 
 9 | extra-source-files:
10 | - package.yaml
11 | 
12 | library: {}
13 | 
14 | dependencies:
15 | - base
16 | 
17 | ghc-options: >-
18 |   -Wall
19 |   -Wincomplete-record-updates
20 |   -Wincomplete-uni-patterns
21 |   -Wpartial-fields
22 |   -j4
23 |   +RTS -A64m -RTS
24 | 
25 | executables:
26 |   ac-bench-ffi:
27 |     dependencies:
28 |     - alfred-margaret
29 |     - base
30 |     - bytestring
31 |     - clock
32 |     - ghc-compact
33 |     - deepseq
34 |     - primitive
35 |     - text
36 |     ghc-options:
37 |     - -rtsopts
38 |     - -threaded
39 |     - '"-with-rtsopts=-I0 -T -N -A32m -n4m -qg"'
40 |     main: Main.hs
41 |     source-dirs: app/
42 |     extra-libraries:
43 |     - libacbench
44 |     extra-lib-dirs:
45 |     - libacbench/target/release


--------------------------------------------------------------------------------
/benchmark/report.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | Report the results of a benchmark run recorded by benchmark.py.
 5 | 
 6 | Usage:
 7 |     ./report.py PROGRAM0.stats PROGRAM1.stats ...
 8 | """
 9 | 
10 | import sys
11 | import numpy as np
12 | 
13 | def report_file(file_name: str) -> np.array:
14 |     # When running the benchmark on a single file, using ndmin ensures that we actually
15 |     # get a 2-dimensional array.
16 |     data_ns = np.loadtxt(file_name, ndmin=2, dtype=int)
17 |     num_files, num_iterations = data_ns.shape
18 | 
19 |     data_secs = data_ns / 1e9
20 |     mean_times_by_file = np.mean(data_secs, axis=1)
21 |     min_times_by_file = np.min(data_secs, axis=1)
22 |     variance_by_file = np.var(data_secs, axis=1)
23 | 
24 |     total_mean_secs = np.sum(mean_times_by_file) 
25 |     total_min_secs = np.sum(min_times_by_file) 
26 |     total_variance = np.sum(variance_by_file) 
27 |     total_stdev = np.sqrt(total_variance)
28 | 
29 |     print(f'{file_name}:')
30 |     print(f'  mean time: {total_mean_secs:0.3f} ± {total_stdev:0.3f} seconds')
31 |     print(f'   min time: {total_min_secs:0.3f} seconds')
32 | 
33 | 
34 | for file_name in sys.argv[1:]:
35 |     report_file(file_name)
36 |     print()
37 | 


--------------------------------------------------------------------------------
/benchmark/naive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import time
 5 | 
 6 | for file_name in sys.argv[1:]:
 7 |     with open(file_name, 'r', encoding='utf-16le') as f:
 8 |         needles = []
 9 | 
10 |         for line in f:
11 |             if line == '\n':
12 |                 break
13 |             needles.append(line[:-1])
14 | 
15 |         haystack = f.read()
16 |         num_matches = 0
17 | 
18 |         # Measure every input five times.
19 |         for i in range(0, 5):
20 |             # In Python 3.7 we would be able to use time.monotonic_ns ...
21 |             epoch_ns = int(time.monotonic() * 1e9)
22 | 
23 |             for needle in needles:
24 |                 start = 0
25 |                 while True:
26 |                     n = haystack.find(needle, start)
27 |                     if n != -1:
28 |                         start = n + 1
29 |                         num_matches += 1
30 |                     else:
31 |                         break
32 | 
33 |             duration_ns = int(time.monotonic() * 1e9) - epoch_ns
34 |             print(f'{duration_ns}\t', end='')
35 | 
36 |             # Print the number of matches once per file to have reference output.
37 |             if i == 0:
38 |                 print(num_matches, file=sys.stderr)
39 | 
40 |         # Print a newline, so we have one line of timings per input file.
41 |         print()
42 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   pkgs ? import ./nix/nixpkgs-pinned.nix {},
 3 |   # Use haskell-languager-server?
 4 |   hsTools ? false,
 5 |   # Use tools for the benchmarks in other languages (Cargo, Bazel, etc.)?
 6 |   benchTools ? false
 7 | }:
 8 | let
 9 |   haskellDependencies = import ./nix/haskell-dependencies.nix;
10 | 
11 |   paths = with pkgs; (
12 |     [
13 |       # Nix tooling
14 |       niv
15 |       nix-tree
16 | 
17 |       # Haskell tooling
18 |       stack
19 | 
20 |       # Haskell dependencies
21 |       (ghc966Packages.ghcWithPackages haskellDependencies)
22 | 
23 |       # Other
24 |       llvm_13
25 |     ] ++
26 |     # We don't use the overlay here because the tooling doesn't need it.
27 |     # The advantage of doing so is that these packages are already available in a global cache.
28 |     lib.optionals hsTools (with haskell.packages.ghc966; [
29 |       haskell-language-server
30 |       implicit-hie
31 |     ]) ++
32 |     lib.optionals benchTools [
33 |       (python3.withPackages (pyPkgs: [
34 |         pyPkgs.clize
35 |         pyPkgs.numpy
36 |       ]))
37 | 
38 |       # For rust implementation
39 |       cargo
40 | 
41 |       # For java implementation (uses outdated bazel stuff)
42 |       jdk8
43 |       bazel_1
44 | 
45 |       # For Haskell implementation
46 |       gmp
47 |     ]
48 |   );
49 | in
50 |   pkgs.buildEnv {
51 |     name = "alfred-margaret-env";
52 |     paths = paths;
53 |   }
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Channable
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of the copyright holder nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/libacbench/src/lib.rs:
--------------------------------------------------------------------------------
 1 | extern crate aho_corasick;
 2 | 
 3 | use aho_corasick::{AcAutomaton, Automaton, Dense, Sparse};
 4 | 
 5 | #[repr(C)]
 6 | #[derive(Debug)]
 7 | pub struct U8Slice {
 8 |     ptr: *const u8,
 9 |     off: isize,
10 |     len: isize,
11 | }
12 | 
13 | impl U8Slice {
14 |     fn into_slice<'a>(&self) -> &'a [u8] {
15 |         slice_from_pointer(self.ptr, self.off, self.len)
16 |     }
17 | }
18 | 
19 | // https://doc.rust-lang.org/src/core/slice/raw.rs.html#87
20 | fn slice_from_pointer<'a, T>(ptr: *const T, off: isize, len: isize) -> &'a [T] {
21 |     &(unsafe { &*std::ptr::slice_from_raw_parts(ptr, (off + len) as usize) })[off as usize..]
22 | }
23 | 
24 | #[no_mangle]
25 | pub extern "C" fn perform_ac(
26 |     use_sparse: bool,
27 |     num_needles: isize,
28 |     needle_slices_: *const U8Slice,
29 |     haystack_slice_: *const U8Slice,
30 | ) -> isize {
31 |     let needle_slices = slice_from_pointer(needle_slices_, 0, num_needles);
32 | 
33 |     let mut needles: Vec<&[u8]> = Vec::with_capacity(num_needles as usize);
34 |     for i in 0..num_needles as usize {
35 |         needles.insert(i, needle_slices[i].into_slice());
36 |     }
37 | 
38 |     let haystack = slice_from_pointer(haystack_slice_, 0, 1)[0].into_slice();
39 | 
40 |     let num_matches = if use_sparse {
41 |         let automaton = AcAutomaton::<_, Sparse>::with_transitions(&needles[..]);
42 |         automaton.find_overlapping(haystack).count()
43 |     } else {
44 |         let automaton = AcAutomaton::<_, Dense>::with_transitions(&needles[..]);
45 |         automaton.find_overlapping(haystack).count()
46 |     };
47 | 
48 |     num_matches as isize
49 | }
50 | 


--------------------------------------------------------------------------------
/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Usage: See benchmark/README.md
 4 | 
 5 | # For bold output
 6 | function echobold {
 7 |     tput bold
 8 |     echo "$@"
 9 |     tput sgr0
10 | }
11 | 
12 | # Read argument
13 | benchcase=$1
14 | 
15 | # Prepare variables
16 | revision=$(git rev-parse --short HEAD)
17 | prefix="$benchcase-$revision"
18 | 
19 | echobold Selecting benchmark case...
20 | 
21 | # Set $exe and $datadir
22 | case $benchcase in
23 |     python)
24 |         exe=./benchmark/naive.py
25 |         datadir=./benchmark/data
26 |         ;;
27 |     rust)
28 |         exe=./benchmark/rust/target/release/acbench-rust
29 |         datadir=./benchmark/data
30 |         ;;
31 |     java)
32 |         exe=./benchmark/java/bazel-bin/acbench
33 |         datadir=./benchmark/data
34 |         ;;
35 |     haskell)
36 |         exe=$(stack path --local-install-root)/bin/ac-bench
37 |         datadir=./benchmark/data-utf8
38 |         ;;
39 |     rust-ffi)
40 |         exe=$(stack path --local-install-root)/bin/ac-bench-ffi
41 |         datadir=./benchmark/data-utf8
42 |         ;;
43 |     *)
44 |         echobold "Invalid benchmark case, please pass one of {python,rust,java,haskell,rust-ffi}" >&2
45 |         exit 1 
46 | esac
47 | 
48 | # Absolute paths may help with debugging
49 | exe_absolute=$(readlink -f "$exe")
50 | datadir_absolute=$(readlink -f "$datadir")
51 | 
52 | # Print some debugging information
53 | echobold "I will benchmark this executable:"
54 | echo "$exe_absolute"
55 | echobold "Using data from this directory:"
56 | echo "$datadir_absolute"
57 | echobold "And store the results in these files:"
58 | echo "$prefix.{stats,results}"
59 | 
60 | # Run the benchmark
61 | echobold "Running the benchmark..."
62 | ./benchmark/benchmark.py "$exe" --data-directory "$datadir_absolute" --prefix "$prefix"
63 | echobold "Done!"


--------------------------------------------------------------------------------
/.semaphore/semaphore.yml:
--------------------------------------------------------------------------------
 1 | version: v1.0
 2 | name: "Semaphore pipeline for alfred-margaret"
 3 | 
 4 | agent:
 5 |   machine:
 6 |     type: f1-standard-2
 7 |     os_image: ubuntu2204
 8 | 
 9 | # Automatically cancel jobs that are already running for all branches but master
10 | auto_cancel:
11 |   running:
12 |     when: "branch != 'master'"
13 | 
14 | blocks:
15 |   - name: Stack
16 |     task:
17 |       prologue:
18 |         commands:
19 |           # Checkout out the repository
20 |           - checkout
21 |           - git log --max-count 1 --oneline --no-decorate
22 |           # Set up /nix with correct permissions
23 |           - sudo mkdir /nix
24 |           - sudo chown semaphore:semaphore /nix
25 |           # Restore any nix cache that we can find
26 |           - cache restore nix-store-
27 |           # Install Nix
28 |           - curl -o install-nix-2.24.12 https://releases.nixos.org/nix/nix-2.24.12/install
29 |           - sha256sum --check .semaphore/install-nix.sha256
30 | 
31 |           - sh ./install-nix-2.24.12 --no-daemon
32 |           # Enable `nix-command` feature, which `nix build` needs to build
33 |           - sudo mkdir /etc/nix
34 |           - echo 'experimental-features = nix-command' | sudo tee -a /etc/nix/nix.conf
35 | 
36 |           - source "$HOME/.nix-profile/etc/profile.d/nix.sh"
37 |           # Configure cachix *without building the environment in default.nix*
38 |           - nix shell -f nix/nixpkgs-pinned.nix cachix -c cachix use channable-public
39 |           # Restore .stack-work and ~/.stack from cache for faster builds
40 |           - cache restore home-stack-$SEMAPHORE_GIT_BRANCH
41 |           - cache restore stack-work-$SEMAPHORE_GIT_BRANCH
42 | 
43 |       jobs:
44 |         - name: Test alfred-margaret
45 |           commands:
46 |             - nix shell -f default.nix -c stack test alfred-margaret:test-suite
47 | 
48 |       epilogue:
49 |         commands:
50 |           # Fill caches
51 |           - cache store home-stack-$SEMAPHORE_GIT_BRANCH ~/.stack
52 |           - cache store stack-work-$SEMAPHORE_GIT_BRANCH .stack-work
53 |           # Store a copy of the nix store. This will be refreshed daily, which
54 |           # is more than sufficient for this repo. Semaphore's cache is faster
55 |           # than Cachix.
56 |           - "cache store nix-store-$(date -u -Idate) /nix"
57 | 


--------------------------------------------------------------------------------
/benchmark/rust/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | version = 3
 4 | 
 5 | [[package]]
 6 | name = "acbench-rust"
 7 | version = "0.0.0"
 8 | dependencies = [
 9 |  "aho-corasick",
10 |  "filebuffer",
11 |  "memchr",
12 | ]
13 | 
14 | [[package]]
15 | name = "aho-corasick"
16 | version = "0.6.8"
17 | source = "registry+https://github.com/rust-lang/crates.io-index"
18 | checksum = "68f56c7353e5a9547cbd76ed90f7bb5ffc3ba09d4ea9bd1d8c06c8b1142eeb5a"
19 | dependencies = [
20 |  "memchr",
21 | ]
22 | 
23 | [[package]]
24 | name = "cfg-if"
25 | version = "0.1.5"
26 | source = "registry+https://github.com/rust-lang/crates.io-index"
27 | checksum = "0c4e7bb64a8ebb0d856483e1e682ea3422f883c5f5615a90d51a2c82fe87fdd3"
28 | 
29 | [[package]]
30 | name = "filebuffer"
31 | version = "0.4.0"
32 | source = "registry+https://github.com/rust-lang/crates.io-index"
33 | checksum = "1b41bfe1d74263ea9d084be951077614b3b98b4e59a9dafab1467645a9e52305"
34 | dependencies = [
35 |  "libc",
36 |  "winapi",
37 | ]
38 | 
39 | [[package]]
40 | name = "libc"
41 | version = "0.2.43"
42 | source = "registry+https://github.com/rust-lang/crates.io-index"
43 | checksum = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d"
44 | 
45 | [[package]]
46 | name = "memchr"
47 | version = "2.1.0"
48 | source = "registry+https://github.com/rust-lang/crates.io-index"
49 | checksum = "4b3629fe9fdbff6daa6c33b90f7c08355c1aca05a3d01fa8063b822fcf185f3b"
50 | dependencies = [
51 |  "cfg-if",
52 |  "libc",
53 |  "version_check",
54 | ]
55 | 
56 | [[package]]
57 | name = "version_check"
58 | version = "0.1.5"
59 | source = "registry+https://github.com/rust-lang/crates.io-index"
60 | checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
61 | 
62 | [[package]]
63 | name = "winapi"
64 | version = "0.3.6"
65 | source = "registry+https://github.com/rust-lang/crates.io-index"
66 | checksum = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"
67 | dependencies = [
68 |  "winapi-i686-pc-windows-gnu",
69 |  "winapi-x86_64-pc-windows-gnu",
70 | ]
71 | 
72 | [[package]]
73 | name = "winapi-i686-pc-windows-gnu"
74 | version = "0.4.0"
75 | source = "registry+https://github.com/rust-lang/crates.io-index"
76 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
77 | 
78 | [[package]]
79 | name = "winapi-x86_64-pc-windows-gnu"
80 | version = "0.4.0"
81 | source = "registry+https://github.com/rust-lang/crates.io-index"
82 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
83 | 


--------------------------------------------------------------------------------
/benchmark/java/main.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.lang.StringBuilder;
 3 | import java.nio.charset.Charset;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | import java.util.TreeMap;
 9 | 
10 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie;
11 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie.IHit;
12 | 
13 | class MatchCounter implements IHit<String> {
14 |   public int count;
15 | 
16 |   public MatchCounter() {
17 |     this.count = 0;
18 |   }
19 | 
20 |   public void hit(int begin, int end, String value) {
21 |     this.count++;
22 |   }
23 | }
24 | 
25 | class AcBench {
26 |   public static void main(String[] args) {
27 |     for (int i = 0; i < args.length; i++) {
28 |       try {
29 |         AcBench.processFile(args[i]);
30 |       } catch (IOException ex) {
31 |         System.out.println(ex.toString());
32 |         System.exit(1);
33 |       }
34 |     }
35 |   }
36 | 
37 |   static void processFile(String filename) throws IOException {
38 |     // Preparation step: read the file, needles first, then the haystack.
39 |     Charset utf16le = Charset.forName("UTF-16LE");
40 |     List<String> fileLines = Files.readAllLines(Paths.get(filename), utf16le);
41 |     TreeMap<String, String> needles = new TreeMap<String, String>();
42 |     StringBuilder haystackBuilder = new StringBuilder();
43 | 
44 |     boolean isNeedles = true;
45 |     for (int i = 0; i < fileLines.size(); i++) {
46 |       String line = fileLines.get(i);
47 | 
48 |       if (line.equals("")) {
49 |         // A blank line signals the end of the needle section.
50 |         isNeedles = false;
51 |         continue;
52 |       }
53 | 
54 |       if (isNeedles) {
55 |         needles.put(line, line);
56 |       } else {
57 |         haystackBuilder.append(line);
58 |         haystackBuilder.append('\n');
59 |       }
60 |     }
61 | 
62 |     String haystack = haystackBuilder.toString();
63 | 
64 |     // Now for the real work: build the automaton and print the number of matches.
65 | 
66 |     // Measure every input 5 times.
67 |     for (int i = 0; i < 5; i++) {
68 |       MatchCounter matchCounter = new MatchCounter();
69 | 
70 |       long epoch = System.nanoTime();
71 |       if (needles.size() > 0) {
72 |         AhoCorasickDoubleArrayTrie<String> automaton = new AhoCorasickDoubleArrayTrie<String>();
73 |         automaton.build(needles);
74 |         automaton.parseText(haystack, matchCounter);
75 |       }
76 |       long durationNs = System.nanoTime() - epoch;
77 | 
78 |       if (i == 0) {
79 |         // Only print count to stderr the first iteration, to check correctness.
80 |         System.err.format("%d\n", matchCounter.count);
81 |       }
82 | 
83 |       System.out.format("%d\t", durationNs);
84 |     }
85 | 
86 |     // Print a newline, so we print one line of measurements per file.
87 |     System.out.println("");
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/bench/uvector-vs-tba/Main.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | 
 3 | -- | Microbenchmark to measure the difference in 'TypedByteArray' and 'Data.Vector.Unboxed.Vector' indexing performance.
 4 | -- The latter is a slice into a 'ByteArray' which means that every time you retrieve and element, an offset
 5 | -- needs to be added to your index.
 6 | --
 7 | -- To reproduce:
 8 | --
 9 | -- @
10 | -- stack bench alfred-margaret:uvector-vs-tba --ba '--output uvector-vs-tba.html'
11 | -- @
12 | --
13 | -- You can pass a greater @--time-limit@ (in the single quotes) to increase the number of iterations.
14 | --
15 | -- NOTE: 'readUVector' and 'genUVector' are marked @NOINLINE@ to prevent GHC optimizing away the indexing addition.
16 | -- 'readTba' and 'genTba' are marked @NOINLINE@ as well for fairness, altough it shouldn't make a difference there.
17 | module Main where
18 | 
19 | import Control.Monad.ST (runST)
20 | import Criterion.Main (Benchmark, bench, bgroup, defaultMain, nf)
21 | import Data.Foldable (for_)
22 | import Text.Printf (printf)
23 | 
24 | import qualified Data.Vector.Unboxed as UVector
25 | import qualified Data.Vector.Unboxed.Mutable as UMVector
26 | 
27 | import qualified Data.TypedByteArray as TBA
28 | 
29 | main :: IO ()
30 | main = defaultMain
31 |   [ bgroup "tba" $ mkReadBenchs readTba genTba [7, 8]
32 |   , bgroup "uvector" $ mkReadBenchs readUVector genUVector [7, 8]
33 |   ]
34 | 
35 | mkReadBenchs
36 |   :: (Int -> a -> Int) -- ^ Function that reads from an array @n@ times.
37 |   -> (Int -> a)        -- ^ Function that constructs an array of length @n@.
38 |   -> [Int]             -- ^ Which powers of 10 to pass for @n@.
39 |   -> [Benchmark]
40 | mkReadBenchs readPattern gen powers =
41 |   [ bench (printf "%d reads" n) $ nf (readPattern n) $ gen n
42 |   | n <- map (10^) powers
43 |   ]
44 | 
45 | {-# NOINLINE readTba #-}
46 | readTba :: Int -> TBA.TypedByteArray Int -> Int
47 | readTba !n !arr = go 0
48 |   where
49 |     go !i
50 |       | i >= n    = 42
51 |       | otherwise = go $ TBA.unsafeIndex arr i
52 | 
53 | {-# NOINLINE readUVector #-}
54 | readUVector :: Int -> UVector.Vector Int -> Int
55 | readUVector !n !arr = go 0
56 |   where
57 |     go !i
58 |       | i >= n    = 42
59 |       | otherwise = go $ UVector.unsafeIndex arr i
60 | 
61 | -- NOTE: We should probably measure pseudo-random access time as well, e.g. by shuffling the generated arrays.
62 | 
63 | {-# NOINLINE genTba #-}
64 | genTba :: Int -> TBA.TypedByteArray Int
65 | genTba n = runST $ do
66 |   mutArr <- TBA.newTypedByteArray n
67 |   for_ [0 .. n-1] $ \i -> TBA.writeTypedByteArray mutArr i $ i + 1
68 |   TBA.unsafeFreezeTypedByteArray mutArr
69 | 
70 | -- | Generate an unboxed vector @v@ such that @v[i] == i + 1@.
71 | {-# NOINLINE genUVector #-}
72 | genUVector :: Int -> UVector.Vector Int
73 | genUVector n = runST $ do
74 |   mutArr <- UMVector.new n
75 |   for_ [0 .. n-1] $ \i -> UMVector.write mutArr i $ i + 1
76 |   UVector.unsafeFreeze mutArr
77 | 


--------------------------------------------------------------------------------
/benchmark/haskell/app/Main.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE CPP #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | {-# OPTIONS_GHC -fllvm -O2 -optlo=-O3 -optlo=-tailcallelim #-}
 5 | 
 6 | -- | Benchmark for our Aho-Corasick implementation.
 7 | module Main where
 8 | 
 9 | import Control.DeepSeq (force)
10 | import Control.Exception (evaluate)
11 | import Control.Monad (void, when)
12 | import Data.Foldable (for_, traverse_)
13 | import System.IO (hPrint, stderr, stdout)
14 | import Text.Printf (hPrintf)
15 | 
16 | import qualified Data.ByteString as ByteString
17 | import qualified Data.Text.Encoding as Encoding
18 | import qualified System.Clock as Clock
19 | import qualified System.Environment as Env
20 | 
21 | import Data.Text.Utf8 (CodeUnitIndex(..), Text (..))
22 | 
23 | import qualified Data.Text.Utf8 as Utf8
24 | import qualified Data.Text.AhoCorasick.Automaton as Aho
25 | 
26 | readNeedleHaystackFile :: FilePath -> IO ([Text], Text)
27 | readNeedleHaystackFile path = do
28 |   (Text u8data off len) <- Encoding.decodeUtf8 <$> ByteString.readFile path
29 |   pure $ go u8data off len []
30 |   where
31 |     go u8data off 0 needles = (reverse needles, Text u8data off 0)
32 |     go u8data off len needles
33 |       -- "line starts with newline char" ==> empty line, emit haystack as slice of u8data
34 |       | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = (reverse needles, Text u8data (off + 1) (len - 1))
35 |       | otherwise = consumeNeedle u8data off len needles off
36 | 
37 |     consumeNeedle u8data off len needles needleStart
38 |       -- Newline ==> emit needle as slice of u8data
39 |       | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = go u8data (off + 1) (len - 1) $ Text u8data needleStart (off - needleStart) : needles
40 |       | otherwise = consumeNeedle u8data (off + 1) (len - 1) needles needleStart
41 | 
42 | main :: IO ()
43 | main = Env.getArgs >>= traverse_ processFile
44 | 
45 | processFile :: FilePath -> IO ()
46 | processFile path = do
47 |   (needles, haystack) <- readNeedleHaystackFile path
48 | 
49 |   void $ evaluate $ force needles
50 |   void $ evaluate $ force haystack
51 | 
52 |   for_ [0 :: Int .. 4] $ \i -> do
53 |     (count, duration) <- acBench needles haystack
54 |     when (i == 0) $
55 |       hPrint stderr count
56 |     hPrintf stdout "%d\t" (Clock.toNanoSecs duration)
57 |   hPrintf stdout "\n"
58 | 
59 | acBench :: [Text] -> Text -> IO (Int, Clock.TimeSpec)
60 | {-# NOINLINE acBench #-}
61 | acBench needles haystack = do
62 |   start <- Clock.getTime Clock.Monotonic
63 |   matchCount <- evaluate $ countMatches needles haystack
64 |   end <- Clock.getTime Clock.Monotonic
65 |   pure (matchCount, Clock.diffTimeSpec start end)
66 | 
67 | countMatches :: [Text] -> Text -> Int
68 | {-# NOINLINE countMatches #-}
69 | countMatches needles haystack = case needles of
70 |   [] -> 0
71 |   _  ->
72 |     let
73 |       ac = Aho.build $ zip needles (repeat ())
74 |       onMatch !n _match = Aho.Step (n + 1)
75 |     in
76 |       Aho.runText 0 onMatch ac haystack
77 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## v2.1.0.2 - "All the bounds!" (2024-09-06)
 4 | 
 5 | Publishing requires `cabal` to know the version bounds as well.
 6 | 
 7 | ## v2.1.0.1 - "Higher bounds!" (2024-09-05)
 8 | 
 9 | Tested on GHC 9.6.6.
10 | 
11 | - Revise dependency bounds ([#62](https://github.com/channable/alfred-margaret/issues/62) thanks @Bodigrim)
12 | - Allow using primitive < 0.9 and vector < 0.14 ([#59](https://github.com/channable/alfred-margaret/pull/59) thanks @rampion)
13 | 
14 | ## v2.1.0.0 - "All The Cases!" (2022-08-31)
15 | 
16 | - Added a case-insensitive variant of the Boyer-Moore algorithm in the `Data.Text.BoyerMooreCI.*` modules. ([#47](https://github.com/channable/alfred-margaret/pull/47))
17 | - Fixed a bug in the case-insensitive Aho-Corasick replacer where it would
18 |   replace the wrong section of the haystack when the needle had a different
19 |   byte-length than the matching part of the haystack. ([#47](https://github.com/channable/alfred-margaret/pull/47))
20 | - Allow mapping the payloads of Aho-Corasick automatons. ([#46](https://github.com/channable/alfred-margaret/pull/46))
21 | 
22 | ## v2.0.0.0 - "So Long Surrogates" (2022-05-02)
23 | 
24 | Switched to text-2.0 which uses UTF-8 encoding internally.
25 | 
26 | - Removed `Data.Text.Utf8.*` modules
27 | - Replaced `Data.Text.AhoCorasick.*` and `Data.Text.BoyerMoore.*` (previously using UTF-16) with the UTF-8 implementation
28 | 
29 | ## v1.1.2.0 - "ByteArray Boogaloo" (2022-04-21)
30 | 
31 | Added UTF-8 implementations on a mock `Text` type (in `Data.Text.Utf8`).
32 | 
33 | - Added `Data.Text.Utf8*` modules
34 | - Moved `CaseSensitivity` to its own `Data.Text.CaseSensitivity` module.
35 | - Added the private module `Data.TypedByteArray` which contains thin wrappers over `ByteArray` and `MutableByteArray`.
36 | - Replaced uses of `Data.Vector.Unboxed.Vector` by `TypedByteArray`.
37 | 
38 | ## v1.1.0.0 - "Moore Features" (2020-10-13)
39 | 
40 | The most notable addition in this release is the implementation of the Boyer-Moore string search algorithm.
41 | 
42 | **Compatibility:**
43 | 
44 | - Extracted the UTF-16 manipulation functions from `Data.Text.AhoCorasick.Automaton` into `Data.Text.Utf16`
45 | - Changed `Data.Text.AhoCorasick.Searcher.Searcher` to remember the case sensitivity used for constructing the searcher
46 | - Removed `Data.Text.AhoCorasick.Searcher.containsAnyIgnoreCase`, the correct implementation is now chosen by `containsAny` based on the case sensitivity of the searcher
47 | 
48 | Other changes:
49 | 
50 | - Added `Data.Text.AhoCorasick.Splitter` for splitting a lot of text using the same needle
51 | - Added `Data.Text.BoyerMoore.Automaton`, a UTF-16 implementation of Boyer-Moore
52 | - Added `Data.Text.BoyerMoore.Searcher` for searching for multiple needles at once using Boyer-Moore
53 | - Added `Data.Text.BoyerMoore.Replacer` for replacing text based on the Boyer-Moore search
54 | - Added optional `FromJSON`/`ToJSON` instances for most types (can be toggled via `aeson` cabal flag)
55 | 
56 | ## v1.0.0.0 - "Initial Release" (2019-03-19)
57 | 
58 | This is the initial open-source release.
59 | 
60 | - Added `Data.Text.AhoCorasick.Automaton`, a UTF-16 implementation of the Aho-Corasick search algorithm
61 | - Added `Data.Text.AhoCorasick.Searcher`, a bulk search abstraction based on Aho-Corasick
62 | - Added `Data.Text.AhoCorasick.Replacer`, a bulk replace abstraction based on Aho-Corasick
63 | 


--------------------------------------------------------------------------------
/src/Data/Text/Utf8/Unlower.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Data.Text.Utf8.Unlower
 3 |   (
 4 |     unlowerCodePoint
 5 | 
 6 |   , printUnlowerings
 7 |   ) where
 8 | 
 9 | import Control.Monad (forM_)
10 | 
11 | import qualified Data.Char as Char
12 | import qualified Data.HashMap.Strict as HashMap
13 | import qualified Data.List as List
14 | 
15 | 
16 | -- | Inverse of Char.toLower/Utf8.lowerCodePoint
17 | --
18 | -- Returns all the characters that have the given character as their lower case, for example:
19 | --
20 | --    unlowerCodePoint 'a' == "aA"
21 | --    unlowerCodePoint 'A' == ""
22 | --    unlowerCodePoint '1' == "1"
23 | --    unlowerCodePoint 'i' == "İiI"
24 | --    unlowerCodePoint 'ß' == "ẞß"
25 | --
26 | unlowerCodePoint :: Char -> [Char]
27 | unlowerCodePoint =
28 |   \c -> maybe [c] id $ HashMap.lookup c unlowerings
29 | 
30 | -- | This map contains all the unlowerings for which the result is not just a singleton with the
31 | -- input character. It's marked NOINLINE to make sure that it only gets constructed once.
32 | unlowerings :: HashMap.HashMap Char [Char]
33 | {-# NOINLINE unlowerings #-}
34 | unlowerings =
35 |   HashMap.filterWithKey isNotId $ List.foldl' (flip addUnlowering) initialMap [minBound..maxBound]
36 |   where
37 |     initialMap = HashMap.fromList $ zip [minBound..maxBound] (repeat [])
38 |     addUnlowering c hm =
39 |       HashMap.insertWith (++) (Char.toLower c) [c] hm
40 |     isNotId lc ucs = ucs /= [lc]
41 | 
42 | 
43 | -- | This function prints all the special cases of unlowerCodePoint where it's not @(pure . id)@:
44 | --
45 | --     SPECIAL: i (105) -> İ (304) i (105) I (73)
46 | --     SPECIAL: k (107) -> K (8490) k (107) K (75)
47 | --     SPECIAL: ß (223) -> ẞ (7838) ß (223)
48 | --     SPECIAL: å (229) -> Å (8491) å (229) Å (197)
49 | --     SPECIAL: ǆ (454) -> ǆ (454) ǅ (453) Ǆ (452)
50 | --     SPECIAL: ǉ (457) -> ǉ (457) ǈ (456) Ǉ (455)
51 | --     SPECIAL: ǌ (460) -> ǌ (460) ǋ (459) Ǌ (458)
52 | --     SPECIAL: ǳ (499) -> ǳ (499) ǲ (498) Ǳ (497)
53 | --     SPECIAL: θ (952) -> ϴ (1012) θ (952) Θ (920)
54 | --     SPECIAL: ω (969) -> Ω (8486) ω (969) Ω (937)
55 | --     [..]
56 | --     Inverse of Char.toUpper: a (97) -> a (97) A (65)
57 | --     Inverse of Char.toUpper: b (98) -> b (98) B (66)
58 | --     Inverse of Char.toUpper: c (99) -> c (99) C (67)
59 | --     [..]
60 | --
61 | printUnlowerings :: IO ()
62 | printUnlowerings = do
63 | 
64 |   let
65 |     showCP :: Char -> String
66 |     showCP c = case Char.ord c of
67 |       co | co > 68000 -> show co -- Some RTL languages above these code points are annoying to print
68 |       co -> c : " (" <> show co <> ")"
69 | 
70 |     showCPs :: [Char] -> String
71 |     showCPs cs = List.intercalate " " (map showCP cs)
72 | 
73 |     isInverse (lc, ucs) = ucs == [lc, Char.toUpper lc] || ucs == [Char.toUpper lc, lc]
74 |     isAlreadyUppercase (_, ucs) = ucs == []
75 |     isSpecial p = not (isInverse p) && not (isAlreadyUppercase p)
76 | 
77 |     lst :: [(Char, [Char])]
78 |     lst = HashMap.toList unlowerings
79 | 
80 |   forM_ (filter isSpecial lst) $ \(lc, ucs) -> do
81 |     putStrLn $ "SPECIAL: " <> showCP lc <> " -> " <> showCPs ucs
82 | 
83 |   forM_ (filter isAlreadyUppercase lst) $ \(lc, _) -> do
84 |     putStrLn $ "Already uppercase (there is no unlowering): " <> showCP lc
85 | 
86 |   forM_ (filter isInverse lst) $ \(lc, ucs) -> do
87 |     putStrLn $ "Inverse of Char.toUpper: " <> showCP lc <> " -> " <> List.intercalate " " (map showCP ucs)
88 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import clize
 4 | import os
 5 | import os.path
 6 | import subprocess
 7 | import sys
 8 | 
 9 | # clize turns this into a CLI where program is a required argument and prefix is a required option
10 | def benchmark(program, *, prefix, data_directory="data"):
11 |     """
12 |     This script runs an executable 5 times and collects the running times it
13 |     reports. Names of the .txt files in the data/ directory are provivided to the
14 |     program on the command line. Expected output of the program is tab-separated
15 |     times (with trailing tab) per run in nanoseconds on stdout, and the number of
16 |     matches on stderr (only once, not per run). The output should be one line per
17 |     input file.
18 | 
19 |     Example:
20 |     ./benchmark.py ./naive.py --prefix python
21 | 
22 |     :param program: What to benchmark.
23 |     :param prefix: Used for generating output files $prefix.{stats,results}
24 |     :param data_directory: Directory containing the benchmark data files.
25 |     """
26 | 
27 |     # # Disable automatic CPU frequency scaling to get lower variance measurements.
28 |     with open('/sys/devices/system/cpu/cpu1/cpufreq/scaling_governor', 'r') as f:
29 |         scaling_governor = next(f)
30 | 
31 |     if scaling_governor != 'performance\n':
32 |         print('Please run the following to clock CPU frequency to its maximum:')
33 |         print('echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor')
34 |         sys.exit(1)
35 | 
36 |     input_file_names = []
37 |     for f in os.listdir(data_directory):
38 |         file_name = os.path.join(data_directory, f)
39 |         if os.path.isfile(file_name) and file_name.endswith('.txt'):
40 |             input_file_names.append(os.path.abspath(file_name))
41 |     
42 |     # Makes comparison of .results files easier
43 |     input_file_names.sort()
44 | 
45 |     print(f'Found {len(input_file_names)} files to benchmark.')
46 | 
47 |     # Run the program to benchmark under taskset to lock it to CPU core 1. This
48 |     # avoids variance due to CPU migrations.
49 |     cmdline = ['taskset', '-c', '1', program] + input_file_names
50 | 
51 |     times = []
52 |     results = None 
53 | 
54 |     num_rounds = 5
55 |     for i in range(0, num_rounds):
56 |         print(f'Round {i + 1} of {num_rounds}.')
57 |         p = subprocess.run(cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
58 |         
59 |         if p.returncode != 0:
60 |             print(f'Running {cmdline[:6]}... failed.')
61 |             print(p.stdout.decode('utf-8'))
62 |             print(p.stderr.decode('utf-8'))
63 |             sys.exit(1)
64 | 
65 |         assert results is None or results == p.stderr, (
66 |             'Program should have consistent output.\n'
67 |             f'{results!r} != {p.stderr!r}'
68 |         )
69 |         results = p.stderr
70 | 
71 |         # Paste outputs of the different runs together, like GNU Paste.
72 |         if len(times) == 0:
73 |             times = p.stdout.splitlines()
74 |         else:
75 |             times = [acc + new for acc, new in zip(times, p.stdout.splitlines())]
76 | 
77 |     with open(f'{prefix}.stats', 'wb') as f:
78 |         f.writelines(ts + b'\n' for ts in times) 
79 | 
80 |     with open(f'{prefix}.results', 'wb') as f:
81 |         f.write(results)
82 | 
83 |     print(f'Results written to {prefix}.{{stats,results}}.')
84 | 
85 | if __name__ == '__main__':
86 |     clize.run(benchmark)
87 | 


--------------------------------------------------------------------------------
/benchmark/rust/src/main.rs:
--------------------------------------------------------------------------------
 1 | extern crate aho_corasick;
 2 | extern crate filebuffer;
 3 | extern crate memchr;
 4 | 
 5 | use std::env;
 6 | use std::io;
 7 | 
 8 | use aho_corasick::{AcAutomaton, Automaton, Dense};
 9 | use filebuffer::FileBuffer;
10 | use memchr::memchr;
11 | 
12 | /// Print the number of matches found in the haystack in the file, and timings.
13 | ///
14 | /// The file format consists of needles, one per line, followed by a blank line,
15 | /// followed by the haystack. The file is encoded as UTF-16 LE without byte
16 | /// order mark.
17 | fn process_file(file_name: String) -> io::Result<()> {
18 |     let file = FileBuffer::open(file_name)?;
19 | 
20 |     let mut needles = Vec::with_capacity(1000);
21 |     let mut start = 0;
22 | 
23 |     loop {
24 |         // The UTF-16 LE newline sequence is an ascii newline (0x0a) and a null
25 |         // byte. Locate the newline byte (0x0a) first, then check that it is
26 |         // part of 0x0a, 0x00 with the proper alignment.
27 |         let mut len = 0;
28 |         loop {
29 |             len += memchr(0x0a, &file[start + len..]).expect("Unexpected EOF before haystack");
30 |             if len % 2 != 0 {
31 |                 // We are looking for the first byte of the 16-bit integer,
32 |                 // not the second one.
33 |                 continue;
34 |             }
35 | 
36 |             if let Some(0x00) = file.get(start + len + 1) {
37 |                 // The second byte is as expected, this really was a newline.
38 |                 break;
39 |             }
40 | 
41 |             // Skip over the 0x0a that is not a newline on the next memchr.
42 |             len += 1;
43 |         }
44 | 
45 |         // A blank line ends the needle section of the file, what follows is
46 |         // haystack.
47 |         if len == 0 {
48 |             break;
49 |         }
50 | 
51 |         // The newline byte is not part of the needle.
52 |         let end = start + len;
53 |         needles.push(&file[start..end]);
54 | 
55 |         // The next needle starts after the null byte after the newline byte.
56 |         start = start + len + 2;
57 |     }
58 | 
59 |     let haystack = &file[start..];
60 | 
61 |     // Run the benchmark 5 times.
62 |     for i in 0..5 {
63 |         let epoch = std::time::Instant::now();
64 |         // NOTE: You can opt for aho_corasick::Dense or Sparse. Dense is more memory
65 |         // efficient and Sparse is faster according to the docs, however, in my
66 |         // measurements, Dense is faster. You can also leave off the .into_full()
67 |         // which is also more memory efficient but slower according to the docs,
68 |         // and also according to my measurements.
69 |         // TODO: Re-enable .into_full() once https://github.com/BurntSushi/aho-corasick/issues/35
70 |         // is fixed.
71 |         let automaton = AcAutomaton::<_, Dense>::with_transitions(&needles[..]);
72 |         let num_matches = automaton.find_overlapping(haystack).count();
73 | 
74 |         let duration = epoch.elapsed();
75 | 
76 |         // Print duration in nanoseconds, tab separated.
77 |         print!("{}\t", duration.as_nanos());
78 | 
79 |         // In the first iteration, print the match count to stderr, so we can
80 |         // verify it against the reference implementation for correctness.
81 |         if i == 0 {
82 |             eprintln!("{}", num_matches);
83 |         }
84 |     }
85 | 
86 |     // Print a newline, so we print one line of measurements per file.
87 |     println!("");
88 | 
89 |     Ok(())
90 | }
91 | 
92 | fn main() {
93 |     // Skip the program name.
94 |     for file_name in env::args().skip(1) {
95 |         process_file(file_name).unwrap();
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/Data/Text/BoyerMooreCI/Replacer.hs:
--------------------------------------------------------------------------------
 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
 2 | -- Copyright 2019 Channable
 3 | --
 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
 5 | -- repository root.
 6 | 
 7 | {-# LANGUAGE GHC2021 #-}
 8 | {-# LANGUAGE DeriveGeneric #-}
 9 | {-# LANGUAGE DerivingStrategies #-}
10 | {-# LANGUAGE FlexibleInstances #-}
11 | 
12 | module Data.Text.BoyerMooreCI.Replacer
13 |     ( -- Replacer
14 |       replaceSingleLimited
15 |     ) where
16 | 
17 | import GHC.Generics (Generic)
18 | import Data.Text.Utf8 (Text)
19 | import Data.Text.BoyerMooreCI.Automaton (Automaton, CodeUnitIndex)
20 | 
21 | import qualified Data.Text.Utf8 as Text
22 | import qualified Data.Text.Utf8 as Utf8
23 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMoore
24 | 
25 | -- | Replace all occurrences matched by the Boyer-Moore automaton
26 | -- with the given replacement text in some haystack.
27 | -- Performs case-sensitive replacement.
28 | replaceSingleLimited
29 |   :: Automaton -- ^ Matches the needles
30 |   -> Text -- ^ Replacement string
31 |   -> Text -- ^ Haystack
32 |   -> CodeUnitIndex -- ^ Maximum number of code units in the returned text
33 |   -> Maybe Text
34 | replaceSingleLimited needle replacement haystack maxLength
35 |   | needleLength == 0 = Just $ if haystackLength == 0 then replacement else haystack
36 |   | otherwise = finish $ BoyerMoore.runText initial foundMatch needle haystack
37 |   where
38 |     needleLength = BoyerMoore.patternLength needle
39 |     haystackLength = Utf8.lengthUtf8 haystack
40 |     replacementLength = Utf8.lengthUtf8 replacement
41 | 
42 |     initial = ReplaceState
43 |       { rsChunks = []
44 |       , rsPreviousMatchEnd = 0
45 |       , rsLength = 0
46 |       }
47 | 
48 |     foundMatch rs matchStart matchEnd =
49 |       let
50 |         -- Slice the part of the haystack between the end of the previous match
51 |         -- and the start of the current match
52 |         haystackPartLength = matchStart - rsPreviousMatchEnd rs
53 |         haystackPart = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack
54 | 
55 |         -- Add the preceding part of the haystack and the replacement in reverse
56 |         -- order to the chunk list (all chunks will be reversed at once in the final step).
57 |         newChunks = replacement : haystackPart : rsChunks rs
58 |         newLength = replacementLength + haystackPartLength + rsLength rs
59 | 
60 |         newState = ReplaceState
61 |           { rsChunks = newChunks
62 |           , rsPreviousMatchEnd = matchEnd + 1
63 |           , rsLength = newLength
64 |           }
65 |       in
66 |         if newLength > maxLength
67 |           then BoyerMoore.Done newState
68 |           else BoyerMoore.Step newState
69 | 
70 |     finish rs =
71 |       let
72 |         -- Slice the remaining part of the haystack from the end of the last match
73 |         -- to the end of the haystack.
74 |         haystackPartLength = haystackLength - rsPreviousMatchEnd rs
75 |         finalChunks
76 |             = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack
77 |             : rsChunks rs
78 |         finalLength = rsLength rs + haystackPartLength
79 |       in
80 |         if finalLength > maxLength
81 |           then Nothing
82 |           else Just $ Text.concat $ reverse finalChunks
83 | 
84 | -- | Internal accumulator state for performing a replace while stepping an automaton
85 | data ReplaceState = ReplaceState
86 |   { rsChunks :: [Text]
87 |     -- ^ Chunks of the final text, in reverse order so that we can efficiently prepend
88 |   , rsPreviousMatchEnd :: !CodeUnitIndex
89 |     -- ^ Index one past the end of the last match.
90 |   , rsLength :: !CodeUnitIndex
91 |     -- ^ Length of the newly build string so far, measured in CodeUnits
92 |   }
93 |   deriving Generic
94 | 


--------------------------------------------------------------------------------
/src/Data/Text/BoyerMoore/Replacer.hs:
--------------------------------------------------------------------------------
 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
 2 | -- Copyright 2019 Channable
 3 | --
 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
 5 | -- repository root.
 6 | 
 7 | {-# LANGUAGE GHC2021 #-}
 8 | {-# LANGUAGE DeriveGeneric #-}
 9 | {-# LANGUAGE DerivingStrategies #-}
10 | {-# LANGUAGE FlexibleInstances #-}
11 | 
12 | module Data.Text.BoyerMoore.Replacer
13 |     ( -- Replacer
14 |       replaceSingleLimited
15 |     ) where
16 | 
17 | import GHC.Generics (Generic)
18 | import Data.Text.Utf8 (Text)
19 | import Data.Text.BoyerMoore.Automaton (Automaton, CodeUnitIndex)
20 | 
21 | import qualified Data.Text.Utf8 as Text
22 | import qualified Data.Text.Utf8 as Utf8
23 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore
24 | 
25 | -- | Replace all occurrences matched by the Boyer-Moore automaton
26 | -- with the given replacement text in some haystack.
27 | -- Performs case-sensitive replacement.
28 | replaceSingleLimited
29 |   :: Automaton -- ^ Matches the needles
30 |   -> Text -- ^ Replacement string
31 |   -> Text -- ^ Haystack
32 |   -> CodeUnitIndex -- ^ Maximum number of code units in the returned text
33 |   -> Maybe Text
34 | replaceSingleLimited needle replacement haystack maxLength
35 |   | needleLength == 0 = Just $ if haystackLength == 0 then replacement else haystack
36 |   | otherwise = finish $ BoyerMoore.runText initial foundMatch needle haystack
37 |   where
38 |     needleLength = BoyerMoore.patternLength needle
39 |     haystackLength = Utf8.lengthUtf8 haystack
40 |     replacementLength = Utf8.lengthUtf8 replacement
41 | 
42 |     initial = ReplaceState
43 |       { rsChunks = []
44 |       , rsPreviousMatchEnd = 0
45 |       , rsLength = 0
46 |       }
47 | 
48 |     foundMatch rs matchStart =
49 |       let
50 |         matchEnd = matchStart + needleLength
51 | 
52 |         -- Slice the part of the haystack between the end of the previous match
53 |         -- and the start of the current match
54 |         haystackPartLength = matchStart - rsPreviousMatchEnd rs
55 |         haystackPart = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack
56 | 
57 |         -- Add the preceding part of the haystack and the replacement in reverse
58 |         -- order to the chunk list (all chunks will be reversed at once in the final step).
59 |         newChunks = replacement : haystackPart : rsChunks rs
60 |         newLength = replacementLength + haystackPartLength + rsLength rs
61 | 
62 |         newState = ReplaceState
63 |           { rsChunks = newChunks
64 |           , rsPreviousMatchEnd = matchEnd
65 |           , rsLength = newLength
66 |           }
67 |       in
68 |         if newLength > maxLength
69 |           then BoyerMoore.Done newState
70 |           else BoyerMoore.Step newState
71 | 
72 |     finish rs =
73 |       let
74 |         -- Slice the remaining part of the haystack from the end of the last match
75 |         -- to the end of the haystack.
76 |         haystackPartLength = haystackLength - rsPreviousMatchEnd rs
77 |         finalChunks
78 |             = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack
79 |             : rsChunks rs
80 |         finalLength = rsLength rs + haystackPartLength
81 |       in
82 |         if finalLength > maxLength
83 |           then Nothing
84 |           else Just $ Text.concat $ reverse finalChunks
85 | 
86 | -- | Internal accumulator state for performing a replace while stepping an automaton
87 | data ReplaceState = ReplaceState
88 |   { rsChunks :: [Text]
89 |     -- ^ Chunks of the final text, in reverse order so that we can efficiently prepend
90 |   , rsPreviousMatchEnd :: !CodeUnitIndex
91 |     -- ^ Index one past the end of the last match.
92 |   , rsLength :: !CodeUnitIndex
93 |     -- ^ Length of the newly build string so far, measured in CodeUnits
94 |   }
95 |   deriving Generic
96 | 


--------------------------------------------------------------------------------
/tests/Data/Text/TestInstances.hs:
--------------------------------------------------------------------------------
 1 | module Data.Text.TestInstances where
 2 | 
 3 | import Data.Text.Internal (Text (..))
 4 | import Test.QuickCheck (Arbitrary (..), Gen)
 5 | 
 6 | import qualified Data.Text as Text
 7 | import qualified Data.Text.Array as TextArray
 8 | import qualified Test.QuickCheck as QuickCheck
 9 | import qualified Test.QuickCheck.Gen as Gen
10 | 
11 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
12 | import qualified Data.Text.Utf8 as Utf8
13 | 
14 | 
15 | 
16 | instance Arbitrary CaseSensitivity where
17 |   arbitrary = Gen.elements [CaseSensitive, IgnoreCase]
18 | 
19 | instance Arbitrary Utf8.CodeUnitIndex where
20 |   arbitrary = fmap Utf8.CodeUnitIndex arbitrary
21 | 
22 | 
23 | 
24 | -- | Copy the text such that it has an arbitrary offset, but the represented
25 | -- text stays the same.
26 | arbitraryOffset :: Text -> Gen Text
27 | arbitraryOffset (Text sourceData sourceOffset sourceLength) = do
28 |   QuickCheck.Positive destOffset <- arbitrary
29 |   let destData = TextArray.run $ do
30 |         arr <- TextArray.new (destOffset + sourceLength)
31 |         TextArray.copyI sourceLength arr destOffset sourceData sourceOffset
32 |         pure arr
33 |   pure $ Text destData destOffset sourceLength
34 | 
35 | 
36 | -- | Generate random needles and haystacks, such that the needles have a
37 | -- reasonable probability of occuring in the haystack, which would hardly be the
38 | -- case if we just generated random texts for all of them.
39 | --
40 | -- We do this by first generating a set of fragments, and then building the
41 | -- haystack and needles by combining these fragments. This way we also get a lot
42 | -- of partial matches where part of a needle does occur in the haystack, but the
43 | -- full needle does not, as well as needles with a shared prefix or suffix. This
44 | -- should fully stress the possible transitions in the search algorithms.
45 | --
46 | arbitraryNeedleHaystack :: Gen (Text, Text)
47 | arbitraryNeedleHaystack = do
48 |   -- Generate a set of fragments, all within the same arbitrarily chosen alphabet
49 |   alphabet <- arbitraryAlphabet
50 |   fragments <- Gen.listOf1 $ Gen.resize 5 (arbitraryFragment alphabet)
51 |   let
52 |     genSmall = Gen.scale (`div` 3) $ Gen.listOf1 $ Gen.elements fragments
53 |     genBig = Gen.scale (* 4) $ Gen.listOf1 $ Gen.elements fragments
54 |   needle <- arbitraryOffset =<< fmap Text.concat genSmall
55 |   haystack <- arbitraryOffset =<< fmap Text.concat genBig
56 |   pure (needle, haystack)
57 | 
58 | arbitraryNeedlesHaystack :: Gen ([Text], Text)
59 | arbitraryNeedlesHaystack = do
60 |   -- Generate a set of fragments, all within the same arbitrarily chosen alphabet
61 |   alphabet <- arbitraryAlphabet
62 |   fragments <- Gen.listOf1 $ Gen.resize 5 (arbitraryFragment alphabet)
63 |   let
64 |     genSmall = Gen.scale (`div` 3) $ Gen.listOf1 $ Gen.elements fragments
65 |     genBig = Gen.scale (* 4) $ Gen.listOf1 $ Gen.elements fragments
66 |   needles <- Gen.listOf1 (arbitraryOffset =<< fmap Text.concat genSmall)
67 |   haystack <- arbitraryOffset =<< fmap Text.concat genBig
68 |   pure (needles, haystack)
69 | 
70 | arbitraryFragment :: [Char] -> Gen Text
71 | arbitraryFragment alphabet =
72 |   arbitraryOffset =<< Text.pack <$> Gen.listOf1 (Gen.elements alphabet)
73 | 
74 | arbitraryAlphabet :: Gen [Char]
75 | arbitraryAlphabet =
76 |   Gen.oneof [ pure simpleAlphabet
77 |             , pure fancyAlphabet
78 |             , randomAlphabet
79 |             ]
80 | 
81 |   where
82 | 
83 |     simpleAlphabet = "abAB12"  -- Some ascii, include numbers so that not everything has upper/lower cases
84 | 
85 |     fancyAlphabet =
86 |       concat
87 |         [ "яЯ"  -- Cyrillic, two-byte characters
88 |         , "åÅÅ" -- Å '\8491' and Å '\197' both have å '\229' as lower case
89 |         ++ "𝄞💩"  -- Four byte characters \119070 and \128169
90 |         ++ "ßẞ"   -- Note that ẞ '\7838' lower cases to ß '\223', but ß '\223' upper cases to ß '\223'
91 |         ]
92 | 
93 |     randomAlphabet = sequence $ replicate 8 QuickCheck.arbitrary
94 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
  1 | # benchmark
  2 | 
  3 | This is the code used to benchmark `alfred-margaret`.
  4 | The results of this benchmark are shown in this [Channable tech blog article](https://www.channable.com/tech/how-we-made-haskell-search-strings-as-fast-as-rust).
  5 | 
  6 | ## Setup
  7 | 
  8 | We use Nix to manage dependencies such as `cargo`, `bazel`, `stack` and the required python packages.
  9 | In order to drop into a environment containing these, run:
 10 | 
 11 | ```
 12 | # In the repository root (alfred-margaret folder)
 13 | nix run --arg benchTools true -c $SHELL
 14 | ```
 15 | 
 16 | The first time you do this will likely take a while unless you've already downloaded those packages (some 5 GiB).
 17 | 
 18 | ## Running the Benchmarks
 19 | 
 20 | Files in the `data` directory must follow this format:
 21 | 
 22 | ```
 23 | Lorem
 24 | sunt
 25 | officia
 26 | 
 27 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
 28 | ```
 29 | 
 30 | The first part of this file defines the search terms (the "needles") `Lorem`, `sunt` and `officia`.
 31 | The second part, after a blank line, defines the corpus to search in  (the "haystack").
 32 | Currently, these files **must be encoded as UTF-16 Little-Endian without BOM**.
 33 | 
 34 | Each run of `benchmark.sh` generates a pair of `.results` and `.stats` files.
 35 | 
 36 | ### Python
 37 | 
 38 | Run the benchmark in the root of the repository:
 39 | 
 40 | ```
 41 | ./benchmark.sh python
 42 | ```
 43 | 
 44 | ### Rust
 45 | 
 46 | Compile the Rust implementation by running Cargo in the `./benchmark/rust` directory:
 47 | 
 48 | ```
 49 | cargo build --release
 50 | ```
 51 | 
 52 | Run the benchmark using the compiled binary:
 53 | 
 54 | ```
 55 | ./benchmark.sh rust
 56 | ```
 57 | 
 58 | ### Java
 59 | 
 60 | Compile the Java implementation by running Bazel in the `java` directory:
 61 | 
 62 | ```
 63 | bazel build :acbench_deploy.jar
 64 | ```
 65 | 
 66 | Run the benchmark using the script generated by Bazel:
 67 | 
 68 | ```
 69 | ./benchmark.sh java
 70 | ```
 71 | 
 72 | ### Haskell
 73 | 
 74 | Compile the Haskell implementation by running Stack in the `haskell` directory:
 75 | 
 76 | ```
 77 | stack build
 78 | ```
 79 | 
 80 | Run the benchmark using the compiled binary:
 81 | 
 82 | ```
 83 | ./benchmark.sh haskell
 84 | ```
 85 | 
 86 | Note that you must first convert the data files for the benchmark into UTF-8 and put them in `data-utf8`.
 87 | You can use `iconv` for this:
 88 | 
 89 | ```
 90 | mkdir data-utf8
 91 | for f in $(ls data); do echo $f; iconv -f UTF-16LE -t UTF-8 data/$f -o data-utf8/$f; done
 92 | ```
 93 | 
 94 | ### Haskell, with Rust FFI
 95 | 
 96 | Compile the static `libacbench` library using `cargo` in the `rust-ffi/libacbench` directory:
 97 | 
 98 | ```
 99 | cargo build --release
100 | ```
101 | 
102 | Make sure to pass `--release`! Now run Stack in the `rust-ffi` directory:
103 | 
104 | ```
105 | stack build
106 | ```
107 | 
108 | Run the benchmark using the compiled binary:
109 | 
110 | ```
111 | ./benchmark.sh rust-ffi
112 | ```
113 | 
114 | Since this version uses the UTF-8 as well, you have to generate the UTF-8 data first as described in the previous section.
115 | 
116 | ## Inspecting the Results
117 | 
118 | Once you have a bunch of `.stats` files, you can inspect the results using `report.py`:
119 | 
120 | ```
121 | ./report.py haskell.stats haskell-utf8.stats rust.stats java.stats python.stats
122 | # Will print something like (depending on your machine...)
123 | haskell.stats:
124 |   mean time: 3.529 ± 0.010 seconds
125 |    min time: 3.431 seconds
126 | 
127 | haskell-utf8.stats:
128 |   mean time: 3.526 ± 0.028 seconds
129 |    min time: 3.392 seconds
130 | 
131 | rust.stats:
132 |   mean time: 4.771 ± 0.013 seconds
133 |    min time: 4.685 seconds
134 | 
135 | java.stats:
136 |   mean time: 7.452 ± 0.090 seconds
137 |    min time: 7.143 seconds
138 | 
139 | python.stats:
140 |   mean time: 11.370 ± 0.057 seconds
141 |    min time: 10.973 seconds
142 | ```
143 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Alfred–Margaret
  2 | 
  3 | Alfred–Margaret is a fast implementation of the Aho–Corasick string
  4 | searching algorithm in Haskell. It powers many string-related operations
  5 | in [Channable][channable].
  6 | 
  7 | The library is designed to work with the [`text`][text] package. It matches
  8 | directly on the internal UTF-16 representation of `Text` for efficiency. See the
  9 | [announcement blog post][blog-post] for a deeper dive into Aho–Corasick, and the
 10 | optimizations that make this library fast.
 11 | 
 12 | Alfred–Margaret is named after Alfred Aho and Margaret Corasick.
 13 | 
 14 | ## Performance
 15 | 
 16 | Running time to count all matches, in a real-world data set,
 17 | comparing [a Java implementation][hankcs] and [a Rust implementation][burntsushi]
 18 | against Alfred–Margaret, and against memcopy to establish a lower bound:
 19 | 
 20 | <p align="center">
 21 | <img
 22 |   title="Graph that shows that Alfred–Margaret is fast."
 23 |   src="performance.png"
 24 |   width="80%"
 25 | >
 26 | </p>
 27 | 
 28 | For the full details of this benchmark, see
 29 | [our announcement blog post][blog-post], which includes more details about the
 30 | data set, the benchmark setup, and a few things to keep in mind when
 31 | interpreting this graph.
 32 | 
 33 | ### LLVM
 34 | 
 35 | If you are using LLVM instead of the GHC backend, make sure to compare different versions.
 36 | Using LLVM 9 instead of LLVM 12 with GHC 8.10.7 made a significant dent in benchmark times.
 37 | For more information, see [this issue][llvm].
 38 | 
 39 | ## Example
 40 | 
 41 | Check if a string contains one of the needles:
 42 | 
 43 | ```haskell
 44 | 
 45 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 46 | import qualified Data.Text.AhoCorasick.Searcher as Searcher
 47 | 
 48 | searcher = Searcher.build Aho.CaseSensitive ["tshirt", "shirts", "shorts"]
 49 | 
 50 | Searcher.containsAny searcher "short tshirts"
 51 | > True
 52 | 
 53 | Searcher.containsAny searcher "long shirt"
 54 | > False
 55 | 
 56 | Searcher.containsAny searcher "Short TSHIRTS"
 57 | > False
 58 | 
 59 | searcher' = Searcher.build Aho.IgnoreCase ["tshirt", "shirts", "shorts"]
 60 | 
 61 | Searcher.containsAny searcher' "Short TSHIRTS"
 62 | > True
 63 | ```
 64 | 
 65 | Sequentially replace many needles:
 66 | 
 67 | ```haskell
 68 | import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..))
 69 | import qualified Data.Text.AhoCorasick.Replacer as Replacer
 70 | 
 71 | replacer = Replacer.build CaseSensitive [("tshirt", "banana"), ("shirt", "pear")]
 72 | 
 73 | Replacer.run replacer "tshirts for sale"
 74 | > "bananas for sale"
 75 | 
 76 | Replacer.run replacer "tshirts and shirts for sale"
 77 | > "bananas and pears for sale"
 78 | 
 79 | Replacer.run replacer "sweatshirts and shirtshirts"
 80 | > "sweabananas and shirbananas"
 81 | 
 82 | Replacer.run replacer "sweatshirts and shirttshirts"
 83 | > "sweabananas and pearbananas"
 84 | ```
 85 | 
 86 | Get all matches, possibly overlapping:
 87 | 
 88 | ```haskell
 89 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 90 | 
 91 | pairNeedleWithSelf text = (Aho.unpackUtf16 text, text)
 92 | automaton = Aho.build $ fmap pairNeedleWithSelf ["tshirt", "shirts", "shorts"]
 93 | allMatches = Aho.runText [] (\matches match -> Aho.Step (match : matches))
 94 | 
 95 | allMatches automaton "short tshirts"
 96 | > [ Match {matchPos = CodeUnitIndex 13, matchValue = "shirts"}
 97 | > , Match {matchPos = CodeUnitIndex 12, matchValue = "tshirt"}
 98 | > ]
 99 | 
100 | allMatches automaton "sweatshirts and shirtshirts"
101 | > [ Match {matchPos = CodeUnitIndex 27, matchValue = "shirts"}
102 | > , Match {matchPos = CodeUnitIndex 26, matchValue = "tshirt"}
103 | > , Match {matchPos = CodeUnitIndex 22, matchValue = "shirts"}
104 | > , Match {matchPos = CodeUnitIndex 11, matchValue = "shirts"}
105 | > , Match {matchPos = CodeUnitIndex 10, matchValue = "tshirt"}
106 | > ]
107 | ```
108 | 
109 | ## License
110 | 
111 | Alfred–Margaret is licensed under the 3-clause BSD license.
112 | 
113 | [channable]:  https://www.channable.com/
114 | [blog-post]:  https://tech.channable.com/posts/2019-03-13-how-we-made-haskell-search-strings-as-fast-as-rust.html
115 | [text]:       https://github.com/haskell/text
116 | [hankcs]:     https://github.com/hankcs/AhoCorasickDoubleArrayTrie/tree/v1.2.0
117 | [burntsushi]: https://github.com/BurntSushi/aho-corasick/tree/0.6.8
118 | [llvm]: https://github.com/channable/alfred-margaret/issues/24
119 | 


--------------------------------------------------------------------------------
/benchmark/rust-ffi/app/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveGeneric #-}
  2 | 
  3 | module Main where
  4 | 
  5 | import GHC.Generics (Generic)
  6 | import Control.DeepSeq (force)
  7 | import Control.Exception (evaluate)
  8 | import Control.Monad (void, when)
  9 | import Data.Foldable (for_, traverse_)
 10 | import Data.Word (Word8)
 11 | import Foreign.C.Types (CBool (..), CSize (..))
 12 | import Foreign.Marshal (with, withArray)
 13 | import Foreign.Ptr (Ptr, castPtr)
 14 | import Foreign.Storable (Storable (sizeOf), alignment, peek, poke, pokeByteOff, sizeOf)
 15 | import GHC.Compact (compact, getCompact)
 16 | import System.IO (hPrint, stderr, stdout)
 17 | import Text.Printf (hPrintf)
 18 | 
 19 | import qualified Data.ByteString as ByteString
 20 | import qualified Data.Text.Encoding as Encoding
 21 | import qualified System.Clock as Clock
 22 | import qualified System.Environment as Env
 23 | 
 24 | import Data.Text.Utf8 (CodeUnitIndex (..), Text (..))
 25 | 
 26 | import qualified Data.Text.Utf8 as Utf8
 27 | 
 28 | foreign import ccall unsafe "perform_ac"
 29 |   performAc :: CBool -> CSize -> Ptr U8Slice -> Ptr U8Slice -> IO CSize
 30 | 
 31 | -- | A slice of 'Word8's that can be passed to FFI.
 32 | -- The pointer should always point to pinned memory.
 33 | -- Use 'fromText' for constructing 'U8Slice's to ensure this.
 34 | data U8Slice = U8Slice (Ptr Word8) CSize CSize
 35 |   deriving Generic
 36 | 
 37 | instance Storable U8Slice where
 38 |   sizeOf _ = sizeOf (undefined :: Ptr Word8) + 2 * sizeOf (undefined :: CSize)
 39 |   alignment _ = max (alignment (undefined :: Ptr Word8)) (alignment (undefined :: CSize))
 40 |   peek _ptr = error "We only write U8Slices to pointer, never read them"
 41 |   poke ptr (U8Slice u8ptr off len) = do
 42 |     poke ptr' u8ptr
 43 |     pokeByteOff ptr' (sizeOf (undefined :: Ptr Word8)) off
 44 |     pokeByteOff ptr' (sizeOf (undefined :: Ptr Word8) + sizeOf (undefined :: CSize)) len
 45 |     where ptr' = castPtr ptr
 46 | 
 47 | -- | Turn a 'Text' value into something that can be passed through FFI.
 48 | -- The 'ByteArray' values inside must be pinned.
 49 | fromText :: Text -> U8Slice
 50 | fromText (Text u8data off len)
 51 |   | Utf8.isArrayPinned u8data = U8Slice (Utf8.arrayContents u8data) (fromIntegral off) (fromIntegral len)
 52 |   | otherwise                 = error "ByteArray is not pinned"
 53 | 
 54 | readNeedleHaystackFile :: FilePath -> IO ([Text], Text)
 55 | readNeedleHaystackFile path = do
 56 |   (Text u8data off len) <- Encoding.decodeUtf8 <$> ByteString.readFile path
 57 |   pure $ go u8data off len []
 58 |   where
 59 |     go u8data off 0 needles = (reverse needles, Text u8data off 0)
 60 |     go u8data off len needles
 61 |       -- "line starts with newline char" ==> empty line, emit haystack as slice of u8data
 62 |       | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = (reverse needles, Text u8data (off + 1) (len - 1))
 63 |       | otherwise = consumeNeedle u8data off len needles off
 64 | 
 65 |     consumeNeedle u8data off len needles needleStart
 66 |       -- Newline ==> emit needle as slice of u8data
 67 |       | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = go u8data (off + 1) (len - 1) $ Text u8data needleStart (off - needleStart) : needles
 68 |       | otherwise = consumeNeedle u8data (off + 1) (len - 1) needles needleStart
 69 | 
 70 | main :: IO ()
 71 | main = Env.getArgs >>= traverse_ processFile
 72 | 
 73 | processFile :: FilePath -> IO ()
 74 | processFile path = do
 75 |   (needles, haystack) <- getCompact <$> (compact =<< readNeedleHaystackFile path)
 76 | 
 77 |   void $ evaluate $ force needles
 78 |   void $ evaluate $ force haystack
 79 | 
 80 |   for_ [0 :: Int .. 4] $ \i -> do
 81 |     (count, duration) <- acBench needles haystack
 82 |     when (i == 0) $
 83 |       hPrint stderr count
 84 |     hPrintf stdout "%d\t" (Clock.toNanoSecs duration)
 85 |   hPrintf stdout "\n"
 86 | 
 87 | acBench :: [Text] -> Text -> IO (Int, Clock.TimeSpec)
 88 | acBench needles haystack = do
 89 |   start <- Clock.getTime Clock.Monotonic
 90 | 
 91 |   let numNeedles = fromIntegral $ length needles
 92 |   matchCount <- with (fromText haystack) $ \haystackSlice ->
 93 |     withArray (map fromText needles) $ \needleSlices -> do
 94 |       performAc useSparse numNeedles needleSlices haystackSlice
 95 | 
 96 |   end <- Clock.getTime Clock.Monotonic
 97 |   pure (fromIntegral matchCount, Clock.diffTimeSpec start end)
 98 |   where
 99 |     -- Whether to use the sparse or dense implementation.
100 |     -- Set to @0@ to use the dense implementation.
101 |     -- Set to @1@ to use the sparse implementation.
102 |     useSparse :: CBool
103 |     useSparse = 0
104 | 


--------------------------------------------------------------------------------
/src/Data/Text/BoyerMoore/Searcher.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2019 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE DeriveGeneric #-}
  9 | {-# LANGUAGE FlexibleInstances #-}
 10 | 
 11 | module Data.Text.BoyerMoore.Searcher
 12 |     ( Searcher
 13 |     , automata
 14 |     , build
 15 |     , buildNeedleIdSearcher
 16 |     , buildWithValues
 17 |     , containsAll
 18 |     , containsAny
 19 |     , needles
 20 |     , numNeedles
 21 |     ) where
 22 | 
 23 | 
 24 | import Control.DeepSeq (NFData)
 25 | import Data.Bifunctor (first)
 26 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
 27 | import GHC.Generics (Generic)
 28 | 
 29 | import Data.Text.Utf8 (Text)
 30 | import Data.Text.BoyerMoore.Automaton (Automaton)
 31 | 
 32 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore
 33 | 
 34 | 
 35 | -- | A set of needles with associated values, and Boyer-Moore automata to
 36 | -- efficiently find those needles.
 37 | --
 38 | -- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles
 39 | -- To enforce this invariant, the fields are not exposed from this module.
 40 | -- There is a separate constructor function.
 41 | --
 42 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
 43 | -- can derive those for the types that embed the searcher, whithout
 44 | -- requiring the automaton itself to be Hashable or Eq, which would be both
 45 | -- wasteful and tedious. Because the automaton is fully determined by the
 46 | -- needles and associated values, it is sufficient to implement Eq and Hashable
 47 | -- in terms of the needles only.
 48 | --
 49 | -- We also use Hashed to cache the hash of the needles.
 50 | data Searcher v = Searcher
 51 |   { searcherNeedles :: Hashed [(Text, v)]
 52 |   , searcherNumNeedles :: Int
 53 |   , searcherAutomata :: [(Automaton, v)]
 54 |   } deriving (Generic)
 55 | 
 56 | instance Show (Searcher v) where
 57 |   show _ = "Searcher _ _ _"
 58 | 
 59 | instance Hashable v => Hashable (Searcher v) where
 60 |   hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher
 61 |   {-# INLINE hashWithSalt #-}
 62 | 
 63 | instance Eq v => Eq (Searcher v) where
 64 |   Searcher xs nx _ == Searcher ys ny _ = nx == ny && xs == ys
 65 |   {-# INLINE (==) #-}
 66 | 
 67 | instance NFData v => NFData (Searcher v)
 68 | 
 69 | -- | Builds the Searcher for a list of needles without values.
 70 | -- This is useful for just checking whether the haystack contains the needles.
 71 | build :: [Text] -> Searcher ()
 72 | {-# INLINABLE build #-}
 73 | build = buildWithValues . flip zip (repeat ())
 74 | 
 75 | -- | Builds the Searcher for a list of needles.
 76 | buildWithValues :: Hashable v => [(Text, v)] -> Searcher v
 77 | {-# INLINABLE buildWithValues #-}
 78 | buildWithValues ns =
 79 |   Searcher (hashed ns) (length ns) $ map (first BoyerMoore.buildAutomaton) ns
 80 | 
 81 | needles :: Searcher v -> [(Text, v)]
 82 | needles = unhashed . searcherNeedles
 83 | 
 84 | automata :: Searcher v -> [(Automaton, v)]
 85 | automata = searcherAutomata
 86 | 
 87 | numNeedles :: Searcher v -> Int
 88 | numNeedles = searcherNumNeedles
 89 | 
 90 | -- | Return whether the haystack contains any of the needles.
 91 | -- This function is marked noinline as an inlining boundary. BoyerMoore.runText is
 92 | -- marked inline, so this function will be optimized to report only whether
 93 | -- there is a match, and not construct a list of matches. We don't want this
 94 | -- function be inline, to make sure that the conditions of the caller don't
 95 | -- affect how this function is optimized. There is little to gain from
 96 | -- additional inlining. The pragma is not an optimization in itself, rather it
 97 | -- is a defence against fragile optimizer decisions.
 98 | {-# NOINLINE containsAny #-}
 99 | containsAny :: Searcher () -> Text -> Bool
100 | containsAny !searcher !text =
101 |   let
102 |     -- On the first match, return True immediately.
103 |     f _acc _match = BoyerMoore.Done True
104 |   in
105 |     any (\(automaton, ()) -> BoyerMoore.runText False f automaton text) (automata searcher)
106 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.
107 | 
108 | buildNeedleIdSearcher :: [Text] -> Searcher Int
109 | buildNeedleIdSearcher !ns =
110 |   buildWithValues $ zip ns [0..]
111 | 
112 | -- | Like 'containsAny', but checks whether all needles match instead.
113 | -- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'.
114 | {-# NOINLINE containsAll #-}
115 | containsAll :: Searcher Int -> Text -> Bool
116 | containsAll !searcher !text =
117 |   let
118 |     -- On the first match, return True immediately.
119 |     f _acc _match = BoyerMoore.Done True
120 |   in
121 |     all (\(automaton, _) -> BoyerMoore.runText False f automaton text) (automata searcher)
122 | 


--------------------------------------------------------------------------------
/src/Data/Text/BoyerMooreCI/Searcher.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2019 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE DeriveGeneric #-}
  9 | {-# LANGUAGE FlexibleInstances #-}
 10 | 
 11 | module Data.Text.BoyerMooreCI.Searcher
 12 |     ( Searcher
 13 |     , automata
 14 |     , build
 15 |     , buildNeedleIdSearcher
 16 |     , buildWithValues
 17 |     , containsAll
 18 |     , containsAny
 19 |     , needles
 20 |     , numNeedles
 21 |     ) where
 22 | 
 23 | 
 24 | import Control.DeepSeq (NFData)
 25 | import Data.Bifunctor (first)
 26 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
 27 | import GHC.Generics (Generic)
 28 | 
 29 | import Data.Text.Utf8 (Text)
 30 | import Data.Text.BoyerMooreCI.Automaton (Automaton)
 31 | 
 32 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMoore
 33 | 
 34 | 
 35 | -- | A set of needles with associated values, and Boyer-Moore automata to
 36 | -- efficiently find those needles.
 37 | --
 38 | -- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles
 39 | -- To enforce this invariant, the fields are not exposed from this module.
 40 | -- There is a separate constructor function.
 41 | --
 42 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
 43 | -- can derive those for the types that embed the searcher, whithout
 44 | -- requiring the automaton itself to be Hashable or Eq, which would be both
 45 | -- wasteful and tedious. Because the automaton is fully determined by the
 46 | -- needles and associated values, it is sufficient to implement Eq and Hashable
 47 | -- in terms of the needles only.
 48 | --
 49 | -- We also use Hashed to cache the hash of the needles.
 50 | data Searcher v = Searcher
 51 |   { searcherNeedles :: Hashed [(Text, v)]
 52 |   , searcherNumNeedles :: Int
 53 |   , searcherAutomata :: [(Automaton, v)]
 54 |   } deriving (Generic)
 55 | 
 56 | instance Show (Searcher v) where
 57 |   show _ = "Searcher _ _ _"
 58 | 
 59 | instance Hashable v => Hashable (Searcher v) where
 60 |   hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher
 61 |   {-# INLINE hashWithSalt #-}
 62 | 
 63 | instance Eq v => Eq (Searcher v) where
 64 |   Searcher xs nx _ == Searcher ys ny _ = nx == ny && xs == ys
 65 |   {-# INLINE (==) #-}
 66 | 
 67 | instance NFData v => NFData (Searcher v)
 68 | 
 69 | -- | Builds the Searcher for a list of needles without values.
 70 | -- This is useful for just checking whether the haystack contains the needles.
 71 | build :: [Text] -> Searcher ()
 72 | {-# INLINABLE build #-}
 73 | build = buildWithValues . flip zip (repeat ())
 74 | 
 75 | -- | Builds the Searcher for a list of needles.
 76 | buildWithValues :: Hashable v => [(Text, v)] -> Searcher v
 77 | {-# INLINABLE buildWithValues #-}
 78 | buildWithValues ns =
 79 |   Searcher (hashed ns) (length ns) $ map (first BoyerMoore.buildAutomaton) ns
 80 | 
 81 | needles :: Searcher v -> [(Text, v)]
 82 | needles = unhashed . searcherNeedles
 83 | 
 84 | automata :: Searcher v -> [(Automaton, v)]
 85 | automata = searcherAutomata
 86 | 
 87 | numNeedles :: Searcher v -> Int
 88 | numNeedles = searcherNumNeedles
 89 | 
 90 | -- | Return whether the haystack contains any of the needles.
 91 | -- This function is marked noinline as an inlining boundary. BoyerMoore.runText is
 92 | -- marked inline, so this function will be optimized to report only whether
 93 | -- there is a match, and not construct a list of matches. We don't want this
 94 | -- function be inline, to make sure that the conditions of the caller don't
 95 | -- affect how this function is optimized. There is little to gain from
 96 | -- additional inlining. The pragma is not an optimization in itself, rather it
 97 | -- is a defence against fragile optimizer decisions.
 98 | {-# NOINLINE containsAny #-}
 99 | containsAny :: Searcher () -> Text -> Bool
100 | containsAny !searcher !text =
101 |   let
102 |     -- On the first match, return True immediately.
103 |     f _acc _matchStart _matchEnd = BoyerMoore.Done True
104 |   in
105 |     any (\(automaton, ()) -> BoyerMoore.runText False f automaton text) (automata searcher)
106 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.
107 | 
108 | buildNeedleIdSearcher :: [Text] -> Searcher Int
109 | buildNeedleIdSearcher !ns =
110 |   buildWithValues $ zip ns [0..]
111 | 
112 | -- | Like 'containsAny', but checks whether all needles match instead.
113 | -- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'.
114 | {-# NOINLINE containsAll #-}
115 | containsAll :: Searcher Int -> Text -> Bool
116 | containsAll !searcher !text =
117 |   let
118 |     -- On the first match, return True immediately.
119 |     f _acc _matchStart _matchEnd = BoyerMoore.Done True
120 |   in
121 |     all (\(automaton, _) -> BoyerMoore.runText False f automaton text) (automata searcher)
122 | 


--------------------------------------------------------------------------------
/alfred-margaret.cabal:
--------------------------------------------------------------------------------
  1 | name:                alfred-margaret
  2 | version:             2.1.0.2
  3 | synopsis:            Fast Aho-Corasick string searching
  4 | description:         An efficient implementation of the Aho-Corasick
  5 |                      string searching algorithm.
  6 | homepage:            https://github.com/channable/alfred-margaret
  7 | license:             BSD3
  8 | license-file:        LICENSE
  9 | author:              The Alfred-Margaret authors
 10 | maintainer:          Ruud van Asseldonk <ruud@channable.com>, Fabian Thorand <fabian@channable.com>
 11 | copyright:           2020 Channable
 12 | category:            Data, Text
 13 | build-type:          Simple
 14 | extra-source-files:  README.md
 15 |                    , performance.png
 16 | cabal-version:       >=1.10
 17 | tested-with:
 18 |                      -- Nixpkgs unstable (Updated 2022-04-14)
 19 |                      GHC == 8.8.4
 20 |                      -- Nixpkgs unstable (Updated 2022-04-14)
 21 |                    , GHC == 8.10.7
 22 |                      -- Nixpkgs unstable (Updated 2022-04-14)
 23 |                    , GHC == 9.0.2
 24 |                      -- Nixpkgs unstable (Updated 2024-09-05)
 25 |                    , GHC == 9.6.6
 26 | 
 27 | source-repository head
 28 |   type:     git
 29 |   location: https://github.com/channable/alfred-margaret
 30 | 
 31 | Flag aeson
 32 |     Description: Enable aeson support
 33 |     Manual: False
 34 |     Default: True
 35 | 
 36 | -- Compile this package with LLVM, rather than with the default code generator.
 37 | -- LLVM produces about 20% faster code.
 38 | Flag llvm
 39 |   Description: Compile with LLVM
 40 |   Manual: True
 41 |   -- Defaulting to false makes the package buildable by Hackage,
 42 |   -- allowing the documentation to be generated for us.
 43 |   Default: False
 44 | 
 45 | library
 46 |   hs-source-dirs:      src
 47 |   exposed-modules:     Data.Text.CaseSensitivity
 48 |                      , Data.Text.Utf8
 49 |                      , Data.Text.Utf8.Unlower
 50 |                      , Data.Text.AhoCorasick.Automaton
 51 |                      , Data.Text.AhoCorasick.Replacer
 52 |                      , Data.Text.AhoCorasick.Searcher
 53 |                      , Data.Text.AhoCorasick.Splitter
 54 |                      , Data.Text.BoyerMoore.Automaton
 55 |                      , Data.Text.BoyerMoore.Replacer
 56 |                      , Data.Text.BoyerMoore.Searcher
 57 |                      , Data.Text.BoyerMooreCI.Automaton
 58 |                      , Data.Text.BoyerMooreCI.Replacer
 59 |                      , Data.Text.BoyerMooreCI.Searcher
 60 |   other-modules:       Data.Primitive.Extended
 61 |   build-depends:
 62 |       base             >= 4.7 && < 5
 63 |     , containers       >= 0.6 && < 0.8
 64 |     , deepseq          >= 1.4 && < 1.6
 65 |     , hashable         >= 1.4.0.2 && < 1.6
 66 |     , primitive        >= 0.6.4 && < 0.10
 67 |     , text             >= 2.0 && < 2.2
 68 |     , unordered-containers >= 0.2.9 && < 0.3
 69 |     , vector           >= 0.12 && < 0.14
 70 |   ghc-options:         -Wall -Wincomplete-record-updates -Wincomplete-uni-patterns -O2
 71 |   default-language:    Haskell2010
 72 |   if flag(aeson) {
 73 |                        -- Even an older version of aeson is fine since
 74 |                        -- we only use it for instances
 75 |   build-depends:       aeson >= 1.4.2 && < 3
 76 |   cpp-options:         -DHAS_AESON
 77 |   }
 78 |   if flag(llvm) {
 79 |   ghc-options:         -fllvm -optlo=-O3 -optlo=-tailcallelim
 80 |   }
 81 | 
 82 | test-suite test-suite
 83 |   type:                exitcode-stdio-1.0
 84 |   main-is:             Main.hs
 85 |   other-modules:       Data.Text.AhoCorasickSpec
 86 |                      , Data.Text.BoyerMooreSpec
 87 |                      , Data.Text.BoyerMooreCISpec
 88 |                      , Data.Text.Utf8Spec
 89 |                      , Data.Text.TestInstances
 90 |   hs-source-dirs:      tests
 91 |   ghc-options:         -Wall -Wincomplete-record-updates -Wno-orphans
 92 |   build-depends:       base >= 4.7 && < 5
 93 |                      , QuickCheck
 94 |                      , alfred-margaret
 95 |                      , deepseq
 96 |                      , hspec
 97 |                      , hspec-expectations
 98 |                      , primitive
 99 |                      , quickcheck-instances
100 |                      , text
101 |   default-language:    Haskell2010
102 | 
103 | benchmark uvector-vs-tba
104 |   type:                exitcode-stdio-1.0
105 |   main-is:             Main.hs
106 |   hs-source-dirs:      bench/uvector-vs-tba
107 |   ghc-options:         -Wall -Wincomplete-record-updates -Wno-orphans
108 |   build-depends:       base >= 4.7 && < 5
109 |                      , alfred-margaret
110 |                      , vector
111 |                      , deepseq
112 |                      , criterion
113 |   default-language:    Haskell2010
114 | 
115 | benchmark bm
116 |   type:                exitcode-stdio-1.0
117 |   main-is:             Main.hs
118 |   hs-source-dirs:      bench/bm
119 |   ghc-options:         -Wall -Wincomplete-record-updates -Wno-orphans
120 |   build-depends:       base >= 4.7 && < 5
121 |                      , alfred-margaret
122 |                      , deepseq
123 |                      , criterion
124 |                      , text
125 |   default-language:    Haskell2010
126 | 
127 | executable dump-automaton
128 |   main-is: Main.hs
129 |   hs-source-dirs:
130 |     app/dump-automaton
131 |   build-depends:   base
132 |                  , alfred-margaret
133 |   default-language: Haskell2010
134 | 


--------------------------------------------------------------------------------
/src/Data/Text/AhoCorasick/Splitter.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2019 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE GHC2021 #-}
  8 | {-# LANGUAGE DeriveGeneric #-}
  9 | {-# LANGUAGE BangPatterns #-}
 10 | {-# LANGUAGE CPP #-}
 11 | 
 12 | -- | Splitting strings using Aho–Corasick.
 13 | module Data.Text.AhoCorasick.Splitter
 14 |     ( Splitter
 15 |     , automaton
 16 |     , build
 17 |     , separator
 18 |     , split
 19 |     , splitIgnoreCase
 20 |     , splitReverse
 21 |     , splitReverseIgnoreCase
 22 |     ) where
 23 | 
 24 | import GHC.Generics (Generic)
 25 | import Control.DeepSeq (NFData (..))
 26 | import Data.Function (on)
 27 | import Data.Hashable (Hashable (..))
 28 | import Data.List.NonEmpty (NonEmpty ((:|)))
 29 | import Data.Text.Utf8 (Text)
 30 | 
 31 | #if defined(HAS_AESON)
 32 | import qualified Data.Aeson as AE
 33 | #endif
 34 | 
 35 | import qualified Data.List.NonEmpty as NonEmpty
 36 | import qualified Data.Text as Text
 37 | 
 38 | import Data.Text.AhoCorasick.Automaton (AcMachine)
 39 | 
 40 | import qualified Data.Text.Utf8 as Utf8
 41 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 42 | 
 43 | --------------------------------------------------------------------------------
 44 | -- Splitter
 45 | 
 46 | -- | Build a splitter once, then use it many times!
 47 | data Splitter =
 48 |   Splitter
 49 |     { splitterAutomaton :: AcMachine () -- INVARIANT: Exactly one needle.
 50 |     , splitterSeparator :: Text         -- INVARIANT: Equivalent to needle.
 51 |     }
 52 |   deriving Generic
 53 | 
 54 | #if defined(HAS_AESON)
 55 | instance AE.ToJSON Splitter where
 56 |   toJSON = AE.toJSON . separator
 57 | 
 58 | instance AE.FromJSON Splitter where
 59 |   parseJSON v = build <$> AE.parseJSON v
 60 | #endif
 61 | 
 62 | -- | Construct a splitter with a single separator.
 63 | {-# INLINE build #-}
 64 | build :: Text -> Splitter
 65 | build sep =
 66 |   let !auto = Aho.build [(sep, ())] in
 67 |   Splitter auto sep
 68 | 
 69 | -- | Get the automaton that would be used for finding separators.
 70 | {-# INLINE automaton #-}
 71 | automaton :: Splitter -> AcMachine ()
 72 | automaton = splitterAutomaton
 73 | 
 74 | -- | What is the separator we are splitting on?
 75 | {-# INLINE separator #-}
 76 | separator :: Splitter -> Text
 77 | separator = splitterSeparator
 78 | 
 79 | -- | Split the given string into strings separated by the separator.
 80 | --
 81 | -- If the order of the results is not important, use the faster function
 82 | -- 'splitReverse' instead.
 83 | {-# INLINE split #-}
 84 | split :: Splitter -> Text -> NonEmpty Text
 85 | split = (NonEmpty.reverse .) . splitReverse
 86 | 
 87 | -- | Split the given string into strings separated by the separator.
 88 | --
 89 | -- If the order of the results is not important, use the faster function
 90 | -- 'splitReverseIgnoreCase' instead.
 91 | --
 92 | -- The separator is matched case-insensitively, but the splitter must have been
 93 | -- constructed with a lowercase needle.
 94 | {-# INLINE splitIgnoreCase #-}
 95 | splitIgnoreCase :: Splitter -> Text -> NonEmpty Text
 96 | splitIgnoreCase = (NonEmpty.reverse .) . splitReverseIgnoreCase
 97 | 
 98 | -- | Like 'split', but return the substrings in reverse order.
 99 | {-# INLINE splitReverse #-}
100 | splitReverse :: Splitter -> Text -> NonEmpty Text
101 | splitReverse s t =
102 |   finalizeAccum t $ Aho.runText zeroAccum stepAccum' (automaton s) t
103 |   where
104 |     -- Case sensitive matching: separator length is in bytes.
105 |     sepLength = Utf8.lengthUtf8 (separator s)
106 |     stepAccum' accum (Aho.Match newFragmentStart _) =
107 |       stepAccum t accum (newFragmentStart - sepLength) newFragmentStart
108 | 
109 | 
110 | -- | Like 'splitIgnoreCase', but return the substrings in reverse order.
111 | {-# INLINE splitReverseIgnoreCase #-}
112 | splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text
113 | splitReverseIgnoreCase s t =
114 |   finalizeAccum t $ Aho.runLower zeroAccum stepAccum' (automaton s) t
115 |   where
116 |     -- Case insensitive matching: separator length is in codepoints.
117 |     sepLength = Text.length (separator s)
118 |     stepAccum' accum (Aho.Match newFragmentStart _) =
119 |       -- We start at the last byte of the separator, and look backwards.
120 |       let sepStart = Utf8.skipCodePointsBackwards t (newFragmentStart-1) (sepLength-1) in
121 |       stepAccum t accum sepStart newFragmentStart
122 | 
123 | --------------------------------------------------------------------------------
124 | -- Fold
125 | 
126 | -- | The accumulator is used as state when processing the matches from left to
127 | -- right. While the matches are fed to us ordered by end offset, all matches
128 | -- have the same length because there is only one needle.
129 | data Accum =
130 |   Accum
131 |     { accumResult :: ![Text]
132 |       -- ^ Match-separated strings.
133 |     , accumFragmentStart :: !Aho.CodeUnitIndex
134 |       -- ^ First byte of current fragment (that is the non-separator part)
135 |     }
136 |   deriving Generic
137 | 
138 | -- | Finalizing the accumulator does more than just 'accumResult', hence this
139 | -- is a separate function.
140 | {-# INLINE finalizeAccum #-}
141 | finalizeAccum :: Text -> Accum -> NonEmpty Text
142 | finalizeAccum hay (Accum res prevEnd) =
143 |   -- Once we have processed all the matches, there is still the substring after
144 |   -- the final match. This substring is always included in the result, even
145 |   -- when there were no matches. Hence we can return a non-empty list.
146 |   let !str = Utf8.unsafeSliceUtf8 prevEnd (Utf8.lengthUtf8 hay - prevEnd) hay in
147 |   str :| res
148 | 
149 | -- | The initial accumulator begins at the begin of the haystack.
150 | {-# INLINE zeroAccum #-}
151 | zeroAccum :: Accum
152 | zeroAccum = Accum { accumResult = [], accumFragmentStart = 0 }
153 | 
154 | -- | Step the accumulator using the next match. Overlapping matches will be
155 | -- ignored. Overlapping matches may occur when the separator has a non-empty
156 | -- prefix that is also a suffix.
157 | {-# INLINE stepAccum #-}
158 | stepAccum :: Text -> Accum -> Aho.CodeUnitIndex -> Aho.CodeUnitIndex -> Aho.Next Accum
159 | stepAccum hay acc@(Accum res fragmentStart) sepStart newFragmentStart
160 | 
161 |   -- When the match begins before the current offset, it overlaps a match that
162 |   -- we processed before, and so we ignore it.
163 |   | sepStart < fragmentStart =
164 |       Aho.Step acc
165 | 
166 |   -- The match is behind the current offset, so we slice the haystack until the
167 |   -- begin of the match and include that as a result.
168 |   | otherwise =
169 |       let !str = Utf8.unsafeSliceUtf8 fragmentStart (sepStart - fragmentStart) hay in
170 |       Aho.Step acc { accumResult = str : res, accumFragmentStart = newFragmentStart }
171 | 
172 | --------------------------------------------------------------------------------
173 | -- Instances
174 | 
175 | instance Eq Splitter where
176 |   {-# INLINE (==) #-}
177 |   (==) = (==) `on` separator
178 | 
179 | instance Ord Splitter where
180 |   {-# INLINE compare #-}
181 |   compare = compare `on` separator
182 | 
183 | instance Hashable Splitter where
184 |   {-# INLINE hashWithSalt #-}
185 |   hashWithSalt salt searcher =
186 |     salt `hashWithSalt` separator searcher
187 | 
188 | instance NFData Splitter where
189 |   {-# INLINE rnf #-}
190 |   rnf (Splitter searcher sepLength) =
191 |     rnf searcher `seq`
192 |     rnf sepLength
193 | 
194 | instance Show Splitter where
195 |   showsPrec p splitter =
196 |     showParen (p > 10) $
197 |       showString "build " .
198 |         showsPrec 11 (separator splitter)
199 | 


--------------------------------------------------------------------------------
/nix/sources.nix:
--------------------------------------------------------------------------------
  1 | # This file has been generated by Niv.
  2 | 
  3 | let
  4 | 
  5 |   #
  6 |   # The fetchers. fetch_<type> fetches specs of type <type>.
  7 |   #
  8 | 
  9 |   fetch_file = pkgs: name: spec:
 10 |     let
 11 |       name' = sanitizeName name + "-src";
 12 |     in
 13 |       if spec.builtin or true then
 14 |         builtins_fetchurl { inherit (spec) url sha256; name = name'; }
 15 |       else
 16 |         pkgs.fetchurl { inherit (spec) url sha256; name = name'; };
 17 | 
 18 |   fetch_tarball = pkgs: name: spec:
 19 |     let
 20 |       name' = sanitizeName name + "-src";
 21 |     in
 22 |       if spec.builtin or true then
 23 |         builtins_fetchTarball { name = name'; inherit (spec) url sha256; }
 24 |       else
 25 |         pkgs.fetchzip { name = name'; inherit (spec) url sha256; };
 26 | 
 27 |   fetch_git = name: spec:
 28 |     let
 29 |       ref =
 30 |         if spec ? ref then spec.ref else
 31 |           if spec ? branch then "refs/heads/${spec.branch}" else
 32 |             if spec ? tag then "refs/tags/${spec.tag}" else
 33 |               abort "In git source '${name}': Please specify `ref`, `tag` or `branch`!";
 34 |       submodules = if spec ? submodules then spec.submodules else false;
 35 |     in
 36 |       builtins.fetchGit { url = spec.repo; inherit (spec) rev; inherit ref; }
 37 |       // (if builtins.compareVersions builtins.nixVersion "2.4" >= 0 then { inherit submodules; } else {});
 38 | 
 39 |   fetch_local = spec: spec.path;
 40 | 
 41 |   fetch_builtin-tarball = name: throw
 42 |     ''[${name}] The niv type "builtin-tarball" is deprecated. You should instead use `builtin = true`.
 43 |         $ niv modify ${name} -a type=tarball -a builtin=true'';
 44 | 
 45 |   fetch_builtin-url = name: throw
 46 |     ''[${name}] The niv type "builtin-url" will soon be deprecated. You should instead use `builtin = true`.
 47 |         $ niv modify ${name} -a type=file -a builtin=true'';
 48 | 
 49 |   #
 50 |   # Various helpers
 51 |   #
 52 | 
 53 |   # https://github.com/NixOS/nixpkgs/pull/83241/files#diff-c6f540a4f3bfa4b0e8b6bafd4cd54e8bR695
 54 |   sanitizeName = name:
 55 |     (
 56 |       concatMapStrings (s: if builtins.isList s then "-" else s)
 57 |         (
 58 |           builtins.split "[^[:alnum:]+._?=-]+"
 59 |             ((x: builtins.elemAt (builtins.match "\\.*(.*)" x) 0) name)
 60 |         )
 61 |     );
 62 | 
 63 |   # The set of packages used when specs are fetched using non-builtins.
 64 |   mkPkgs = sources: system:
 65 |     let
 66 |       sourcesNixpkgs =
 67 |         import (builtins_fetchTarball { inherit (sources.nixpkgs) url sha256; }) { inherit system; };
 68 |       hasNixpkgsPath = builtins.any (x: x.prefix == "nixpkgs") builtins.nixPath;
 69 |       hasThisAsNixpkgsPath = <nixpkgs> == ./.;
 70 |     in
 71 |       if builtins.hasAttr "nixpkgs" sources
 72 |       then sourcesNixpkgs
 73 |       else if hasNixpkgsPath && ! hasThisAsNixpkgsPath then
 74 |         import <nixpkgs> {}
 75 |       else
 76 |         abort
 77 |           ''
 78 |             Please specify either <nixpkgs> (through -I or NIX_PATH=nixpkgs=...) or
 79 |             add a package called "nixpkgs" to your sources.json.
 80 |           '';
 81 | 
 82 |   # The actual fetching function.
 83 |   fetch = pkgs: name: spec:
 84 | 
 85 |     if ! builtins.hasAttr "type" spec then
 86 |       abort "ERROR: niv spec ${name} does not have a 'type' attribute"
 87 |     else if spec.type == "file" then fetch_file pkgs name spec
 88 |     else if spec.type == "tarball" then fetch_tarball pkgs name spec
 89 |     else if spec.type == "git" then fetch_git name spec
 90 |     else if spec.type == "local" then fetch_local spec
 91 |     else if spec.type == "builtin-tarball" then fetch_builtin-tarball name
 92 |     else if spec.type == "builtin-url" then fetch_builtin-url name
 93 |     else
 94 |       abort "ERROR: niv spec ${name} has unknown type ${builtins.toJSON spec.type}";
 95 | 
 96 |   # If the environment variable NIV_OVERRIDE_${name} is set, then use
 97 |   # the path directly as opposed to the fetched source.
 98 |   replace = name: drv:
 99 |     let
100 |       saneName = stringAsChars (c: if isNull (builtins.match "[a-zA-Z0-9]" c) then "_" else c) name;
101 |       ersatz = builtins.getEnv "NIV_OVERRIDE_${saneName}";
102 |     in
103 |       if ersatz == "" then drv else
104 |         # this turns the string into an actual Nix path (for both absolute and
105 |         # relative paths)
106 |         if builtins.substring 0 1 ersatz == "/" then /. + ersatz else /. + builtins.getEnv "PWD" + "/${ersatz}";
107 | 
108 |   # Ports of functions for older nix versions
109 | 
110 |   # a Nix version of mapAttrs if the built-in doesn't exist
111 |   mapAttrs = builtins.mapAttrs or (
112 |     f: set: with builtins;
113 |     listToAttrs (map (attr: { name = attr; value = f attr set.${attr}; }) (attrNames set))
114 |   );
115 | 
116 |   # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295
117 |   range = first: last: if first > last then [] else builtins.genList (n: first + n) (last - first + 1);
118 | 
119 |   # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L257
120 |   stringToCharacters = s: map (p: builtins.substring p 1 s) (range 0 (builtins.stringLength s - 1));
121 | 
122 |   # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269
123 |   stringAsChars = f: s: concatStrings (map f (stringToCharacters s));
124 |   concatMapStrings = f: list: concatStrings (map f list);
125 |   concatStrings = builtins.concatStringsSep "";
126 | 
127 |   # https://github.com/NixOS/nixpkgs/blob/8a9f58a375c401b96da862d969f66429def1d118/lib/attrsets.nix#L331
128 |   optionalAttrs = cond: as: if cond then as else {};
129 | 
130 |   # fetchTarball version that is compatible between all the versions of Nix
131 |   builtins_fetchTarball = { url, name ? null, sha256 }@attrs:
132 |     let
133 |       inherit (builtins) lessThan nixVersion fetchTarball;
134 |     in
135 |       if lessThan nixVersion "1.12" then
136 |         fetchTarball ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; }))
137 |       else
138 |         fetchTarball attrs;
139 | 
140 |   # fetchurl version that is compatible between all the versions of Nix
141 |   builtins_fetchurl = { url, name ? null, sha256 }@attrs:
142 |     let
143 |       inherit (builtins) lessThan nixVersion fetchurl;
144 |     in
145 |       if lessThan nixVersion "1.12" then
146 |         fetchurl ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; }))
147 |       else
148 |         fetchurl attrs;
149 | 
150 |   # Create the final "sources" from the config
151 |   mkSources = config:
152 |     mapAttrs (
153 |       name: spec:
154 |         if builtins.hasAttr "outPath" spec
155 |         then abort
156 |           "The values in sources.json should not have an 'outPath' attribute"
157 |         else
158 |           spec // { outPath = replace name (fetch config.pkgs name spec); }
159 |     ) config.sources;
160 | 
161 |   # The "config" used by the fetchers
162 |   mkConfig =
163 |     { sourcesFile ? if builtins.pathExists ./sources.json then ./sources.json else null
164 |     , sources ? if isNull sourcesFile then {} else builtins.fromJSON (builtins.readFile sourcesFile)
165 |     , system ? builtins.currentSystem
166 |     , pkgs ? mkPkgs sources system
167 |     }: rec {
168 |       # The sources, i.e. the attribute set of spec name to spec
169 |       inherit sources;
170 | 
171 |       # The "pkgs" (evaluated nixpkgs) to use for e.g. non-builtin fetchers
172 |       inherit pkgs;
173 |     };
174 | 
175 | in
176 | mkSources (mkConfig {}) // { __functor = _: settings: mkSources (mkConfig settings); }
177 | 


--------------------------------------------------------------------------------
/src/Data/Text/AhoCorasick/Searcher.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2022 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE CPP #-}
  9 | {-# LANGUAGE DeriveGeneric #-}
 10 | {-# LANGUAGE FlexibleInstances #-}
 11 | {-# LANGUAGE OverloadedStrings #-}
 12 | {-# LANGUAGE TupleSections #-}
 13 | 
 14 | module Data.Text.AhoCorasick.Searcher
 15 |     ( Searcher
 16 |     , automaton
 17 |     , build
 18 |     , buildNeedleIdSearcher
 19 |     , buildWithValues
 20 |     , caseSensitivity
 21 |     , containsAll
 22 |     , containsAny
 23 |     , mapSearcher
 24 |     , needles
 25 |     , numNeedles
 26 |     , setCaseSensitivity
 27 |     ) where
 28 | 
 29 | import Control.DeepSeq (NFData)
 30 | import Data.Bifunctor (second)
 31 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
 32 | import GHC.Generics (Generic)
 33 | 
 34 | #if defined(HAS_AESON)
 35 | import Data.Aeson ((.:), (.=))
 36 | import qualified Data.Aeson as AE
 37 | #endif
 38 | 
 39 | import qualified Data.IntSet as IS
 40 | 
 41 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 42 | import Data.Text.Utf8 (Text)
 43 | 
 44 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 45 | 
 46 | -- | A set of needles with associated values, and an Aho-Corasick automaton to
 47 | -- efficiently find those needles.
 48 | --
 49 | -- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles
 50 | -- To enforce this invariant, the fields are not exposed from this module.
 51 | -- There is a separate constructor function.
 52 | --
 53 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
 54 | -- can derive those for types that embed the searcher, whithout requiring the
 55 | -- automaton itself to be Hashable or Eq, which would be both wasteful and
 56 | -- tedious. Because the automaton is fully determined by the needles and
 57 | -- associated values, it is sufficient to implement Eq and Hashable in terms of
 58 | -- the needles only.
 59 | --
 60 | -- We also use Hashed to cache the hash of the needles.
 61 | data Searcher v = Searcher
 62 |   { searcherCaseSensitive :: CaseSensitivity
 63 |   , searcherNeedles :: Hashed [(Text, v)]
 64 |   , searcherNumNeedles :: Int
 65 |   , searcherAutomaton :: Aho.AcMachine v
 66 |   } deriving (Generic)
 67 | 
 68 | #if defined(HAS_AESON)
 69 | instance AE.ToJSON v => AE.ToJSON (Searcher v) where
 70 |   toJSON s = AE.object
 71 |     [ "needles" .= needles s
 72 |     , "caseSensitivity" .= caseSensitivity s
 73 |     ]
 74 | 
 75 | instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where
 76 |   parseJSON = AE.withObject "Searcher" $ \o -> buildWithValues <$> o .: "caseSensitivity" <*> o .: "needles"
 77 | #endif
 78 | 
 79 | instance Show (Searcher v) where
 80 |   show _ = "Searcher _ _ _"
 81 | 
 82 | instance Hashable v => Hashable (Searcher v) where
 83 |   hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher
 84 |   {-# INLINE hashWithSalt #-}
 85 | 
 86 | instance Eq v => Eq (Searcher v) where
 87 |   -- Since we store the length of the needle list anyway,
 88 |   -- we can use it to early out if there is a length mismatch.
 89 |   Searcher cx xs nx _ == Searcher cy ys ny _ = (nx, xs, cx) == (ny, ys, cy)
 90 |   {-# INLINE (==) #-}
 91 | 
 92 | instance NFData v => NFData (Searcher v)
 93 | 
 94 | -- NOTE: Although we could implement Semigroup for every v by just concatenating
 95 | -- needle lists, we don't, because this might lead to unexpected results. For
 96 | -- example, if v is (Int, a) where the Int is a priority, combining two
 97 | -- searchers might want to discard priorities, concatenate the needle lists, and
 98 | -- reassign priorities, rather than concatenating the needle lists as-is and
 99 | -- possibly having duplicate priorities in the resulting searcher.
100 | instance Semigroup (Searcher ()) where
101 |   x <> y
102 |     | caseSensitivity x == caseSensitivity y
103 |       = buildWithValues (searcherCaseSensitive x) (needles x <> needles y)
104 |     | otherwise = error "Combining searchers of different case sensitivity"
105 |   {-# INLINE (<>) #-}
106 | 
107 | -- | Builds the Searcher for a list of needles
108 | -- The caller is responsible that the needles are lower case in case the IgnoreCase
109 | -- is used for case sensitivity
110 | build :: CaseSensitivity -> [Text] -> Searcher ()
111 | build case_ = buildWithValues case_ . fmap (, ())
112 | 
113 | -- | The caller is responsible that the needles are lower case in case the IgnoreCase
114 | -- is used for case sensitivity
115 | buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v
116 | {-# INLINABLE buildWithValues #-}
117 | buildWithValues case_ ns =
118 |   Searcher case_ (hashed ns) (length ns) $ Aho.build ns
119 | 
120 | -- | Modify the values associated with the needles.
121 | mapSearcher :: Hashable b => (a -> b) -> Searcher a -> Searcher b
122 | mapSearcher f searcher = searcher
123 |   { searcherNeedles = hashed $ fmap (second f) $ needles searcher
124 |   , searcherAutomaton = fmap f (searcherAutomaton searcher)
125 |   }
126 | 
127 | needles :: Searcher v -> [(Text, v)]
128 | needles = unhashed . searcherNeedles
129 | 
130 | numNeedles :: Searcher v -> Int
131 | numNeedles = searcherNumNeedles
132 | 
133 | automaton :: Searcher v -> Aho.AcMachine v
134 | automaton = searcherAutomaton
135 | 
136 | caseSensitivity :: Searcher v -> CaseSensitivity
137 | caseSensitivity = searcherCaseSensitive
138 | 
139 | -- | Updates the case sensitivity of the searcher. Does not change the
140 | -- capitilization of the needles. The caller should be certain that if IgnoreCase
141 | -- is passed, the needles are already lower case.
142 | setCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v
143 | setCaseSensitivity case_ searcher = searcher{
144 |     searcherCaseSensitive = case_
145 |   }
146 | 
147 | -- | Return whether the haystack contains any of the needles.
148 | -- Case sensitivity depends on the properties of the searcher
149 | -- This function is marked noinline as an inlining boundary. Aho.runText is
150 | -- marked inline, so this function will be optimized to report only whether
151 | -- there is a match, and not construct a list of matches. We don't want this
152 | -- function be inline, to make sure that the conditions of the caller don't
153 | -- affect how this function is optimized. There is little to gain from
154 | -- additional inlining. The pragma is not an optimization in itself, rather it
155 | -- is a defence against fragile optimizer decisions.
156 | {-# NOINLINE containsAny #-}
157 | containsAny :: Searcher () -> Text -> Bool
158 | containsAny !searcher !text =
159 |   let
160 |     -- On the first match, return True immediately.
161 |     f _acc _match = Aho.Done True
162 |   in case caseSensitivity searcher of
163 |     CaseSensitive  -> Aho.runText False f (automaton searcher) text
164 |     IgnoreCase      -> Aho.runLower False f (automaton searcher) text
165 | 
166 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.
167 | buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int
168 | buildNeedleIdSearcher !case_ !ns =
169 |   buildWithValues case_ $ zip ns [0..]
170 | 
171 | -- | Returns whether the haystack contains all of the needles.
172 | -- This function expects the passed 'Searcher' to be constructed using 'buildNeedleIdAutomaton'.
173 | containsAll :: Searcher Int -> Text -> Bool
174 | containsAll !searcher !haystack =
175 |   let
176 |     initial = IS.fromDistinctAscList [0..numNeedles searcher - 1]
177 |     ac = automaton searcher
178 | 
179 |     f !acc (Aho.Match _index !needleId)
180 |       | IS.null acc' = Aho.Done acc'
181 |       | otherwise = Aho.Step acc'
182 |       where
183 |         !acc' = IS.delete needleId acc
184 | 
185 |   in IS.null $ case caseSensitivity searcher of
186 |     CaseSensitive -> Aho.runText initial f ac haystack
187 |     IgnoreCase -> Aho.runLower initial f ac haystack
188 | 


--------------------------------------------------------------------------------
/tests/Data/Text/Utf8Spec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE ScopedTypeVariables #-}
  3 | 
  4 | module Data.Text.Utf8Spec where
  5 | 
  6 | import Control.Exception (evaluate)
  7 | import Control.Monad (forM_)
  8 | import Test.Hspec (Spec, anyErrorCall, describe, it, shouldBe, shouldSatisfy, shouldThrow)
  9 | import Test.Hspec.QuickCheck (prop)
 10 | import Test.QuickCheck (Gen, choose, forAllShrink, shrink)
 11 | 
 12 | import qualified Data.Char as Char
 13 | 
 14 | import Data.Text.TestInstances ()
 15 | 
 16 | import qualified Data.Text.Utf8 as Utf8
 17 | 
 18 | spec :: Spec
 19 | spec = do
 20 |     describe "Properties of the BMP in UTF-8" $ do
 21 | 
 22 |         describe "Char.toLower" $ do
 23 | 
 24 |             {-
 25 |             it "does not generate common suffixes" $ do
 26 |                 forM_ bmpCodepoints $ flip shouldSatisfy $ \cp ->
 27 |                     let
 28 |                         lowerCp = mapCp Char.toLower cp
 29 |                     in
 30 |                     cp == lowerCp || null (commonSuffix (Utf8.unicode2utf8 cp) (Utf8.unicode2utf8 lowerCp))
 31 |             -- Sadly, it "actually does"
 32 |             -}
 33 | 
 34 |             it "is idempotent" $ do
 35 |                 forM_ bmpCodepoints $ flip shouldSatisfy $ \cp ->
 36 |                     Char.toLower cp == Char.toLower (Char.toLower cp)
 37 | 
 38 |     describe "toLowerAscii" $ do
 39 | 
 40 |         it "is equivalent to Char.toLower on ASCII" $ do
 41 | 
 42 |             forM_ asciiCodepoints $ flip shouldSatisfy $ \cp ->
 43 |                 Char.toLower cp == Utf8.toLowerAscii cp
 44 | 
 45 |     describe "lowerCodePoint" $ do
 46 | 
 47 |         prop "is equivalent to Char.toLower on all of Unicode" $ \c ->
 48 |             Utf8.lowerCodePoint c `shouldBe` Char.toLower c
 49 | 
 50 |     describe "unlowerCodePoint" $ do
 51 | 
 52 |       it "should return nothing if it's not a lower case of anything" $ do
 53 |         Utf8.unlowerCodePoint 'A' `shouldBe` ""
 54 |         Utf8.unlowerCodePoint 'ẞ' `shouldBe` ""
 55 | 
 56 |       it "should return itself if it doesn't have any casings" $ do
 57 |         Utf8.unlowerCodePoint '1' `shouldBe` "1"
 58 | 
 59 |       it "can return multiple values" $ do
 60 |         Utf8.unlowerCodePoint 'a' `shouldBe` "aA"
 61 |         Utf8.unlowerCodePoint 'ß' `shouldBe` "ẞß"
 62 |         Utf8.unlowerCodePoint 'i' `shouldBe` "İiI"
 63 | 
 64 |     describe "isCaseInvariant" $ do
 65 |       it "holds vacuously for empty texts" $ do
 66 |         Utf8.isCaseInvariant "" `shouldBe` True
 67 | 
 68 |       it "should be true for characters without upper/lower cases" $ do
 69 |         Utf8.isCaseInvariant "." `shouldBe` True
 70 |         Utf8.isCaseInvariant ".,;'123" `shouldBe` True
 71 |         Utf8.isCaseInvariant "💩" `shouldBe` True
 72 | 
 73 |       it "should be false if there are characters with upper/lower cases" $ do
 74 |         Utf8.isCaseInvariant "a" `shouldBe` False
 75 |         Utf8.isCaseInvariant "A.." `shouldBe` False
 76 |         Utf8.isCaseInvariant "ß." `shouldBe` False
 77 |         Utf8.isCaseInvariant "ẞ" `shouldBe` False
 78 |         Utf8.isCaseInvariant "İ" `shouldBe` False
 79 | 
 80 |     describe "dropWhile" $ do
 81 | 
 82 |         it "handles a simple example well" $ do
 83 |             Utf8.dropWhile (== 'b') "bbba" `shouldBe` "a"
 84 | 
 85 |     describe "slicing functions" $ do
 86 | 
 87 |         let
 88 |             -- | Example shown in section "Slicing Functions" in 'Data.Text.Utf8".
 89 |             slicingExample :: Utf8.Text
 90 |             slicingExample = Utf8.Text u8data 1 11
 91 |                 where Utf8.Text u8data _ _ = Utf8.pack "ABCDEFGHIJKLMN"
 92 | 
 93 |         it "satisfies the example in Data.Text.Utf8" $ do
 94 |             let begin = Utf8.CodeUnitIndex 2
 95 |             let length_ = Utf8.CodeUnitIndex 6
 96 |             Utf8.unsafeSliceUtf8 begin length_ slicingExample `shouldBe` "DEFGHI"
 97 |             Utf8.unsafeCutUtf8 begin length_ slicingExample `shouldBe` ("BC", "JKL")
 98 | 
 99 |         prop "unsafeSliceUtf8 and unsafeCutUtf8 are complementary" $
100 |             forAllShrink (arbitrarySlicingIndices slicingExample) shrink $ \ (begin, length_) -> do
101 |                 let (prefix, suffix) = Utf8.unsafeCutUtf8 begin length_ slicingExample
102 |                 Utf8.concat [prefix, Utf8.unsafeSliceUtf8 begin length_ slicingExample, suffix] `shouldBe` slicingExample
103 | 
104 |     describe "Basic Text instances" $ do
105 | 
106 |         prop "Show Text behaves like Show String" $ \ (str :: String) -> do
107 |             show (Utf8.pack str) `shouldBe` show str
108 | 
109 |         prop "Eq Text behaves like Eq String" $ \ (a :: String) (b :: String) -> do
110 |             Utf8.pack a == Utf8.pack b `shouldBe` a == b
111 | 
112 |         prop "Ord Text behaves like Ord String" $ \ (a :: String) (b :: String) -> do
113 |             compare (Utf8.pack a) (Utf8.pack b) `shouldBe` compare a b
114 | 
115 |     describe "skipCodePointsBackwards" $ do
116 |       it "works with ascii" $ do
117 |         Utf8.skipCodePointsBackwards "abcd" 3 0 `shouldBe` 3
118 |         Utf8.skipCodePointsBackwards "abcd" 3 1 `shouldBe` 2
119 |         Utf8.skipCodePointsBackwards "abcd" 3 2 `shouldBe` 1
120 |         Utf8.skipCodePointsBackwards "abcd" 3 3 `shouldBe` 0
121 | 
122 |       it "moves to start of codepoint if you skip 0" $ do
123 |         Utf8.skipCodePointsBackwards "💩💩" 0 0 `shouldBe` 0
124 |         Utf8.skipCodePointsBackwards "💩💩" 1 0 `shouldBe` 0
125 |         Utf8.skipCodePointsBackwards "💩💩" 2 0 `shouldBe` 0
126 |         Utf8.skipCodePointsBackwards "💩💩" 3 0 `shouldBe` 0
127 |         Utf8.skipCodePointsBackwards "💩💩" 4 0 `shouldBe` 4
128 |         Utf8.skipCodePointsBackwards "💩💩" 5 0 `shouldBe` 4
129 |         Utf8.skipCodePointsBackwards "💩💩" 6 0 `shouldBe` 4
130 |         Utf8.skipCodePointsBackwards "💩💩" 7 0 `shouldBe` 4
131 | 
132 |       it "can skip 1 multi-byte codepoint" $ do
133 |         Utf8.skipCodePointsBackwards "💩💩" 4 1 `shouldBe` 0
134 |         Utf8.skipCodePointsBackwards "💩💩" 5 1 `shouldBe` 0
135 |         Utf8.skipCodePointsBackwards "💩💩" 6 1 `shouldBe` 0
136 |         Utf8.skipCodePointsBackwards "💩💩" 7 1 `shouldBe` 0
137 | 
138 |       it "can skip multiple multi-byte codepoint" $ do
139 |         -- a, İ, ẞ and 💩 have byte lengths 1, 2, 3 and 4
140 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 0 `shouldBe` 15  -- stays at a
141 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 1 `shouldBe` 13  -- skips to İ
142 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 2 `shouldBe` 10  -- skips to ẞ
143 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 3 `shouldBe` 6   -- skips to 💩
144 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 4 `shouldBe` 3   -- skips to ẞ
145 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 5 `shouldBe` 1   -- skips to İ
146 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 6 `shouldBe` 0   -- skips to a
147 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 14 2 `shouldBe` 6  -- from İ to 💩
148 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 13 2 `shouldBe` 6  -- from İ to 💩
149 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 10 3 `shouldBe` 1  -- from ẞ to İ
150 |         Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 9 3 `shouldBe` 0   -- from 💩 to a
151 | 
152 |       it "throws errors when you read out of bounds" $ do
153 |         evaluate (Utf8.skipCodePointsBackwards "💩💩" 8 0) `shouldThrow` anyErrorCall
154 |         evaluate (Utf8.skipCodePointsBackwards "💩💩" 7 2) `shouldThrow` anyErrorCall
155 | 
156 | 
157 | 
158 | arbitrarySlicingIndices :: Utf8.Text -> Gen (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex)
159 | arbitrarySlicingIndices example = do
160 |     let exampleLength = Utf8.codeUnitIndex $ Utf8.lengthUtf8 example
161 | 
162 |     begin <- choose (0, exampleLength)
163 |     length_ <- choose (0, exampleLength - begin)
164 | 
165 |     pure (Utf8.CodeUnitIndex begin, Utf8.CodeUnitIndex length_)
166 | 
167 | asciiCodepoints :: [Char]
168 | asciiCodepoints = map Char.chr [0..0x7f]
169 | 
170 | -- | The Basic Multilingual Plane (BMP) contains the Unicode code points
171 | -- 0x0000 through 0xFFFF.
172 | bmpCodepoints :: [Char]
173 | bmpCodepoints = map Char.chr [0..0xffff]
174 | 
175 | commonSuffix :: Eq a => [a] -> [a] -> [a]
176 | commonSuffix list list' = reverse $ go (reverse list) (reverse list')
177 |     where
178 |         go (x:xs) (y:ys)
179 |             | x == y = x : go xs ys
180 |         go _ _ = []
181 | 


--------------------------------------------------------------------------------
/.stylish-haskell.yaml:
--------------------------------------------------------------------------------
  1 | # stylish-haskell configuration file
  2 | # ==================================
  3 | 
  4 | # The stylish-haskell tool is mainly configured by specifying steps. These steps
  5 | # are a list, so they have an order, and one specific step may appear more than
  6 | # once (if needed). Each file is processed by these steps in the given order.
  7 | steps:
  8 |   # Convert some ASCII sequences to their Unicode equivalents. This is disabled
  9 |   # by default.
 10 |   # - unicode_syntax:
 11 |   #     # In order to make this work, we also need to insert the UnicodeSyntax
 12 |   #     # language pragma. If this flag is set to true, we insert it when it's
 13 |   #     # not already present. You may want to disable it if you configure
 14 |   #     # language extensions using some other method than pragmas. Default:
 15 |   #     # true.
 16 |   #     add_language_pragma: true
 17 | 
 18 |   # Align the right hand side of some elements.  This is quite conservative
 19 |   # and only applies to statements where each element occupies a single
 20 |   # line.
 21 |   - simple_align:
 22 |       cases: false
 23 |       top_level_patterns: false
 24 |       records: false
 25 | 
 26 |   # Import cleanup
 27 |   - imports:
 28 |       # There are different ways we can align names and lists.
 29 |       #
 30 |       # - global: Align the import names and import list throughout the entire
 31 |       #   file.
 32 |       #
 33 |       # - file: Like global, but don't add padding when there are no qualified
 34 |       #   imports in the file.
 35 |       #
 36 |       # - group: Only align the imports per group (a group is formed by adjacent
 37 |       #   import lines).
 38 |       #
 39 |       # - none: Do not perform any alignment.
 40 |       #
 41 |       # Default: global.
 42 |       align: none
 43 | 
 44 |       # The following options affect only import list alignment.
 45 |       #
 46 |       # List align has following options:
 47 |       #
 48 |       # - after_alias: Import list is aligned with end of import including
 49 |       #   'as' and 'hiding' keywords.
 50 |       #
 51 |       #   > import qualified Data.List      as List (concat, foldl, foldr, head,
 52 |       #   >                                          init, last, length)
 53 |       #
 54 |       # - with_alias: Import list is aligned with start of alias or hiding.
 55 |       #
 56 |       #   > import qualified Data.List      as List (concat, foldl, foldr, head,
 57 |       #   >                                 init, last, length)
 58 |       #
 59 |       # - new_line: Import list starts always on new line.
 60 |       #
 61 |       #   > import qualified Data.List      as List
 62 |       #   >     (concat, foldl, foldr, head, init, last, length)
 63 |       #
 64 |       # Default: after_alias
 65 |       list_align: after_alias
 66 | 
 67 |       # Right-pad the module names to align imports in a group:
 68 |       #
 69 |       # - true: a little more readable
 70 |       #
 71 |       #   > import qualified Data.List       as List (concat, foldl, foldr,
 72 |       #   >                                           init, last, length)
 73 |       #   > import qualified Data.List.Extra as List (concat, foldl, foldr,
 74 |       #   >                                           init, last, length)
 75 |       #
 76 |       # - false: diff-safe
 77 |       #
 78 |       #   > import qualified Data.List as List (concat, foldl, foldr, init,
 79 |       #   >                                     last, length)
 80 |       #   > import qualified Data.List.Extra as List (concat, foldl, foldr,
 81 |       #   >                                           init, last, length)
 82 |       #
 83 |       # Default: true
 84 |       pad_module_names: false
 85 | 
 86 |       # Long list align style takes effect when import is too long. This is
 87 |       # determined by 'columns' setting.
 88 |       #
 89 |       # - inline: This option will put as much specs on same line as possible.
 90 |       #
 91 |       # - new_line: Import list will start on new line.
 92 |       #
 93 |       # - new_line_multiline: Import list will start on new line when it's
 94 |       #   short enough to fit to single line. Otherwise it'll be multiline.
 95 |       #
 96 |       # - multiline: One line per import list entry.
 97 |       #   Type with constructor list acts like single import.
 98 |       #
 99 |       #   > import qualified Data.Map as M
100 |       #   >     ( empty
101 |       #   >     , singleton
102 |       #   >     , ...
103 |       #   >     , delete
104 |       #   >     )
105 |       #
106 |       # Default: inline
107 |       long_list_align: inline
108 | 
109 |       # Align empty list (importing instances)
110 |       #
111 |       # Empty list align has following options
112 |       #
113 |       # - inherit: inherit list_align setting
114 |       #
115 |       # - right_after: () is right after the module name:
116 |       #
117 |       #   > import Vector.Instances ()
118 |       #
119 |       # Default: inherit
120 |       empty_list_align: inherit
121 | 
122 |       # List padding determines indentation of import list on lines after import.
123 |       # This option affects 'long_list_align'.
124 |       #
125 |       # - <integer>: constant value
126 |       #
127 |       # - module_name: align under start of module name.
128 |       #   Useful for 'file' and 'group' align settings.
129 |       list_padding: 4
130 | 
131 |       # Separate lists option affects formatting of import list for type
132 |       # or class. The only difference is single space between type and list
133 |       # of constructors, selectors and class functions.
134 |       #
135 |       # - true: There is single space between Foldable type and list of it's
136 |       #   functions.
137 |       #
138 |       #   > import Data.Foldable (Foldable (fold, foldl, foldMap))
139 |       #
140 |       # - false: There is no space between Foldable type and list of it's
141 |       #   functions.
142 |       #
143 |       #   > import Data.Foldable (Foldable(fold, foldl, foldMap))
144 |       #
145 |       # Default: true
146 |       separate_lists: true
147 | 
148 |       # Space surround option affects formatting of import lists on a single
149 |       # line. The only difference is single space after the initial
150 |       # parenthesis and a single space before the terminal parenthesis.
151 |       #
152 |       # - true: There is single space associated with the enclosing
153 |       #   parenthesis.
154 |       #
155 |       #   > import Data.Foo ( foo )
156 |       #
157 |       # - false: There is no space associated with the enclosing parenthesis
158 |       #
159 |       #   > import Data.Foo (foo)
160 |       #
161 |       # Default: false
162 |       space_surround: false
163 | 
164 |   # Language pragmas
165 |   - language_pragmas:
166 |       # We can generate different styles of language pragma lists.
167 |       #
168 |       # - vertical: Vertical-spaced language pragmas, one per line.
169 |       #
170 |       # - compact: A more compact style.
171 |       #
172 |       # - compact_line: Similar to compact, but wrap each line with
173 |       #   `{-#LANGUAGE #-}'.
174 |       #
175 |       # Default: vertical.
176 |       style: vertical
177 | 
178 |       # Align affects alignment of closing pragma brackets.
179 |       #
180 |       # - true: Brackets are aligned in same column.
181 |       #
182 |       # - false: Brackets are not aligned together. There is only one space
183 |       #   between actual import and closing bracket.
184 |       #
185 |       # Default: true
186 |       align: false
187 | 
188 |       # stylish-haskell can detect redundancy of some language pragmas. If this
189 |       # is set to true, it will remove those redundant pragmas. Default: true.
190 |       remove_redundant: true
191 | 
192 |   # Replace tabs by spaces. This is disabled by default.
193 |   - tabs:
194 |       # Number of spaces to use for each tab. Default: 8, as specified by the
195 |       # Haskell report.
196 |       spaces: 4
197 | 
198 |   # Remove trailing whitespace
199 |   - trailing_whitespace: {}
200 | 
201 |   - module_header:
202 |       indent: 4
203 |       sort: true
204 |       break_where: exports
205 |       open_bracket: next_line
206 | 
207 | # A common setting is the number of columns (parts of) code will be wrapped
208 | # to. Different steps take this into account. Default: 80.
209 | columns: 100
210 | 
211 | # By default, line endings are converted according to the OS. You can override
212 | # preferred format here.
213 | #
214 | # - native: Native newline format. CRLF on Windows, LF on other OSes.
215 | #
216 | # - lf: Convert to LF ("\n").
217 | #
218 | # - crlf: Convert to CRLF ("\r\n").
219 | #
220 | # Default: native.
221 | newline: lf
222 | 
223 | # Sometimes, language extensions are specified in a cabal file or from the
224 | # command line instead of using language pragmas in the file. stylish-haskell
225 | # needs to be aware of these, so it can parse the file correctly.
226 | #
227 | # No language extensions are enabled by default.
228 | language_extensions:
229 |   - FlexibleContexts
230 | #   # StylishHaskell needs these extensions even when GHC does not in some cases
231 | #   - MultiParamTypeClasses
232 | #   - TemplateHaskell
233 | #   - FlexibleContexts
234 | #   - ExistentialQuantification
235 | 


--------------------------------------------------------------------------------
/tests/Data/Text/BoyerMooreCISpec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | module Data.Text.BoyerMooreCISpec
  5 |   ( spec
  6 |   ) where
  7 | 
  8 | 
  9 | import Control.Monad (forM_)
 10 | import Test.Hspec (Spec, describe, it, shouldBe, parallel)
 11 | import Test.Hspec.QuickCheck (modifyMaxSuccess, prop)
 12 | import Data.Text (Text)
 13 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAllShrink)
 14 | import Test.QuickCheck.Instances ()
 15 | 
 16 | import qualified Data.Char as Char
 17 | import qualified Data.Text as Text
 18 | import qualified Data.Text.Utf8 as Utf8
 19 | import qualified Test.QuickCheck as QuickCheck
 20 | 
 21 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 22 | import Data.Text.TestInstances (arbitraryAlphabet, arbitraryFragment, arbitraryNeedleHaystack,
 23 |                                 arbitraryNeedlesHaystack)
 24 | 
 25 | import qualified Data.Text.AhoCorasick.Replacer as AhoReplacer
 26 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMooreCI
 27 | import qualified Data.Text.BoyerMooreCI.Searcher as Searcher
 28 | import qualified Data.Text.BoyerMooreCI.Replacer as Replacer
 29 | 
 30 | 
 31 | spec :: Spec
 32 | spec = parallel $ modifyMaxSuccess (const 200) $ do
 33 | 
 34 |   describe "automaton" $ do
 35 | 
 36 |     it "works for some basic examples" $ do
 37 |       matchPositions "a" "abca" `shouldBe` [(0,0), (3,3)]
 38 |       matchPositions "a" "ABCA" `shouldBe` [(0,0), (3,3)]
 39 |       matchPositions "abc" "abca" `shouldBe` [(0,2)]
 40 |       matchPositions "abc" "ABCA" `shouldBe` [(0,2)]
 41 |       matchPositions "bc" "abca" `shouldBe` [(1,2)]
 42 |       matchPositions "bc" "ABCA" `shouldBe` [(1,2)]
 43 | 
 44 |     it "does not yield overlapping matches" $ do
 45 |       matchPositions "aba" "abababa" `shouldBe` [(0,2), (4,6)]
 46 |       matchPositions "aba" "ABaBaBA" `shouldBe` [(0,2), (4,6)]
 47 | 
 48 |     it "does not work with uppercase needles" $ do
 49 |       matchPositions "A" "aaaa" `shouldBe` []
 50 |       matchPositions "A" "AAAA" `shouldBe` []
 51 | 
 52 |     it "works with cyrillic characters" $ do
 53 |       -- Cyrillic characters are all two bytes.
 54 |       -- The match positions are byte indices (not char indices).
 55 |       matchPositions "п" "ипсум" `shouldBe` [(2,3)]
 56 |       matchPositions "п" "ИПСУМ" `shouldBe` [(2,3)]
 57 | 
 58 |       matchPositions "лорем" "Лорем" `shouldBe` [(0,9)]
 59 |       matchTexts "лорем" "Лорем" `shouldBe` ["Лорем"]
 60 | 
 61 |       matchPositions "лорем" "ЛОРЕМ" `shouldBe` [(0,9)]
 62 |       matchTexts "лорем" "ЛОРЕМ" `shouldBe` ["ЛОРЕМ"]
 63 | 
 64 |       -- This is an interesting case for badCharLookup, because the mismatch
 65 |       -- happens at "с" which is the first character in the needle.
 66 |       matchTexts "сит" "итсит" `shouldBe` ["сит"]
 67 |       matchTexts "сит" "ИТСИТ" `shouldBe` ["СИТ"]
 68 | 
 69 |     it "works with mixed byte lengths" $ do
 70 |       -- Space is 1 byte
 71 |       matchTexts "сит" "Лор сит амет" `shouldBe` ["сит"]
 72 |       matchTexts "сит" "Лорем ипсум долор сит амет" `shouldBe` ["сит"]
 73 |       matchTexts "сит" "ЛОРЕМ ИПСУМ ДОЛОР СИТ АМЕТ" `shouldBe` ["СИТ"]
 74 | 
 75 |       matchTexts "💩b" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["💩b"]
 76 |       matchTexts "𝄞" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["𝄞","𝄞","𝄞","𝄞"]
 77 |       matchTexts "a" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["a","a","a"]
 78 | 
 79 |     it "works with ⱥ and ⱦ" $ do
 80 |       -- The letters ⱥ and ⱦ are 3 UTF8 bytes, but have unlowerings Ⱥ and Ⱦ of 2 bytes
 81 |       matchPositions "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` [(0, 17)]
 82 |       matchTexts "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` ["ⱥⱦⱥⱦⱥⱦ"]
 83 |       matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` [(0, 11)]
 84 |       matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` ["ȺȾȺȾȺȾ"]
 85 | 
 86 |       matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` [(0, 14)]
 87 |       matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` ["ȺⱦⱥȾⱥȾ"]
 88 | 
 89 |     describe "with a needle equal to the haystack" $ do
 90 | 
 91 |       it "reports a single match for a repeated character" $
 92 |         forM_ [1..128] $ \n ->
 93 |           let needle = Text.replicate n "a" in
 94 |           matchPositions needle needle `shouldBe` [(0, Utf8.lengthUtf8 needle-1)]
 95 | 
 96 |       prop "reports a single match for any arbitrary text fragment" $
 97 |         QuickCheck.forAll (arbitraryAlphabet >>= arbitraryFragment) $ \text ->
 98 |           let needle = Utf8.lowerUtf8 text in
 99 |           matchPositions needle text `shouldBe` [(0, Utf8.lengthUtf8 text-1)]
100 | 
101 |     describe "with sliced text (using nonzero internal offset)" $ do
102 | 
103 |       it "still reports offset relative to the text start" $
104 |         -- The match position should be relative to the start of the text "a".
105 |         -- Even if this text is represented as a slice of "bbba" internally.
106 |         matchPositions "a" (Text.dropWhile (== 'b') "bbba") `shouldBe` [(0, 0)]
107 | 
108 |     it "matches ß and ẞ" $ do
109 |       matchTexts "groß" "Großfräsmaschinenöffnungstür" `shouldBe` ["Groß"]
110 |       matchTexts "groß" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["GROẞ"]
111 |       matchTexts "öffnung" "Großfräsmaschinenöffnungstür" `shouldBe` ["öffnung"]
112 |       matchTexts "öffnung" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["ÖFFNUNG"]
113 | 
114 | 
115 |   describe "minimumSkipForCodePoint" $
116 |     it "should match the reference implementation" $ do
117 |       forM_ [minBound..maxBound] $ \c ->
118 |         BoyerMooreCI.minimumSkipForCodePoint c `shouldBe` refMinimumSkipForCodePoint c
119 | 
120 | 
121 |   describe "Searcher" $ do
122 |     describe "containsAny" $ do
123 | 
124 |       -- For the edge case where a needle is the empty string,
125 |       -- 'Text.isInfixOf' and 'Searcher.containsAny' are different:
126 |       --
127 |       -- @
128 |       -- Text.isInfixOf "" "abc" == True /= False == Searcher.containsAny (Searcher.build [""]) "abc"
129 |       -- @
130 |       --
131 |       -- However, at this point we probably shouldn't break this property.
132 |       prop "is equivalent to disjunction of Text.isInfixOf calls*" $ do
133 |         QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do
134 |           let
135 |             lneedles = map Utf8.lowerUtf8 needles  -- needles must be lowercase
136 |             searcher = Searcher.build lneedles
137 |             test needle =
138 |               not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack)
139 |           Searcher.containsAny searcher haystack `shouldBe` any test lneedles
140 | 
141 |     describe "containsAll" $ do
142 |       prop "is equivalent to conjunction of Text.isInfixOf calls*" $ do
143 |         QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do
144 |           let
145 |             lneedles = map Utf8.lowerUtf8 needles  -- needles must be lowercase
146 |             searcher = Searcher.buildNeedleIdSearcher lneedles
147 |             test needle =
148 |               not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack)
149 |           Searcher.containsAll searcher haystack `shouldBe` all test lneedles
150 | 
151 |   describe "Replacer" $ do
152 |     describe "replaceSingleLimited" $ do
153 | 
154 |       prop "is equivalent to Aho-Corasick replacer with a single needle" $
155 |         forAllShrink arbitraryNeedleHaystack shrink $ \(needle, haystack) ->
156 |         forAllShrink arbitrary shrink $ \replacement ->
157 |         let
158 |           lneedle = Utf8.lowerUtf8 needle
159 |           expected =
160 |             AhoReplacer.run (AhoReplacer.build IgnoreCase [(lneedle, replacement)]) haystack
161 |           auto = BoyerMooreCI.buildAutomaton lneedle
162 |           actual = Replacer.replaceSingleLimited auto replacement haystack maxBound
163 |         in
164 |           actual `shouldBe` Just expected
165 | 
166 | -- Reference implementation for BoyerMooreCI.minimumSkipForCodePoint
167 | refMinimumSkipForCodePoint :: Utf8.CodePoint -> Utf8.CodeUnitIndex
168 | refMinimumSkipForCodePoint cp =
169 |   let codePointLength = length . Utf8.unicode2utf8 . Char.ord in
170 |   case Utf8.unlowerCodePoint cp of
171 |     [] ->
172 |       -- Input is upper case, so this is undefined behaviour but we match what the real
173 |       -- implementation does:
174 |       Utf8.CodeUnitIndex $ codePointLength cp
175 |     ucs -> Utf8.CodeUnitIndex $ minimum $ map codePointLength ucs
176 | 
177 | 
178 | -- | Return indices of the first and last byte of every match
179 | matchPositions :: Text -> Text -> [(Utf8.CodeUnitIndex, Utf8.CodeUnitIndex)]
180 | matchPositions needle =
181 |   let
182 |     !automaton = BoyerMooreCI.buildAutomaton needle
183 |     prependMatch matches matchStart matchEnd =
184 |       BoyerMooreCI.Step ((matchStart, matchEnd) : matches)
185 |   in
186 |     \haystack -> reverse $ BoyerMooreCI.runText [] prependMatch automaton haystack
187 | 
188 | positionText :: Text -> (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex) -> Text
189 | positionText haystack (firstByte, lastByte) =
190 |   let len = lastByte - firstByte + 1  -- length is 1 if firstByte==lastByte
191 |   in Utf8.unsafeSliceUtf8 firstByte len haystack
192 | 
193 | matchTexts :: Text -> Text -> [Text]
194 | matchTexts needle haystack =
195 |   map (positionText haystack) $ matchPositions needle haystack
196 | 


--------------------------------------------------------------------------------
/tests/Data/Text/BoyerMooreSpec.hs:
--------------------------------------------------------------------------------
  1 | {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
  2 | 
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE ScopedTypeVariables #-}
  5 | 
  6 | module Data.Text.BoyerMooreSpec
  7 |     ( spec
  8 |     ) where
  9 | 
 10 | import Control.DeepSeq (rnf)
 11 | import Control.Monad (forM_)
 12 | import Data.Foldable (for_)
 13 | import GHC.Stack (HasCallStack)
 14 | import Prelude hiding (replicate)
 15 | import Test.Hspec (Expectation, Spec, describe, it, parallel, shouldBe)
 16 | import Test.Hspec.Expectations (shouldMatchList, shouldSatisfy)
 17 | import Test.Hspec.QuickCheck (modifyMaxSuccess, prop)
 18 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAllShrink, (==>))
 19 | import Test.QuickCheck.Instances ()
 20 | 
 21 | import qualified Test.QuickCheck as QuickCheck
 22 | 
 23 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 24 | import Data.Text.TestInstances (arbitraryNeedleHaystack)
 25 | import Data.Text.Utf8 (Text)
 26 | 
 27 | import qualified Data.Text.Utf8 as Text
 28 | import qualified Data.Text.Utf8 as TextSearch
 29 | import qualified Data.Text.Utf8 as Utf8
 30 | import qualified Data.Text.AhoCorasick.Replacer as AhoReplacer
 31 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore
 32 | import qualified Data.Text.BoyerMoore.Replacer as Replacer
 33 | import qualified Data.Text.BoyerMoore.Searcher as Searcher
 34 | 
 35 | -- | Test that for a single needle which equals the haystack, we find a single
 36 | -- match. Does not apply to the empty needle.
 37 | needleIsHaystackMatches :: HasCallStack => Text -> Expectation
 38 | needleIsHaystackMatches needle =
 39 |   let
 40 |     prependMatch ms match = BoyerMoore.Step (Utf8.codeUnitIndex match : ms)
 41 |     matches = BoyerMoore.runText [] prependMatch (BoyerMoore.buildAutomaton needle) needle
 42 |   in
 43 |     matches `shouldBe` [0]
 44 | 
 45 | boyerMatch :: Text -> Text -> [Int]
 46 | boyerMatch needle haystack =
 47 |   let
 48 |     prependMatch matches match = BoyerMoore.Step (Utf8.codeUnitIndex match : matches)
 49 |   in
 50 |     BoyerMoore.runText [] prependMatch (BoyerMoore.buildAutomaton needle) haystack
 51 | 
 52 | -- | Match without a payload, return only the match positions.
 53 | matchEndPositions :: Text -> Text -> [Int]
 54 | matchEndPositions needle haystack =
 55 |   let
 56 |     matches = boyerMatch needle haystack
 57 |   in
 58 |     fmap (Utf8.codeUnitIndex (Utf8.lengthUtf8 needle) +) matches
 59 | 
 60 | -- | `matchEndPositions` implemented naively in terms of Text's functionality,
 61 | -- which we assume to be correct.
 62 | naiveMatchPositions :: Text -> Text -> [Int]
 63 | naiveMatchPositions needle haystack =
 64 |   map toEndPos $ TextSearch.indices needle haystack
 65 |   where
 66 |     toEndPos index = Utf8.codeUnitIndex (Utf8.lengthUtf8 needle) + index
 67 | 
 68 | spec :: Spec
 69 | spec = parallel $ modifyMaxSuccess (const 200) $ do
 70 |   describe "build" $ do
 71 |     prop "does not throw exceptions" $ \ (pat :: Text) ->
 72 |       rnf $ BoyerMoore.buildAutomaton pat
 73 | 
 74 |   describe "runText" $ do
 75 | 
 76 |     describe "when given a needle equal to the haystack" $ do
 77 | 
 78 |       it "reports a single match for a repeated character" $
 79 |         forM_ [1..128] $ \n ->
 80 |           needleIsHaystackMatches $ Text.replicate n "a"
 81 | 
 82 |       it "reports a single match for non-BMP data" $ do
 83 |         -- Include a few code points outside of the Basic Multilingual Plane,
 84 |         -- which require multible code units to encode.
 85 |         needleIsHaystackMatches "\x000437b8suffix"
 86 |         needleIsHaystackMatches "aaa\359339aaa\95759aa\899256aa"
 87 | 
 88 |       prop "reports a single match for random needles" $ \needle ->
 89 |         not (Text.null needle) ==> needleIsHaystackMatches needle
 90 | 
 91 |     describe "when given a sliced text (with nonzero internal offset)" $ do
 92 | 
 93 |       it "still reports offset relative to the text start" $
 94 |         -- The match position should be relative to the start of the text "a".
 95 |         -- Even if this text is represented as a slice of "bbba" internally.
 96 |         matchEndPositions "a" (Text.dropWhile (== 'b') "bbba") `shouldMatchList` [1]
 97 | 
 98 |     describe "when given non-ascii inputs" $ do
 99 | 
100 |       -- We have a special lookup table for bad character shifts for
101 |       -- the first 128 code units, which is always hit for ascii inputs.
102 |       -- Also exercise the fallback code path with a different input.
103 |       -- The code point é is encoded as two code units in UTF-8.
104 |       -- 0             7          13
105 |       -- │             │           │
106 |       -- ▼             ▼           ▼
107 |       -- ┌───┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┐
108 |       -- │ é │c│l│a│i│r│e│c│l│a│i│r│ Code Points
109 |       -- ├─┬─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
110 |       -- │ │ │ │ │ │ │ │ │ │ │ │ │ │ Code Units (Bytes)
111 |       -- └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
112 |       it "reports a match if the haystack contains a character > U+7f" $ do
113 |         matchEndPositions "eclair" "éclaireclair" `shouldMatchList` [13]
114 |         matchEndPositions "éclair" "éclaireclair" `shouldMatchList` [7]
115 |         matchEndPositions "éclair" "eclairéclair" `shouldMatchList` [13]
116 | 
117 |       it "reports the correct code unit index for complex characters" $ do
118 |         -- Note that the index after the match is 4, even though there is
119 |         -- only a single code point. U+1d11e is encoded as four code units:
120 |         -- in UTF-8:
121 |         -- 0       4
122 |         -- │       │
123 |         -- ▼       ▼
124 |         -- ┌───────┐
125 |         -- │   𝄞   │ Code Points
126 |         -- ├─┬─┬─┬─┤
127 |         -- │ │ │ │ │ Code Units (Bytes)
128 |         -- └─┴─┴─┴─┘
129 |         matchEndPositions "𝄞" "𝄞" `shouldMatchList` [4]
130 | 
131 |         -- A levitating woman in business suit with dark skin tone needs a
132 |         -- whopping 5 code points to encode. The first two need 4 code units each to encode,
133 |         -- the remaining three need 3 code units each for a total of 17 code units:
134 |         -- 0       4       8                17
135 |         -- │       │       │                 │
136 |         -- ▼       ▼       ▼                 ▼
137 |         -- ┌───────┬───────┬─────┬─────┬─────┐
138 |         -- │   1   │   2   │  3  │  4  │  5  │ Code Points
139 |         -- ├─┬─┬─┬─┼─┬─┬─┬─┼─┬─┬─┼─┬─┬─┼─┬─┬─┤
140 |         -- │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ Code Units (Bytes)
141 |         -- └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
142 |         -- 1. U+1f574: man in business suit levitating (🕴)
143 |         -- 2. U+1f3ff: emoji modifier Fitzpatrick type-6
144 |         -- 3. U+200d:  zero width joiner
145 |         -- 4. U+2640:  female sign (♀)
146 |         -- 5. U+fe0f:  variation selector-16
147 |         -- A peculiar feature of Unicode emoji, is that the male levivating
148 |         -- man in business suit with dark skin tone is a substring of the
149 |         -- levivating woman in business suit. And the levivating man in
150 |         -- business suit without particular skin tone is a substring of that.
151 |         let
152 |           examples =
153 |             [ ("\x1f574\x1f3ff\x200d\x2640\xfe0f", 17)
154 |             , ("\x1f574\x1f3ff", 8)
155 |             , ("\x1f574", 4)
156 |             ]
157 |         for_ examples $ \(needle, endPos) ->
158 |           matchEndPositions needle "\x1f574\x1f3ff\x200d\x2640\xfe0f" `shouldMatchList` [endPos]
159 | 
160 |     describe "when given empty needle" $ do
161 | 
162 |       it "does not report a match" $ do
163 |         matchEndPositions "" "" `shouldMatchList` []
164 |         matchEndPositions "" "foo" `shouldMatchList` []
165 | 
166 |     describe "kitchen sink" $ do
167 |       it "kitchen sinks" $ do
168 |         matchEndPositions "\"\SO]JL\"" "aaaaa\"\SO]JL\"" `shouldMatchList` [11]
169 |         matchEndPositions "\"X]JL\"" "aaaaa\"X]JL\"" `shouldMatchList` [11]
170 | 
171 |     describe "when given random needles and haystacks" $ do
172 | 
173 |       prop "reports only infixes of the haystack" $
174 |         QuickCheck.forAllShrink arbitraryNeedleHaystack shrink $ \ (needle, haystack) ->
175 |           let
176 |             matches = boyerMatch needle haystack
177 |             sliceMatch startPos len = Utf8.unsafeSliceUtf8 startPos len haystack
178 |           in
179 |             forM_ matches $ \pos -> do
180 |               needle `shouldSatisfy` (`Text.isInfixOf` haystack)
181 |               sliceMatch (Utf8.CodeUnitIndex pos) (Utf8.lengthUtf8 needle) `shouldBe` needle
182 | 
183 |       prop "reports all infixes of the haystack" $
184 |         QuickCheck.forAllShrink arbitraryNeedleHaystack shrink $ \ (needle, haystack) ->
185 |           matchEndPositions needle haystack `shouldMatchList` naiveMatchPositions needle haystack
186 | 
187 |   describe "replaceSingleLimited" $ do
188 | 
189 |     prop "is equivalent to Aho-Corasick replacer with a single needle" $
190 |       forAllShrink arbitraryNeedleHaystack shrink $ \(needle, haystack) ->
191 |       forAllShrink arbitrary shrink $ \replacement ->
192 |       let
193 |         expected = AhoReplacer.run (AhoReplacer.build CaseSensitive [(needle, replacement)]) haystack
194 | 
195 |         auto = BoyerMoore.buildAutomaton needle
196 | 
197 |         actual = Replacer.replaceSingleLimited auto replacement haystack maxBound
198 |       in
199 |         actual `shouldBe` Just expected
200 | 
201 |   describe "Searcher" $ do
202 | 
203 |     describe "containsAny" $ do
204 | 
205 |       -- For the edge case where a needle is the empty string,
206 |       -- 'Text.isInfixOf' and 'Searcher.containsAny' are different:
207 |       --
208 |       -- @
209 |       -- Text.isInfixOf "" "abc" == True /= False == Searcher.containsAny (Searcher.build [""]) "abc"
210 |       -- @
211 |       --
212 |       -- However, at this point we probably shouldn't break this property.
213 |       prop "is equivalent to disjunction of Text.isInfixOf calls*" $ \ (needles :: [Text]) (haystack :: Text) ->
214 |         let
215 |           searcher = Searcher.build needles
216 |           test needle =
217 |             not (Text.null needle) && needle `Text.isInfixOf` haystack
218 |         in
219 |           Searcher.containsAny searcher haystack `shouldBe` any test needles
220 | 
221 |     describe "containsAll" $ do
222 |       prop "is equivalent to conjunction of Text.isInfixOf calls*" $ \ (needles :: [Text]) (haystack :: Text) ->
223 |         let
224 |           searcher = Searcher.buildNeedleIdSearcher needles
225 |           test needle =
226 |             not (Text.null needle) && needle `Text.isInfixOf` haystack
227 |         in
228 |           Searcher.containsAll searcher haystack `shouldBe` all test needles
229 | 


--------------------------------------------------------------------------------
/bench/bm/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | module Main
  5 |   ( main
  6 |   ) where
  7 | 
  8 | import Criterion.Main (bgroup, bench, defaultMain, nf)
  9 | 
 10 | import qualified Data.Text as Text
 11 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 12 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore
 13 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMooreCI
 14 | import qualified Data.Text.Utf8 as Utf8
 15 | 
 16 | --
 17 | -- @
 18 | -- stack bench alfred-margaret:bm --ba '--output bm.html'
 19 | -- @
 20 | 
 21 | main :: IO ()
 22 | main = defaultMain
 23 |   [
 24 |     bgroup "Big needle, large haystack" $
 25 |       benchVariants needle1 haystack1
 26 | 
 27 |   , bgroup "Big needle, small haystack" $
 28 |       benchVariants needle2 haystack2
 29 | 
 30 |   , bgroup "Small needle, large haystack" $
 31 |       benchVariants needle3 haystack3
 32 | 
 33 |   , bgroup "Small needle, small haystack" $
 34 |       benchVariants needle4 haystack4
 35 | 
 36 |   , bgroup "Cyrillic, medium needle, large haystack" $
 37 |       benchVariants needle5 haystack5
 38 | 
 39 |   ]
 40 |   where
 41 |     benchVariants needle haystack =
 42 |       [ bench "Count Aho" $ nf (ahoCount needle) haystack
 43 |       , bench "Count Aho CI" $ nf (ahociCount needle) haystack
 44 |       , bench "Count BoyerMoore" $ nf (bmCount needle) haystack
 45 |       , bench "Count BoyerMoore CI" $ nf (bmciCount needle) haystack
 46 |       , bench "Count Aho CI multineedle" $ nf (ahocimultiCount needle) haystack
 47 |       , bench "Count Text.count" $ nf (Text.count needle) haystack
 48 |       -- This naive case insensitive count is really slow. We leave it out
 49 |       -- because it throws off the plot scale.
 50 |       -- , bench "Count Text.count . lowerUtf8" $ nf (Text.count needle . Utf8.lowerUtf8) haystack
 51 |       , bench "Contains Aho" $ nf (ahoContains needle) haystack
 52 |       , bench "Contains Aho CI" $ nf (ahociContains needle) haystack
 53 |       , bench "Contains BoyerMoore" $ nf (bmContains needle) haystack
 54 |       , bench "Contains BoyerMoore CI" $ nf (bmciContains needle) haystack
 55 |       , bench "Contains Text.isInfixOf" $ nf (Text.isInfixOf needle) haystack
 56 |       ]
 57 | 
 58 | 
 59 | -- | The NOINLINE annotation is used so that the needle doesn't get inlined at compile time. The
 60 | -- function does get partially applied with the needle so that the automaton is constructed only
 61 | -- once.
 62 | ahoCount :: Text.Text -> Text.Text -> Int
 63 | {-# NOINLINE ahoCount #-}
 64 | ahoCount needle =
 65 |   let
 66 |     !automaton = Aho.build [(needle, ())]
 67 |     onMatch !n _match = Aho.Step (n + 1)
 68 |   in
 69 |     -- Eta expansion of the haystack variable is necessary for proper inlining of runText
 70 |     \haystack -> Aho.runText 0 onMatch automaton haystack
 71 | 
 72 | ahoContains :: Text.Text -> Text.Text -> Bool
 73 | {-# NOINLINE ahoContains #-}
 74 | ahoContains needle =
 75 |   let
 76 |     !automaton = Aho.build [(needle, ())]
 77 |     onMatch _ _match = Aho.Done True
 78 |   in
 79 |     \haystack -> Aho.runText False onMatch automaton haystack
 80 | 
 81 | ahociCount :: Text.Text -> Text.Text -> Int
 82 | {-# NOINLINE ahociCount #-}
 83 | ahociCount needle =
 84 |   let
 85 |     !automaton = Aho.build [(needle, ())]
 86 |     onMatch !n _match = Aho.Step (n + 1)
 87 |   in
 88 |     \haystack -> Aho.runLower 0 onMatch automaton haystack
 89 | 
 90 | 
 91 | ahocimultiCount :: Text.Text -> Text.Text -> Int
 92 | {-# NOINLINE ahocimultiCount #-}
 93 | ahocimultiCount needle =
 94 |   let
 95 |     !automaton = Aho.build $ map (\n -> (n, ())) $ Aho.needleCasings needle
 96 |     onMatch !n _match = Aho.Step (n + 1)
 97 |   in
 98 |     \haystack -> Aho.runText 0 onMatch automaton haystack
 99 | 
100 | ahociContains :: Text.Text -> Text.Text -> Bool
101 | {-# NOINLINE ahociContains #-}
102 | ahociContains needle =
103 |   let
104 |     !automaton = Aho.build [(needle, ())]
105 |     onMatch _ _match = Aho.Done True
106 |   in
107 |     \haystack -> Aho.runLower False onMatch automaton haystack
108 | 
109 | bmciCount :: Text.Text -> Text.Text -> Int
110 | {-# NOINLINE bmciCount #-}
111 | bmciCount needle =
112 |   let
113 |     !automaton = BoyerMooreCI.buildAutomaton needle
114 |     onMatch !n _matchStart _matchEnd = BoyerMooreCI.Step (n + 1)
115 |   in
116 |     \haystack -> BoyerMooreCI.runText 0 onMatch automaton haystack
117 | 
118 | bmciContains :: Text.Text -> Text.Text -> Bool
119 | {-# NOINLINE bmciContains #-}
120 | bmciContains needle =
121 |   let
122 |     !automaton = BoyerMooreCI.buildAutomaton needle
123 |     onMatch _ _matchStart _matchEnd = BoyerMooreCI.Done True
124 |   in
125 |     \haystack -> BoyerMooreCI.runText False onMatch automaton haystack
126 | 
127 | bmCount :: Text.Text -> Text.Text -> Int
128 | {-# NOINLINE bmCount #-}
129 | bmCount needle =
130 |   let
131 |     !automaton = BoyerMoore.buildAutomaton needle
132 |     onMatch !n _match = BoyerMoore.Step (n + 1)
133 |   in
134 |     \haystack -> BoyerMoore.runText 0 onMatch automaton haystack
135 | 
136 | bmContains :: Text.Text -> Text.Text -> Bool
137 | {-# NOINLINE bmContains #-}
138 | bmContains needle =
139 |   let
140 |     !automaton = BoyerMoore.buildAutomaton needle
141 |     onMatch _ _match = BoyerMoore.Done True
142 |   in
143 |     \haystack -> BoyerMoore.runText False onMatch automaton haystack
144 | 
145 | 
146 | needle1 :: Text.Text
147 | needle1 = "necessitatibus"
148 | 
149 | haystack1 :: Text.Text
150 | haystack1 =
151 |   Text.unlines
152 |     [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem."
153 |     , "Qui quia totam non rerum eveniet sed tempora repellendus ab enim consequatur eum quaerat iste. Sed amet nihil sed voluptate aspernatur ut rerum facilis est officia earum aut molestiae tenetur non autem nulla. Qui deserunt necessitatibus ab accusamus doloremque non sint aspernatur."
154 |     , "Et officia illum non quaerat obcaecati cum accusamus minus rem quae quis sed rerum omnis sed inventore quasi. Qui totam deserunt sit minima ullam sit debitis dolores. Est debitis explicabo ut temporibus corporis nam harum dolore est fuga numquam non exercitationem Quis cum amet fuga."
155 |     , "Aut incidunt provident et sequi nulla est molestias perferendis. Hic exercitationem modi ex optio cumque nam voluptate debitis nam iste consequatur non nihil rerum ut accusantium nihil. Sed ullam maiores nobis dolorem sit galisum maiores eum reprehenderit maxime sed galisum placeat cum molestiae quia id similique velit. Aut quasi autem non illo reiciendis sit ullam tempora."
156 |     , "Ut corporis exercitationem sed dicta autem ut voluptatem dolorem vel dolores dolores. Qui ipsam quisquam sed facere porro rem autem necessitatibus nam beatae quisquam. Quo voluptatem optio hic quod reprehenderit ut nostrum voluptatem."
157 |     , "Et consequuntur quia vel unde laudantium non voluptatum magnam. Ut quam autem rem fugit quia ut assumenda error quam amet in omnis quia ut rerum soluta sed consequatur fuga. Ut blanditiis quia et facilis ratione aut blanditiis dolorum aut itaque excepturi eos iste incidunt qui blanditiis velit et magni autem."
158 |     ]
159 | 
160 | needle2 :: Text.Text
161 | needle2 = "necessitatibus"
162 | 
163 | haystack2 :: Text.Text
164 | haystack2 =
165 |   Text.unlines
166 |     [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem."
167 |     ]
168 | 
169 | needle3 :: Text.Text
170 | needle3 = "sit"
171 | 
172 | haystack3 :: Text.Text
173 | haystack3 =
174 |   Text.unlines
175 |     [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem."
176 |     , "Qui quia totam non rerum eveniet sed tempora repellendus ab enim consequatur eum quaerat iste. Sed amet nihil sed voluptate aspernatur ut rerum facilis est officia earum aut molestiae tenetur non autem nulla. Qui deserunt necessitatibus ab accusamus doloremque non sint aspernatur."
177 |     , "Et officia illum non quaerat obcaecati cum accusamus minus rem quae quis sed rerum omnis sed inventore quasi. Qui totam deserunt sit minima ullam sit debitis dolores. Est debitis explicabo ut temporibus corporis nam harum dolore est fuga numquam non exercitationem Quis cum amet fuga."
178 |     , "Aut incidunt provident et sequi nulla est molestias perferendis. Hic exercitationem modi ex optio cumque nam voluptate debitis nam iste consequatur non nihil rerum ut accusantium nihil. Sed ullam maiores nobis dolorem sit galisum maiores eum reprehenderit maxime sed galisum placeat cum molestiae quia id similique velit. Aut quasi autem non illo reiciendis sit ullam tempora."
179 |     , "Ut corporis exercitationem sed dicta autem ut voluptatem dolorem vel dolores dolores. Qui ipsam quisquam sed facere porro rem autem necessitatibus nam beatae quisquam. Quo voluptatem optio hic quod reprehenderit ut nostrum voluptatem."
180 |     , "Et consequuntur quia vel unde laudantium non voluptatum magnam. Ut quam autem rem fugit quia ut assumenda error quam amet in omnis quia ut rerum soluta sed consequatur fuga. Ut blanditiis quia et facilis ratione aut blanditiis dolorum aut itaque excepturi eos iste incidunt qui blanditiis velit et magni autem."
181 |     ]
182 | 
183 | needle4 :: Text.Text
184 | needle4 = "sit"
185 | 
186 | haystack4 :: Text.Text
187 | haystack4 =
188 |   Text.unlines
189 |     [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem."
190 |     ]
191 | 
192 | 
193 | 
194 | needle5 :: Text.Text
195 | needle5 = "сусципиантур"
196 | 
197 | -- Cyrillic lipsum from https://generator.lorem-ipsum.info/_russian
198 | haystack5 :: Text.Text
199 | haystack5 =
200 |   Text.unlines
201 |     [ "Лорем ипсум долор сит амет, ин вис вирис маиорум инсоленс, алиа агам иудицабит цу нец, ут адмодум инцидеринт еффициантур цум. Вих те номинави диспутатиони, не дуо путент дицунт. Дебет репудиандае яуо ан, вих тамяуам репудиандае цонцлусионемяуе но, цу иус регионе урбанитас. Персиус дицерет форенсибус ат цум, еум яуаеяуе аппеллантур еу. Хас вивендо цонсететур те, ад хас меис пхаедрум, репудиаре демоцритум вих еу."
202 |     , "Пер ут нулла минимум цонтентионес, сеа ат мунере алтера медиоцрем. Но вим лабитур цонсететур, нам ут ипсум фацилис феугаит. Не дицат феугиат аппареат нам, дуис адиписцинг ех еам. Ут еуисмод веритус интерпретарис усу. Сед цасе дицунт ехпетендис ех. Ад лудус урбанитас вим."
203 |     , "Перфецто инсоленс ут меи, адмодум ментитум партиендо усу ат. Ин солеат еирмод аперири дуо, еи мазим фацилиси делицата сед. Дицо неглегентур ут хис, еи хис афферт аудире. Ат аццусата глориатур вел."
204 |     , "Персиус омиттам при ад, цонгуе воцент репудиаре усу ет. Ат детрахит салутатус вим. Еа сеа яуис тациматес. Ид фацер дицтас еум, вери дицам еум еу, при ид тантас омиттам. Суавитате персецути еам те, ет яуот омиттантур ест."
205 |     , "Еа тациматес евертитур дуо, меа ин вениам ассентиор. Еос ессе мунди аццусата цу. Ат партем медиоцрем иус, ан мел алии плацерат. Еам еи суммо ерант тинцидунт. Меа еа алиа сцрибентур."
206 |     , "Еи вих малис омиттам сплендиде, ид витае ностер алияуандо еам, те новум репудиаре пер. Вис цу еррор алтерум садипсцинг, ан ерат фацете аудире ест. Меи ат виде ассуеверит. Вис либрис популо ид, ат хис симилияуе репудиандае, нибх салутатус ех сеа."
207 |     , "Сеа пондерум адиписцинг ех, ет нонумы анциллае вих. Цу иллум ессент медиоцрем вис, еу мовет мунди еос, яуо зрил долорем партиендо но. Хис ат диам цоммодо демоцритум, вери цаусае инцидеринт усу ин. Темпорибус диссентиунт вис еа, ех цум цонсул поссим алтерум. Вим ин веро нулла, еиус реяуе ех меа."
208 |     , "Омнес трацтатос цу еос. Пер ехерци модератиус еа. Нец оффендит сусципит ех. Пер ех постеа фацете, те еум еиус игнота легендос. Ат поссе воцибус пер, ин про лаудем сплендиде цотидиеяуе, но адхуц инермис аццоммодаре еос. Ин денияуе малуиссет еам, ин яуис доцтус нец."
209 |     , "Идяуе елояуентиам еу яуи, витае интерессет не вис, хис диссентиет сусципиантур цу. Тибияуе елаборарет еам еа. Оратио вивендум цонсулату ид пер. Еа путант еррорибус сит, цум ад алияуид аццоммодаре. Алтерум делецтус алияуандо еу иус, ан иус поссит персиус тибияуе. Сеа не нумяуам фуиссет, ат виси аудире при. Суас граеце цопиосае дуо ан, но мунди опортере цонвенире яуи."
210 |     , "Мел фугит путант опортере но, цопиосае оффициис еос еа. Вис инвидунт еффициенди ат, те луцилиус сцрибентур хис, вих не поссит вертерем. Дуо омнис дицат вирис еи, ет адхуц цетеро вис. Еа вим утинам инвенире, вих ин еяуидем ассентиор."
211 |     ]
212 | 


--------------------------------------------------------------------------------
/src/Data/Text/AhoCorasick/Replacer.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2019 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE CPP #-}
  9 | {-# LANGUAGE DeriveAnyClass #-}
 10 | {-# LANGUAGE DeriveGeneric #-}
 11 | {-# LANGUAGE DerivingStrategies #-}
 12 | 
 13 | -- | Implements sequential string replacements based on the Aho-Corasick algorithm.
 14 | module Data.Text.AhoCorasick.Replacer
 15 |     ( -- * State machine
 16 |       Needle
 17 |     , Payload (..)
 18 |     , Replacement
 19 |     , Replacer (..)
 20 |     , replacerCaseSensitivity
 21 |     , build
 22 |     , compose
 23 |     , mapReplacement
 24 |     , run
 25 |     , runWithLimit
 26 |     , setCaseSensitivity
 27 |     ) where
 28 | 
 29 | import Control.DeepSeq (NFData)
 30 | import Data.Hashable (Hashable)
 31 | import Data.List (sort)
 32 | import Data.Maybe (fromJust)
 33 | import GHC.Generics (Generic)
 34 | 
 35 | #if defined(HAS_AESON)
 36 | import qualified Data.Aeson as AE
 37 | #endif
 38 | 
 39 | import Data.Text.AhoCorasick.Searcher (Searcher)
 40 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 41 | import Data.Text.Utf8 (CodeUnitIndex (..), Text)
 42 | 
 43 | import qualified Data.Text as Text
 44 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 45 | import qualified Data.Text.AhoCorasick.Searcher as Searcher
 46 | import qualified Data.Text.Utf8 as Utf8
 47 | 
 48 | -- | Descriptive type alias for strings to search for.
 49 | type Needle = Text
 50 | 
 51 | -- | Descriptive type alias for replacements.
 52 | type Replacement = Text
 53 | 
 54 | -- | Priority of a needle. Higher integers indicate higher priorities.
 55 | -- Replacement order is such that all matches of priority p are replaced before
 56 | -- replacing any matches of priority q where p > q.
 57 | type Priority = Int
 58 | 
 59 | data Payload = Payload
 60 |   { needlePriority    :: {-# UNPACK #-} !Priority
 61 |   , needleLengthBytes       :: {-# UNPACK #-} !CodeUnitIndex
 62 |     -- ^ Number of bytes is used for case sensitive matching
 63 |   , needleLengthCodePoints  :: {-# UNPACK #-} !Int
 64 |     -- ^ For case insensitive matches, the byte length does not necessarily match the needle byte
 65 |     -- length. Due to our simple case folding the number of codepoints _does_ match, so we put that
 66 |     -- in the payload. It's less efficient because we have to scan backwards through the text to
 67 |     -- obtain the length of a match.
 68 | 
 69 |   , needleReplacement :: !Replacement
 70 |   }
 71 | #if defined(HAS_AESON)
 72 |   deriving (Eq, Generic, Hashable, NFData, Show, AE.FromJSON, AE.ToJSON)
 73 | #else
 74 |   deriving (Eq, Generic, Hashable, NFData, Show)
 75 | #endif
 76 | 
 77 | -- | A state machine used for efficient replacements with many different needles.
 78 | data Replacer = Replacer
 79 |   { replacerSearcher :: Searcher Payload
 80 |   }
 81 |   deriving stock (Show, Eq, Generic)
 82 | #if defined(HAS_AESON)
 83 |   deriving (Hashable, NFData, AE.FromJSON, AE.ToJSON)
 84 | #else
 85 |   deriving (Hashable, NFData)
 86 | #endif
 87 | 
 88 | -- | Build an Aho-Corasick automaton that can be used for performing fast
 89 | -- sequential replaces.
 90 | --
 91 | -- Case-insensitive matching performs per-letter language-agnostic lower-casing.
 92 | -- Therefore, it will work in most cases, but not in languages where lower-casing
 93 | -- depends on the context of the character in question.
 94 | --
 95 | -- We need to revisit this algorithm when we want to implement full Unicode
 96 | -- support.
 97 | build :: CaseSensitivity -> [(Needle, Replacement)] -> Replacer
 98 | build caseSensitivity replaces = Replacer searcher
 99 |   where
100 |     searcher = Searcher.buildWithValues caseSensitivity $ zipWith mapNeedle [0..] replaces
101 |     mapNeedle i (needle, replacement) =
102 |       -- Note that we negate i: earlier needles have a higher priority. We
103 |       -- could avoid it and define larger integers to be lower priority, but
104 |       -- that made the terminology in this module very confusing.
105 |       let needle' = case Searcher.caseSensitivity searcher of
106 |                       CaseSensitive -> needle
107 |                       IgnoreCase -> Utf8.lowerUtf8 needle
108 |           -- Payload includes byte and code point lengths, so can still be used if we change case
109 |           -- sensitivity later.
110 |           payload = Payload
111 |             { needlePriority = (-i)
112 |             , needleLengthBytes = Utf8.lengthUtf8 needle
113 |             , needleLengthCodePoints = Text.length needle
114 |             , needleReplacement = replacement
115 |             }
116 |       in (needle', payload)
117 | 
118 | -- | Return the composition `replacer2` after `replacer1`, if they have the same
119 | -- case sensitivity. If the case sensitivity differs, Nothing is returned.
120 | compose :: Replacer -> Replacer -> Maybe Replacer
121 | compose (Replacer searcher1) (Replacer searcher2)
122 |   | Searcher.caseSensitivity searcher1 /= Searcher.caseSensitivity searcher2 = Nothing
123 |   | otherwise =
124 |       let
125 |         -- Replace the priorities of the second machine, so they all come after
126 |         -- the first.
127 |         renumber i (needle, Payload _ lenb lenc replacement) = (needle, Payload (-i) lenb lenc replacement)
128 |         needles1 = Searcher.needles searcher1
129 |         needles2 = Searcher.needles searcher2
130 |         cs = Searcher.caseSensitivity searcher1
131 |         searcher = Searcher.buildWithValues cs $ zipWith renumber [0..] (needles1 ++ needles2)
132 |       in
133 |         Just $ Replacer searcher
134 | 
135 | -- | Modify the replacement of a replacer. It doesn't modify the needles.
136 | mapReplacement :: (Replacement -> Replacement) -> Replacer -> Replacer
137 | mapReplacement f replacer = replacer{
138 |   replacerSearcher = Searcher.mapSearcher
139 |     (\p -> p {needleReplacement = f (needleReplacement p)})
140 |     (replacerSearcher replacer)
141 | }
142 | 
143 | 
144 | replacerCaseSensitivity :: Replacer -> CaseSensitivity
145 | replacerCaseSensitivity (Replacer searcher) = Searcher.caseSensitivity searcher
146 | 
147 | 
148 | -- | Updates the case sensitivity of the replacer. Does not change the
149 | -- capitilization of the needles. The caller should be certain that if IgnoreCase
150 | -- is passed, the needles are already lower case.
151 | setCaseSensitivity :: CaseSensitivity -> Replacer -> Replacer
152 | setCaseSensitivity case_ (Replacer searcher) =
153 |   Replacer (Searcher.setCaseSensitivity case_ searcher)
154 | 
155 | 
156 | -- A match collected while running replacements. It is isomorphic to the Match
157 | -- reported by the automaton, but the data is arranged in a more useful way:
158 | -- as the start index and length of the match, and the replacement.
159 | data Match = Match !CodeUnitIndex !CodeUnitIndex !Text deriving (Eq, Ord, Show, Generic)
160 | 
161 | -- | Apply replacements of all matches. Assumes that the matches are ordered by
162 | -- match position, and that no matches overlap.
163 | replace :: [Match] -> Text -> Text
164 | replace matches haystack = Utf8.concat $ go 0 matches haystack
165 |   where
166 |     -- At every match, cut the string into three pieces, removing the match.
167 |     -- Because a Text is a buffer pointer and (offset, length), cutting does not
168 |     -- involve string copies. Only at the very end we piece together the strings
169 |     -- again, so Text can allocate a buffer of the right length and memcpy the
170 |     -- parts into the new target string.
171 |     -- If `k` is a code unit index into the original text, then `k - offset`
172 |     -- is an index into `remainder`. In other words, `offset` is the index into
173 |     -- the original text where `remainder` starts.
174 |     go :: CodeUnitIndex -> [Match] -> Text -> [Text]
175 |     go !_offset [] remainder = [remainder]
176 |     go !offset ((Match pos len replacement) : ms) remainder =
177 |       let
178 |         (prefix, suffix) = Utf8.unsafeCutUtf8 (pos - offset) len remainder
179 |       in
180 |         prefix : replacement : go (pos + len) ms suffix
181 | 
182 | -- | Compute the length of the string resulting from applying the replacements.
183 | replacementLength :: [Match] -> Text -> CodeUnitIndex
184 | replacementLength matches initial  = go matches (Utf8.lengthUtf8 initial)
185 |   where
186 |     go [] !acc = acc
187 |     go (Match _ matchLen repl : rest) !acc = go rest (acc - matchLen + Utf8.lengthUtf8 repl)
188 | 
189 | -- | Given a list of matches sorted on start position, remove matches that start
190 | -- within an earlier match.
191 | removeOverlap :: [Match] -> [Match]
192 | removeOverlap matches = case matches of
193 |   [] -> []
194 |   [m] -> [m]
195 |   (m0@(Match pos0 len0 _) : m1@(Match pos1 _ _) : ms) ->
196 |     if pos1 >= pos0 + len0
197 |       then m0 : removeOverlap (m1:ms)
198 |       else removeOverlap (m0:ms)
199 | 
200 | run :: Replacer -> Text -> Text
201 | run replacer = fromJust . runWithLimit replacer maxBound
202 | 
203 | {-# NOINLINE runWithLimit #-}
204 | runWithLimit :: Replacer -> CodeUnitIndex -> Text -> Maybe Text
205 | runWithLimit (Replacer searcher) maxLength = go initialThreshold
206 |   where
207 |     !automaton = Searcher.automaton searcher
208 | 
209 |     -- Priorities are 0 or lower, so an initial threshold of 1 keeps all
210 |     -- matches.
211 |     !initialThreshold = 1
212 | 
213 |     -- Needle priorities go from 0 for the highest priority to (-numNeedles + 1)
214 |     -- for the lowest priority. That means that if we find a match with
215 |     -- minPriority, we don't need to do another pass afterwards, because there
216 |     -- are no remaining needles.
217 |     !minPriority = 1 - Searcher.numNeedles searcher
218 | 
219 |     go :: Priority -> Text -> Maybe Text
220 |     go !threshold haystack =
221 |       let
222 |         seed = (minBound :: Priority, [])
223 |         matchesWithPriority = case Searcher.caseSensitivity searcher of
224 |           CaseSensitive -> Aho.runText seed (prependMatch threshold haystack) automaton haystack
225 |           IgnoreCase -> Aho.runLower seed (prependMatch threshold haystack) automaton haystack
226 |       in
227 |         case matchesWithPriority of
228 |           -- No match at the given threshold, there is nothing left to do.
229 |           -- Return the input string unmodified.
230 |           (_, []) -> Just haystack
231 |           -- We found matches at priority p. Remove overlapping matches, then
232 |           -- apply all replacements. Next, we need to go again, this time
233 |           -- considering only needles with a lower priority than p. As an
234 |           -- optimization (which matters mainly for the single needle case),
235 |           -- if we find a match at the lowest priority, we don't need another
236 |           -- pass. Note that if in `rawMatches` we find only matches of priority
237 |           -- p > minPriority, then we do still need another pass, because the
238 |           -- replacements could create new matches.
239 |           (p, matches)
240 |             | replacementLength matches haystack > maxLength -> Nothing
241 |             | p == minPriority -> Just $ replace (removeOverlap $ sort matches) haystack
242 |             | otherwise -> go p $ replace (removeOverlap $ sort matches) haystack
243 | 
244 |     -- When we iterate through all matches, keep track only of the matches with
245 |     -- the highest priority: those are the ones that we will replace first. If we
246 |     -- find multiple matches with that priority, remember all of them. If we find a
247 |     -- match with lower priority, ignore it, because we already have a more
248 |     -- important match. Also, if the priority is `threshold` or higher, ignore the
249 |     -- match, so we can exclude matches if we already did a round of replacements
250 |     -- for that priority. This way we don't have to build a new automaton after
251 |     -- every round of replacements.
252 |     prependMatch
253 |       :: Priority -> Text -> (Priority, [Match]) -> Aho.Match Payload -> Aho.Next (Priority, [Match])
254 |     {-# INLINE prependMatch #-}
255 |     prependMatch !threshold haystack (!pBest, !matches) (Aho.Match pos (Payload pMatch lenb lenc replacement))
256 |       | pMatch < threshold && pMatch >  pBest =
257 |           Aho.Step (pMatch, [makeMatch haystack pos lenb lenc replacement])
258 |       | pMatch < threshold && pMatch == pBest =
259 |           Aho.Step (pMatch, makeMatch haystack pos lenb lenc replacement : matches)
260 |       | otherwise = Aho.Step (pBest, matches)
261 | 
262 |     -- Pos is the code unit index past the last code unit of the match, we have
263 |     -- to find the first code unit.
264 |     makeMatch :: Text -> CodeUnitIndex -> CodeUnitIndex -> Int -> Replacement -> Match
265 |     {-# INLINE makeMatch #-}
266 |     makeMatch = case Searcher.caseSensitivity searcher of
267 |       -- Case sensitive: length is interpreted as number of bytes
268 |       CaseSensitive -> \_ pos lenb _ replacement ->
269 |         Match (pos - lenb) lenb replacement
270 |       -- Case insensitive: length is interpreted as number of characters
271 |       IgnoreCase -> \haystack pos _ lenc replacement ->
272 |         -- We start at the last byte of the match, and look backwards.
273 |         let start = Utf8.skipCodePointsBackwards haystack (pos-1) (lenc-1) in
274 |         Match start (pos - start) replacement
275 | 


--------------------------------------------------------------------------------
/tests/Data/Text/AhoCorasickSpec.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2022 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
  8 | 
  9 | {-# LANGUAGE BangPatterns #-}
 10 | {-# LANGUAGE OverloadedStrings #-}
 11 | {-# LANGUAGE ScopedTypeVariables #-}
 12 | 
 13 | module Data.Text.AhoCorasickSpec where
 14 | 
 15 | import Control.Monad (forM_)
 16 | import Data.Foldable (foldl')
 17 | import Data.List.NonEmpty (NonEmpty ((:|)))
 18 | import Test.Hspec (Expectation, Spec, describe, it, shouldBe)
 19 | import Test.Hspec.QuickCheck (modifyMaxSize, prop)
 20 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAll, forAllShrink)
 21 | import Test.QuickCheck.Instances ()
 22 | 
 23 | import qualified Data.Text as T
 24 | import qualified Test.QuickCheck.Gen as Gen
 25 | 
 26 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 27 | import Data.Text.TestInstances ()
 28 | import Data.Text.Utf8 (Text)
 29 | 
 30 | import qualified Data.Text.Utf8 as Text
 31 | import qualified Data.Text.Utf8 as Utf8
 32 | import qualified Data.Text.AhoCorasick.Automaton as Aho
 33 | import qualified Data.Text.AhoCorasick.Replacer as Replacer
 34 | import qualified Data.Text.AhoCorasick.Searcher as Searcher
 35 | import qualified Data.Text.AhoCorasick.Splitter as Splitter
 36 | 
 37 | spec :: Spec
 38 | spec = do
 39 |     -- Ensure that helper functions are actually helping
 40 |     -- Examples are from https://en.wikipedia.org/wiki/UTF-8
 41 |     describe "IsString ByteArray" $ do
 42 | 
 43 |         it "encodes the dollar sign" $ utf8Test "$" [0x24]
 44 |         it "encodes the euro sign" $ utf8Test "€" [0xe2, 0x82, 0xac]
 45 |         it "encodes the pound sign" $ utf8Test "£" [0xc2, 0xa3]
 46 |         it "encodes Hwair" $ utf8Test "𐍈" [0xf0, 0x90, 0x8d, 0x88]
 47 |         it "encodes all of the above" $ utf8Test "$€£𐍈" [0x24, 0xe2, 0x82, 0xac, 0xc2, 0xa3, 0xf0, 0x90, 0x8d, 0x88]
 48 | 
 49 |     describe "runText" $ do
 50 | 
 51 |         describe "countMatches" $ do
 52 |             it "counts the right number of matches in a basic example" $ do
 53 |                 countMatches Aho.CaseSensitive ["abc", "rst", "xyz"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 3
 54 | 
 55 |             it "counts the right number of matches in an example with 1-, 2-, 3- and 4-code unit code points" $ do
 56 |                 countMatches Aho.CaseSensitive ["$", "£"] "$€£𐍈" `shouldBe` 2
 57 | 
 58 |     describe "runLower" $ do
 59 | 
 60 |         describe "countMatches" $ do
 61 |             it "counts the right number of matches in a basic example" $ do
 62 |                 countMatches Aho.IgnoreCase ["abc", "rst", "xyz"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 3
 63 | 
 64 |             it "does not work with uppercase needles" $ do
 65 |                 countMatches Aho.IgnoreCase ["ABC", "Rst", "xYZ"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 0
 66 | 
 67 |             it "works with characters that are not in ASCII" $ do
 68 |                 countMatches Aho.IgnoreCase ["groß", "öffnung", "tür"] "Großfräsmaschinenöffnungstür" `shouldBe` 3
 69 |                 countMatches Aho.IgnoreCase ["groß", "öffnung", "tür"] "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` 3
 70 | 
 71 |     modifyMaxSize (const 10) $ describe "Replacer" $ do
 72 | 
 73 |         describe "run" $ do
 74 |             let
 75 |                 genHaystack = fmap Utf8.pack $ Gen.listOf $ Gen.frequency [(40, Gen.elements "abAB"), (1, pure 'İ'), (1, arbitrary)]
 76 | 
 77 |                 -- needles may not be empty, because empty needles are filtered out in an I.ActionReplaceMultiple
 78 |                 genNeedle = fmap Utf8.pack $ Gen.resize 3 $ Gen.listOf1 $ Gen.elements "abAB"
 79 |                 genReplaces = Gen.listOf $ (,) <$> genNeedle <*> arbitrary
 80 |                 shrinkReplaces = filter (not . any (\(needle, _) -> Utf8.null needle)) . shrink
 81 | 
 82 |                 replace needles haystack =
 83 |                     Replacer.run (Replacer.build Aho.CaseSensitive needles) haystack
 84 | 
 85 |                 replaceIgnoreCase needles haystack =
 86 |                     Replacer.run (Replacer.build Aho.IgnoreCase needles) haystack
 87 | 
 88 |             it "replaces all occurrences" $ do
 89 |                 replace [("A", "B")] "AXAXB" `shouldBe` "BXBXB"
 90 |                 replace [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB"
 91 |                 replace [("aaa", ""), ("b", "c")] "aaabaaa" `shouldBe` "c"
 92 |                 -- Have a few non-matching needles too.
 93 |                 replace [("A", "B"), ("Q", "r"), ("Z", "")] "AXAXB" `shouldBe` "BXBXB"
 94 | 
 95 |             it "replaces only non-overlapping matches" $ do
 96 |                 replace [("aa", "zz"), ("bb", "w")] "aaabbb" `shouldBe` "zzawb"
 97 |                 replace [("aaa", "")] "aaaaa" `shouldBe` "aa"
 98 | 
 99 |             it "replaces all occurrences in priority order" $ do
100 |                 replace [("A", ""), ("BBBB", "bingo")] "BBABB" `shouldBe` "bingo"
101 |                 replace [("BB", ""), ("BBBB", "bingo")] "BBBB" `shouldBe` ""
102 | 
103 |             it "replaces needles that contain a surrogate pair" $
104 |                 replace [("\x1f574", "levitating man in business suit")]
105 |                     "the \x1f574" `shouldBe` "the levitating man in business suit"
106 | 
107 | 
108 |             it "replaces all occurrences case-insensitively" $ do
109 |                 replaceIgnoreCase [("A", "B")] "AXAXB" `shouldBe` "BXBXB"
110 |                 replaceIgnoreCase [("A", "B")] "axaxb" `shouldBe` "BxBxb"
111 |                 replaceIgnoreCase [("a", "b")] "AXAXB" `shouldBe` "bXbXB"
112 | 
113 |                 replaceIgnoreCase [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB"
114 |                 replaceIgnoreCase [("A", "B"), ("X", "Y")] "axaxb" `shouldBe` "BYBYb"
115 |                 replaceIgnoreCase [("a", "b"), ("x", "y")] "AXAXB" `shouldBe` "bybyB"
116 | 
117 |             it "matches replacements case-insensitively" $
118 |               replaceIgnoreCase [("foo", "BAR"), ("bar", "BAZ")] "Foo" `shouldBe` "BAZ"
119 | 
120 |             it "matches replacements case-insensitively for non-ascii characters" $ do
121 |                 replaceIgnoreCase [("éclair", "lightning")] "Éclair" `shouldBe` "lightning"
122 |                 -- Note: U+0319 is an uppercase alpha, which looks exactly like A, but it
123 |                 -- is a different code point.
124 |                 replaceIgnoreCase [("å", "b")] "åÅÅ" `shouldBe` "bbb"
125 |                 replaceIgnoreCase [("k", "m")] "KkK" `shouldBe` "mmm"
126 |                 replaceIgnoreCase [("ǳ", "z")] "ǳǲǱ" `shouldBe` "zzz"
127 |                 replaceIgnoreCase [("bèta", "α"), ("\x0391", "alpha")] "BÈTA" `shouldBe` "alpha"
128 |                 replaceIgnoreCase [("ßèta", "sseta")] "ßèta" `shouldBe` "sseta"
129 |                 replaceIgnoreCase [("ßèta", "sseta")] "ẞÈTA" `shouldBe` "sseta"
130 | 
131 |             it "matches surrogate pairs case-insensitively" $ do
132 |                 -- We can't lowercase a levivating man in business suit, but that should
133 |                 -- not affect whether we match it or not.
134 |                 replaceIgnoreCase [("\x1f574", "levitating man in business suit")] "the \x1f574"
135 |                     `shouldBe` "the levitating man in business suit"
136 | 
137 |             prop "satisfies (run . compose a b) == (run b (run a))" $
138 |                 forAllShrink genHaystack shrink $ \haystack ->
139 |                 forAll arbitrary $ \case_ ->
140 |                 forAllShrink genReplaces shrinkReplaces $ \replaces1 ->
141 |                 forAllShrink genReplaces shrinkReplaces $ \replaces2 ->
142 |                 let
143 |                     rm1 = Replacer.build case_ replaces1
144 |                     rm2 = Replacer.build case_ replaces2
145 |                     Just rm12 = Replacer.compose rm1 rm2
146 |                 in
147 |                     Replacer.run rm2 (Replacer.run rm1 haystack)
148 |                     `shouldBe` Replacer.run rm12 haystack
149 | 
150 |             prop "is identity for empty needles" $ \case_ haystack ->
151 |                 let replacerId = Replacer.build case_ []
152 |                 in Replacer.run replacerId haystack `shouldBe` haystack
153 | 
154 |             prop "is equivalent to sequential Text.replace calls" $
155 |                 forAllShrink genHaystack shrink $ \haystack ->
156 |                 forAllShrink genReplaces shrinkReplaces $ \replaces ->
157 |                 let
158 |                     replacer = Replacer.build Aho.CaseSensitive replaces
159 |                     -- TODO: Remove conversions once we move to text-2.0
160 |                     replaceText agg (needle, replacement) = Utf8.pack $ T.unpack $ T.replace (T.pack $ Utf8.unpack needle) (T.pack $ Utf8.unpack replacement) (T.pack $ Utf8.unpack agg)
161 |                     expected = foldl' replaceText haystack replaces
162 |                 in
163 |                     Replacer.run replacer haystack `shouldBe` expected
164 | 
165 |     describe "Searcher" $ do
166 | 
167 |         describe "containsAny" $ do
168 | 
169 |             it "gives the right values for the examples in the README" $ do
170 |                 let needles = ["tshirt", "shirts", "shorts"]
171 |                 let searcher = Searcher.build Aho.CaseSensitive needles
172 | 
173 |                 Searcher.containsAny searcher "short tshirts" `shouldBe` True
174 |                 Searcher.containsAny searcher "long shirt" `shouldBe` False
175 |                 Searcher.containsAny searcher "Short TSHIRTS" `shouldBe` False
176 | 
177 |                 let searcher' = Searcher.build Aho.IgnoreCase needles
178 | 
179 |                 Searcher.containsAny searcher' "Short TSHIRTS" `shouldBe` True
180 | 
181 |             it "works with the the first line of the illiad" $ do
182 |                 let illiad = "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ"
183 |                     needleSets = [(["μοι"], True), (["Ὀδυσεύς"], False)]
184 | 
185 |                 forM_ needleSets $ \(needles, expectedResult) -> do
186 |                     let searcher = Searcher.build Aho.CaseSensitive needles
187 |                     Searcher.containsAny searcher illiad `shouldBe` expectedResult
188 | 
189 |             it "works with the the first line of the illiad (ignore case)" $ do
190 |                 let illiad = "ἌΝΔΡΑ ΜΟΙ ἜΝΝΕΠΕ, ΜΟΥ͂ΣΑ, ΠΟΛΎΤΡΟΠΟΝ, ὋΣ ΜΆΛΑ ΠΟΛΛᾺ"
191 |                     searcher = Searcher.build Aho.IgnoreCase ["μοι"]
192 |                 Searcher.containsAny searcher illiad `shouldBe` True
193 | 
194 |         describe "containsAll" $ do
195 | 
196 |             prop "never reports true for empty needles" $ \ (haystack :: Text) ->
197 |                 let
198 |                     searcher = Searcher.buildNeedleIdSearcher CaseSensitive [""]
199 |                 in
200 |                     Searcher.containsAll searcher haystack `shouldBe` False
201 | 
202 |             prop "is equivalent to sequential Text.isInfixOf calls for non-empty needles" $ \ (needles' :: [NonEmptyText]) (haystack :: Text) ->
203 |                 let
204 |                     needles = map unNonEmptyText needles'
205 |                     searcher = Searcher.buildNeedleIdSearcher CaseSensitive needles
206 |                 in
207 |                     Searcher.containsAll searcher haystack `shouldBe` all (`Text.isInfixOf` haystack) needles
208 | 
209 |             prop "is equivalent to sequential Text.isInfixOf calls for case-insensitive matching for non-empty needles" $ \ (needles' :: [NonEmptyText]) (haystack :: Text) ->
210 |                 let
211 |                     needles = map unNonEmptyText needles'
212 | 
213 |                     lowerNeedles = map Utf8.lowerUtf8 needles
214 |                     lowerHaystack = Utf8.lowerUtf8 haystack
215 | 
216 |                     searcher = Searcher.buildNeedleIdSearcher IgnoreCase lowerNeedles
217 |                 in
218 |                     Searcher.containsAll searcher haystack `shouldBe` all (`Text.isInfixOf` lowerHaystack) lowerNeedles
219 | 
220 |     describe "Splitter" $ do
221 | 
222 |         describe "split" $ do
223 | 
224 |             it "passes an example" $ do
225 |                 let separator = "bob"
226 |                     splitter = Splitter.build separator
227 | 
228 |                 Splitter.split splitter "C++bobobCOBOLbobScala" `shouldBe` "C++" :| ["obCOBOL", "Scala"]
229 |                 Splitter.splitIgnoreCase splitter "C++bobobCOBOLbobScala" `shouldBe` "C++" :| ["obCOBOL", "Scala"]
230 |                 Splitter.splitIgnoreCase splitter "C++BOBOBCOBOLBOBSCALA" `shouldBe` "C++" :| ["OBCOBOL", "SCALA"]
231 | 
232 |             it "neatly splits the first line of the illiad" $ do
233 |                 let splitter = Splitter.build ", "
234 | 
235 |                 Splitter.split splitter "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ" `shouldBe`
236 |                     "Ἄνδρα μοι ἔννεπε" :| ["Μοῦσα", "πολύτροπον", "ὃς μάλα πολλὰ"]
237 |                 Splitter.splitIgnoreCase splitter "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ" `shouldBe`
238 |                     "Ἄνδρα μοι ἔννεπε" :| ["Μοῦσα", "πολύτροπον", "ὃς μάλα πολλὰ"]
239 | 
240 |             it "splits on case insensitive needles" $ do
241 |                 -- The case variations of å have different byte lengths
242 |                 let splitter = Splitter.build "å"
243 |                 Splitter.splitIgnoreCase splitter "aaåbbÅccÅdd" `shouldBe`
244 |                     "aa" :| ["bb", "cc", "dd"]
245 | 
246 | -- helpers
247 | 
248 | utf8Test :: Utf8.Text -> [Utf8.CodeUnit] -> Expectation
249 | utf8Test str byteList = str `shouldBe` Utf8.fromByteList byteList
250 | 
251 | -- From ./benchmark
252 | countMatches :: Aho.CaseSensitivity -> [Utf8.Text] -> Utf8.Text -> Int
253 | {-# NOINLINE countMatches #-}
254 | countMatches caseSensitivity needles haystack = case needles of
255 |   [] -> 0
256 |   _  ->
257 |     let
258 |       ac = Aho.build $ zip needles (repeat ())
259 |       onMatch !n _match = Aho.Step (n + 1)
260 |     in
261 |       Aho.runWithCase caseSensitivity 0 onMatch ac haystack
262 | 
263 | -- | A newtype for generating non-empty 'Text' values.
264 | newtype NonEmptyText = NonEmptyText { unNonEmptyText :: Text }
265 | 
266 | -- | Simply generates and packs non-empty @[Char]@ values.
267 | instance Arbitrary NonEmptyText where
268 |     arbitrary = NonEmptyText . Text.pack <$> Gen.listOf1 arbitrary
269 | 
270 | instance Show NonEmptyText where
271 |     show = show . unNonEmptyText
272 | 


--------------------------------------------------------------------------------
/src/Data/Text/BoyerMoore/Automaton.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2019 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE CPP #-}
  9 | {-# LANGUAGE DeriveAnyClass #-}
 10 | {-# LANGUAGE DeriveGeneric #-}
 11 | {-# LANGUAGE DerivingStrategies #-}
 12 | {-# LANGUAGE ScopedTypeVariables #-}
 13 | 
 14 | -- | An efficient implementation of the Boyer-Moore string search algorithm.
 15 | -- http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140
 16 | -- https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
 17 | --
 18 | -- This module contains a almost 1:1 translation from the C example code in the
 19 | -- wikipedia article.
 20 | --
 21 | -- The algorithm here can be potentially improved by including the Galil rule
 22 | -- (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_Galil_rule)
 23 | module Data.Text.BoyerMoore.Automaton
 24 |     ( Automaton
 25 |     , CaseSensitivity (..)
 26 |     , CodeUnitIndex (..)
 27 |     , Next (..)
 28 |     , buildAutomaton
 29 |     , patternLength
 30 |     , patternText
 31 |     , runText
 32 |     ) where
 33 | 
 34 | import Prelude hiding (length)
 35 | 
 36 | import Control.DeepSeq (NFData)
 37 | import Control.Monad (when)
 38 | import Control.Monad.ST (runST)
 39 | import Data.Hashable (Hashable (..))
 40 | import Data.Primitive.Extended
 41 |   ( Prim
 42 |   , PrimArray
 43 |   , indexPrimArray
 44 |   , newPrimArray
 45 |   , replicateMutablePrimArray
 46 |   , unsafeFreezePrimArray
 47 |   , writePrimArray
 48 |   )
 49 | import GHC.Generics (Generic)
 50 | 
 51 | #if defined(HAS_AESON)
 52 | import qualified Data.Aeson as AE
 53 | #endif
 54 | 
 55 | import Data.Text.CaseSensitivity (CaseSensitivity (..))
 56 | import Data.Text.Utf8 (CodeUnit, CodeUnitIndex (..), Text)
 57 | 
 58 | import qualified Data.Text.Utf8 as Utf8
 59 | 
 60 | data Next a
 61 |   = Done !a
 62 |   | Step !a
 63 | 
 64 | -- | A Boyer-Moore automaton is based on lookup-tables that allow skipping through the haystack.
 65 | -- This allows for sub-linear matching in some cases, as we do not have to look at every input
 66 | -- character.
 67 | --
 68 | -- NOTE: Unlike the AcMachine, a Boyer-Moore automaton only returns non-overlapping matches.
 69 | -- This means that a Boyer-Moore automaton is not a 100% drop-in replacement for Aho-Corasick.
 70 | --
 71 | -- Returning overlapping matches would degrade the performance to /O(nm)/ in pathological cases like
 72 | -- finding @aaaa@ in @aaaaa....aaaaaa@ as for each match it would scan back the whole /m/ characters
 73 | -- of the pattern.
 74 | data Automaton = Automaton
 75 |   { automatonPattern :: !Text
 76 |   , automatonPatternHash :: !Int  -- ^ Remember our own hash (similar to what 'Hashed' does but our
 77 |                                   -- fields are strict).
 78 |   , automatonSuffixTable :: !SuffixTable
 79 |   , automatonBadCharTable :: !BadCharTable
 80 |   }
 81 |   deriving stock (Generic, Show)
 82 |   deriving anyclass (NFData)
 83 | 
 84 | instance Hashable Automaton where
 85 |   hashWithSalt salt (Automaton _ patternHash _ _) = hashWithSalt salt patternHash
 86 | 
 87 | instance Eq Automaton where
 88 |   (Automaton pat1 patHash1 _ _) == (Automaton pat2 patHash2 _ _) =
 89 |     patHash1 == patHash2 && pat1 == pat2
 90 | 
 91 | #if defined(HAS_AESON)
 92 | instance AE.FromJSON Automaton where
 93 |   parseJSON v = buildAutomaton <$> AE.parseJSON v
 94 | 
 95 | instance AE.ToJSON Automaton where
 96 |   toJSON = AE.toJSON . automatonPattern
 97 | #endif
 98 | 
 99 | buildAutomaton :: Text -> Automaton
100 | buildAutomaton pattern =
101 |   Automaton pattern (hash pattern) (buildSuffixTable pattern) (buildBadCharTable pattern)
102 | 
103 | -- | Finds all matches in the text, calling the match callback with the *first*
104 | -- matched character of each match of the pattern.
105 | --
106 | -- NOTE: This is unlike Aho-Corasick, which reports the index of the character
107 | -- right after a match.
108 | --
109 | -- NOTE: In the UTF-16 version of this module, there is a function 'Data.Text.BoyerMoore.Automaton.runLower'
110 | -- which does lower-case matching. This function does not exist for the UTF-8 version since it is very
111 | -- tricky to skip code points going backwards without preprocessing the whole input first.
112 | --
113 | -- NOTE: To get full advantage of inlining this function, you probably want to
114 | -- compile the compiling module with -fllvm and the same optimization flags as
115 | -- this module.
116 | runText  :: forall a
117 |   . a
118 |   -> (a -> CodeUnitIndex -> Next a)
119 |   -> Automaton
120 |   -> Text
121 |   -> a
122 | {-# INLINE runText #-}
123 | runText seed f automaton text
124 |   | patLen == 0 = seed
125 |   | otherwise = go seed (patLen - 1)
126 |   where
127 |     -- Use needle as identifier since pattern is potentially a keyword
128 |     Automaton needle _ suffixTable badCharTable = automaton
129 |     patLen = Utf8.lengthUtf8 needle
130 |     stringLen = Utf8.lengthUtf8 text
131 | 
132 |     codeUnitAt = Utf8.unsafeIndexCodeUnit text
133 | 
134 |     {-# INLINE go #-}
135 |     go result haystackIndex
136 |       | haystackIndex < stringLen = matchLoop result haystackIndex (patLen - 1)
137 |       | otherwise = result
138 | 
139 |     -- Compare the needle back-to-front with the haystack
140 |     matchLoop result haystackIndex needleIndex
141 |       | needleIndex >= 0 && codeUnitAt haystackIndex == Utf8.unsafeIndexCodeUnit needle needleIndex =
142 |         -- Characters match, try the pair before
143 |         matchLoop result (haystackIndex - 1) (needleIndex - 1)
144 |       -- We found a match (all needle characters matched)
145 |       | needleIndex < 0 =
146 |         case f result (haystackIndex + 1) of
147 |           Done final -> final
148 |           -- `haystackIndex` now points to the character just before the match starts
149 |           -- Adding `patLen` once points to the last character of the match,
150 |           -- Adding `patLen` once more points to the earliest character where
151 |           -- we can find a non-overlapping match.
152 |           Step intermediate -> go intermediate (haystackIndex + 2 * patLen)
153 |       -- We know it's not a match, the characters differ at the current position
154 |       | otherwise =
155 |         let
156 |           -- The bad character table tells us how far we can advance to the right so that the
157 |           -- character at the current position in the input string, where matching failed,
158 |           -- is lined up with it's rightmost occurrence in the needle.
159 |           -- Note: we could end up left of were we started, essentially never making progress,
160 |           -- if we were to use this rule alone.
161 |           badCharSkip = badCharLookup badCharTable (codeUnitAt haystackIndex)
162 |           suffixSkip = suffixLookup suffixTable needleIndex
163 |           skip = max badCharSkip suffixSkip
164 |         in
165 |           go result (haystackIndex + skip)
166 | 
167 | -- | Length of the matched pattern measured in UTF-8 code units (bytes).
168 | patternLength :: Automaton -> CodeUnitIndex
169 | patternLength = Utf8.lengthUtf8 . patternText
170 | 
171 | -- | Return the pattern that was used to construct the automaton.
172 | patternText :: Automaton -> Text
173 | patternText = automatonPattern
174 | 
175 | -- | The suffix table tells us for each character of the pattern how many characters we can
176 | -- jump ahead if the match fails at that point.
177 | newtype SuffixTable = SuffixTable (PrimArray Int)
178 |   deriving stock (Generic, Show)
179 |   deriving anyclass (NFData)
180 | 
181 | -- | Lookup an entry in the suffix table.
182 | suffixLookup :: SuffixTable -> CodeUnitIndex -> CodeUnitIndex
183 | {-# INLINE suffixLookup #-}
184 | suffixLookup (SuffixTable table) = CodeUnitIndex . indexTable table . codeUnitIndex
185 | 
186 | buildSuffixTable :: Text -> SuffixTable
187 | buildSuffixTable pattern = runST $ do
188 |   let patLen = Utf8.lengthUtf8 pattern
189 | 
190 |   table <- newPrimArray $ codeUnitIndex patLen
191 | 
192 |   let
193 |     -- Case 1: For each position of the pattern we record the shift that would align the pattern so
194 |     -- that it starts at the longest suffix that is at the same time a prefix, if a mismatch would
195 |     -- happen at that position.
196 |     --
197 |     -- Suppose the length of the pattern is n, a mismatch occurs at position i in the pattern and j
198 |     -- in the haystack, then we know that pattern[i+1..n] == haystack[j+1..j+n-i]. That is, we know
199 |     -- that the part of the haystack that we already matched is a suffix of the pattern.
200 |     -- If the pattern happens to have a prefix that is equal to or a shorter suffix of that matched
201 |     -- suffix, we can shift the pattern to the right so that the pattern starts at the longest
202 |     -- suffix that we have seen that conincides with a prefix of the pattern.
203 |     --
204 |     -- Consider the pattern `ababa`. Then we get
205 |     --
206 |     -- p:                0  1  2  3  4
207 |     -- Pattern:          a  b  a  b  a
208 |     -- lastPrefixIndex:  2  2  4  4  5
209 |     -- table:            6  5  6  5  5
210 |     init1 lastPrefixIndex p
211 |       | p >= 0 = do
212 |         let
213 |           prefixIndex
214 |             | isPrefix pattern (p + 1) = p + 1
215 |             | otherwise = lastPrefixIndex
216 |         writePrimArray table (codeUnitIndex p) (codeUnitIndex $ prefixIndex + patLen - 1 - p)
217 |         init1 prefixIndex (p - 1)
218 |       | otherwise = pure ()
219 | 
220 |     -- Case 2: We also have to account for the fact that the matching suffix of the pattern might
221 |     -- occur again somewhere within the pattern. In that case, we may not shift as far as if it was
222 |     -- a prefix. That is why the `init2` loop is run after `init1`, potentially overwriting some
223 |     -- entries with smaller shifts.
224 |     init2 p
225 |       | p < patLen - 1 = do
226 |         let
227 |           suffixLen = suffixLength pattern p
228 |         when (Utf8.unsafeIndexCodeUnit pattern (p - suffixLen) /= Utf8.unsafeIndexCodeUnit pattern (patLen - 1 - suffixLen)) $
229 |           writePrimArray table (codeUnitIndex $ patLen - 1 - suffixLen) (codeUnitIndex $ patLen - 1 - p + suffixLen)
230 |         init2 (p + 1)
231 |       | otherwise = pure ()
232 | 
233 |   init1 (patLen - 1) (patLen - 1)
234 |   init2 0
235 | 
236 |   SuffixTable <$> unsafeFreezePrimArray table
237 | 
238 | 
239 | -- | The bad char table tells us how far we may skip ahead when encountering a certain character
240 | -- in the input string. For example, if there's a character that is not contained in the pattern at
241 | -- all, we can skip ahead until after that character.
242 | data BadCharTable = BadCharTable
243 |   { badCharTableEntries :: {-# UNPACK #-} !(PrimArray Int)
244 |     -- ^ The element type should be CodeUnitIndex, but there's no unboxed vector for that type, and
245 |     -- defining it would be a lot of boilerplate.
246 |   , badCharTablePatternLen :: CodeUnitIndex
247 |   }
248 |   deriving stock (Generic, Show)
249 |   deriving anyclass (NFData)
250 | 
251 | -- | Number of entries in the fixed-size lookup-table of the bad char table.
252 | badcharTableSize :: Int
253 | {-# INLINE badcharTableSize #-}
254 | badcharTableSize = 256
255 | 
256 | -- | Lookup an entry in the bad char table.
257 | badCharLookup :: BadCharTable -> CodeUnit -> CodeUnitIndex
258 | {-# INLINE badCharLookup #-}
259 | badCharLookup (BadCharTable asciiTable _patLen) char = CodeUnitIndex $ indexTable asciiTable intChar
260 |   where
261 |     intChar = fromIntegral char
262 | 
263 | -- | True if the suffix of the @pattern@ starting from @pos@ is a prefix of the pattern
264 | -- For example, @isPrefix \"aabbaa\" 4 == True@.
265 | isPrefix :: Text -> CodeUnitIndex -> Bool
266 | isPrefix needle pos = go 0
267 |   where
268 |     suffixLen = Utf8.lengthUtf8 needle - pos
269 |     go i
270 |       | i < suffixLen =
271 |         -- FIXME: Check whether implementing the linter warning kills tco
272 |         if Utf8.unsafeIndexCodeUnit needle i == Utf8.unsafeIndexCodeUnit needle (pos + i)
273 |           then go (i + 1)
274 |           else False
275 |       | otherwise = True
276 | 
277 | -- | Length of the longest suffix of the pattern ending on @pos@.
278 | -- For example, @suffixLength \"abaacbbaac\" 4 == 4@, because the substring \"baac\" ends at position
279 | -- 4 and is at the same time the longest suffix that does so, having length 4.
280 | suffixLength :: Text -> CodeUnitIndex -> CodeUnitIndex
281 | suffixLength pattern pos = go 0
282 |   where
283 |     patLen = Utf8.lengthUtf8 pattern
284 |     go i
285 |       | Utf8.unsafeIndexCodeUnit pattern (pos - i) == Utf8.unsafeIndexCodeUnit pattern (patLen - 1 - i) && i < pos = go (i + 1)
286 |       | otherwise = i
287 | 
288 | buildBadCharTable :: Text -> BadCharTable
289 | buildBadCharTable pattern = runST $ do
290 |   let patLen = Utf8.lengthUtf8 pattern
291 | 
292 |   -- Initialize table with the maximum skip distance, which is the length of the pattern.
293 |   -- This applies to all characters that are not part of the pattern.
294 |   asciiTable <- replicateMutablePrimArray badcharTableSize $ codeUnitIndex patLen
295 | 
296 |   let
297 |     -- Fill the bad character table based on the rightmost occurrence of a character in the pattern.
298 |     -- Note that there is also a variant of Boyer-Moore that records all positions (see Wikipedia,
299 |     -- but that requires even more storage space).
300 |     -- Also note that we exclude the last character of the pattern when building the table.
301 |     -- This is because
302 |     --
303 |     -- 1. If the last character does not occur anywhere else in the pattern and we encounter it
304 |     --    during a mismatch, we can advance the pattern to just after that character:
305 |     --
306 |     --    Haystack: aaadcdabcdbb
307 |     --    Pattern:    abcd
308 |     --
309 |     --    In the above example, we would match `d` and `c`, but then fail because `d` != `b`.
310 |     --    Since `d` only occurs at the very last position of the pattern, we can shift to
311 |     --
312 |     --    Haystack: aaadcdabcdbb
313 |     --    Pattern:      abcd
314 |     --
315 |     -- 2. If it does occur anywhere else in the pattern, we can only shift as far as it's necessary
316 |     --    to align it with the haystack:
317 |     --
318 |     --    Haystack: aaadddabcdbb
319 |     --    Pattern:    adcd
320 |     --
321 |     --    We match `d`, and then there is a mismatch `d` != `c`, which allows us to shift only up to:
322 | 
323 |     --    Haystack: aaadddabcdbb
324 |     --    Pattern:     adcd
325 |     fillTable !i
326 |       -- for(i = 0; i < patLen - 1; i++) {
327 |       | i < patLen - 1 = do
328 |         let patChar = Utf8.unsafeIndexCodeUnit pattern i
329 |         writePrimArray asciiTable (fromIntegral patChar) (codeUnitIndex $ patLen - 1 - i)
330 |         fillTable (i + 1)
331 |       | otherwise = pure ()
332 | 
333 |   fillTable 0
334 | 
335 |   asciiTableFrozen <- unsafeFreezePrimArray asciiTable
336 | 
337 |   pure BadCharTable
338 |     { badCharTableEntries = asciiTableFrozen
339 |     , badCharTablePatternLen = patLen
340 |     }
341 | 
342 | 
343 | -- Helper functions for easily toggling the safety of this module
344 | 
345 | -- | Read from a lookup table at the specified index.
346 | indexTable :: Prim a => PrimArray a -> Int -> a
347 | {-# INLINE indexTable #-}
348 | indexTable = indexPrimArray
349 | 


--------------------------------------------------------------------------------
/src/Data/Text/Utf8.hs:
--------------------------------------------------------------------------------
  1 | -- Alfred-Margaret: Fast Aho-Corasick string searching
  2 | -- Copyright 2022 Channable
  3 | --
  4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the
  5 | -- repository root.
  6 | 
  7 | {-# LANGUAGE BangPatterns #-}
  8 | {-# LANGUAGE BinaryLiterals #-}
  9 | {-# LANGUAGE CPP #-}
 10 | {-# LANGUAGE DeriveGeneric #-}
 11 | {-# LANGUAGE DerivingStrategies #-}
 12 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 13 | {-# LANGUAGE MagicHash #-}
 14 | {-# LANGUAGE NumericUnderscores #-}
 15 | {-# LANGUAGE UnboxedTuples #-}
 16 | 
 17 | -- | This module provides functions that allow treating 'Text' values as series of UTF-8 code units
 18 | -- instead of characters. Any calls to 'Text' in @alfred-margaret@ go through this module.
 19 | -- Therefore we re-export some 'Text' functions, e.g. 'Text.concat'.
 20 | module Data.Text.Utf8
 21 |     ( CodePoint
 22 |     , CodeUnit
 23 |     , CodeUnitIndex (..)
 24 |     , Text (..)
 25 |     , fromByteList
 26 |     , isCaseInvariant
 27 |     , lengthUtf8
 28 |     , lowerCodePoint
 29 |     , unlowerCodePoint
 30 |     , lowerUtf8
 31 |     , toLowerAscii
 32 |     , unicode2utf8
 33 |     , unpackUtf8
 34 |       -- * Decoding
 35 |       --
 36 |       -- $decoding
 37 |     , decode2
 38 |     , decode3
 39 |     , decode4
 40 |     , decodeUtf8
 41 |       -- * Indexing
 42 |       --
 43 |       -- $indexing
 44 |     , indexCodeUnit
 45 |     , unsafeIndexCodePoint
 46 |     , unsafeIndexCodeUnit
 47 |     , skipCodePointsBackwards
 48 |       -- * Slicing Functions
 49 |       --
 50 |       -- $slicingFunctions
 51 |     , unsafeCutUtf8
 52 |     , unsafeSliceUtf8
 53 |       -- * Functions on Arrays
 54 |       --
 55 |       -- $functionsOnArrays
 56 |     , arrayContents
 57 |     , isArrayPinned
 58 |     , unsafeIndexCodePoint'
 59 |     , unsafeIndexCodeUnit'
 60 |     , BackwardsIter (..)
 61 |     , unsafeIndexEndOfCodePoint'
 62 |     , unsafeIndexAnywhereInCodePoint'
 63 | 
 64 |       -- * General Functions
 65 |       --
 66 |       -- $generalFunctions
 67 |     , Text.concat
 68 |     , Text.dropWhile
 69 |     , Text.isInfixOf
 70 |     , Text.null
 71 |     , Text.pack
 72 |     , Text.replicate
 73 |     , Text.unpack
 74 |     , TextSearch.indices
 75 |     ) where
 76 | 
 77 | import Control.DeepSeq (NFData)
 78 | import Data.Bits (Bits (shiftL), shiftR, (.&.), (.|.))
 79 | import Data.Hashable (Hashable)
 80 | import Data.Text.Internal (Text (..))
 81 | import Data.Word (Word8)
 82 | import GHC.Generics (Generic)
 83 | import Data.Primitive (ByteArray (ByteArray), Prim, byteArrayFromList)
 84 | #if defined(HAS_AESON)
 85 | import Data.Aeson (FromJSON, ToJSON)
 86 | #endif
 87 | import Data.Text.Utf8.Unlower (unlowerCodePoint)
 88 | 
 89 | import qualified Data.Char as Char
 90 | import qualified Data.Text as Text
 91 | import qualified Data.Text.Array as TextArray
 92 | import qualified Data.Text.Internal.Search as TextSearch
 93 | import qualified Data.Text.Unsafe as TextUnsafe
 94 | import qualified GHC.Exts as Exts
 95 | 
 96 | -- | A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.
 97 | type CodeUnit = Word8
 98 | 
 99 | -- | A Unicode code point.
100 | type CodePoint = Char
101 | 
102 | -- | An index into the raw UTF-8 data of a `Text`. This is not the code point
103 | -- index as conventionally accepted by `Text`, so we wrap it to avoid confusing
104 | -- the two. Incorrect index manipulation can lead to surrogate pairs being
105 | -- sliced, so manipulate indices with care. This type is also used for lengths.
106 | newtype CodeUnitIndex = CodeUnitIndex
107 |     { codeUnitIndex :: Int
108 |     }
109 |     deriving stock (Eq, Ord, Generic, Bounded)
110 | #if defined(HAS_AESON)
111 |     deriving newtype (Show, Prim, Hashable, Num, NFData, FromJSON, ToJSON)
112 | #else
113 |     deriving newtype (Show, Prim, Hashable, Num, NFData)
114 | #endif
115 | 
116 | {-# INLINABLE unpackUtf8 #-}
117 | unpackUtf8 :: Text -> [CodeUnit]
118 | unpackUtf8 (Text u8data offset len) =
119 |   let
120 |     go _ 0 = []
121 |     go i n = unsafeIndexCodeUnit' u8data (CodeUnitIndex i) : go (i + 1) (n - 1)
122 |   in
123 |     go offset len
124 | 
125 | -- | The return value of this function is not really an index.
126 | -- However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.
127 | lengthUtf8 :: Text -> CodeUnitIndex
128 | lengthUtf8 (Text _ _ !len) = CodeUnitIndex len
129 | 
130 | -- | Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.
131 | {-# INLINE toLowerAscii #-}
132 | toLowerAscii :: Char -> Char
133 | toLowerAscii cp
134 |   | Char.isAsciiUpper cp = Char.chr (Char.ord cp + 0x20)
135 |   | otherwise = cp
136 | 
137 | -- | Lowercase a 'Text' by applying 'lowerCodePoint' to each 'Char'.
138 | {-# INLINE lowerUtf8 #-}
139 | lowerUtf8 :: Text -> Text
140 | lowerUtf8 = Text.map lowerCodePoint
141 | 
142 | asciiCount :: Int
143 | asciiCount = 128
144 | 
145 | {-# INLINE lowerCodePoint #-}
146 | -- | Lower-Case a UTF-8 codepoint.
147 | -- Uses 'toLowerAscii' for ASCII and 'Char.toLower' otherwise.
148 | lowerCodePoint :: Char -> Char
149 | lowerCodePoint cp
150 |   | Char.ord cp < asciiCount = toLowerAscii cp
151 |   | otherwise = Char.toLower cp
152 | 
153 | -- | Convert a Unicode Code Point 'c' into a list of UTF-8 code units (bytes).
154 | unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
155 | {-# INLINE unicode2utf8 #-}
156 | unicode2utf8 c
157 |     | c < 0x80    = [c]
158 |     | c < 0x800   = [0xc0 .|. (c `shiftR` 6), 0x80 .|. (0x3f .&. c)]
159 |     | c < 0x10000 = [0xe0 .|. (c `shiftR` 12), 0x80 .|. (0x3f .&. (c `shiftR` 6)), 0x80 .|. (0x3f .&. c)]
160 |     | otherwise   = [0xf0 .|. (c `shiftR` 18), 0x80 .|. (0x3f .&. (c `shiftR` 12)), 0x80 .|. (0x3f .&. (c `shiftR` 6)), 0x80 .|. (0x3f .&. c)]
161 | 
162 | fromByteList :: [Word8] -> Text
163 | fromByteList byteList = Text (TextArray.ByteArray ba#) 0 (length byteList)
164 |   where !(ByteArray ba#) = byteArrayFromList byteList
165 | 
166 | -- | Return whether text has exactly one case variation, such that this function
167 | -- will not return true when Aho–Corasick would differentiate when doing
168 | -- case-insensitive matching.
169 | {-# INLINE isCaseInvariant #-}
170 | isCaseInvariant :: Text -> Bool
171 | isCaseInvariant = Text.all (\c -> unlowerCodePoint (lowerCodePoint c) == [c])
172 | 
173 | -- $decoding
174 | --
175 | -- Functions that turns code unit sequences into code point sequences.
176 | 
177 | -- | Decode a single UTF-8 code unit into its code point.
178 | -- The given code unit should have the following format:
179 | --
180 | -- > ┌───────────────┐
181 | -- > │0 x x x x x x x│
182 | -- > └───────────────┘
183 | decode1 :: CodeUnit -> CodePoint
184 | decode1 cu0 =
185 |   Char.chr $ fromIntegral cu0
186 | 
187 | -- | Decode 2 UTF-8 code units into their code point.
188 | -- The given code units should have the following format:
189 | --
190 | -- > ┌───────────────┬───────────────┐
191 | -- > │1 1 0 x x x x x│1 0 x x x x x x│
192 | -- > └───────────────┴───────────────┘
193 | {-# INLINE decode2 #-}
194 | decode2 :: CodeUnit -> CodeUnit -> CodePoint
195 | decode2 cu0 cu1 =
196 |   Char.chr $ (fromIntegral cu0 .&. 0x1f) `shiftL` 6 .|. fromIntegral cu1 .&. 0x3f
197 | 
198 | -- | Decode 3 UTF-8 code units into their code point.
199 | -- The given code units should have the following format:
200 | --
201 | -- > ┌───────────────┬───────────────┬───────────────┐
202 | -- > │1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
203 | -- > └───────────────┴───────────────┴───────────────┘
204 | {-# INLINE decode3 #-}
205 | decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
206 | decode3 cu0 cu1 cu2 =
207 |   Char.chr $ (fromIntegral cu0 .&. 0xf) `shiftL` 12 .|. (fromIntegral cu1 .&. 0x3f) `shiftL` 6 .|. (fromIntegral cu2 .&. 0x3f)
208 | 
209 | -- | Decode 4 UTF-8 code units into their code point.
210 | -- The given code units should have the following format:
211 | --
212 | -- > ┌───────────────┬───────────────┬───────────────┬───────────────┐
213 | -- > │1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
214 | -- > └───────────────┴───────────────┴───────────────┴───────────────┘
215 | {-# INLINE decode4 #-}
216 | decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
217 | decode4 cu0 cu1 cu2 cu3 =
218 |   Char.chr $ (fromIntegral cu0 .&. 0x7) `shiftL` 18 .|. (fromIntegral cu1 .&. 0x3f) `shiftL` 12 .|. (fromIntegral cu2 .&. 0x3f) `shiftL` 6 .|. (fromIntegral cu3 .&. 0x3f)
219 | 
220 | -- | Decode a list of UTF-8 code units into a list of code points.
221 | decodeUtf8 :: [CodeUnit] -> [CodePoint]
222 | decodeUtf8 [] = []
223 | decodeUtf8 (cu0 : cus) | cu0 < 0xc0 = Char.chr (fromIntegral cu0) : decodeUtf8 cus
224 | decodeUtf8 (cu0 : cu1 : cus) | cu0 < 0xe0 = decode2 cu0 cu1 : decodeUtf8 cus
225 | decodeUtf8 (cu0 : cu1 : cu2 : cus) | cu0 < 0xf0 = decode3 cu0 cu1 cu2 : decodeUtf8 cus
226 | decodeUtf8 (cu0 : cu1 : cu2 : cu3 : cus) | cu0 < 0xf8 = decode4 cu0 cu1 cu2 cu3 : decodeUtf8 cus
227 | decodeUtf8 cus = error $ "Invalid UTF-8 input sequence at " ++ show (take 4 cus)
228 | 
229 | -- $indexing
230 | --
231 | -- 'Text' can be indexed by code units or code points.
232 | -- A 'CodePoint' is a 21-bit Unicode code point and can consist of up to four code units.
233 | -- A 'CodeUnit' is a single byte.
234 | 
235 | -- | Does exactly the same thing as 'unsafeIndexCodePoint'', but on 'Text' values.
236 | {-# INLINE unsafeIndexCodePoint #-}
237 | unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
238 | unsafeIndexCodePoint (Text !u8data !off !_len) !index =
239 |   unsafeIndexCodePoint' u8data $ CodeUnitIndex off + index
240 | 
241 | -- | Get the code unit at the given 'CodeUnitIndex'.
242 | -- Performs bounds checking.
243 | {-# INLINE indexCodeUnit #-}
244 | indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
245 | indexCodeUnit !text !index
246 |   | index < 0 || index >= lengthUtf8 text = error $ "Index out of bounds " ++ show index
247 |   | otherwise = unsafeIndexCodeUnit text index
248 | 
249 | {-# INLINE unsafeIndexCodeUnit #-}
250 | unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
251 | unsafeIndexCodeUnit (Text !u8data !off !_len) !index =
252 |   unsafeIndexCodeUnit' u8data $ CodeUnitIndex off + index
253 | 
254 | -- | Scan backwards through the text until we've seen the specified number of codepoints. Assumes
255 | -- that the initial CodeUnitIndex is within a codepoint.
256 | {-# INLINE skipCodePointsBackwards #-}
257 | skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex
258 | skipCodePointsBackwards (Text !u8data !off !len) !index0 !n0
259 |   | index0 >= CodeUnitIndex len = error "Invalid use of skipCodePointsBackwards"
260 |   | otherwise = loop (index0 + CodeUnitIndex off) n0
261 |   where
262 |     loop index n | atTrailingByte index =
263 |       loop (index-1) n  -- Don't exit before we're at a leading byte
264 |     loop index 0 | index < 0 =
265 |       -- Throw an error if we've read before the array (e.g. when the data was
266 |       -- not valid UTF-8), this one-time check doesn't prevent undefined
267 |       -- behaviour but may help you locate bugs.
268 |       error "Invalid use of skipCodePointsBackwards"
269 |     loop index 0 =
270 |       index - CodeUnitIndex off
271 |     loop index n =
272 |       loop (index-1) (n-1)
273 | 
274 |     -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
275 |     -- the first byte can be 0xxxxxxx or 11yyyyyy.
276 |     atTrailingByte !index = unsafeIndexCodeUnit' u8data index .&. 0b1100_0000 == 0b1000_0000
277 | 
278 | -- $slicingFunctions
279 | --
280 | -- 'unsafeCutUtf8' and 'unsafeSliceUtf8' are used to retrieve slices of 'Text' values.
281 | -- @unsafeSliceUtf8 begin length@ returns a substring of length @length@ starting at @begin@.
282 | -- @unsafeSliceUtf8 begin length@ returns a tuple of the "surrounding" substrings.
283 | --
284 | -- They satisfy the following property:
285 | --
286 | -- > let (prefix, suffix) = unsafeCutUtf8 begin length t
287 | -- > in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t
288 | --
289 | -- The following diagram visualizes the relevant offsets for @begin = CodeUnitIndex 2@, @length = CodeUnitIndex 6@ and @t = \"BCDEFGHIJKL\"@.
290 | --
291 | -- >  off                 off+len
292 | -- >   │                     │
293 | -- >   ▼                     ▼
294 | -- > ──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
295 | -- >  A│B│C│D│E│F│G│H│I│J│K│L│M│N
296 | -- > ──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
297 | -- >       ▲           ▲
298 | -- >       │           │
299 | -- >  off+begin   off+begin+length
300 | -- >
301 | -- > unsafeSliceUtf8 begin length t == "DEFGHI"
302 | -- > unsafeCutUtf8 begin length t == ("BC", "JKL")
303 | --
304 | -- The shown array is open at each end because in general, @t@ may be a slice as well.
305 | --
306 | -- __WARNING__: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.
307 | 
308 | unsafeCutUtf8 :: CodeUnitIndex -- ^ Starting position of substring.
309 |   -> CodeUnitIndex -- ^ Length of substring.
310 |   -> Text -- ^ Initial string.
311 |   -> (Text, Text)
312 | unsafeCutUtf8 (CodeUnitIndex !begin) (CodeUnitIndex !len) !text =
313 |   ( TextUnsafe.takeWord8 begin text
314 |   , TextUnsafe.dropWord8 (begin + len) text
315 |   )
316 | 
317 | unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
318 | unsafeSliceUtf8 (CodeUnitIndex !begin) (CodeUnitIndex !len) !text =
319 |   TextUnsafe.takeWord8 len $ TextUnsafe.dropWord8 begin text
320 | 
321 | -- $functionsOnArrays
322 | --
323 | -- Functions for working with 'TextArray.Array' values.
324 | 
325 | -- | See 'Data.Primitive.isByteArrayPinned'.
326 | isArrayPinned :: TextArray.Array -> Bool
327 | isArrayPinned (TextArray.ByteArray ba#) = Exts.isTrue# (Exts.isByteArrayPinned# ba#)
328 | 
329 | -- | See 'Data.Primitive.byteArrayContents'.
330 | arrayContents :: TextArray.Array -> Exts.Ptr Word8
331 | arrayContents (TextArray.ByteArray ba#) = Exts.Ptr (Exts.byteArrayContents# ba#)
332 | 
333 | -- | Decode a code point at the given 'CodeUnitIndex'.
334 | -- Returns garbage if there is no valid code point at that position.
335 | -- Does not perform bounds checking.
336 | -- See 'decode2', 'decode3' and 'decode4' for the expected format of multi-byte code points.
337 | unsafeIndexCodePoint' :: TextArray.Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
338 | {-# INLINE unsafeIndexCodePoint' #-}
339 | unsafeIndexCodePoint' !u8data !idx =
340 |   decodeN (cuAt 0) (cuAt 1) (cuAt 2) (cuAt 3)
341 |   where
342 |     cuAt !i = unsafeIndexCodeUnit' u8data $ idx + i
343 | 
344 | decodeN :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, CodePoint)
345 | {-# INLINE decodeN #-}
346 | decodeN cu0 cu1 cu2 cu3
347 |   | cu0 < 0xc0 = (1, decode1 cu0)
348 |   | cu0 < 0xe0 = (2, decode2 cu0 cu1)
349 |   | cu0 < 0xf0 = (3, decode3 cu0 cu1 cu2)
350 |   | otherwise = (4, decode4 cu0 cu1 cu2 cu3)
351 | 
352 | 
353 | 
354 | -- | Intermediate state when you're iterating backwards through a Utf8 text.
355 | data BackwardsIter = BackwardsIter
356 |   { backwardsIterNext :: {-# UNPACK #-} !CodeUnitIndex
357 |     -- ^ First byte to the left of the codepoint that we're focused on. This can
358 |     -- be used with 'unsafeIndexEndOfCodePoint'' to find the next codepoint.
359 |   , backwardsIterChar :: {-# UNPACK #-} !CodePoint
360 |     -- ^ The codepoint that we're focused on
361 |   , backwardsIterEndOfChar :: {-# UNPACK #-} !CodeUnitIndex
362 |     -- ^ Points to the last byte of the codepoint that we're focused on
363 |   }
364 |   deriving Generic
365 | 
366 | -- | Similar to unsafeIndexCodePoint', but assumes that the given index is the
367 | -- end of a utf8 codepoint. It returns the decoded code point and the index
368 | -- _before_ the code point. The resulting index could be passed directly to
369 | -- unsafeIndexEndOfCodePoint' again to decode the _previous_ code point.
370 | unsafeIndexEndOfCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter
371 | {-# INLINE unsafeIndexEndOfCodePoint' #-}
372 | unsafeIndexEndOfCodePoint' !u8data !idx =
373 |   let
374 |     cuAt !i = unsafeIndexCodeUnit' u8data $ idx - i
375 |     -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
376 |     -- the first byte can be 0xxxxxxx or 11yyyyyy.
377 |     isFirstByte !cu = cu .&. 0b1100_0000 /= 0b1000_0000
378 |     cu0 = cuAt 0
379 |   in
380 |     if isFirstByte cu0
381 |     then BackwardsIter (idx - 1) (decode1 cu0) idx
382 |     else
383 |       let cu1 = cuAt 1 in
384 |       if isFirstByte cu1
385 |       then BackwardsIter (idx - 2) (decode2 cu1 cu0) idx
386 |       else
387 |         let cu2 = cuAt 2 in
388 |         if isFirstByte cu2
389 |         then BackwardsIter (idx - 3) (decode3 cu2 cu1 cu0) idx
390 |         else
391 |           let cu3 = cuAt 3 in
392 |           if isFirstByte cu3
393 |           then BackwardsIter (idx - 4) (decode4 cu3 cu2 cu1 cu0) idx
394 |           else
395 |             error "unsafeIndexEndOfCodePoint' could not find valid UTF8 codepoint"
396 | 
397 | unsafeIndexAnywhereInCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter
398 | {-# INLINE unsafeIndexAnywhereInCodePoint' #-}
399 | unsafeIndexAnywhereInCodePoint' !u8data !idx =
400 |   let
401 |     cuAt !i = unsafeIndexCodeUnit' u8data $ idx + i
402 |     -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while
403 |     -- the first byte can be 0xxxxxxx or 11yyyyyy.
404 |     isFirstByte !cu = cu .&. 0b1100_0000 /= 0b1000_0000
405 |     cu0 = cuAt 0
406 | 
407 |     makeBackwardsIter next (l, cp) = BackwardsIter next cp (next + l)
408 |   in
409 |     if isFirstByte cu0
410 |     then makeBackwardsIter (idx - 1) $ decodeN cu0 (cuAt 1) (cuAt 2) (cuAt 3)
411 |     else
412 |       let cu00 = cuAt (-1) in
413 |       if isFirstByte cu00
414 |       then makeBackwardsIter (idx - 2) $ decodeN cu00 cu0 (cuAt 1) (cuAt 2)
415 |       else
416 |         let cu000 = cuAt (-2) in
417 |         if isFirstByte cu000
418 |         then makeBackwardsIter (idx - 3) $ decodeN cu000 cu00 cu0 (cuAt 1)
419 |         else
420 |           let cu0000 = cuAt (-3) in
421 |           if isFirstByte cu0000
422 |           then makeBackwardsIter (idx - 4) $ decodeN cu0000 cu000 cu00 cu0
423 |           else
424 |             error "unsafeIndexAnywhereInCodePoint' could not find valid UTF8 codepoint"
425 | 
426 | {-# INLINE unsafeIndexCodeUnit' #-}
427 | unsafeIndexCodeUnit' :: TextArray.Array -> CodeUnitIndex -> CodeUnit
428 | unsafeIndexCodeUnit' !u8data (CodeUnitIndex !idx) = TextArray.unsafeIndex u8data idx
429 | 
430 | -- $generalFunctions
431 | --
432 | -- Re-exported from 'Text'.
433 | 


--------------------------------------------------------------------------------