├── benchmark ├── rust │ ├── rust-toolchain │ ├── Cargo.toml │ ├── Cargo.lock │ └── src │ │ └── main.rs ├── rust-ffi │ ├── libacbench │ │ ├── rust-toolchain │ │ ├── Cargo.toml │ │ ├── Cargo.lock │ │ └── src │ │ │ └── lib.rs │ ├── package.yaml │ └── app │ │ └── Main.hs ├── java │ ├── BUILD │ ├── WORKSPACE │ └── main.java ├── data-utf8 │ └── example.txt ├── data │ └── example.txt ├── .gitignore ├── haskell │ ├── package.yaml │ └── app │ │ └── Main.hs ├── default.nix ├── report.py ├── naive.py ├── benchmark.py └── README.md ├── nix ├── haskell-overlay.nix ├── ghc966-overlay.nix ├── nixpkgs-pinned.nix ├── haskell-dependencies.nix ├── sources.json └── sources.nix ├── Setup.hs ├── performance.png ├── .semaphore ├── install-nix.sha256 └── semaphore.yml ├── .gitignore ├── stack.yaml.lock ├── stack.yaml ├── tests ├── Main.hs └── Data │ └── Text │ ├── TestInstances.hs │ ├── Utf8Spec.hs │ ├── BoyerMooreCISpec.hs │ ├── BoyerMooreSpec.hs │ └── AhoCorasickSpec.hs ├── app └── dump-automaton │ └── Main.hs ├── src └── Data │ ├── Text │ ├── CaseSensitivity.hs │ ├── Utf8 │ │ └── Unlower.hs │ ├── BoyerMooreCI │ │ ├── Replacer.hs │ │ └── Searcher.hs │ ├── BoyerMoore │ │ ├── Replacer.hs │ │ ├── Searcher.hs │ │ └── Automaton.hs │ ├── AhoCorasick │ │ ├── Splitter.hs │ │ ├── Searcher.hs │ │ └── Replacer.hs │ └── Utf8.hs │ └── Primitive │ └── Extended.hs ├── stack-shell.nix ├── default.nix ├── LICENSE ├── benchmark.sh ├── bench ├── uvector-vs-tba │ └── Main.hs └── bm │ └── Main.hs ├── CHANGELOG.md ├── README.md ├── alfred-margaret.cabal └── .stylish-haskell.yaml /benchmark/rust/rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /nix/haskell-overlay.nix: -------------------------------------------------------------------------------- 1 | self: super: { 2 | 3 | } 4 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/libacbench/rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/channable/alfred-margaret/HEAD/performance.png -------------------------------------------------------------------------------- /.semaphore/install-nix.sha256: -------------------------------------------------------------------------------- 1 | a2d0e4f6954a6295664994dc4e5492843b7de3e7e23e89a1df9e0820975d2fde install-nix-2.24.12 2 | -------------------------------------------------------------------------------- /benchmark/rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "acbench-rust" 3 | version = "0.0.0" 4 | 5 | [dependencies] 6 | aho-corasick = "0.6.8" 7 | filebuffer = "0.4.0" 8 | memchr = "2.1.0" 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Stack build directory. 2 | /.stack-work/ 3 | 4 | # Graph files generated by dot for debugging 5 | debug/*.png 6 | 7 | # Benchmark results 8 | *.stats 9 | *.results 10 | -------------------------------------------------------------------------------- /nix/ghc966-overlay.nix: -------------------------------------------------------------------------------- 1 | self: super: 2 | let 3 | haskellOverlay = import ./haskell-overlay.nix; 4 | in { 5 | ghc966Packages = super.haskell.packages.ghc966.extend haskellOverlay; 6 | } 7 | -------------------------------------------------------------------------------- /nix/nixpkgs-pinned.nix: -------------------------------------------------------------------------------- 1 | { overlays ? [] }: 2 | let 3 | sources = import ./sources.nix; 4 | in 5 | import sources.nixpkgs { 6 | overlays = [(import ./ghc966-overlay.nix)] ++ overlays; 7 | } 8 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/libacbench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "libacbench" 3 | version = "0.0.0" 4 | 5 | [dependencies] 6 | aho-corasick = "0.6.8" 7 | 8 | [lib] 9 | name = "libacbench" 10 | crate-type = ["staticlib"] -------------------------------------------------------------------------------- /benchmark/java/BUILD: -------------------------------------------------------------------------------- 1 | # NOTE: You can build a standalone jar with `bazel build :acbench_deploy.jar`. 2 | 3 | java_binary( 4 | name = "acbench", 5 | srcs = ["main.java"], 6 | deps = ["@hankcs_aho_corasick//jar"], 7 | main_class = "AcBench", 8 | ) 9 | -------------------------------------------------------------------------------- /stack.yaml.lock: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by Stack. 2 | # You should not edit this file by hand. 3 | # For more information, please see the documentation at: 4 | # https://docs.haskellstack.org/en/stable/lock_files 5 | 6 | packages: [] 7 | snapshots: [] 8 | -------------------------------------------------------------------------------- /benchmark/java/WORKSPACE: -------------------------------------------------------------------------------- 1 | maven_jar( 2 | name = "hankcs_aho_corasick", 3 | artifact = "com.hankcs:aho-corasick-double-array-trie:1.2.0", 4 | sha1 = "7b75748bad8e193cc048a7fb0c71605e300b2286", 5 | sha1_src = "eaea57776bea89b6067fe75473445a878e4b3469", 6 | ) 7 | -------------------------------------------------------------------------------- /benchmark/data-utf8/example.txt: -------------------------------------------------------------------------------- 1 | Henk 2 | Piet 3 | Klaas 4 | Sjaak 5 | Marieke 6 | 7 | Henk eet een appel en Piet eet kaas. 8 | Klaas eet ook kaas. 9 | Kaas is baas. 10 | Mari en Marieke wandelen door het bos. 11 | De auto van Sjaak heeft geen trekhaak en die van Klaas ook niet. 12 | -------------------------------------------------------------------------------- /nix/haskell-dependencies.nix: -------------------------------------------------------------------------------- 1 | haskellPackages: 2 | with haskellPackages; [ 3 | aeson 4 | bytestring 5 | criterion 6 | hashable 7 | primitive 8 | text 9 | unordered-containers 10 | hspec 11 | hspec-expectations 12 | quickcheck-instances 13 | ] 14 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: ghc-9.6.6 2 | packages: 3 | - . 4 | - benchmark/haskell 5 | - benchmark/rust-ffi 6 | 7 | # Note: This section will be ignored by stack, on non-NixOS systems. 8 | # It can be explicitly enabled on non-NixOS systems by passing --nix. 9 | # On NixOS this section is needed to bring the non-Haskell dependencies 10 | # into scope. 11 | nix: 12 | enable: true 13 | shell-file: stack-shell.nix 14 | path: ["nixpkgs=./nix/nixpkgs-pinned.nix"] 15 | -------------------------------------------------------------------------------- /benchmark/data/example.txt: -------------------------------------------------------------------------------- 1 | Henk 2 | Piet 3 | Klaas 4 | Sjaak 5 | Marieke 6 | 7 | Henk eet een appel en Piet eet kaas. 8 | Klaas eet ook kaas. 9 | Kaas is baas. 10 | Mari en Marieke wandelen door het bos. 11 | De auto van Sjaak heeft geen trekhaak en die van Klaas ook niet. 12 | -------------------------------------------------------------------------------- /benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | data/*.txt 2 | data-utf8/*.txt 3 | !data/example.txt 4 | !data-utf8/example.txt 5 | data/*.tar.xz 6 | *.aux 7 | *.hp 8 | *.pdf 9 | *.prof 10 | *.ps 11 | 12 | java/bazel-bin 13 | java/bazel-genfiles 14 | java/bazel-java 15 | java/bazel-out 16 | java/bazel-testlogs 17 | 18 | haskell/ac-bench.cabal 19 | haskell/.stack-work 20 | haskell-utf8/ac-bench-utf8.cabal 21 | haskell-utf8/.stack-work 22 | 23 | rust/target/ 24 | 25 | rust-ffi/ac-bench-ffi.cabal 26 | rust-ffi/.stack-work/ 27 | rust-ffi/libacbench/target/ 28 | -------------------------------------------------------------------------------- /tests/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Test.Hspec (describe, hspec) 4 | 5 | import Data.Text.AhoCorasickSpec as AhoCorasickSpec 6 | import Data.Text.BoyerMooreSpec as BoyerMooreSpec 7 | import Data.Text.BoyerMooreCISpec as BoyerMooreCISpec 8 | import Data.Text.Utf8Spec as Utf8Spec 9 | 10 | main :: IO () 11 | main = hspec $ do 12 | describe "Data.Text.AhoCorasick" AhoCorasickSpec.spec 13 | describe "Data.Text.BoyerMoore" BoyerMooreSpec.spec 14 | describe "Data.Text.BoyerMooreCI" BoyerMooreCISpec.spec 15 | describe "Data.Text.Utf8" Utf8Spec.spec 16 | -------------------------------------------------------------------------------- /app/dump-automaton/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Control.Monad (forM) 4 | import qualified Data.Text.Utf8 as Utf8 5 | import Data.Text.AhoCorasick.Automaton (debugBuildDot) 6 | import System.Environment (getArgs) 7 | import System.IO (hPrint, hPutStr, stderr) 8 | 9 | main = do 10 | args <- getArgs 11 | needles <- forM args $ \needle -> do 12 | hPutStr stderr $ needle ++ ": " 13 | let needleBytes = Utf8.unpackUtf8 $ Utf8.pack needle 14 | hPrint stderr needleBytes 15 | pure $ Utf8.pack needle 16 | 17 | let dot = debugBuildDot needles 18 | putStrLn dot 19 | -------------------------------------------------------------------------------- /src/Data/Text/CaseSensitivity.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE DeriveAnyClass #-} 3 | {-# LANGUAGE DeriveGeneric #-} 4 | {-# LANGUAGE DerivingStrategies #-} 5 | 6 | module Data.Text.CaseSensitivity where 7 | 8 | import Control.DeepSeq (NFData) 9 | import Data.Hashable (Hashable) 10 | import GHC.Generics (Generic) 11 | #if defined(HAS_AESON) 12 | import Data.Aeson (FromJSON, ToJSON) 13 | #endif 14 | data CaseSensitivity 15 | = CaseSensitive 16 | | IgnoreCase 17 | deriving stock (Eq, Generic, Show) 18 | #if defined(HAS_AESON) 19 | deriving anyclass (Hashable, NFData, FromJSON, ToJSON) 20 | #else 21 | deriving anyclass (Hashable, NFData) 22 | #endif 23 | -------------------------------------------------------------------------------- /nix/sources.json: -------------------------------------------------------------------------------- 1 | { 2 | "nixpkgs": { 3 | "branch": "nixpkgs-unstable", 4 | "description": "Nix Packages collection", 5 | "homepage": null, 6 | "owner": "NixOS", 7 | "repo": "nixpkgs", 8 | "rev": "b833ff01a0d694b910daca6e2ff4a3f26dee478c", 9 | "sha256": "1v3y9km48glcmgzk7h8s9sg5sgv1w86pyad973d981sk84a85cdl", 10 | "type": "tarball", 11 | "url": "https://github.com/NixOS/nixpkgs/archive/b833ff01a0d694b910daca6e2ff4a3f26dee478c.tar.gz", 12 | "url_template": "https://github.com///archive/.tar.gz", 13 | "version": "5e7fb7699c84da3420495e40459dfbff459c16e4" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /stack-shell.nix: -------------------------------------------------------------------------------- 1 | {}: 2 | let 3 | pkgs = import ./nix/nixpkgs-pinned.nix {}; 4 | haskellDependencies = import ./nix/haskell-dependencies.nix; 5 | 6 | libacbench = pkgs.rustPlatform.buildRustPackage rec { 7 | name = "libacbench"; 8 | src = ./benchmark/rust-ffi/libacbench; 9 | buildType = "release"; 10 | cargoLock = { 11 | lockFile = ./benchmark/rust-ffi/libacbench/Cargo.lock; 12 | }; 13 | }; 14 | in 15 | pkgs.haskell.lib.buildStackProject { 16 | name = "alfred-margaret"; 17 | ghc = pkgs.ghc966Packages.ghcWithPackages haskellDependencies; 18 | buildInputs = with pkgs; [ 19 | llvm_13 20 | zlib 21 | libacbench 22 | ]; 23 | } 24 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/libacbench/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "0.6.10" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "81ce3d38065e618af2d7b77e10c5ad9a069859b4be3c2250f674af3840d9c8a5" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "libacbench" 16 | version = "0.0.0" 17 | dependencies = [ 18 | "aho-corasick", 19 | ] 20 | 21 | [[package]] 22 | name = "memchr" 23 | version = "2.4.1" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 26 | -------------------------------------------------------------------------------- /benchmark/haskell/package.yaml: -------------------------------------------------------------------------------- 1 | name: ac-bench 2 | maintainer: Channable 3 | category: Other 4 | synopsis: Benchmark code for alfred-margaret 5 | description: Benchmark code for alfred-margaret 6 | version: '0' 7 | github: channable/alfred-margaret 8 | 9 | extra-source-files: 10 | - package.yaml 11 | 12 | library: {} 13 | 14 | dependencies: 15 | - base 16 | 17 | ghc-options: >- 18 | -Wall 19 | -Wincomplete-record-updates 20 | -Wincomplete-uni-patterns 21 | -Wpartial-fields 22 | -j4 23 | +RTS -A64m -RTS 24 | 25 | executables: 26 | ac-bench: 27 | dependencies: 28 | - alfred-margaret 29 | - base 30 | - bytestring 31 | - clock 32 | - deepseq 33 | - text 34 | ghc-options: 35 | - -rtsopts 36 | - -threaded 37 | - '"-with-rtsopts=-I0 -T -N -A32m -n4m -qg"' 38 | main: Main.hs 39 | source-dirs: app/ 40 | -------------------------------------------------------------------------------- /src/Data/Primitive/Extended.hs: -------------------------------------------------------------------------------- 1 | -- | "Data.Primitive" extended with extra definitions. 2 | -- 3 | -- Based on the ".Extended Modules" pattern: 4 | -- https://jaspervdj.be/posts/2015-01-20-haskell-design-patterns-extended-modules.html 5 | module Data.Primitive.Extended 6 | ( module Data.Primitive 7 | , replicateMutablePrimArray 8 | ) 9 | where 10 | 11 | import Data.Primitive 12 | import Control.Monad.Primitive (PrimMonad(..)) 13 | 14 | -- | Like 'replicatePrimArray', but does not freeze the array afterwards and 15 | -- stays within a monadic context, so it can easily be mutated further. 16 | {-# INLINE replicateMutablePrimArray #-} 17 | replicateMutablePrimArray :: (Prim a, PrimMonad m) => Int -> a -> m (MutablePrimArray (PrimState m) a) 18 | replicateMutablePrimArray len value = do 19 | arr <- newPrimArray len 20 | setPrimArray arr 0 len value 21 | pure arr 22 | -------------------------------------------------------------------------------- /benchmark/default.nix: -------------------------------------------------------------------------------- 1 | let 2 | pinnedPkgs = import (builtins.fetchTarball { 3 | name = "ac-bench-nixpkgs"; 4 | url = "https://github.com/NixOS/nixpkgs/archive/1882c6b7368fd284ad01b0a5b5601ef136321292.tar.gz"; 5 | sha256 = "0zg7ak2mcmwzi2kg29g4v9fvbvs0viykjsg2pwaphm1fi13s7s0i"; 6 | }) {}; 7 | in 8 | { pkgs ? pinnedPkgs }: 9 | pkgs.mkShell { 10 | buildInputs = [ 11 | # For benchmark code 12 | (pkgs.python3.withPackages (pyPkgs: [ 13 | pyPkgs.clize 14 | pyPkgs.numpy 15 | ])) 16 | 17 | # For rust implementation 18 | pkgs.cargo 19 | 20 | # For java implementation (uses outdated bazel stuff) 21 | pkgs.jdk8 22 | pkgs.bazel_1 23 | 24 | # For Haskell implementation 25 | pkgs.stack 26 | pkgs.gmp 27 | # GHC 8.10.7 produces faster code with LLVM 9 than with LLVM 12 28 | pkgs.llvmPackages_9.llvm 29 | ]; 30 | } 31 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/package.yaml: -------------------------------------------------------------------------------- 1 | name: ac-bench-ffi 2 | maintainer: Channable 3 | category: Other 4 | synopsis: Benchmark code for alfred-margaret with Rust FFI 5 | description: Benchmark code for alfred-margaret with Rust FFI 6 | version: '0' 7 | github: channable/alfred-margaret 8 | 9 | extra-source-files: 10 | - package.yaml 11 | 12 | library: {} 13 | 14 | dependencies: 15 | - base 16 | 17 | ghc-options: >- 18 | -Wall 19 | -Wincomplete-record-updates 20 | -Wincomplete-uni-patterns 21 | -Wpartial-fields 22 | -j4 23 | +RTS -A64m -RTS 24 | 25 | executables: 26 | ac-bench-ffi: 27 | dependencies: 28 | - alfred-margaret 29 | - base 30 | - bytestring 31 | - clock 32 | - ghc-compact 33 | - deepseq 34 | - primitive 35 | - text 36 | ghc-options: 37 | - -rtsopts 38 | - -threaded 39 | - '"-with-rtsopts=-I0 -T -N -A32m -n4m -qg"' 40 | main: Main.hs 41 | source-dirs: app/ 42 | extra-libraries: 43 | - libacbench 44 | extra-lib-dirs: 45 | - libacbench/target/release -------------------------------------------------------------------------------- /benchmark/report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Report the results of a benchmark run recorded by benchmark.py. 5 | 6 | Usage: 7 | ./report.py PROGRAM0.stats PROGRAM1.stats ... 8 | """ 9 | 10 | import sys 11 | import numpy as np 12 | 13 | def report_file(file_name: str) -> np.array: 14 | # When running the benchmark on a single file, using ndmin ensures that we actually 15 | # get a 2-dimensional array. 16 | data_ns = np.loadtxt(file_name, ndmin=2, dtype=int) 17 | num_files, num_iterations = data_ns.shape 18 | 19 | data_secs = data_ns / 1e9 20 | mean_times_by_file = np.mean(data_secs, axis=1) 21 | min_times_by_file = np.min(data_secs, axis=1) 22 | variance_by_file = np.var(data_secs, axis=1) 23 | 24 | total_mean_secs = np.sum(mean_times_by_file) 25 | total_min_secs = np.sum(min_times_by_file) 26 | total_variance = np.sum(variance_by_file) 27 | total_stdev = np.sqrt(total_variance) 28 | 29 | print(f'{file_name}:') 30 | print(f' mean time: {total_mean_secs:0.3f} ± {total_stdev:0.3f} seconds') 31 | print(f' min time: {total_min_secs:0.3f} seconds') 32 | 33 | 34 | for file_name in sys.argv[1:]: 35 | report_file(file_name) 36 | print() 37 | -------------------------------------------------------------------------------- /benchmark/naive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import time 5 | 6 | for file_name in sys.argv[1:]: 7 | with open(file_name, 'r', encoding='utf-16le') as f: 8 | needles = [] 9 | 10 | for line in f: 11 | if line == '\n': 12 | break 13 | needles.append(line[:-1]) 14 | 15 | haystack = f.read() 16 | num_matches = 0 17 | 18 | # Measure every input five times. 19 | for i in range(0, 5): 20 | # In Python 3.7 we would be able to use time.monotonic_ns ... 21 | epoch_ns = int(time.monotonic() * 1e9) 22 | 23 | for needle in needles: 24 | start = 0 25 | while True: 26 | n = haystack.find(needle, start) 27 | if n != -1: 28 | start = n + 1 29 | num_matches += 1 30 | else: 31 | break 32 | 33 | duration_ns = int(time.monotonic() * 1e9) - epoch_ns 34 | print(f'{duration_ns}\t', end='') 35 | 36 | # Print the number of matches once per file to have reference output. 37 | if i == 0: 38 | print(num_matches, file=sys.stderr) 39 | 40 | # Print a newline, so we have one line of timings per input file. 41 | print() 42 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { 2 | pkgs ? import ./nix/nixpkgs-pinned.nix {}, 3 | # Use haskell-languager-server? 4 | hsTools ? false, 5 | # Use tools for the benchmarks in other languages (Cargo, Bazel, etc.)? 6 | benchTools ? false 7 | }: 8 | let 9 | haskellDependencies = import ./nix/haskell-dependencies.nix; 10 | 11 | paths = with pkgs; ( 12 | [ 13 | # Nix tooling 14 | niv 15 | nix-tree 16 | 17 | # Haskell tooling 18 | stack 19 | 20 | # Haskell dependencies 21 | (ghc966Packages.ghcWithPackages haskellDependencies) 22 | 23 | # Other 24 | llvm_13 25 | ] ++ 26 | # We don't use the overlay here because the tooling doesn't need it. 27 | # The advantage of doing so is that these packages are already available in a global cache. 28 | lib.optionals hsTools (with haskell.packages.ghc966; [ 29 | haskell-language-server 30 | implicit-hie 31 | ]) ++ 32 | lib.optionals benchTools [ 33 | (python3.withPackages (pyPkgs: [ 34 | pyPkgs.clize 35 | pyPkgs.numpy 36 | ])) 37 | 38 | # For rust implementation 39 | cargo 40 | 41 | # For java implementation (uses outdated bazel stuff) 42 | jdk8 43 | bazel_1 44 | 45 | # For Haskell implementation 46 | gmp 47 | ] 48 | ); 49 | in 50 | pkgs.buildEnv { 51 | name = "alfred-margaret-env"; 52 | paths = paths; 53 | } 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Channable 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/libacbench/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate aho_corasick; 2 | 3 | use aho_corasick::{AcAutomaton, Automaton, Dense, Sparse}; 4 | 5 | #[repr(C)] 6 | #[derive(Debug)] 7 | pub struct U8Slice { 8 | ptr: *const u8, 9 | off: isize, 10 | len: isize, 11 | } 12 | 13 | impl U8Slice { 14 | fn into_slice<'a>(&self) -> &'a [u8] { 15 | slice_from_pointer(self.ptr, self.off, self.len) 16 | } 17 | } 18 | 19 | // https://doc.rust-lang.org/src/core/slice/raw.rs.html#87 20 | fn slice_from_pointer<'a, T>(ptr: *const T, off: isize, len: isize) -> &'a [T] { 21 | &(unsafe { &*std::ptr::slice_from_raw_parts(ptr, (off + len) as usize) })[off as usize..] 22 | } 23 | 24 | #[no_mangle] 25 | pub extern "C" fn perform_ac( 26 | use_sparse: bool, 27 | num_needles: isize, 28 | needle_slices_: *const U8Slice, 29 | haystack_slice_: *const U8Slice, 30 | ) -> isize { 31 | let needle_slices = slice_from_pointer(needle_slices_, 0, num_needles); 32 | 33 | let mut needles: Vec<&[u8]> = Vec::with_capacity(num_needles as usize); 34 | for i in 0..num_needles as usize { 35 | needles.insert(i, needle_slices[i].into_slice()); 36 | } 37 | 38 | let haystack = slice_from_pointer(haystack_slice_, 0, 1)[0].into_slice(); 39 | 40 | let num_matches = if use_sparse { 41 | let automaton = AcAutomaton::<_, Sparse>::with_transitions(&needles[..]); 42 | automaton.find_overlapping(haystack).count() 43 | } else { 44 | let automaton = AcAutomaton::<_, Dense>::with_transitions(&needles[..]); 45 | automaton.find_overlapping(haystack).count() 46 | }; 47 | 48 | num_matches as isize 49 | } 50 | -------------------------------------------------------------------------------- /benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Usage: See benchmark/README.md 4 | 5 | # For bold output 6 | function echobold { 7 | tput bold 8 | echo "$@" 9 | tput sgr0 10 | } 11 | 12 | # Read argument 13 | benchcase=$1 14 | 15 | # Prepare variables 16 | revision=$(git rev-parse --short HEAD) 17 | prefix="$benchcase-$revision" 18 | 19 | echobold Selecting benchmark case... 20 | 21 | # Set $exe and $datadir 22 | case $benchcase in 23 | python) 24 | exe=./benchmark/naive.py 25 | datadir=./benchmark/data 26 | ;; 27 | rust) 28 | exe=./benchmark/rust/target/release/acbench-rust 29 | datadir=./benchmark/data 30 | ;; 31 | java) 32 | exe=./benchmark/java/bazel-bin/acbench 33 | datadir=./benchmark/data 34 | ;; 35 | haskell) 36 | exe=$(stack path --local-install-root)/bin/ac-bench 37 | datadir=./benchmark/data-utf8 38 | ;; 39 | rust-ffi) 40 | exe=$(stack path --local-install-root)/bin/ac-bench-ffi 41 | datadir=./benchmark/data-utf8 42 | ;; 43 | *) 44 | echobold "Invalid benchmark case, please pass one of {python,rust,java,haskell,rust-ffi}" >&2 45 | exit 1 46 | esac 47 | 48 | # Absolute paths may help with debugging 49 | exe_absolute=$(readlink -f "$exe") 50 | datadir_absolute=$(readlink -f "$datadir") 51 | 52 | # Print some debugging information 53 | echobold "I will benchmark this executable:" 54 | echo "$exe_absolute" 55 | echobold "Using data from this directory:" 56 | echo "$datadir_absolute" 57 | echobold "And store the results in these files:" 58 | echo "$prefix.{stats,results}" 59 | 60 | # Run the benchmark 61 | echobold "Running the benchmark..." 62 | ./benchmark/benchmark.py "$exe" --data-directory "$datadir_absolute" --prefix "$prefix" 63 | echobold "Done!" -------------------------------------------------------------------------------- /.semaphore/semaphore.yml: -------------------------------------------------------------------------------- 1 | version: v1.0 2 | name: "Semaphore pipeline for alfred-margaret" 3 | 4 | agent: 5 | machine: 6 | type: f1-standard-2 7 | os_image: ubuntu2204 8 | 9 | # Automatically cancel jobs that are already running for all branches but master 10 | auto_cancel: 11 | running: 12 | when: "branch != 'master'" 13 | 14 | blocks: 15 | - name: Stack 16 | task: 17 | prologue: 18 | commands: 19 | # Checkout out the repository 20 | - checkout 21 | - git log --max-count 1 --oneline --no-decorate 22 | # Set up /nix with correct permissions 23 | - sudo mkdir /nix 24 | - sudo chown semaphore:semaphore /nix 25 | # Restore any nix cache that we can find 26 | - cache restore nix-store- 27 | # Install Nix 28 | - curl -o install-nix-2.24.12 https://releases.nixos.org/nix/nix-2.24.12/install 29 | - sha256sum --check .semaphore/install-nix.sha256 30 | 31 | - sh ./install-nix-2.24.12 --no-daemon 32 | # Enable `nix-command` feature, which `nix build` needs to build 33 | - sudo mkdir /etc/nix 34 | - echo 'experimental-features = nix-command' | sudo tee -a /etc/nix/nix.conf 35 | 36 | - source "$HOME/.nix-profile/etc/profile.d/nix.sh" 37 | # Configure cachix *without building the environment in default.nix* 38 | - nix shell -f nix/nixpkgs-pinned.nix cachix -c cachix use channable-public 39 | # Restore .stack-work and ~/.stack from cache for faster builds 40 | - cache restore home-stack-$SEMAPHORE_GIT_BRANCH 41 | - cache restore stack-work-$SEMAPHORE_GIT_BRANCH 42 | 43 | jobs: 44 | - name: Test alfred-margaret 45 | commands: 46 | - nix shell -f default.nix -c stack test alfred-margaret:test-suite 47 | 48 | epilogue: 49 | commands: 50 | # Fill caches 51 | - cache store home-stack-$SEMAPHORE_GIT_BRANCH ~/.stack 52 | - cache store stack-work-$SEMAPHORE_GIT_BRANCH .stack-work 53 | # Store a copy of the nix store. This will be refreshed daily, which 54 | # is more than sufficient for this repo. Semaphore's cache is faster 55 | # than Cachix. 56 | - "cache store nix-store-$(date -u -Idate) /nix" 57 | -------------------------------------------------------------------------------- /benchmark/rust/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "acbench-rust" 7 | version = "0.0.0" 8 | dependencies = [ 9 | "aho-corasick", 10 | "filebuffer", 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "aho-corasick" 16 | version = "0.6.8" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "68f56c7353e5a9547cbd76ed90f7bb5ffc3ba09d4ea9bd1d8c06c8b1142eeb5a" 19 | dependencies = [ 20 | "memchr", 21 | ] 22 | 23 | [[package]] 24 | name = "cfg-if" 25 | version = "0.1.5" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "0c4e7bb64a8ebb0d856483e1e682ea3422f883c5f5615a90d51a2c82fe87fdd3" 28 | 29 | [[package]] 30 | name = "filebuffer" 31 | version = "0.4.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "1b41bfe1d74263ea9d084be951077614b3b98b4e59a9dafab1467645a9e52305" 34 | dependencies = [ 35 | "libc", 36 | "winapi", 37 | ] 38 | 39 | [[package]] 40 | name = "libc" 41 | version = "0.2.43" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d" 44 | 45 | [[package]] 46 | name = "memchr" 47 | version = "2.1.0" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "4b3629fe9fdbff6daa6c33b90f7c08355c1aca05a3d01fa8063b822fcf185f3b" 50 | dependencies = [ 51 | "cfg-if", 52 | "libc", 53 | "version_check", 54 | ] 55 | 56 | [[package]] 57 | name = "version_check" 58 | version = "0.1.5" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" 61 | 62 | [[package]] 63 | name = "winapi" 64 | version = "0.3.6" 65 | source = "registry+https://github.com/rust-lang/crates.io-index" 66 | checksum = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0" 67 | dependencies = [ 68 | "winapi-i686-pc-windows-gnu", 69 | "winapi-x86_64-pc-windows-gnu", 70 | ] 71 | 72 | [[package]] 73 | name = "winapi-i686-pc-windows-gnu" 74 | version = "0.4.0" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 77 | 78 | [[package]] 79 | name = "winapi-x86_64-pc-windows-gnu" 80 | version = "0.4.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 83 | -------------------------------------------------------------------------------- /benchmark/java/main.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.lang.StringBuilder; 3 | import java.nio.charset.Charset; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.TreeMap; 9 | 10 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie; 11 | import com.hankcs.algorithm.AhoCorasickDoubleArrayTrie.IHit; 12 | 13 | class MatchCounter implements IHit { 14 | public int count; 15 | 16 | public MatchCounter() { 17 | this.count = 0; 18 | } 19 | 20 | public void hit(int begin, int end, String value) { 21 | this.count++; 22 | } 23 | } 24 | 25 | class AcBench { 26 | public static void main(String[] args) { 27 | for (int i = 0; i < args.length; i++) { 28 | try { 29 | AcBench.processFile(args[i]); 30 | } catch (IOException ex) { 31 | System.out.println(ex.toString()); 32 | System.exit(1); 33 | } 34 | } 35 | } 36 | 37 | static void processFile(String filename) throws IOException { 38 | // Preparation step: read the file, needles first, then the haystack. 39 | Charset utf16le = Charset.forName("UTF-16LE"); 40 | List fileLines = Files.readAllLines(Paths.get(filename), utf16le); 41 | TreeMap needles = new TreeMap(); 42 | StringBuilder haystackBuilder = new StringBuilder(); 43 | 44 | boolean isNeedles = true; 45 | for (int i = 0; i < fileLines.size(); i++) { 46 | String line = fileLines.get(i); 47 | 48 | if (line.equals("")) { 49 | // A blank line signals the end of the needle section. 50 | isNeedles = false; 51 | continue; 52 | } 53 | 54 | if (isNeedles) { 55 | needles.put(line, line); 56 | } else { 57 | haystackBuilder.append(line); 58 | haystackBuilder.append('\n'); 59 | } 60 | } 61 | 62 | String haystack = haystackBuilder.toString(); 63 | 64 | // Now for the real work: build the automaton and print the number of matches. 65 | 66 | // Measure every input 5 times. 67 | for (int i = 0; i < 5; i++) { 68 | MatchCounter matchCounter = new MatchCounter(); 69 | 70 | long epoch = System.nanoTime(); 71 | if (needles.size() > 0) { 72 | AhoCorasickDoubleArrayTrie automaton = new AhoCorasickDoubleArrayTrie(); 73 | automaton.build(needles); 74 | automaton.parseText(haystack, matchCounter); 75 | } 76 | long durationNs = System.nanoTime() - epoch; 77 | 78 | if (i == 0) { 79 | // Only print count to stderr the first iteration, to check correctness. 80 | System.err.format("%d\n", matchCounter.count); 81 | } 82 | 83 | System.out.format("%d\t", durationNs); 84 | } 85 | 86 | // Print a newline, so we print one line of measurements per file. 87 | System.out.println(""); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /bench/uvector-vs-tba/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | -- | Microbenchmark to measure the difference in 'TypedByteArray' and 'Data.Vector.Unboxed.Vector' indexing performance. 4 | -- The latter is a slice into a 'ByteArray' which means that every time you retrieve and element, an offset 5 | -- needs to be added to your index. 6 | -- 7 | -- To reproduce: 8 | -- 9 | -- @ 10 | -- stack bench alfred-margaret:uvector-vs-tba --ba '--output uvector-vs-tba.html' 11 | -- @ 12 | -- 13 | -- You can pass a greater @--time-limit@ (in the single quotes) to increase the number of iterations. 14 | -- 15 | -- NOTE: 'readUVector' and 'genUVector' are marked @NOINLINE@ to prevent GHC optimizing away the indexing addition. 16 | -- 'readTba' and 'genTba' are marked @NOINLINE@ as well for fairness, altough it shouldn't make a difference there. 17 | module Main where 18 | 19 | import Control.Monad.ST (runST) 20 | import Criterion.Main (Benchmark, bench, bgroup, defaultMain, nf) 21 | import Data.Foldable (for_) 22 | import Text.Printf (printf) 23 | 24 | import qualified Data.Vector.Unboxed as UVector 25 | import qualified Data.Vector.Unboxed.Mutable as UMVector 26 | 27 | import qualified Data.TypedByteArray as TBA 28 | 29 | main :: IO () 30 | main = defaultMain 31 | [ bgroup "tba" $ mkReadBenchs readTba genTba [7, 8] 32 | , bgroup "uvector" $ mkReadBenchs readUVector genUVector [7, 8] 33 | ] 34 | 35 | mkReadBenchs 36 | :: (Int -> a -> Int) -- ^ Function that reads from an array @n@ times. 37 | -> (Int -> a) -- ^ Function that constructs an array of length @n@. 38 | -> [Int] -- ^ Which powers of 10 to pass for @n@. 39 | -> [Benchmark] 40 | mkReadBenchs readPattern gen powers = 41 | [ bench (printf "%d reads" n) $ nf (readPattern n) $ gen n 42 | | n <- map (10^) powers 43 | ] 44 | 45 | {-# NOINLINE readTba #-} 46 | readTba :: Int -> TBA.TypedByteArray Int -> Int 47 | readTba !n !arr = go 0 48 | where 49 | go !i 50 | | i >= n = 42 51 | | otherwise = go $ TBA.unsafeIndex arr i 52 | 53 | {-# NOINLINE readUVector #-} 54 | readUVector :: Int -> UVector.Vector Int -> Int 55 | readUVector !n !arr = go 0 56 | where 57 | go !i 58 | | i >= n = 42 59 | | otherwise = go $ UVector.unsafeIndex arr i 60 | 61 | -- NOTE: We should probably measure pseudo-random access time as well, e.g. by shuffling the generated arrays. 62 | 63 | {-# NOINLINE genTba #-} 64 | genTba :: Int -> TBA.TypedByteArray Int 65 | genTba n = runST $ do 66 | mutArr <- TBA.newTypedByteArray n 67 | for_ [0 .. n-1] $ \i -> TBA.writeTypedByteArray mutArr i $ i + 1 68 | TBA.unsafeFreezeTypedByteArray mutArr 69 | 70 | -- | Generate an unboxed vector @v@ such that @v[i] == i + 1@. 71 | {-# NOINLINE genUVector #-} 72 | genUVector :: Int -> UVector.Vector Int 73 | genUVector n = runST $ do 74 | mutArr <- UMVector.new n 75 | for_ [0 .. n-1] $ \i -> UMVector.write mutArr i $ i + 1 76 | UVector.unsafeFreeze mutArr 77 | -------------------------------------------------------------------------------- /benchmark/haskell/app/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# OPTIONS_GHC -fllvm -O2 -optlo=-O3 -optlo=-tailcallelim #-} 5 | 6 | -- | Benchmark for our Aho-Corasick implementation. 7 | module Main where 8 | 9 | import Control.DeepSeq (force) 10 | import Control.Exception (evaluate) 11 | import Control.Monad (void, when) 12 | import Data.Foldable (for_, traverse_) 13 | import System.IO (hPrint, stderr, stdout) 14 | import Text.Printf (hPrintf) 15 | 16 | import qualified Data.ByteString as ByteString 17 | import qualified Data.Text.Encoding as Encoding 18 | import qualified System.Clock as Clock 19 | import qualified System.Environment as Env 20 | 21 | import Data.Text.Utf8 (CodeUnitIndex(..), Text (..)) 22 | 23 | import qualified Data.Text.Utf8 as Utf8 24 | import qualified Data.Text.AhoCorasick.Automaton as Aho 25 | 26 | readNeedleHaystackFile :: FilePath -> IO ([Text], Text) 27 | readNeedleHaystackFile path = do 28 | (Text u8data off len) <- Encoding.decodeUtf8 <$> ByteString.readFile path 29 | pure $ go u8data off len [] 30 | where 31 | go u8data off 0 needles = (reverse needles, Text u8data off 0) 32 | go u8data off len needles 33 | -- "line starts with newline char" ==> empty line, emit haystack as slice of u8data 34 | | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = (reverse needles, Text u8data (off + 1) (len - 1)) 35 | | otherwise = consumeNeedle u8data off len needles off 36 | 37 | consumeNeedle u8data off len needles needleStart 38 | -- Newline ==> emit needle as slice of u8data 39 | | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = go u8data (off + 1) (len - 1) $ Text u8data needleStart (off - needleStart) : needles 40 | | otherwise = consumeNeedle u8data (off + 1) (len - 1) needles needleStart 41 | 42 | main :: IO () 43 | main = Env.getArgs >>= traverse_ processFile 44 | 45 | processFile :: FilePath -> IO () 46 | processFile path = do 47 | (needles, haystack) <- readNeedleHaystackFile path 48 | 49 | void $ evaluate $ force needles 50 | void $ evaluate $ force haystack 51 | 52 | for_ [0 :: Int .. 4] $ \i -> do 53 | (count, duration) <- acBench needles haystack 54 | when (i == 0) $ 55 | hPrint stderr count 56 | hPrintf stdout "%d\t" (Clock.toNanoSecs duration) 57 | hPrintf stdout "\n" 58 | 59 | acBench :: [Text] -> Text -> IO (Int, Clock.TimeSpec) 60 | {-# NOINLINE acBench #-} 61 | acBench needles haystack = do 62 | start <- Clock.getTime Clock.Monotonic 63 | matchCount <- evaluate $ countMatches needles haystack 64 | end <- Clock.getTime Clock.Monotonic 65 | pure (matchCount, Clock.diffTimeSpec start end) 66 | 67 | countMatches :: [Text] -> Text -> Int 68 | {-# NOINLINE countMatches #-} 69 | countMatches needles haystack = case needles of 70 | [] -> 0 71 | _ -> 72 | let 73 | ac = Aho.build $ zip needles (repeat ()) 74 | onMatch !n _match = Aho.Step (n + 1) 75 | in 76 | Aho.runText 0 onMatch ac haystack 77 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v2.1.0.2 - "All the bounds!" (2024-09-06) 4 | 5 | Publishing requires `cabal` to know the version bounds as well. 6 | 7 | ## v2.1.0.1 - "Higher bounds!" (2024-09-05) 8 | 9 | Tested on GHC 9.6.6. 10 | 11 | - Revise dependency bounds ([#62](https://github.com/channable/alfred-margaret/issues/62) thanks @Bodigrim) 12 | - Allow using primitive < 0.9 and vector < 0.14 ([#59](https://github.com/channable/alfred-margaret/pull/59) thanks @rampion) 13 | 14 | ## v2.1.0.0 - "All The Cases!" (2022-08-31) 15 | 16 | - Added a case-insensitive variant of the Boyer-Moore algorithm in the `Data.Text.BoyerMooreCI.*` modules. ([#47](https://github.com/channable/alfred-margaret/pull/47)) 17 | - Fixed a bug in the case-insensitive Aho-Corasick replacer where it would 18 | replace the wrong section of the haystack when the needle had a different 19 | byte-length than the matching part of the haystack. ([#47](https://github.com/channable/alfred-margaret/pull/47)) 20 | - Allow mapping the payloads of Aho-Corasick automatons. ([#46](https://github.com/channable/alfred-margaret/pull/46)) 21 | 22 | ## v2.0.0.0 - "So Long Surrogates" (2022-05-02) 23 | 24 | Switched to text-2.0 which uses UTF-8 encoding internally. 25 | 26 | - Removed `Data.Text.Utf8.*` modules 27 | - Replaced `Data.Text.AhoCorasick.*` and `Data.Text.BoyerMoore.*` (previously using UTF-16) with the UTF-8 implementation 28 | 29 | ## v1.1.2.0 - "ByteArray Boogaloo" (2022-04-21) 30 | 31 | Added UTF-8 implementations on a mock `Text` type (in `Data.Text.Utf8`). 32 | 33 | - Added `Data.Text.Utf8*` modules 34 | - Moved `CaseSensitivity` to its own `Data.Text.CaseSensitivity` module. 35 | - Added the private module `Data.TypedByteArray` which contains thin wrappers over `ByteArray` and `MutableByteArray`. 36 | - Replaced uses of `Data.Vector.Unboxed.Vector` by `TypedByteArray`. 37 | 38 | ## v1.1.0.0 - "Moore Features" (2020-10-13) 39 | 40 | The most notable addition in this release is the implementation of the Boyer-Moore string search algorithm. 41 | 42 | **Compatibility:** 43 | 44 | - Extracted the UTF-16 manipulation functions from `Data.Text.AhoCorasick.Automaton` into `Data.Text.Utf16` 45 | - Changed `Data.Text.AhoCorasick.Searcher.Searcher` to remember the case sensitivity used for constructing the searcher 46 | - Removed `Data.Text.AhoCorasick.Searcher.containsAnyIgnoreCase`, the correct implementation is now chosen by `containsAny` based on the case sensitivity of the searcher 47 | 48 | Other changes: 49 | 50 | - Added `Data.Text.AhoCorasick.Splitter` for splitting a lot of text using the same needle 51 | - Added `Data.Text.BoyerMoore.Automaton`, a UTF-16 implementation of Boyer-Moore 52 | - Added `Data.Text.BoyerMoore.Searcher` for searching for multiple needles at once using Boyer-Moore 53 | - Added `Data.Text.BoyerMoore.Replacer` for replacing text based on the Boyer-Moore search 54 | - Added optional `FromJSON`/`ToJSON` instances for most types (can be toggled via `aeson` cabal flag) 55 | 56 | ## v1.0.0.0 - "Initial Release" (2019-03-19) 57 | 58 | This is the initial open-source release. 59 | 60 | - Added `Data.Text.AhoCorasick.Automaton`, a UTF-16 implementation of the Aho-Corasick search algorithm 61 | - Added `Data.Text.AhoCorasick.Searcher`, a bulk search abstraction based on Aho-Corasick 62 | - Added `Data.Text.AhoCorasick.Replacer`, a bulk replace abstraction based on Aho-Corasick 63 | -------------------------------------------------------------------------------- /src/Data/Text/Utf8/Unlower.hs: -------------------------------------------------------------------------------- 1 | 2 | module Data.Text.Utf8.Unlower 3 | ( 4 | unlowerCodePoint 5 | 6 | , printUnlowerings 7 | ) where 8 | 9 | import Control.Monad (forM_) 10 | 11 | import qualified Data.Char as Char 12 | import qualified Data.HashMap.Strict as HashMap 13 | import qualified Data.List as List 14 | 15 | 16 | -- | Inverse of Char.toLower/Utf8.lowerCodePoint 17 | -- 18 | -- Returns all the characters that have the given character as their lower case, for example: 19 | -- 20 | -- unlowerCodePoint 'a' == "aA" 21 | -- unlowerCodePoint 'A' == "" 22 | -- unlowerCodePoint '1' == "1" 23 | -- unlowerCodePoint 'i' == "İiI" 24 | -- unlowerCodePoint 'ß' == "ẞß" 25 | -- 26 | unlowerCodePoint :: Char -> [Char] 27 | unlowerCodePoint = 28 | \c -> maybe [c] id $ HashMap.lookup c unlowerings 29 | 30 | -- | This map contains all the unlowerings for which the result is not just a singleton with the 31 | -- input character. It's marked NOINLINE to make sure that it only gets constructed once. 32 | unlowerings :: HashMap.HashMap Char [Char] 33 | {-# NOINLINE unlowerings #-} 34 | unlowerings = 35 | HashMap.filterWithKey isNotId $ List.foldl' (flip addUnlowering) initialMap [minBound..maxBound] 36 | where 37 | initialMap = HashMap.fromList $ zip [minBound..maxBound] (repeat []) 38 | addUnlowering c hm = 39 | HashMap.insertWith (++) (Char.toLower c) [c] hm 40 | isNotId lc ucs = ucs /= [lc] 41 | 42 | 43 | -- | This function prints all the special cases of unlowerCodePoint where it's not @(pure . id)@: 44 | -- 45 | -- SPECIAL: i (105) -> İ (304) i (105) I (73) 46 | -- SPECIAL: k (107) -> K (8490) k (107) K (75) 47 | -- SPECIAL: ß (223) -> ẞ (7838) ß (223) 48 | -- SPECIAL: å (229) -> Å (8491) å (229) Å (197) 49 | -- SPECIAL: dž (454) -> dž (454) Dž (453) DŽ (452) 50 | -- SPECIAL: lj (457) -> lj (457) Lj (456) LJ (455) 51 | -- SPECIAL: nj (460) -> nj (460) Nj (459) NJ (458) 52 | -- SPECIAL: dz (499) -> dz (499) Dz (498) DZ (497) 53 | -- SPECIAL: θ (952) -> ϴ (1012) θ (952) Θ (920) 54 | -- SPECIAL: ω (969) -> Ω (8486) ω (969) Ω (937) 55 | -- [..] 56 | -- Inverse of Char.toUpper: a (97) -> a (97) A (65) 57 | -- Inverse of Char.toUpper: b (98) -> b (98) B (66) 58 | -- Inverse of Char.toUpper: c (99) -> c (99) C (67) 59 | -- [..] 60 | -- 61 | printUnlowerings :: IO () 62 | printUnlowerings = do 63 | 64 | let 65 | showCP :: Char -> String 66 | showCP c = case Char.ord c of 67 | co | co > 68000 -> show co -- Some RTL languages above these code points are annoying to print 68 | co -> c : " (" <> show co <> ")" 69 | 70 | showCPs :: [Char] -> String 71 | showCPs cs = List.intercalate " " (map showCP cs) 72 | 73 | isInverse (lc, ucs) = ucs == [lc, Char.toUpper lc] || ucs == [Char.toUpper lc, lc] 74 | isAlreadyUppercase (_, ucs) = ucs == [] 75 | isSpecial p = not (isInverse p) && not (isAlreadyUppercase p) 76 | 77 | lst :: [(Char, [Char])] 78 | lst = HashMap.toList unlowerings 79 | 80 | forM_ (filter isSpecial lst) $ \(lc, ucs) -> do 81 | putStrLn $ "SPECIAL: " <> showCP lc <> " -> " <> showCPs ucs 82 | 83 | forM_ (filter isAlreadyUppercase lst) $ \(lc, _) -> do 84 | putStrLn $ "Already uppercase (there is no unlowering): " <> showCP lc 85 | 86 | forM_ (filter isInverse lst) $ \(lc, ucs) -> do 87 | putStrLn $ "Inverse of Char.toUpper: " <> showCP lc <> " -> " <> List.intercalate " " (map showCP ucs) 88 | -------------------------------------------------------------------------------- /benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import clize 4 | import os 5 | import os.path 6 | import subprocess 7 | import sys 8 | 9 | # clize turns this into a CLI where program is a required argument and prefix is a required option 10 | def benchmark(program, *, prefix, data_directory="data"): 11 | """ 12 | This script runs an executable 5 times and collects the running times it 13 | reports. Names of the .txt files in the data/ directory are provivided to the 14 | program on the command line. Expected output of the program is tab-separated 15 | times (with trailing tab) per run in nanoseconds on stdout, and the number of 16 | matches on stderr (only once, not per run). The output should be one line per 17 | input file. 18 | 19 | Example: 20 | ./benchmark.py ./naive.py --prefix python 21 | 22 | :param program: What to benchmark. 23 | :param prefix: Used for generating output files $prefix.{stats,results} 24 | :param data_directory: Directory containing the benchmark data files. 25 | """ 26 | 27 | # # Disable automatic CPU frequency scaling to get lower variance measurements. 28 | with open('/sys/devices/system/cpu/cpu1/cpufreq/scaling_governor', 'r') as f: 29 | scaling_governor = next(f) 30 | 31 | if scaling_governor != 'performance\n': 32 | print('Please run the following to clock CPU frequency to its maximum:') 33 | print('echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor') 34 | sys.exit(1) 35 | 36 | input_file_names = [] 37 | for f in os.listdir(data_directory): 38 | file_name = os.path.join(data_directory, f) 39 | if os.path.isfile(file_name) and file_name.endswith('.txt'): 40 | input_file_names.append(os.path.abspath(file_name)) 41 | 42 | # Makes comparison of .results files easier 43 | input_file_names.sort() 44 | 45 | print(f'Found {len(input_file_names)} files to benchmark.') 46 | 47 | # Run the program to benchmark under taskset to lock it to CPU core 1. This 48 | # avoids variance due to CPU migrations. 49 | cmdline = ['taskset', '-c', '1', program] + input_file_names 50 | 51 | times = [] 52 | results = None 53 | 54 | num_rounds = 5 55 | for i in range(0, num_rounds): 56 | print(f'Round {i + 1} of {num_rounds}.') 57 | p = subprocess.run(cmdline, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 58 | 59 | if p.returncode != 0: 60 | print(f'Running {cmdline[:6]}... failed.') 61 | print(p.stdout.decode('utf-8')) 62 | print(p.stderr.decode('utf-8')) 63 | sys.exit(1) 64 | 65 | assert results is None or results == p.stderr, ( 66 | 'Program should have consistent output.\n' 67 | f'{results!r} != {p.stderr!r}' 68 | ) 69 | results = p.stderr 70 | 71 | # Paste outputs of the different runs together, like GNU Paste. 72 | if len(times) == 0: 73 | times = p.stdout.splitlines() 74 | else: 75 | times = [acc + new for acc, new in zip(times, p.stdout.splitlines())] 76 | 77 | with open(f'{prefix}.stats', 'wb') as f: 78 | f.writelines(ts + b'\n' for ts in times) 79 | 80 | with open(f'{prefix}.results', 'wb') as f: 81 | f.write(results) 82 | 83 | print(f'Results written to {prefix}.{{stats,results}}.') 84 | 85 | if __name__ == '__main__': 86 | clize.run(benchmark) 87 | -------------------------------------------------------------------------------- /benchmark/rust/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate aho_corasick; 2 | extern crate filebuffer; 3 | extern crate memchr; 4 | 5 | use std::env; 6 | use std::io; 7 | 8 | use aho_corasick::{AcAutomaton, Automaton, Dense}; 9 | use filebuffer::FileBuffer; 10 | use memchr::memchr; 11 | 12 | /// Print the number of matches found in the haystack in the file, and timings. 13 | /// 14 | /// The file format consists of needles, one per line, followed by a blank line, 15 | /// followed by the haystack. The file is encoded as UTF-16 LE without byte 16 | /// order mark. 17 | fn process_file(file_name: String) -> io::Result<()> { 18 | let file = FileBuffer::open(file_name)?; 19 | 20 | let mut needles = Vec::with_capacity(1000); 21 | let mut start = 0; 22 | 23 | loop { 24 | // The UTF-16 LE newline sequence is an ascii newline (0x0a) and a null 25 | // byte. Locate the newline byte (0x0a) first, then check that it is 26 | // part of 0x0a, 0x00 with the proper alignment. 27 | let mut len = 0; 28 | loop { 29 | len += memchr(0x0a, &file[start + len..]).expect("Unexpected EOF before haystack"); 30 | if len % 2 != 0 { 31 | // We are looking for the first byte of the 16-bit integer, 32 | // not the second one. 33 | continue; 34 | } 35 | 36 | if let Some(0x00) = file.get(start + len + 1) { 37 | // The second byte is as expected, this really was a newline. 38 | break; 39 | } 40 | 41 | // Skip over the 0x0a that is not a newline on the next memchr. 42 | len += 1; 43 | } 44 | 45 | // A blank line ends the needle section of the file, what follows is 46 | // haystack. 47 | if len == 0 { 48 | break; 49 | } 50 | 51 | // The newline byte is not part of the needle. 52 | let end = start + len; 53 | needles.push(&file[start..end]); 54 | 55 | // The next needle starts after the null byte after the newline byte. 56 | start = start + len + 2; 57 | } 58 | 59 | let haystack = &file[start..]; 60 | 61 | // Run the benchmark 5 times. 62 | for i in 0..5 { 63 | let epoch = std::time::Instant::now(); 64 | // NOTE: You can opt for aho_corasick::Dense or Sparse. Dense is more memory 65 | // efficient and Sparse is faster according to the docs, however, in my 66 | // measurements, Dense is faster. You can also leave off the .into_full() 67 | // which is also more memory efficient but slower according to the docs, 68 | // and also according to my measurements. 69 | // TODO: Re-enable .into_full() once https://github.com/BurntSushi/aho-corasick/issues/35 70 | // is fixed. 71 | let automaton = AcAutomaton::<_, Dense>::with_transitions(&needles[..]); 72 | let num_matches = automaton.find_overlapping(haystack).count(); 73 | 74 | let duration = epoch.elapsed(); 75 | 76 | // Print duration in nanoseconds, tab separated. 77 | print!("{}\t", duration.as_nanos()); 78 | 79 | // In the first iteration, print the match count to stderr, so we can 80 | // verify it against the reference implementation for correctness. 81 | if i == 0 { 82 | eprintln!("{}", num_matches); 83 | } 84 | } 85 | 86 | // Print a newline, so we print one line of measurements per file. 87 | println!(""); 88 | 89 | Ok(()) 90 | } 91 | 92 | fn main() { 93 | // Skip the program name. 94 | for file_name in env::args().skip(1) { 95 | process_file(file_name).unwrap(); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/Data/Text/BoyerMooreCI/Replacer.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE GHC2021 #-} 8 | {-# LANGUAGE DeriveGeneric #-} 9 | {-# LANGUAGE DerivingStrategies #-} 10 | {-# LANGUAGE FlexibleInstances #-} 11 | 12 | module Data.Text.BoyerMooreCI.Replacer 13 | ( -- Replacer 14 | replaceSingleLimited 15 | ) where 16 | 17 | import GHC.Generics (Generic) 18 | import Data.Text.Utf8 (Text) 19 | import Data.Text.BoyerMooreCI.Automaton (Automaton, CodeUnitIndex) 20 | 21 | import qualified Data.Text.Utf8 as Text 22 | import qualified Data.Text.Utf8 as Utf8 23 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMoore 24 | 25 | -- | Replace all occurrences matched by the Boyer-Moore automaton 26 | -- with the given replacement text in some haystack. 27 | -- Performs case-sensitive replacement. 28 | replaceSingleLimited 29 | :: Automaton -- ^ Matches the needles 30 | -> Text -- ^ Replacement string 31 | -> Text -- ^ Haystack 32 | -> CodeUnitIndex -- ^ Maximum number of code units in the returned text 33 | -> Maybe Text 34 | replaceSingleLimited needle replacement haystack maxLength 35 | | needleLength == 0 = Just $ if haystackLength == 0 then replacement else haystack 36 | | otherwise = finish $ BoyerMoore.runText initial foundMatch needle haystack 37 | where 38 | needleLength = BoyerMoore.patternLength needle 39 | haystackLength = Utf8.lengthUtf8 haystack 40 | replacementLength = Utf8.lengthUtf8 replacement 41 | 42 | initial = ReplaceState 43 | { rsChunks = [] 44 | , rsPreviousMatchEnd = 0 45 | , rsLength = 0 46 | } 47 | 48 | foundMatch rs matchStart matchEnd = 49 | let 50 | -- Slice the part of the haystack between the end of the previous match 51 | -- and the start of the current match 52 | haystackPartLength = matchStart - rsPreviousMatchEnd rs 53 | haystackPart = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack 54 | 55 | -- Add the preceding part of the haystack and the replacement in reverse 56 | -- order to the chunk list (all chunks will be reversed at once in the final step). 57 | newChunks = replacement : haystackPart : rsChunks rs 58 | newLength = replacementLength + haystackPartLength + rsLength rs 59 | 60 | newState = ReplaceState 61 | { rsChunks = newChunks 62 | , rsPreviousMatchEnd = matchEnd + 1 63 | , rsLength = newLength 64 | } 65 | in 66 | if newLength > maxLength 67 | then BoyerMoore.Done newState 68 | else BoyerMoore.Step newState 69 | 70 | finish rs = 71 | let 72 | -- Slice the remaining part of the haystack from the end of the last match 73 | -- to the end of the haystack. 74 | haystackPartLength = haystackLength - rsPreviousMatchEnd rs 75 | finalChunks 76 | = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack 77 | : rsChunks rs 78 | finalLength = rsLength rs + haystackPartLength 79 | in 80 | if finalLength > maxLength 81 | then Nothing 82 | else Just $ Text.concat $ reverse finalChunks 83 | 84 | -- | Internal accumulator state for performing a replace while stepping an automaton 85 | data ReplaceState = ReplaceState 86 | { rsChunks :: [Text] 87 | -- ^ Chunks of the final text, in reverse order so that we can efficiently prepend 88 | , rsPreviousMatchEnd :: !CodeUnitIndex 89 | -- ^ Index one past the end of the last match. 90 | , rsLength :: !CodeUnitIndex 91 | -- ^ Length of the newly build string so far, measured in CodeUnits 92 | } 93 | deriving Generic 94 | -------------------------------------------------------------------------------- /src/Data/Text/BoyerMoore/Replacer.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE GHC2021 #-} 8 | {-# LANGUAGE DeriveGeneric #-} 9 | {-# LANGUAGE DerivingStrategies #-} 10 | {-# LANGUAGE FlexibleInstances #-} 11 | 12 | module Data.Text.BoyerMoore.Replacer 13 | ( -- Replacer 14 | replaceSingleLimited 15 | ) where 16 | 17 | import GHC.Generics (Generic) 18 | import Data.Text.Utf8 (Text) 19 | import Data.Text.BoyerMoore.Automaton (Automaton, CodeUnitIndex) 20 | 21 | import qualified Data.Text.Utf8 as Text 22 | import qualified Data.Text.Utf8 as Utf8 23 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore 24 | 25 | -- | Replace all occurrences matched by the Boyer-Moore automaton 26 | -- with the given replacement text in some haystack. 27 | -- Performs case-sensitive replacement. 28 | replaceSingleLimited 29 | :: Automaton -- ^ Matches the needles 30 | -> Text -- ^ Replacement string 31 | -> Text -- ^ Haystack 32 | -> CodeUnitIndex -- ^ Maximum number of code units in the returned text 33 | -> Maybe Text 34 | replaceSingleLimited needle replacement haystack maxLength 35 | | needleLength == 0 = Just $ if haystackLength == 0 then replacement else haystack 36 | | otherwise = finish $ BoyerMoore.runText initial foundMatch needle haystack 37 | where 38 | needleLength = BoyerMoore.patternLength needle 39 | haystackLength = Utf8.lengthUtf8 haystack 40 | replacementLength = Utf8.lengthUtf8 replacement 41 | 42 | initial = ReplaceState 43 | { rsChunks = [] 44 | , rsPreviousMatchEnd = 0 45 | , rsLength = 0 46 | } 47 | 48 | foundMatch rs matchStart = 49 | let 50 | matchEnd = matchStart + needleLength 51 | 52 | -- Slice the part of the haystack between the end of the previous match 53 | -- and the start of the current match 54 | haystackPartLength = matchStart - rsPreviousMatchEnd rs 55 | haystackPart = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack 56 | 57 | -- Add the preceding part of the haystack and the replacement in reverse 58 | -- order to the chunk list (all chunks will be reversed at once in the final step). 59 | newChunks = replacement : haystackPart : rsChunks rs 60 | newLength = replacementLength + haystackPartLength + rsLength rs 61 | 62 | newState = ReplaceState 63 | { rsChunks = newChunks 64 | , rsPreviousMatchEnd = matchEnd 65 | , rsLength = newLength 66 | } 67 | in 68 | if newLength > maxLength 69 | then BoyerMoore.Done newState 70 | else BoyerMoore.Step newState 71 | 72 | finish rs = 73 | let 74 | -- Slice the remaining part of the haystack from the end of the last match 75 | -- to the end of the haystack. 76 | haystackPartLength = haystackLength - rsPreviousMatchEnd rs 77 | finalChunks 78 | = Utf8.unsafeSliceUtf8 (rsPreviousMatchEnd rs) haystackPartLength haystack 79 | : rsChunks rs 80 | finalLength = rsLength rs + haystackPartLength 81 | in 82 | if finalLength > maxLength 83 | then Nothing 84 | else Just $ Text.concat $ reverse finalChunks 85 | 86 | -- | Internal accumulator state for performing a replace while stepping an automaton 87 | data ReplaceState = ReplaceState 88 | { rsChunks :: [Text] 89 | -- ^ Chunks of the final text, in reverse order so that we can efficiently prepend 90 | , rsPreviousMatchEnd :: !CodeUnitIndex 91 | -- ^ Index one past the end of the last match. 92 | , rsLength :: !CodeUnitIndex 93 | -- ^ Length of the newly build string so far, measured in CodeUnits 94 | } 95 | deriving Generic 96 | -------------------------------------------------------------------------------- /tests/Data/Text/TestInstances.hs: -------------------------------------------------------------------------------- 1 | module Data.Text.TestInstances where 2 | 3 | import Data.Text.Internal (Text (..)) 4 | import Test.QuickCheck (Arbitrary (..), Gen) 5 | 6 | import qualified Data.Text as Text 7 | import qualified Data.Text.Array as TextArray 8 | import qualified Test.QuickCheck as QuickCheck 9 | import qualified Test.QuickCheck.Gen as Gen 10 | 11 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 12 | import qualified Data.Text.Utf8 as Utf8 13 | 14 | 15 | 16 | instance Arbitrary CaseSensitivity where 17 | arbitrary = Gen.elements [CaseSensitive, IgnoreCase] 18 | 19 | instance Arbitrary Utf8.CodeUnitIndex where 20 | arbitrary = fmap Utf8.CodeUnitIndex arbitrary 21 | 22 | 23 | 24 | -- | Copy the text such that it has an arbitrary offset, but the represented 25 | -- text stays the same. 26 | arbitraryOffset :: Text -> Gen Text 27 | arbitraryOffset (Text sourceData sourceOffset sourceLength) = do 28 | QuickCheck.Positive destOffset <- arbitrary 29 | let destData = TextArray.run $ do 30 | arr <- TextArray.new (destOffset + sourceLength) 31 | TextArray.copyI sourceLength arr destOffset sourceData sourceOffset 32 | pure arr 33 | pure $ Text destData destOffset sourceLength 34 | 35 | 36 | -- | Generate random needles and haystacks, such that the needles have a 37 | -- reasonable probability of occuring in the haystack, which would hardly be the 38 | -- case if we just generated random texts for all of them. 39 | -- 40 | -- We do this by first generating a set of fragments, and then building the 41 | -- haystack and needles by combining these fragments. This way we also get a lot 42 | -- of partial matches where part of a needle does occur in the haystack, but the 43 | -- full needle does not, as well as needles with a shared prefix or suffix. This 44 | -- should fully stress the possible transitions in the search algorithms. 45 | -- 46 | arbitraryNeedleHaystack :: Gen (Text, Text) 47 | arbitraryNeedleHaystack = do 48 | -- Generate a set of fragments, all within the same arbitrarily chosen alphabet 49 | alphabet <- arbitraryAlphabet 50 | fragments <- Gen.listOf1 $ Gen.resize 5 (arbitraryFragment alphabet) 51 | let 52 | genSmall = Gen.scale (`div` 3) $ Gen.listOf1 $ Gen.elements fragments 53 | genBig = Gen.scale (* 4) $ Gen.listOf1 $ Gen.elements fragments 54 | needle <- arbitraryOffset =<< fmap Text.concat genSmall 55 | haystack <- arbitraryOffset =<< fmap Text.concat genBig 56 | pure (needle, haystack) 57 | 58 | arbitraryNeedlesHaystack :: Gen ([Text], Text) 59 | arbitraryNeedlesHaystack = do 60 | -- Generate a set of fragments, all within the same arbitrarily chosen alphabet 61 | alphabet <- arbitraryAlphabet 62 | fragments <- Gen.listOf1 $ Gen.resize 5 (arbitraryFragment alphabet) 63 | let 64 | genSmall = Gen.scale (`div` 3) $ Gen.listOf1 $ Gen.elements fragments 65 | genBig = Gen.scale (* 4) $ Gen.listOf1 $ Gen.elements fragments 66 | needles <- Gen.listOf1 (arbitraryOffset =<< fmap Text.concat genSmall) 67 | haystack <- arbitraryOffset =<< fmap Text.concat genBig 68 | pure (needles, haystack) 69 | 70 | arbitraryFragment :: [Char] -> Gen Text 71 | arbitraryFragment alphabet = 72 | arbitraryOffset =<< Text.pack <$> Gen.listOf1 (Gen.elements alphabet) 73 | 74 | arbitraryAlphabet :: Gen [Char] 75 | arbitraryAlphabet = 76 | Gen.oneof [ pure simpleAlphabet 77 | , pure fancyAlphabet 78 | , randomAlphabet 79 | ] 80 | 81 | where 82 | 83 | simpleAlphabet = "abAB12" -- Some ascii, include numbers so that not everything has upper/lower cases 84 | 85 | fancyAlphabet = 86 | concat 87 | [ "яЯ" -- Cyrillic, two-byte characters 88 | , "åÅÅ" -- Å '\8491' and Å '\197' both have å '\229' as lower case 89 | ++ "𝄞💩" -- Four byte characters \119070 and \128169 90 | ++ "ßẞ" -- Note that ẞ '\7838' lower cases to ß '\223', but ß '\223' upper cases to ß '\223' 91 | ] 92 | 93 | randomAlphabet = sequence $ replicate 8 QuickCheck.arbitrary 94 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # benchmark 2 | 3 | This is the code used to benchmark `alfred-margaret`. 4 | The results of this benchmark are shown in this [Channable tech blog article](https://www.channable.com/tech/how-we-made-haskell-search-strings-as-fast-as-rust). 5 | 6 | ## Setup 7 | 8 | We use Nix to manage dependencies such as `cargo`, `bazel`, `stack` and the required python packages. 9 | In order to drop into a environment containing these, run: 10 | 11 | ``` 12 | # In the repository root (alfred-margaret folder) 13 | nix run --arg benchTools true -c $SHELL 14 | ``` 15 | 16 | The first time you do this will likely take a while unless you've already downloaded those packages (some 5 GiB). 17 | 18 | ## Running the Benchmarks 19 | 20 | Files in the `data` directory must follow this format: 21 | 22 | ``` 23 | Lorem 24 | sunt 25 | officia 26 | 27 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 28 | ``` 29 | 30 | The first part of this file defines the search terms (the "needles") `Lorem`, `sunt` and `officia`. 31 | The second part, after a blank line, defines the corpus to search in (the "haystack"). 32 | Currently, these files **must be encoded as UTF-16 Little-Endian without BOM**. 33 | 34 | Each run of `benchmark.sh` generates a pair of `.results` and `.stats` files. 35 | 36 | ### Python 37 | 38 | Run the benchmark in the root of the repository: 39 | 40 | ``` 41 | ./benchmark.sh python 42 | ``` 43 | 44 | ### Rust 45 | 46 | Compile the Rust implementation by running Cargo in the `./benchmark/rust` directory: 47 | 48 | ``` 49 | cargo build --release 50 | ``` 51 | 52 | Run the benchmark using the compiled binary: 53 | 54 | ``` 55 | ./benchmark.sh rust 56 | ``` 57 | 58 | ### Java 59 | 60 | Compile the Java implementation by running Bazel in the `java` directory: 61 | 62 | ``` 63 | bazel build :acbench_deploy.jar 64 | ``` 65 | 66 | Run the benchmark using the script generated by Bazel: 67 | 68 | ``` 69 | ./benchmark.sh java 70 | ``` 71 | 72 | ### Haskell 73 | 74 | Compile the Haskell implementation by running Stack in the `haskell` directory: 75 | 76 | ``` 77 | stack build 78 | ``` 79 | 80 | Run the benchmark using the compiled binary: 81 | 82 | ``` 83 | ./benchmark.sh haskell 84 | ``` 85 | 86 | Note that you must first convert the data files for the benchmark into UTF-8 and put them in `data-utf8`. 87 | You can use `iconv` for this: 88 | 89 | ``` 90 | mkdir data-utf8 91 | for f in $(ls data); do echo $f; iconv -f UTF-16LE -t UTF-8 data/$f -o data-utf8/$f; done 92 | ``` 93 | 94 | ### Haskell, with Rust FFI 95 | 96 | Compile the static `libacbench` library using `cargo` in the `rust-ffi/libacbench` directory: 97 | 98 | ``` 99 | cargo build --release 100 | ``` 101 | 102 | Make sure to pass `--release`! Now run Stack in the `rust-ffi` directory: 103 | 104 | ``` 105 | stack build 106 | ``` 107 | 108 | Run the benchmark using the compiled binary: 109 | 110 | ``` 111 | ./benchmark.sh rust-ffi 112 | ``` 113 | 114 | Since this version uses the UTF-8 as well, you have to generate the UTF-8 data first as described in the previous section. 115 | 116 | ## Inspecting the Results 117 | 118 | Once you have a bunch of `.stats` files, you can inspect the results using `report.py`: 119 | 120 | ``` 121 | ./report.py haskell.stats haskell-utf8.stats rust.stats java.stats python.stats 122 | # Will print something like (depending on your machine...) 123 | haskell.stats: 124 | mean time: 3.529 ± 0.010 seconds 125 | min time: 3.431 seconds 126 | 127 | haskell-utf8.stats: 128 | mean time: 3.526 ± 0.028 seconds 129 | min time: 3.392 seconds 130 | 131 | rust.stats: 132 | mean time: 4.771 ± 0.013 seconds 133 | min time: 4.685 seconds 134 | 135 | java.stats: 136 | mean time: 7.452 ± 0.090 seconds 137 | min time: 7.143 seconds 138 | 139 | python.stats: 140 | mean time: 11.370 ± 0.057 seconds 141 | min time: 10.973 seconds 142 | ``` 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Alfred–Margaret 2 | 3 | Alfred–Margaret is a fast implementation of the Aho–Corasick string 4 | searching algorithm in Haskell. It powers many string-related operations 5 | in [Channable][channable]. 6 | 7 | The library is designed to work with the [`text`][text] package. It matches 8 | directly on the internal UTF-16 representation of `Text` for efficiency. See the 9 | [announcement blog post][blog-post] for a deeper dive into Aho–Corasick, and the 10 | optimizations that make this library fast. 11 | 12 | Alfred–Margaret is named after Alfred Aho and Margaret Corasick. 13 | 14 | ## Performance 15 | 16 | Running time to count all matches, in a real-world data set, 17 | comparing [a Java implementation][hankcs] and [a Rust implementation][burntsushi] 18 | against Alfred–Margaret, and against memcopy to establish a lower bound: 19 | 20 |

21 | 26 |

27 | 28 | For the full details of this benchmark, see 29 | [our announcement blog post][blog-post], which includes more details about the 30 | data set, the benchmark setup, and a few things to keep in mind when 31 | interpreting this graph. 32 | 33 | ### LLVM 34 | 35 | If you are using LLVM instead of the GHC backend, make sure to compare different versions. 36 | Using LLVM 9 instead of LLVM 12 with GHC 8.10.7 made a significant dent in benchmark times. 37 | For more information, see [this issue][llvm]. 38 | 39 | ## Example 40 | 41 | Check if a string contains one of the needles: 42 | 43 | ```haskell 44 | 45 | import qualified Data.Text.AhoCorasick.Automaton as Aho 46 | import qualified Data.Text.AhoCorasick.Searcher as Searcher 47 | 48 | searcher = Searcher.build Aho.CaseSensitive ["tshirt", "shirts", "shorts"] 49 | 50 | Searcher.containsAny searcher "short tshirts" 51 | > True 52 | 53 | Searcher.containsAny searcher "long shirt" 54 | > False 55 | 56 | Searcher.containsAny searcher "Short TSHIRTS" 57 | > False 58 | 59 | searcher' = Searcher.build Aho.IgnoreCase ["tshirt", "shirts", "shorts"] 60 | 61 | Searcher.containsAny searcher' "Short TSHIRTS" 62 | > True 63 | ``` 64 | 65 | Sequentially replace many needles: 66 | 67 | ```haskell 68 | import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..)) 69 | import qualified Data.Text.AhoCorasick.Replacer as Replacer 70 | 71 | replacer = Replacer.build CaseSensitive [("tshirt", "banana"), ("shirt", "pear")] 72 | 73 | Replacer.run replacer "tshirts for sale" 74 | > "bananas for sale" 75 | 76 | Replacer.run replacer "tshirts and shirts for sale" 77 | > "bananas and pears for sale" 78 | 79 | Replacer.run replacer "sweatshirts and shirtshirts" 80 | > "sweabananas and shirbananas" 81 | 82 | Replacer.run replacer "sweatshirts and shirttshirts" 83 | > "sweabananas and pearbananas" 84 | ``` 85 | 86 | Get all matches, possibly overlapping: 87 | 88 | ```haskell 89 | import qualified Data.Text.AhoCorasick.Automaton as Aho 90 | 91 | pairNeedleWithSelf text = (Aho.unpackUtf16 text, text) 92 | automaton = Aho.build $ fmap pairNeedleWithSelf ["tshirt", "shirts", "shorts"] 93 | allMatches = Aho.runText [] (\matches match -> Aho.Step (match : matches)) 94 | 95 | allMatches automaton "short tshirts" 96 | > [ Match {matchPos = CodeUnitIndex 13, matchValue = "shirts"} 97 | > , Match {matchPos = CodeUnitIndex 12, matchValue = "tshirt"} 98 | > ] 99 | 100 | allMatches automaton "sweatshirts and shirtshirts" 101 | > [ Match {matchPos = CodeUnitIndex 27, matchValue = "shirts"} 102 | > , Match {matchPos = CodeUnitIndex 26, matchValue = "tshirt"} 103 | > , Match {matchPos = CodeUnitIndex 22, matchValue = "shirts"} 104 | > , Match {matchPos = CodeUnitIndex 11, matchValue = "shirts"} 105 | > , Match {matchPos = CodeUnitIndex 10, matchValue = "tshirt"} 106 | > ] 107 | ``` 108 | 109 | ## License 110 | 111 | Alfred–Margaret is licensed under the 3-clause BSD license. 112 | 113 | [channable]: https://www.channable.com/ 114 | [blog-post]: https://tech.channable.com/posts/2019-03-13-how-we-made-haskell-search-strings-as-fast-as-rust.html 115 | [text]: https://github.com/haskell/text 116 | [hankcs]: https://github.com/hankcs/AhoCorasickDoubleArrayTrie/tree/v1.2.0 117 | [burntsushi]: https://github.com/BurntSushi/aho-corasick/tree/0.6.8 118 | [llvm]: https://github.com/channable/alfred-margaret/issues/24 119 | -------------------------------------------------------------------------------- /benchmark/rust-ffi/app/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | 3 | module Main where 4 | 5 | import GHC.Generics (Generic) 6 | import Control.DeepSeq (force) 7 | import Control.Exception (evaluate) 8 | import Control.Monad (void, when) 9 | import Data.Foldable (for_, traverse_) 10 | import Data.Word (Word8) 11 | import Foreign.C.Types (CBool (..), CSize (..)) 12 | import Foreign.Marshal (with, withArray) 13 | import Foreign.Ptr (Ptr, castPtr) 14 | import Foreign.Storable (Storable (sizeOf), alignment, peek, poke, pokeByteOff, sizeOf) 15 | import GHC.Compact (compact, getCompact) 16 | import System.IO (hPrint, stderr, stdout) 17 | import Text.Printf (hPrintf) 18 | 19 | import qualified Data.ByteString as ByteString 20 | import qualified Data.Text.Encoding as Encoding 21 | import qualified System.Clock as Clock 22 | import qualified System.Environment as Env 23 | 24 | import Data.Text.Utf8 (CodeUnitIndex (..), Text (..)) 25 | 26 | import qualified Data.Text.Utf8 as Utf8 27 | 28 | foreign import ccall unsafe "perform_ac" 29 | performAc :: CBool -> CSize -> Ptr U8Slice -> Ptr U8Slice -> IO CSize 30 | 31 | -- | A slice of 'Word8's that can be passed to FFI. 32 | -- The pointer should always point to pinned memory. 33 | -- Use 'fromText' for constructing 'U8Slice's to ensure this. 34 | data U8Slice = U8Slice (Ptr Word8) CSize CSize 35 | deriving Generic 36 | 37 | instance Storable U8Slice where 38 | sizeOf _ = sizeOf (undefined :: Ptr Word8) + 2 * sizeOf (undefined :: CSize) 39 | alignment _ = max (alignment (undefined :: Ptr Word8)) (alignment (undefined :: CSize)) 40 | peek _ptr = error "We only write U8Slices to pointer, never read them" 41 | poke ptr (U8Slice u8ptr off len) = do 42 | poke ptr' u8ptr 43 | pokeByteOff ptr' (sizeOf (undefined :: Ptr Word8)) off 44 | pokeByteOff ptr' (sizeOf (undefined :: Ptr Word8) + sizeOf (undefined :: CSize)) len 45 | where ptr' = castPtr ptr 46 | 47 | -- | Turn a 'Text' value into something that can be passed through FFI. 48 | -- The 'ByteArray' values inside must be pinned. 49 | fromText :: Text -> U8Slice 50 | fromText (Text u8data off len) 51 | | Utf8.isArrayPinned u8data = U8Slice (Utf8.arrayContents u8data) (fromIntegral off) (fromIntegral len) 52 | | otherwise = error "ByteArray is not pinned" 53 | 54 | readNeedleHaystackFile :: FilePath -> IO ([Text], Text) 55 | readNeedleHaystackFile path = do 56 | (Text u8data off len) <- Encoding.decodeUtf8 <$> ByteString.readFile path 57 | pure $ go u8data off len [] 58 | where 59 | go u8data off 0 needles = (reverse needles, Text u8data off 0) 60 | go u8data off len needles 61 | -- "line starts with newline char" ==> empty line, emit haystack as slice of u8data 62 | | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = (reverse needles, Text u8data (off + 1) (len - 1)) 63 | | otherwise = consumeNeedle u8data off len needles off 64 | 65 | consumeNeedle u8data off len needles needleStart 66 | -- Newline ==> emit needle as slice of u8data 67 | | Utf8.unsafeIndexCodeUnit' u8data (CodeUnitIndex off) == 10 = go u8data (off + 1) (len - 1) $ Text u8data needleStart (off - needleStart) : needles 68 | | otherwise = consumeNeedle u8data (off + 1) (len - 1) needles needleStart 69 | 70 | main :: IO () 71 | main = Env.getArgs >>= traverse_ processFile 72 | 73 | processFile :: FilePath -> IO () 74 | processFile path = do 75 | (needles, haystack) <- getCompact <$> (compact =<< readNeedleHaystackFile path) 76 | 77 | void $ evaluate $ force needles 78 | void $ evaluate $ force haystack 79 | 80 | for_ [0 :: Int .. 4] $ \i -> do 81 | (count, duration) <- acBench needles haystack 82 | when (i == 0) $ 83 | hPrint stderr count 84 | hPrintf stdout "%d\t" (Clock.toNanoSecs duration) 85 | hPrintf stdout "\n" 86 | 87 | acBench :: [Text] -> Text -> IO (Int, Clock.TimeSpec) 88 | acBench needles haystack = do 89 | start <- Clock.getTime Clock.Monotonic 90 | 91 | let numNeedles = fromIntegral $ length needles 92 | matchCount <- with (fromText haystack) $ \haystackSlice -> 93 | withArray (map fromText needles) $ \needleSlices -> do 94 | performAc useSparse numNeedles needleSlices haystackSlice 95 | 96 | end <- Clock.getTime Clock.Monotonic 97 | pure (fromIntegral matchCount, Clock.diffTimeSpec start end) 98 | where 99 | -- Whether to use the sparse or dense implementation. 100 | -- Set to @0@ to use the dense implementation. 101 | -- Set to @1@ to use the sparse implementation. 102 | useSparse :: CBool 103 | useSparse = 0 104 | -------------------------------------------------------------------------------- /src/Data/Text/BoyerMoore/Searcher.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE DeriveGeneric #-} 9 | {-# LANGUAGE FlexibleInstances #-} 10 | 11 | module Data.Text.BoyerMoore.Searcher 12 | ( Searcher 13 | , automata 14 | , build 15 | , buildNeedleIdSearcher 16 | , buildWithValues 17 | , containsAll 18 | , containsAny 19 | , needles 20 | , numNeedles 21 | ) where 22 | 23 | 24 | import Control.DeepSeq (NFData) 25 | import Data.Bifunctor (first) 26 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed) 27 | import GHC.Generics (Generic) 28 | 29 | import Data.Text.Utf8 (Text) 30 | import Data.Text.BoyerMoore.Automaton (Automaton) 31 | 32 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore 33 | 34 | 35 | -- | A set of needles with associated values, and Boyer-Moore automata to 36 | -- efficiently find those needles. 37 | -- 38 | -- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles 39 | -- To enforce this invariant, the fields are not exposed from this module. 40 | -- There is a separate constructor function. 41 | -- 42 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we 43 | -- can derive those for the types that embed the searcher, whithout 44 | -- requiring the automaton itself to be Hashable or Eq, which would be both 45 | -- wasteful and tedious. Because the automaton is fully determined by the 46 | -- needles and associated values, it is sufficient to implement Eq and Hashable 47 | -- in terms of the needles only. 48 | -- 49 | -- We also use Hashed to cache the hash of the needles. 50 | data Searcher v = Searcher 51 | { searcherNeedles :: Hashed [(Text, v)] 52 | , searcherNumNeedles :: Int 53 | , searcherAutomata :: [(Automaton, v)] 54 | } deriving (Generic) 55 | 56 | instance Show (Searcher v) where 57 | show _ = "Searcher _ _ _" 58 | 59 | instance Hashable v => Hashable (Searcher v) where 60 | hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher 61 | {-# INLINE hashWithSalt #-} 62 | 63 | instance Eq v => Eq (Searcher v) where 64 | Searcher xs nx _ == Searcher ys ny _ = nx == ny && xs == ys 65 | {-# INLINE (==) #-} 66 | 67 | instance NFData v => NFData (Searcher v) 68 | 69 | -- | Builds the Searcher for a list of needles without values. 70 | -- This is useful for just checking whether the haystack contains the needles. 71 | build :: [Text] -> Searcher () 72 | {-# INLINABLE build #-} 73 | build = buildWithValues . flip zip (repeat ()) 74 | 75 | -- | Builds the Searcher for a list of needles. 76 | buildWithValues :: Hashable v => [(Text, v)] -> Searcher v 77 | {-# INLINABLE buildWithValues #-} 78 | buildWithValues ns = 79 | Searcher (hashed ns) (length ns) $ map (first BoyerMoore.buildAutomaton) ns 80 | 81 | needles :: Searcher v -> [(Text, v)] 82 | needles = unhashed . searcherNeedles 83 | 84 | automata :: Searcher v -> [(Automaton, v)] 85 | automata = searcherAutomata 86 | 87 | numNeedles :: Searcher v -> Int 88 | numNeedles = searcherNumNeedles 89 | 90 | -- | Return whether the haystack contains any of the needles. 91 | -- This function is marked noinline as an inlining boundary. BoyerMoore.runText is 92 | -- marked inline, so this function will be optimized to report only whether 93 | -- there is a match, and not construct a list of matches. We don't want this 94 | -- function be inline, to make sure that the conditions of the caller don't 95 | -- affect how this function is optimized. There is little to gain from 96 | -- additional inlining. The pragma is not an optimization in itself, rather it 97 | -- is a defence against fragile optimizer decisions. 98 | {-# NOINLINE containsAny #-} 99 | containsAny :: Searcher () -> Text -> Bool 100 | containsAny !searcher !text = 101 | let 102 | -- On the first match, return True immediately. 103 | f _acc _match = BoyerMoore.Done True 104 | in 105 | any (\(automaton, ()) -> BoyerMoore.runText False f automaton text) (automata searcher) 106 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches. 107 | 108 | buildNeedleIdSearcher :: [Text] -> Searcher Int 109 | buildNeedleIdSearcher !ns = 110 | buildWithValues $ zip ns [0..] 111 | 112 | -- | Like 'containsAny', but checks whether all needles match instead. 113 | -- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'. 114 | {-# NOINLINE containsAll #-} 115 | containsAll :: Searcher Int -> Text -> Bool 116 | containsAll !searcher !text = 117 | let 118 | -- On the first match, return True immediately. 119 | f _acc _match = BoyerMoore.Done True 120 | in 121 | all (\(automaton, _) -> BoyerMoore.runText False f automaton text) (automata searcher) 122 | -------------------------------------------------------------------------------- /src/Data/Text/BoyerMooreCI/Searcher.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE DeriveGeneric #-} 9 | {-# LANGUAGE FlexibleInstances #-} 10 | 11 | module Data.Text.BoyerMooreCI.Searcher 12 | ( Searcher 13 | , automata 14 | , build 15 | , buildNeedleIdSearcher 16 | , buildWithValues 17 | , containsAll 18 | , containsAny 19 | , needles 20 | , numNeedles 21 | ) where 22 | 23 | 24 | import Control.DeepSeq (NFData) 25 | import Data.Bifunctor (first) 26 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed) 27 | import GHC.Generics (Generic) 28 | 29 | import Data.Text.Utf8 (Text) 30 | import Data.Text.BoyerMooreCI.Automaton (Automaton) 31 | 32 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMoore 33 | 34 | 35 | -- | A set of needles with associated values, and Boyer-Moore automata to 36 | -- efficiently find those needles. 37 | -- 38 | -- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles 39 | -- To enforce this invariant, the fields are not exposed from this module. 40 | -- There is a separate constructor function. 41 | -- 42 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we 43 | -- can derive those for the types that embed the searcher, whithout 44 | -- requiring the automaton itself to be Hashable or Eq, which would be both 45 | -- wasteful and tedious. Because the automaton is fully determined by the 46 | -- needles and associated values, it is sufficient to implement Eq and Hashable 47 | -- in terms of the needles only. 48 | -- 49 | -- We also use Hashed to cache the hash of the needles. 50 | data Searcher v = Searcher 51 | { searcherNeedles :: Hashed [(Text, v)] 52 | , searcherNumNeedles :: Int 53 | , searcherAutomata :: [(Automaton, v)] 54 | } deriving (Generic) 55 | 56 | instance Show (Searcher v) where 57 | show _ = "Searcher _ _ _" 58 | 59 | instance Hashable v => Hashable (Searcher v) where 60 | hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher 61 | {-# INLINE hashWithSalt #-} 62 | 63 | instance Eq v => Eq (Searcher v) where 64 | Searcher xs nx _ == Searcher ys ny _ = nx == ny && xs == ys 65 | {-# INLINE (==) #-} 66 | 67 | instance NFData v => NFData (Searcher v) 68 | 69 | -- | Builds the Searcher for a list of needles without values. 70 | -- This is useful for just checking whether the haystack contains the needles. 71 | build :: [Text] -> Searcher () 72 | {-# INLINABLE build #-} 73 | build = buildWithValues . flip zip (repeat ()) 74 | 75 | -- | Builds the Searcher for a list of needles. 76 | buildWithValues :: Hashable v => [(Text, v)] -> Searcher v 77 | {-# INLINABLE buildWithValues #-} 78 | buildWithValues ns = 79 | Searcher (hashed ns) (length ns) $ map (first BoyerMoore.buildAutomaton) ns 80 | 81 | needles :: Searcher v -> [(Text, v)] 82 | needles = unhashed . searcherNeedles 83 | 84 | automata :: Searcher v -> [(Automaton, v)] 85 | automata = searcherAutomata 86 | 87 | numNeedles :: Searcher v -> Int 88 | numNeedles = searcherNumNeedles 89 | 90 | -- | Return whether the haystack contains any of the needles. 91 | -- This function is marked noinline as an inlining boundary. BoyerMoore.runText is 92 | -- marked inline, so this function will be optimized to report only whether 93 | -- there is a match, and not construct a list of matches. We don't want this 94 | -- function be inline, to make sure that the conditions of the caller don't 95 | -- affect how this function is optimized. There is little to gain from 96 | -- additional inlining. The pragma is not an optimization in itself, rather it 97 | -- is a defence against fragile optimizer decisions. 98 | {-# NOINLINE containsAny #-} 99 | containsAny :: Searcher () -> Text -> Bool 100 | containsAny !searcher !text = 101 | let 102 | -- On the first match, return True immediately. 103 | f _acc _matchStart _matchEnd = BoyerMoore.Done True 104 | in 105 | any (\(automaton, ()) -> BoyerMoore.runText False f automaton text) (automata searcher) 106 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches. 107 | 108 | buildNeedleIdSearcher :: [Text] -> Searcher Int 109 | buildNeedleIdSearcher !ns = 110 | buildWithValues $ zip ns [0..] 111 | 112 | -- | Like 'containsAny', but checks whether all needles match instead. 113 | -- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'. 114 | {-# NOINLINE containsAll #-} 115 | containsAll :: Searcher Int -> Text -> Bool 116 | containsAll !searcher !text = 117 | let 118 | -- On the first match, return True immediately. 119 | f _acc _matchStart _matchEnd = BoyerMoore.Done True 120 | in 121 | all (\(automaton, _) -> BoyerMoore.runText False f automaton text) (automata searcher) 122 | -------------------------------------------------------------------------------- /alfred-margaret.cabal: -------------------------------------------------------------------------------- 1 | name: alfred-margaret 2 | version: 2.1.0.2 3 | synopsis: Fast Aho-Corasick string searching 4 | description: An efficient implementation of the Aho-Corasick 5 | string searching algorithm. 6 | homepage: https://github.com/channable/alfred-margaret 7 | license: BSD3 8 | license-file: LICENSE 9 | author: The Alfred-Margaret authors 10 | maintainer: Ruud van Asseldonk , Fabian Thorand 11 | copyright: 2020 Channable 12 | category: Data, Text 13 | build-type: Simple 14 | extra-source-files: README.md 15 | , performance.png 16 | cabal-version: >=1.10 17 | tested-with: 18 | -- Nixpkgs unstable (Updated 2022-04-14) 19 | GHC == 8.8.4 20 | -- Nixpkgs unstable (Updated 2022-04-14) 21 | , GHC == 8.10.7 22 | -- Nixpkgs unstable (Updated 2022-04-14) 23 | , GHC == 9.0.2 24 | -- Nixpkgs unstable (Updated 2024-09-05) 25 | , GHC == 9.6.6 26 | 27 | source-repository head 28 | type: git 29 | location: https://github.com/channable/alfred-margaret 30 | 31 | Flag aeson 32 | Description: Enable aeson support 33 | Manual: False 34 | Default: True 35 | 36 | -- Compile this package with LLVM, rather than with the default code generator. 37 | -- LLVM produces about 20% faster code. 38 | Flag llvm 39 | Description: Compile with LLVM 40 | Manual: True 41 | -- Defaulting to false makes the package buildable by Hackage, 42 | -- allowing the documentation to be generated for us. 43 | Default: False 44 | 45 | library 46 | hs-source-dirs: src 47 | exposed-modules: Data.Text.CaseSensitivity 48 | , Data.Text.Utf8 49 | , Data.Text.Utf8.Unlower 50 | , Data.Text.AhoCorasick.Automaton 51 | , Data.Text.AhoCorasick.Replacer 52 | , Data.Text.AhoCorasick.Searcher 53 | , Data.Text.AhoCorasick.Splitter 54 | , Data.Text.BoyerMoore.Automaton 55 | , Data.Text.BoyerMoore.Replacer 56 | , Data.Text.BoyerMoore.Searcher 57 | , Data.Text.BoyerMooreCI.Automaton 58 | , Data.Text.BoyerMooreCI.Replacer 59 | , Data.Text.BoyerMooreCI.Searcher 60 | other-modules: Data.Primitive.Extended 61 | build-depends: 62 | base >= 4.7 && < 5 63 | , containers >= 0.6 && < 0.8 64 | , deepseq >= 1.4 && < 1.6 65 | , hashable >= 1.4.0.2 && < 1.6 66 | , primitive >= 0.6.4 && < 0.10 67 | , text >= 2.0 && < 2.2 68 | , unordered-containers >= 0.2.9 && < 0.3 69 | , vector >= 0.12 && < 0.14 70 | ghc-options: -Wall -Wincomplete-record-updates -Wincomplete-uni-patterns -O2 71 | default-language: Haskell2010 72 | if flag(aeson) { 73 | -- Even an older version of aeson is fine since 74 | -- we only use it for instances 75 | build-depends: aeson >= 1.4.2 && < 3 76 | cpp-options: -DHAS_AESON 77 | } 78 | if flag(llvm) { 79 | ghc-options: -fllvm -optlo=-O3 -optlo=-tailcallelim 80 | } 81 | 82 | test-suite test-suite 83 | type: exitcode-stdio-1.0 84 | main-is: Main.hs 85 | other-modules: Data.Text.AhoCorasickSpec 86 | , Data.Text.BoyerMooreSpec 87 | , Data.Text.BoyerMooreCISpec 88 | , Data.Text.Utf8Spec 89 | , Data.Text.TestInstances 90 | hs-source-dirs: tests 91 | ghc-options: -Wall -Wincomplete-record-updates -Wno-orphans 92 | build-depends: base >= 4.7 && < 5 93 | , QuickCheck 94 | , alfred-margaret 95 | , deepseq 96 | , hspec 97 | , hspec-expectations 98 | , primitive 99 | , quickcheck-instances 100 | , text 101 | default-language: Haskell2010 102 | 103 | benchmark uvector-vs-tba 104 | type: exitcode-stdio-1.0 105 | main-is: Main.hs 106 | hs-source-dirs: bench/uvector-vs-tba 107 | ghc-options: -Wall -Wincomplete-record-updates -Wno-orphans 108 | build-depends: base >= 4.7 && < 5 109 | , alfred-margaret 110 | , vector 111 | , deepseq 112 | , criterion 113 | default-language: Haskell2010 114 | 115 | benchmark bm 116 | type: exitcode-stdio-1.0 117 | main-is: Main.hs 118 | hs-source-dirs: bench/bm 119 | ghc-options: -Wall -Wincomplete-record-updates -Wno-orphans 120 | build-depends: base >= 4.7 && < 5 121 | , alfred-margaret 122 | , deepseq 123 | , criterion 124 | , text 125 | default-language: Haskell2010 126 | 127 | executable dump-automaton 128 | main-is: Main.hs 129 | hs-source-dirs: 130 | app/dump-automaton 131 | build-depends: base 132 | , alfred-margaret 133 | default-language: Haskell2010 134 | -------------------------------------------------------------------------------- /src/Data/Text/AhoCorasick/Splitter.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE GHC2021 #-} 8 | {-# LANGUAGE DeriveGeneric #-} 9 | {-# LANGUAGE BangPatterns #-} 10 | {-# LANGUAGE CPP #-} 11 | 12 | -- | Splitting strings using Aho–Corasick. 13 | module Data.Text.AhoCorasick.Splitter 14 | ( Splitter 15 | , automaton 16 | , build 17 | , separator 18 | , split 19 | , splitIgnoreCase 20 | , splitReverse 21 | , splitReverseIgnoreCase 22 | ) where 23 | 24 | import GHC.Generics (Generic) 25 | import Control.DeepSeq (NFData (..)) 26 | import Data.Function (on) 27 | import Data.Hashable (Hashable (..)) 28 | import Data.List.NonEmpty (NonEmpty ((:|))) 29 | import Data.Text.Utf8 (Text) 30 | 31 | #if defined(HAS_AESON) 32 | import qualified Data.Aeson as AE 33 | #endif 34 | 35 | import qualified Data.List.NonEmpty as NonEmpty 36 | import qualified Data.Text as Text 37 | 38 | import Data.Text.AhoCorasick.Automaton (AcMachine) 39 | 40 | import qualified Data.Text.Utf8 as Utf8 41 | import qualified Data.Text.AhoCorasick.Automaton as Aho 42 | 43 | -------------------------------------------------------------------------------- 44 | -- Splitter 45 | 46 | -- | Build a splitter once, then use it many times! 47 | data Splitter = 48 | Splitter 49 | { splitterAutomaton :: AcMachine () -- INVARIANT: Exactly one needle. 50 | , splitterSeparator :: Text -- INVARIANT: Equivalent to needle. 51 | } 52 | deriving Generic 53 | 54 | #if defined(HAS_AESON) 55 | instance AE.ToJSON Splitter where 56 | toJSON = AE.toJSON . separator 57 | 58 | instance AE.FromJSON Splitter where 59 | parseJSON v = build <$> AE.parseJSON v 60 | #endif 61 | 62 | -- | Construct a splitter with a single separator. 63 | {-# INLINE build #-} 64 | build :: Text -> Splitter 65 | build sep = 66 | let !auto = Aho.build [(sep, ())] in 67 | Splitter auto sep 68 | 69 | -- | Get the automaton that would be used for finding separators. 70 | {-# INLINE automaton #-} 71 | automaton :: Splitter -> AcMachine () 72 | automaton = splitterAutomaton 73 | 74 | -- | What is the separator we are splitting on? 75 | {-# INLINE separator #-} 76 | separator :: Splitter -> Text 77 | separator = splitterSeparator 78 | 79 | -- | Split the given string into strings separated by the separator. 80 | -- 81 | -- If the order of the results is not important, use the faster function 82 | -- 'splitReverse' instead. 83 | {-# INLINE split #-} 84 | split :: Splitter -> Text -> NonEmpty Text 85 | split = (NonEmpty.reverse .) . splitReverse 86 | 87 | -- | Split the given string into strings separated by the separator. 88 | -- 89 | -- If the order of the results is not important, use the faster function 90 | -- 'splitReverseIgnoreCase' instead. 91 | -- 92 | -- The separator is matched case-insensitively, but the splitter must have been 93 | -- constructed with a lowercase needle. 94 | {-# INLINE splitIgnoreCase #-} 95 | splitIgnoreCase :: Splitter -> Text -> NonEmpty Text 96 | splitIgnoreCase = (NonEmpty.reverse .) . splitReverseIgnoreCase 97 | 98 | -- | Like 'split', but return the substrings in reverse order. 99 | {-# INLINE splitReverse #-} 100 | splitReverse :: Splitter -> Text -> NonEmpty Text 101 | splitReverse s t = 102 | finalizeAccum t $ Aho.runText zeroAccum stepAccum' (automaton s) t 103 | where 104 | -- Case sensitive matching: separator length is in bytes. 105 | sepLength = Utf8.lengthUtf8 (separator s) 106 | stepAccum' accum (Aho.Match newFragmentStart _) = 107 | stepAccum t accum (newFragmentStart - sepLength) newFragmentStart 108 | 109 | 110 | -- | Like 'splitIgnoreCase', but return the substrings in reverse order. 111 | {-# INLINE splitReverseIgnoreCase #-} 112 | splitReverseIgnoreCase :: Splitter -> Text -> NonEmpty Text 113 | splitReverseIgnoreCase s t = 114 | finalizeAccum t $ Aho.runLower zeroAccum stepAccum' (automaton s) t 115 | where 116 | -- Case insensitive matching: separator length is in codepoints. 117 | sepLength = Text.length (separator s) 118 | stepAccum' accum (Aho.Match newFragmentStart _) = 119 | -- We start at the last byte of the separator, and look backwards. 120 | let sepStart = Utf8.skipCodePointsBackwards t (newFragmentStart-1) (sepLength-1) in 121 | stepAccum t accum sepStart newFragmentStart 122 | 123 | -------------------------------------------------------------------------------- 124 | -- Fold 125 | 126 | -- | The accumulator is used as state when processing the matches from left to 127 | -- right. While the matches are fed to us ordered by end offset, all matches 128 | -- have the same length because there is only one needle. 129 | data Accum = 130 | Accum 131 | { accumResult :: ![Text] 132 | -- ^ Match-separated strings. 133 | , accumFragmentStart :: !Aho.CodeUnitIndex 134 | -- ^ First byte of current fragment (that is the non-separator part) 135 | } 136 | deriving Generic 137 | 138 | -- | Finalizing the accumulator does more than just 'accumResult', hence this 139 | -- is a separate function. 140 | {-# INLINE finalizeAccum #-} 141 | finalizeAccum :: Text -> Accum -> NonEmpty Text 142 | finalizeAccum hay (Accum res prevEnd) = 143 | -- Once we have processed all the matches, there is still the substring after 144 | -- the final match. This substring is always included in the result, even 145 | -- when there were no matches. Hence we can return a non-empty list. 146 | let !str = Utf8.unsafeSliceUtf8 prevEnd (Utf8.lengthUtf8 hay - prevEnd) hay in 147 | str :| res 148 | 149 | -- | The initial accumulator begins at the begin of the haystack. 150 | {-# INLINE zeroAccum #-} 151 | zeroAccum :: Accum 152 | zeroAccum = Accum { accumResult = [], accumFragmentStart = 0 } 153 | 154 | -- | Step the accumulator using the next match. Overlapping matches will be 155 | -- ignored. Overlapping matches may occur when the separator has a non-empty 156 | -- prefix that is also a suffix. 157 | {-# INLINE stepAccum #-} 158 | stepAccum :: Text -> Accum -> Aho.CodeUnitIndex -> Aho.CodeUnitIndex -> Aho.Next Accum 159 | stepAccum hay acc@(Accum res fragmentStart) sepStart newFragmentStart 160 | 161 | -- When the match begins before the current offset, it overlaps a match that 162 | -- we processed before, and so we ignore it. 163 | | sepStart < fragmentStart = 164 | Aho.Step acc 165 | 166 | -- The match is behind the current offset, so we slice the haystack until the 167 | -- begin of the match and include that as a result. 168 | | otherwise = 169 | let !str = Utf8.unsafeSliceUtf8 fragmentStart (sepStart - fragmentStart) hay in 170 | Aho.Step acc { accumResult = str : res, accumFragmentStart = newFragmentStart } 171 | 172 | -------------------------------------------------------------------------------- 173 | -- Instances 174 | 175 | instance Eq Splitter where 176 | {-# INLINE (==) #-} 177 | (==) = (==) `on` separator 178 | 179 | instance Ord Splitter where 180 | {-# INLINE compare #-} 181 | compare = compare `on` separator 182 | 183 | instance Hashable Splitter where 184 | {-# INLINE hashWithSalt #-} 185 | hashWithSalt salt searcher = 186 | salt `hashWithSalt` separator searcher 187 | 188 | instance NFData Splitter where 189 | {-# INLINE rnf #-} 190 | rnf (Splitter searcher sepLength) = 191 | rnf searcher `seq` 192 | rnf sepLength 193 | 194 | instance Show Splitter where 195 | showsPrec p splitter = 196 | showParen (p > 10) $ 197 | showString "build " . 198 | showsPrec 11 (separator splitter) 199 | -------------------------------------------------------------------------------- /nix/sources.nix: -------------------------------------------------------------------------------- 1 | # This file has been generated by Niv. 2 | 3 | let 4 | 5 | # 6 | # The fetchers. fetch_ fetches specs of type . 7 | # 8 | 9 | fetch_file = pkgs: name: spec: 10 | let 11 | name' = sanitizeName name + "-src"; 12 | in 13 | if spec.builtin or true then 14 | builtins_fetchurl { inherit (spec) url sha256; name = name'; } 15 | else 16 | pkgs.fetchurl { inherit (spec) url sha256; name = name'; }; 17 | 18 | fetch_tarball = pkgs: name: spec: 19 | let 20 | name' = sanitizeName name + "-src"; 21 | in 22 | if spec.builtin or true then 23 | builtins_fetchTarball { name = name'; inherit (spec) url sha256; } 24 | else 25 | pkgs.fetchzip { name = name'; inherit (spec) url sha256; }; 26 | 27 | fetch_git = name: spec: 28 | let 29 | ref = 30 | if spec ? ref then spec.ref else 31 | if spec ? branch then "refs/heads/${spec.branch}" else 32 | if spec ? tag then "refs/tags/${spec.tag}" else 33 | abort "In git source '${name}': Please specify `ref`, `tag` or `branch`!"; 34 | submodules = if spec ? submodules then spec.submodules else false; 35 | in 36 | builtins.fetchGit { url = spec.repo; inherit (spec) rev; inherit ref; } 37 | // (if builtins.compareVersions builtins.nixVersion "2.4" >= 0 then { inherit submodules; } else {}); 38 | 39 | fetch_local = spec: spec.path; 40 | 41 | fetch_builtin-tarball = name: throw 42 | ''[${name}] The niv type "builtin-tarball" is deprecated. You should instead use `builtin = true`. 43 | $ niv modify ${name} -a type=tarball -a builtin=true''; 44 | 45 | fetch_builtin-url = name: throw 46 | ''[${name}] The niv type "builtin-url" will soon be deprecated. You should instead use `builtin = true`. 47 | $ niv modify ${name} -a type=file -a builtin=true''; 48 | 49 | # 50 | # Various helpers 51 | # 52 | 53 | # https://github.com/NixOS/nixpkgs/pull/83241/files#diff-c6f540a4f3bfa4b0e8b6bafd4cd54e8bR695 54 | sanitizeName = name: 55 | ( 56 | concatMapStrings (s: if builtins.isList s then "-" else s) 57 | ( 58 | builtins.split "[^[:alnum:]+._?=-]+" 59 | ((x: builtins.elemAt (builtins.match "\\.*(.*)" x) 0) name) 60 | ) 61 | ); 62 | 63 | # The set of packages used when specs are fetched using non-builtins. 64 | mkPkgs = sources: system: 65 | let 66 | sourcesNixpkgs = 67 | import (builtins_fetchTarball { inherit (sources.nixpkgs) url sha256; }) { inherit system; }; 68 | hasNixpkgsPath = builtins.any (x: x.prefix == "nixpkgs") builtins.nixPath; 69 | hasThisAsNixpkgsPath = == ./.; 70 | in 71 | if builtins.hasAttr "nixpkgs" sources 72 | then sourcesNixpkgs 73 | else if hasNixpkgsPath && ! hasThisAsNixpkgsPath then 74 | import {} 75 | else 76 | abort 77 | '' 78 | Please specify either (through -I or NIX_PATH=nixpkgs=...) or 79 | add a package called "nixpkgs" to your sources.json. 80 | ''; 81 | 82 | # The actual fetching function. 83 | fetch = pkgs: name: spec: 84 | 85 | if ! builtins.hasAttr "type" spec then 86 | abort "ERROR: niv spec ${name} does not have a 'type' attribute" 87 | else if spec.type == "file" then fetch_file pkgs name spec 88 | else if spec.type == "tarball" then fetch_tarball pkgs name spec 89 | else if spec.type == "git" then fetch_git name spec 90 | else if spec.type == "local" then fetch_local spec 91 | else if spec.type == "builtin-tarball" then fetch_builtin-tarball name 92 | else if spec.type == "builtin-url" then fetch_builtin-url name 93 | else 94 | abort "ERROR: niv spec ${name} has unknown type ${builtins.toJSON spec.type}"; 95 | 96 | # If the environment variable NIV_OVERRIDE_${name} is set, then use 97 | # the path directly as opposed to the fetched source. 98 | replace = name: drv: 99 | let 100 | saneName = stringAsChars (c: if isNull (builtins.match "[a-zA-Z0-9]" c) then "_" else c) name; 101 | ersatz = builtins.getEnv "NIV_OVERRIDE_${saneName}"; 102 | in 103 | if ersatz == "" then drv else 104 | # this turns the string into an actual Nix path (for both absolute and 105 | # relative paths) 106 | if builtins.substring 0 1 ersatz == "/" then /. + ersatz else /. + builtins.getEnv "PWD" + "/${ersatz}"; 107 | 108 | # Ports of functions for older nix versions 109 | 110 | # a Nix version of mapAttrs if the built-in doesn't exist 111 | mapAttrs = builtins.mapAttrs or ( 112 | f: set: with builtins; 113 | listToAttrs (map (attr: { name = attr; value = f attr set.${attr}; }) (attrNames set)) 114 | ); 115 | 116 | # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295 117 | range = first: last: if first > last then [] else builtins.genList (n: first + n) (last - first + 1); 118 | 119 | # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L257 120 | stringToCharacters = s: map (p: builtins.substring p 1 s) (range 0 (builtins.stringLength s - 1)); 121 | 122 | # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269 123 | stringAsChars = f: s: concatStrings (map f (stringToCharacters s)); 124 | concatMapStrings = f: list: concatStrings (map f list); 125 | concatStrings = builtins.concatStringsSep ""; 126 | 127 | # https://github.com/NixOS/nixpkgs/blob/8a9f58a375c401b96da862d969f66429def1d118/lib/attrsets.nix#L331 128 | optionalAttrs = cond: as: if cond then as else {}; 129 | 130 | # fetchTarball version that is compatible between all the versions of Nix 131 | builtins_fetchTarball = { url, name ? null, sha256 }@attrs: 132 | let 133 | inherit (builtins) lessThan nixVersion fetchTarball; 134 | in 135 | if lessThan nixVersion "1.12" then 136 | fetchTarball ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; })) 137 | else 138 | fetchTarball attrs; 139 | 140 | # fetchurl version that is compatible between all the versions of Nix 141 | builtins_fetchurl = { url, name ? null, sha256 }@attrs: 142 | let 143 | inherit (builtins) lessThan nixVersion fetchurl; 144 | in 145 | if lessThan nixVersion "1.12" then 146 | fetchurl ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; })) 147 | else 148 | fetchurl attrs; 149 | 150 | # Create the final "sources" from the config 151 | mkSources = config: 152 | mapAttrs ( 153 | name: spec: 154 | if builtins.hasAttr "outPath" spec 155 | then abort 156 | "The values in sources.json should not have an 'outPath' attribute" 157 | else 158 | spec // { outPath = replace name (fetch config.pkgs name spec); } 159 | ) config.sources; 160 | 161 | # The "config" used by the fetchers 162 | mkConfig = 163 | { sourcesFile ? if builtins.pathExists ./sources.json then ./sources.json else null 164 | , sources ? if isNull sourcesFile then {} else builtins.fromJSON (builtins.readFile sourcesFile) 165 | , system ? builtins.currentSystem 166 | , pkgs ? mkPkgs sources system 167 | }: rec { 168 | # The sources, i.e. the attribute set of spec name to spec 169 | inherit sources; 170 | 171 | # The "pkgs" (evaluated nixpkgs) to use for e.g. non-builtin fetchers 172 | inherit pkgs; 173 | }; 174 | 175 | in 176 | mkSources (mkConfig {}) // { __functor = _: settings: mkSources (mkConfig settings); } 177 | -------------------------------------------------------------------------------- /src/Data/Text/AhoCorasick/Searcher.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2022 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE CPP #-} 9 | {-# LANGUAGE DeriveGeneric #-} 10 | {-# LANGUAGE FlexibleInstances #-} 11 | {-# LANGUAGE OverloadedStrings #-} 12 | {-# LANGUAGE TupleSections #-} 13 | 14 | module Data.Text.AhoCorasick.Searcher 15 | ( Searcher 16 | , automaton 17 | , build 18 | , buildNeedleIdSearcher 19 | , buildWithValues 20 | , caseSensitivity 21 | , containsAll 22 | , containsAny 23 | , mapSearcher 24 | , needles 25 | , numNeedles 26 | , setCaseSensitivity 27 | ) where 28 | 29 | import Control.DeepSeq (NFData) 30 | import Data.Bifunctor (second) 31 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed) 32 | import GHC.Generics (Generic) 33 | 34 | #if defined(HAS_AESON) 35 | import Data.Aeson ((.:), (.=)) 36 | import qualified Data.Aeson as AE 37 | #endif 38 | 39 | import qualified Data.IntSet as IS 40 | 41 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 42 | import Data.Text.Utf8 (Text) 43 | 44 | import qualified Data.Text.AhoCorasick.Automaton as Aho 45 | 46 | -- | A set of needles with associated values, and an Aho-Corasick automaton to 47 | -- efficiently find those needles. 48 | -- 49 | -- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles 50 | -- To enforce this invariant, the fields are not exposed from this module. 51 | -- There is a separate constructor function. 52 | -- 53 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we 54 | -- can derive those for types that embed the searcher, whithout requiring the 55 | -- automaton itself to be Hashable or Eq, which would be both wasteful and 56 | -- tedious. Because the automaton is fully determined by the needles and 57 | -- associated values, it is sufficient to implement Eq and Hashable in terms of 58 | -- the needles only. 59 | -- 60 | -- We also use Hashed to cache the hash of the needles. 61 | data Searcher v = Searcher 62 | { searcherCaseSensitive :: CaseSensitivity 63 | , searcherNeedles :: Hashed [(Text, v)] 64 | , searcherNumNeedles :: Int 65 | , searcherAutomaton :: Aho.AcMachine v 66 | } deriving (Generic) 67 | 68 | #if defined(HAS_AESON) 69 | instance AE.ToJSON v => AE.ToJSON (Searcher v) where 70 | toJSON s = AE.object 71 | [ "needles" .= needles s 72 | , "caseSensitivity" .= caseSensitivity s 73 | ] 74 | 75 | instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where 76 | parseJSON = AE.withObject "Searcher" $ \o -> buildWithValues <$> o .: "caseSensitivity" <*> o .: "needles" 77 | #endif 78 | 79 | instance Show (Searcher v) where 80 | show _ = "Searcher _ _ _" 81 | 82 | instance Hashable v => Hashable (Searcher v) where 83 | hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher 84 | {-# INLINE hashWithSalt #-} 85 | 86 | instance Eq v => Eq (Searcher v) where 87 | -- Since we store the length of the needle list anyway, 88 | -- we can use it to early out if there is a length mismatch. 89 | Searcher cx xs nx _ == Searcher cy ys ny _ = (nx, xs, cx) == (ny, ys, cy) 90 | {-# INLINE (==) #-} 91 | 92 | instance NFData v => NFData (Searcher v) 93 | 94 | -- NOTE: Although we could implement Semigroup for every v by just concatenating 95 | -- needle lists, we don't, because this might lead to unexpected results. For 96 | -- example, if v is (Int, a) where the Int is a priority, combining two 97 | -- searchers might want to discard priorities, concatenate the needle lists, and 98 | -- reassign priorities, rather than concatenating the needle lists as-is and 99 | -- possibly having duplicate priorities in the resulting searcher. 100 | instance Semigroup (Searcher ()) where 101 | x <> y 102 | | caseSensitivity x == caseSensitivity y 103 | = buildWithValues (searcherCaseSensitive x) (needles x <> needles y) 104 | | otherwise = error "Combining searchers of different case sensitivity" 105 | {-# INLINE (<>) #-} 106 | 107 | -- | Builds the Searcher for a list of needles 108 | -- The caller is responsible that the needles are lower case in case the IgnoreCase 109 | -- is used for case sensitivity 110 | build :: CaseSensitivity -> [Text] -> Searcher () 111 | build case_ = buildWithValues case_ . fmap (, ()) 112 | 113 | -- | The caller is responsible that the needles are lower case in case the IgnoreCase 114 | -- is used for case sensitivity 115 | buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v 116 | {-# INLINABLE buildWithValues #-} 117 | buildWithValues case_ ns = 118 | Searcher case_ (hashed ns) (length ns) $ Aho.build ns 119 | 120 | -- | Modify the values associated with the needles. 121 | mapSearcher :: Hashable b => (a -> b) -> Searcher a -> Searcher b 122 | mapSearcher f searcher = searcher 123 | { searcherNeedles = hashed $ fmap (second f) $ needles searcher 124 | , searcherAutomaton = fmap f (searcherAutomaton searcher) 125 | } 126 | 127 | needles :: Searcher v -> [(Text, v)] 128 | needles = unhashed . searcherNeedles 129 | 130 | numNeedles :: Searcher v -> Int 131 | numNeedles = searcherNumNeedles 132 | 133 | automaton :: Searcher v -> Aho.AcMachine v 134 | automaton = searcherAutomaton 135 | 136 | caseSensitivity :: Searcher v -> CaseSensitivity 137 | caseSensitivity = searcherCaseSensitive 138 | 139 | -- | Updates the case sensitivity of the searcher. Does not change the 140 | -- capitilization of the needles. The caller should be certain that if IgnoreCase 141 | -- is passed, the needles are already lower case. 142 | setCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v 143 | setCaseSensitivity case_ searcher = searcher{ 144 | searcherCaseSensitive = case_ 145 | } 146 | 147 | -- | Return whether the haystack contains any of the needles. 148 | -- Case sensitivity depends on the properties of the searcher 149 | -- This function is marked noinline as an inlining boundary. Aho.runText is 150 | -- marked inline, so this function will be optimized to report only whether 151 | -- there is a match, and not construct a list of matches. We don't want this 152 | -- function be inline, to make sure that the conditions of the caller don't 153 | -- affect how this function is optimized. There is little to gain from 154 | -- additional inlining. The pragma is not an optimization in itself, rather it 155 | -- is a defence against fragile optimizer decisions. 156 | {-# NOINLINE containsAny #-} 157 | containsAny :: Searcher () -> Text -> Bool 158 | containsAny !searcher !text = 159 | let 160 | -- On the first match, return True immediately. 161 | f _acc _match = Aho.Done True 162 | in case caseSensitivity searcher of 163 | CaseSensitive -> Aho.runText False f (automaton searcher) text 164 | IgnoreCase -> Aho.runLower False f (automaton searcher) text 165 | 166 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches. 167 | buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int 168 | buildNeedleIdSearcher !case_ !ns = 169 | buildWithValues case_ $ zip ns [0..] 170 | 171 | -- | Returns whether the haystack contains all of the needles. 172 | -- This function expects the passed 'Searcher' to be constructed using 'buildNeedleIdAutomaton'. 173 | containsAll :: Searcher Int -> Text -> Bool 174 | containsAll !searcher !haystack = 175 | let 176 | initial = IS.fromDistinctAscList [0..numNeedles searcher - 1] 177 | ac = automaton searcher 178 | 179 | f !acc (Aho.Match _index !needleId) 180 | | IS.null acc' = Aho.Done acc' 181 | | otherwise = Aho.Step acc' 182 | where 183 | !acc' = IS.delete needleId acc 184 | 185 | in IS.null $ case caseSensitivity searcher of 186 | CaseSensitive -> Aho.runText initial f ac haystack 187 | IgnoreCase -> Aho.runLower initial f ac haystack 188 | -------------------------------------------------------------------------------- /tests/Data/Text/Utf8Spec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module Data.Text.Utf8Spec where 5 | 6 | import Control.Exception (evaluate) 7 | import Control.Monad (forM_) 8 | import Test.Hspec (Spec, anyErrorCall, describe, it, shouldBe, shouldSatisfy, shouldThrow) 9 | import Test.Hspec.QuickCheck (prop) 10 | import Test.QuickCheck (Gen, choose, forAllShrink, shrink) 11 | 12 | import qualified Data.Char as Char 13 | 14 | import Data.Text.TestInstances () 15 | 16 | import qualified Data.Text.Utf8 as Utf8 17 | 18 | spec :: Spec 19 | spec = do 20 | describe "Properties of the BMP in UTF-8" $ do 21 | 22 | describe "Char.toLower" $ do 23 | 24 | {- 25 | it "does not generate common suffixes" $ do 26 | forM_ bmpCodepoints $ flip shouldSatisfy $ \cp -> 27 | let 28 | lowerCp = mapCp Char.toLower cp 29 | in 30 | cp == lowerCp || null (commonSuffix (Utf8.unicode2utf8 cp) (Utf8.unicode2utf8 lowerCp)) 31 | -- Sadly, it "actually does" 32 | -} 33 | 34 | it "is idempotent" $ do 35 | forM_ bmpCodepoints $ flip shouldSatisfy $ \cp -> 36 | Char.toLower cp == Char.toLower (Char.toLower cp) 37 | 38 | describe "toLowerAscii" $ do 39 | 40 | it "is equivalent to Char.toLower on ASCII" $ do 41 | 42 | forM_ asciiCodepoints $ flip shouldSatisfy $ \cp -> 43 | Char.toLower cp == Utf8.toLowerAscii cp 44 | 45 | describe "lowerCodePoint" $ do 46 | 47 | prop "is equivalent to Char.toLower on all of Unicode" $ \c -> 48 | Utf8.lowerCodePoint c `shouldBe` Char.toLower c 49 | 50 | describe "unlowerCodePoint" $ do 51 | 52 | it "should return nothing if it's not a lower case of anything" $ do 53 | Utf8.unlowerCodePoint 'A' `shouldBe` "" 54 | Utf8.unlowerCodePoint 'ẞ' `shouldBe` "" 55 | 56 | it "should return itself if it doesn't have any casings" $ do 57 | Utf8.unlowerCodePoint '1' `shouldBe` "1" 58 | 59 | it "can return multiple values" $ do 60 | Utf8.unlowerCodePoint 'a' `shouldBe` "aA" 61 | Utf8.unlowerCodePoint 'ß' `shouldBe` "ẞß" 62 | Utf8.unlowerCodePoint 'i' `shouldBe` "İiI" 63 | 64 | describe "isCaseInvariant" $ do 65 | it "holds vacuously for empty texts" $ do 66 | Utf8.isCaseInvariant "" `shouldBe` True 67 | 68 | it "should be true for characters without upper/lower cases" $ do 69 | Utf8.isCaseInvariant "." `shouldBe` True 70 | Utf8.isCaseInvariant ".,;'123" `shouldBe` True 71 | Utf8.isCaseInvariant "💩" `shouldBe` True 72 | 73 | it "should be false if there are characters with upper/lower cases" $ do 74 | Utf8.isCaseInvariant "a" `shouldBe` False 75 | Utf8.isCaseInvariant "A.." `shouldBe` False 76 | Utf8.isCaseInvariant "ß." `shouldBe` False 77 | Utf8.isCaseInvariant "ẞ" `shouldBe` False 78 | Utf8.isCaseInvariant "İ" `shouldBe` False 79 | 80 | describe "dropWhile" $ do 81 | 82 | it "handles a simple example well" $ do 83 | Utf8.dropWhile (== 'b') "bbba" `shouldBe` "a" 84 | 85 | describe "slicing functions" $ do 86 | 87 | let 88 | -- | Example shown in section "Slicing Functions" in 'Data.Text.Utf8". 89 | slicingExample :: Utf8.Text 90 | slicingExample = Utf8.Text u8data 1 11 91 | where Utf8.Text u8data _ _ = Utf8.pack "ABCDEFGHIJKLMN" 92 | 93 | it "satisfies the example in Data.Text.Utf8" $ do 94 | let begin = Utf8.CodeUnitIndex 2 95 | let length_ = Utf8.CodeUnitIndex 6 96 | Utf8.unsafeSliceUtf8 begin length_ slicingExample `shouldBe` "DEFGHI" 97 | Utf8.unsafeCutUtf8 begin length_ slicingExample `shouldBe` ("BC", "JKL") 98 | 99 | prop "unsafeSliceUtf8 and unsafeCutUtf8 are complementary" $ 100 | forAllShrink (arbitrarySlicingIndices slicingExample) shrink $ \ (begin, length_) -> do 101 | let (prefix, suffix) = Utf8.unsafeCutUtf8 begin length_ slicingExample 102 | Utf8.concat [prefix, Utf8.unsafeSliceUtf8 begin length_ slicingExample, suffix] `shouldBe` slicingExample 103 | 104 | describe "Basic Text instances" $ do 105 | 106 | prop "Show Text behaves like Show String" $ \ (str :: String) -> do 107 | show (Utf8.pack str) `shouldBe` show str 108 | 109 | prop "Eq Text behaves like Eq String" $ \ (a :: String) (b :: String) -> do 110 | Utf8.pack a == Utf8.pack b `shouldBe` a == b 111 | 112 | prop "Ord Text behaves like Ord String" $ \ (a :: String) (b :: String) -> do 113 | compare (Utf8.pack a) (Utf8.pack b) `shouldBe` compare a b 114 | 115 | describe "skipCodePointsBackwards" $ do 116 | it "works with ascii" $ do 117 | Utf8.skipCodePointsBackwards "abcd" 3 0 `shouldBe` 3 118 | Utf8.skipCodePointsBackwards "abcd" 3 1 `shouldBe` 2 119 | Utf8.skipCodePointsBackwards "abcd" 3 2 `shouldBe` 1 120 | Utf8.skipCodePointsBackwards "abcd" 3 3 `shouldBe` 0 121 | 122 | it "moves to start of codepoint if you skip 0" $ do 123 | Utf8.skipCodePointsBackwards "💩💩" 0 0 `shouldBe` 0 124 | Utf8.skipCodePointsBackwards "💩💩" 1 0 `shouldBe` 0 125 | Utf8.skipCodePointsBackwards "💩💩" 2 0 `shouldBe` 0 126 | Utf8.skipCodePointsBackwards "💩💩" 3 0 `shouldBe` 0 127 | Utf8.skipCodePointsBackwards "💩💩" 4 0 `shouldBe` 4 128 | Utf8.skipCodePointsBackwards "💩💩" 5 0 `shouldBe` 4 129 | Utf8.skipCodePointsBackwards "💩💩" 6 0 `shouldBe` 4 130 | Utf8.skipCodePointsBackwards "💩💩" 7 0 `shouldBe` 4 131 | 132 | it "can skip 1 multi-byte codepoint" $ do 133 | Utf8.skipCodePointsBackwards "💩💩" 4 1 `shouldBe` 0 134 | Utf8.skipCodePointsBackwards "💩💩" 5 1 `shouldBe` 0 135 | Utf8.skipCodePointsBackwards "💩💩" 6 1 `shouldBe` 0 136 | Utf8.skipCodePointsBackwards "💩💩" 7 1 `shouldBe` 0 137 | 138 | it "can skip multiple multi-byte codepoint" $ do 139 | -- a, İ, ẞ and 💩 have byte lengths 1, 2, 3 and 4 140 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 0 `shouldBe` 15 -- stays at a 141 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 1 `shouldBe` 13 -- skips to İ 142 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 2 `shouldBe` 10 -- skips to ẞ 143 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 3 `shouldBe` 6 -- skips to 💩 144 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 4 `shouldBe` 3 -- skips to ẞ 145 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 5 `shouldBe` 1 -- skips to İ 146 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 15 6 `shouldBe` 0 -- skips to a 147 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 14 2 `shouldBe` 6 -- from İ to 💩 148 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 13 2 `shouldBe` 6 -- from İ to 💩 149 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 10 3 `shouldBe` 1 -- from ẞ to İ 150 | Utf8.skipCodePointsBackwards "aİẞ💩ẞİa" 9 3 `shouldBe` 0 -- from 💩 to a 151 | 152 | it "throws errors when you read out of bounds" $ do 153 | evaluate (Utf8.skipCodePointsBackwards "💩💩" 8 0) `shouldThrow` anyErrorCall 154 | evaluate (Utf8.skipCodePointsBackwards "💩💩" 7 2) `shouldThrow` anyErrorCall 155 | 156 | 157 | 158 | arbitrarySlicingIndices :: Utf8.Text -> Gen (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex) 159 | arbitrarySlicingIndices example = do 160 | let exampleLength = Utf8.codeUnitIndex $ Utf8.lengthUtf8 example 161 | 162 | begin <- choose (0, exampleLength) 163 | length_ <- choose (0, exampleLength - begin) 164 | 165 | pure (Utf8.CodeUnitIndex begin, Utf8.CodeUnitIndex length_) 166 | 167 | asciiCodepoints :: [Char] 168 | asciiCodepoints = map Char.chr [0..0x7f] 169 | 170 | -- | The Basic Multilingual Plane (BMP) contains the Unicode code points 171 | -- 0x0000 through 0xFFFF. 172 | bmpCodepoints :: [Char] 173 | bmpCodepoints = map Char.chr [0..0xffff] 174 | 175 | commonSuffix :: Eq a => [a] -> [a] -> [a] 176 | commonSuffix list list' = reverse $ go (reverse list) (reverse list') 177 | where 178 | go (x:xs) (y:ys) 179 | | x == y = x : go xs ys 180 | go _ _ = [] 181 | -------------------------------------------------------------------------------- /.stylish-haskell.yaml: -------------------------------------------------------------------------------- 1 | # stylish-haskell configuration file 2 | # ================================== 3 | 4 | # The stylish-haskell tool is mainly configured by specifying steps. These steps 5 | # are a list, so they have an order, and one specific step may appear more than 6 | # once (if needed). Each file is processed by these steps in the given order. 7 | steps: 8 | # Convert some ASCII sequences to their Unicode equivalents. This is disabled 9 | # by default. 10 | # - unicode_syntax: 11 | # # In order to make this work, we also need to insert the UnicodeSyntax 12 | # # language pragma. If this flag is set to true, we insert it when it's 13 | # # not already present. You may want to disable it if you configure 14 | # # language extensions using some other method than pragmas. Default: 15 | # # true. 16 | # add_language_pragma: true 17 | 18 | # Align the right hand side of some elements. This is quite conservative 19 | # and only applies to statements where each element occupies a single 20 | # line. 21 | - simple_align: 22 | cases: false 23 | top_level_patterns: false 24 | records: false 25 | 26 | # Import cleanup 27 | - imports: 28 | # There are different ways we can align names and lists. 29 | # 30 | # - global: Align the import names and import list throughout the entire 31 | # file. 32 | # 33 | # - file: Like global, but don't add padding when there are no qualified 34 | # imports in the file. 35 | # 36 | # - group: Only align the imports per group (a group is formed by adjacent 37 | # import lines). 38 | # 39 | # - none: Do not perform any alignment. 40 | # 41 | # Default: global. 42 | align: none 43 | 44 | # The following options affect only import list alignment. 45 | # 46 | # List align has following options: 47 | # 48 | # - after_alias: Import list is aligned with end of import including 49 | # 'as' and 'hiding' keywords. 50 | # 51 | # > import qualified Data.List as List (concat, foldl, foldr, head, 52 | # > init, last, length) 53 | # 54 | # - with_alias: Import list is aligned with start of alias or hiding. 55 | # 56 | # > import qualified Data.List as List (concat, foldl, foldr, head, 57 | # > init, last, length) 58 | # 59 | # - new_line: Import list starts always on new line. 60 | # 61 | # > import qualified Data.List as List 62 | # > (concat, foldl, foldr, head, init, last, length) 63 | # 64 | # Default: after_alias 65 | list_align: after_alias 66 | 67 | # Right-pad the module names to align imports in a group: 68 | # 69 | # - true: a little more readable 70 | # 71 | # > import qualified Data.List as List (concat, foldl, foldr, 72 | # > init, last, length) 73 | # > import qualified Data.List.Extra as List (concat, foldl, foldr, 74 | # > init, last, length) 75 | # 76 | # - false: diff-safe 77 | # 78 | # > import qualified Data.List as List (concat, foldl, foldr, init, 79 | # > last, length) 80 | # > import qualified Data.List.Extra as List (concat, foldl, foldr, 81 | # > init, last, length) 82 | # 83 | # Default: true 84 | pad_module_names: false 85 | 86 | # Long list align style takes effect when import is too long. This is 87 | # determined by 'columns' setting. 88 | # 89 | # - inline: This option will put as much specs on same line as possible. 90 | # 91 | # - new_line: Import list will start on new line. 92 | # 93 | # - new_line_multiline: Import list will start on new line when it's 94 | # short enough to fit to single line. Otherwise it'll be multiline. 95 | # 96 | # - multiline: One line per import list entry. 97 | # Type with constructor list acts like single import. 98 | # 99 | # > import qualified Data.Map as M 100 | # > ( empty 101 | # > , singleton 102 | # > , ... 103 | # > , delete 104 | # > ) 105 | # 106 | # Default: inline 107 | long_list_align: inline 108 | 109 | # Align empty list (importing instances) 110 | # 111 | # Empty list align has following options 112 | # 113 | # - inherit: inherit list_align setting 114 | # 115 | # - right_after: () is right after the module name: 116 | # 117 | # > import Vector.Instances () 118 | # 119 | # Default: inherit 120 | empty_list_align: inherit 121 | 122 | # List padding determines indentation of import list on lines after import. 123 | # This option affects 'long_list_align'. 124 | # 125 | # - : constant value 126 | # 127 | # - module_name: align under start of module name. 128 | # Useful for 'file' and 'group' align settings. 129 | list_padding: 4 130 | 131 | # Separate lists option affects formatting of import list for type 132 | # or class. The only difference is single space between type and list 133 | # of constructors, selectors and class functions. 134 | # 135 | # - true: There is single space between Foldable type and list of it's 136 | # functions. 137 | # 138 | # > import Data.Foldable (Foldable (fold, foldl, foldMap)) 139 | # 140 | # - false: There is no space between Foldable type and list of it's 141 | # functions. 142 | # 143 | # > import Data.Foldable (Foldable(fold, foldl, foldMap)) 144 | # 145 | # Default: true 146 | separate_lists: true 147 | 148 | # Space surround option affects formatting of import lists on a single 149 | # line. The only difference is single space after the initial 150 | # parenthesis and a single space before the terminal parenthesis. 151 | # 152 | # - true: There is single space associated with the enclosing 153 | # parenthesis. 154 | # 155 | # > import Data.Foo ( foo ) 156 | # 157 | # - false: There is no space associated with the enclosing parenthesis 158 | # 159 | # > import Data.Foo (foo) 160 | # 161 | # Default: false 162 | space_surround: false 163 | 164 | # Language pragmas 165 | - language_pragmas: 166 | # We can generate different styles of language pragma lists. 167 | # 168 | # - vertical: Vertical-spaced language pragmas, one per line. 169 | # 170 | # - compact: A more compact style. 171 | # 172 | # - compact_line: Similar to compact, but wrap each line with 173 | # `{-#LANGUAGE #-}'. 174 | # 175 | # Default: vertical. 176 | style: vertical 177 | 178 | # Align affects alignment of closing pragma brackets. 179 | # 180 | # - true: Brackets are aligned in same column. 181 | # 182 | # - false: Brackets are not aligned together. There is only one space 183 | # between actual import and closing bracket. 184 | # 185 | # Default: true 186 | align: false 187 | 188 | # stylish-haskell can detect redundancy of some language pragmas. If this 189 | # is set to true, it will remove those redundant pragmas. Default: true. 190 | remove_redundant: true 191 | 192 | # Replace tabs by spaces. This is disabled by default. 193 | - tabs: 194 | # Number of spaces to use for each tab. Default: 8, as specified by the 195 | # Haskell report. 196 | spaces: 4 197 | 198 | # Remove trailing whitespace 199 | - trailing_whitespace: {} 200 | 201 | - module_header: 202 | indent: 4 203 | sort: true 204 | break_where: exports 205 | open_bracket: next_line 206 | 207 | # A common setting is the number of columns (parts of) code will be wrapped 208 | # to. Different steps take this into account. Default: 80. 209 | columns: 100 210 | 211 | # By default, line endings are converted according to the OS. You can override 212 | # preferred format here. 213 | # 214 | # - native: Native newline format. CRLF on Windows, LF on other OSes. 215 | # 216 | # - lf: Convert to LF ("\n"). 217 | # 218 | # - crlf: Convert to CRLF ("\r\n"). 219 | # 220 | # Default: native. 221 | newline: lf 222 | 223 | # Sometimes, language extensions are specified in a cabal file or from the 224 | # command line instead of using language pragmas in the file. stylish-haskell 225 | # needs to be aware of these, so it can parse the file correctly. 226 | # 227 | # No language extensions are enabled by default. 228 | language_extensions: 229 | - FlexibleContexts 230 | # # StylishHaskell needs these extensions even when GHC does not in some cases 231 | # - MultiParamTypeClasses 232 | # - TemplateHaskell 233 | # - FlexibleContexts 234 | # - ExistentialQuantification 235 | -------------------------------------------------------------------------------- /tests/Data/Text/BoyerMooreCISpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Data.Text.BoyerMooreCISpec 5 | ( spec 6 | ) where 7 | 8 | 9 | import Control.Monad (forM_) 10 | import Test.Hspec (Spec, describe, it, shouldBe, parallel) 11 | import Test.Hspec.QuickCheck (modifyMaxSuccess, prop) 12 | import Data.Text (Text) 13 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAllShrink) 14 | import Test.QuickCheck.Instances () 15 | 16 | import qualified Data.Char as Char 17 | import qualified Data.Text as Text 18 | import qualified Data.Text.Utf8 as Utf8 19 | import qualified Test.QuickCheck as QuickCheck 20 | 21 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 22 | import Data.Text.TestInstances (arbitraryAlphabet, arbitraryFragment, arbitraryNeedleHaystack, 23 | arbitraryNeedlesHaystack) 24 | 25 | import qualified Data.Text.AhoCorasick.Replacer as AhoReplacer 26 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMooreCI 27 | import qualified Data.Text.BoyerMooreCI.Searcher as Searcher 28 | import qualified Data.Text.BoyerMooreCI.Replacer as Replacer 29 | 30 | 31 | spec :: Spec 32 | spec = parallel $ modifyMaxSuccess (const 200) $ do 33 | 34 | describe "automaton" $ do 35 | 36 | it "works for some basic examples" $ do 37 | matchPositions "a" "abca" `shouldBe` [(0,0), (3,3)] 38 | matchPositions "a" "ABCA" `shouldBe` [(0,0), (3,3)] 39 | matchPositions "abc" "abca" `shouldBe` [(0,2)] 40 | matchPositions "abc" "ABCA" `shouldBe` [(0,2)] 41 | matchPositions "bc" "abca" `shouldBe` [(1,2)] 42 | matchPositions "bc" "ABCA" `shouldBe` [(1,2)] 43 | 44 | it "does not yield overlapping matches" $ do 45 | matchPositions "aba" "abababa" `shouldBe` [(0,2), (4,6)] 46 | matchPositions "aba" "ABaBaBA" `shouldBe` [(0,2), (4,6)] 47 | 48 | it "does not work with uppercase needles" $ do 49 | matchPositions "A" "aaaa" `shouldBe` [] 50 | matchPositions "A" "AAAA" `shouldBe` [] 51 | 52 | it "works with cyrillic characters" $ do 53 | -- Cyrillic characters are all two bytes. 54 | -- The match positions are byte indices (not char indices). 55 | matchPositions "п" "ипсум" `shouldBe` [(2,3)] 56 | matchPositions "п" "ИПСУМ" `shouldBe` [(2,3)] 57 | 58 | matchPositions "лорем" "Лорем" `shouldBe` [(0,9)] 59 | matchTexts "лорем" "Лорем" `shouldBe` ["Лорем"] 60 | 61 | matchPositions "лорем" "ЛОРЕМ" `shouldBe` [(0,9)] 62 | matchTexts "лорем" "ЛОРЕМ" `shouldBe` ["ЛОРЕМ"] 63 | 64 | -- This is an interesting case for badCharLookup, because the mismatch 65 | -- happens at "с" which is the first character in the needle. 66 | matchTexts "сит" "итсит" `shouldBe` ["сит"] 67 | matchTexts "сит" "ИТСИТ" `shouldBe` ["СИТ"] 68 | 69 | it "works with mixed byte lengths" $ do 70 | -- Space is 1 byte 71 | matchTexts "сит" "Лор сит амет" `shouldBe` ["сит"] 72 | matchTexts "сит" "Лорем ипсум долор сит амет" `shouldBe` ["сит"] 73 | matchTexts "сит" "ЛОРЕМ ИПСУМ ДОЛОР СИТ АМЕТ" `shouldBe` ["СИТ"] 74 | 75 | matchTexts "💩b" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["💩b"] 76 | matchTexts "𝄞" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["𝄞","𝄞","𝄞","𝄞"] 77 | matchTexts "a" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["a","a","a"] 78 | 79 | it "works with ⱥ and ⱦ" $ do 80 | -- The letters ⱥ and ⱦ are 3 UTF8 bytes, but have unlowerings Ⱥ and Ⱦ of 2 bytes 81 | matchPositions "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` [(0, 17)] 82 | matchTexts "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` ["ⱥⱦⱥⱦⱥⱦ"] 83 | matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` [(0, 11)] 84 | matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` ["ȺȾȺȾȺȾ"] 85 | 86 | matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` [(0, 14)] 87 | matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` ["ȺⱦⱥȾⱥȾ"] 88 | 89 | describe "with a needle equal to the haystack" $ do 90 | 91 | it "reports a single match for a repeated character" $ 92 | forM_ [1..128] $ \n -> 93 | let needle = Text.replicate n "a" in 94 | matchPositions needle needle `shouldBe` [(0, Utf8.lengthUtf8 needle-1)] 95 | 96 | prop "reports a single match for any arbitrary text fragment" $ 97 | QuickCheck.forAll (arbitraryAlphabet >>= arbitraryFragment) $ \text -> 98 | let needle = Utf8.lowerUtf8 text in 99 | matchPositions needle text `shouldBe` [(0, Utf8.lengthUtf8 text-1)] 100 | 101 | describe "with sliced text (using nonzero internal offset)" $ do 102 | 103 | it "still reports offset relative to the text start" $ 104 | -- The match position should be relative to the start of the text "a". 105 | -- Even if this text is represented as a slice of "bbba" internally. 106 | matchPositions "a" (Text.dropWhile (== 'b') "bbba") `shouldBe` [(0, 0)] 107 | 108 | it "matches ß and ẞ" $ do 109 | matchTexts "groß" "Großfräsmaschinenöffnungstür" `shouldBe` ["Groß"] 110 | matchTexts "groß" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["GROẞ"] 111 | matchTexts "öffnung" "Großfräsmaschinenöffnungstür" `shouldBe` ["öffnung"] 112 | matchTexts "öffnung" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["ÖFFNUNG"] 113 | 114 | 115 | describe "minimumSkipForCodePoint" $ 116 | it "should match the reference implementation" $ do 117 | forM_ [minBound..maxBound] $ \c -> 118 | BoyerMooreCI.minimumSkipForCodePoint c `shouldBe` refMinimumSkipForCodePoint c 119 | 120 | 121 | describe "Searcher" $ do 122 | describe "containsAny" $ do 123 | 124 | -- For the edge case where a needle is the empty string, 125 | -- 'Text.isInfixOf' and 'Searcher.containsAny' are different: 126 | -- 127 | -- @ 128 | -- Text.isInfixOf "" "abc" == True /= False == Searcher.containsAny (Searcher.build [""]) "abc" 129 | -- @ 130 | -- 131 | -- However, at this point we probably shouldn't break this property. 132 | prop "is equivalent to disjunction of Text.isInfixOf calls*" $ do 133 | QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do 134 | let 135 | lneedles = map Utf8.lowerUtf8 needles -- needles must be lowercase 136 | searcher = Searcher.build lneedles 137 | test needle = 138 | not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack) 139 | Searcher.containsAny searcher haystack `shouldBe` any test lneedles 140 | 141 | describe "containsAll" $ do 142 | prop "is equivalent to conjunction of Text.isInfixOf calls*" $ do 143 | QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do 144 | let 145 | lneedles = map Utf8.lowerUtf8 needles -- needles must be lowercase 146 | searcher = Searcher.buildNeedleIdSearcher lneedles 147 | test needle = 148 | not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack) 149 | Searcher.containsAll searcher haystack `shouldBe` all test lneedles 150 | 151 | describe "Replacer" $ do 152 | describe "replaceSingleLimited" $ do 153 | 154 | prop "is equivalent to Aho-Corasick replacer with a single needle" $ 155 | forAllShrink arbitraryNeedleHaystack shrink $ \(needle, haystack) -> 156 | forAllShrink arbitrary shrink $ \replacement -> 157 | let 158 | lneedle = Utf8.lowerUtf8 needle 159 | expected = 160 | AhoReplacer.run (AhoReplacer.build IgnoreCase [(lneedle, replacement)]) haystack 161 | auto = BoyerMooreCI.buildAutomaton lneedle 162 | actual = Replacer.replaceSingleLimited auto replacement haystack maxBound 163 | in 164 | actual `shouldBe` Just expected 165 | 166 | -- Reference implementation for BoyerMooreCI.minimumSkipForCodePoint 167 | refMinimumSkipForCodePoint :: Utf8.CodePoint -> Utf8.CodeUnitIndex 168 | refMinimumSkipForCodePoint cp = 169 | let codePointLength = length . Utf8.unicode2utf8 . Char.ord in 170 | case Utf8.unlowerCodePoint cp of 171 | [] -> 172 | -- Input is upper case, so this is undefined behaviour but we match what the real 173 | -- implementation does: 174 | Utf8.CodeUnitIndex $ codePointLength cp 175 | ucs -> Utf8.CodeUnitIndex $ minimum $ map codePointLength ucs 176 | 177 | 178 | -- | Return indices of the first and last byte of every match 179 | matchPositions :: Text -> Text -> [(Utf8.CodeUnitIndex, Utf8.CodeUnitIndex)] 180 | matchPositions needle = 181 | let 182 | !automaton = BoyerMooreCI.buildAutomaton needle 183 | prependMatch matches matchStart matchEnd = 184 | BoyerMooreCI.Step ((matchStart, matchEnd) : matches) 185 | in 186 | \haystack -> reverse $ BoyerMooreCI.runText [] prependMatch automaton haystack 187 | 188 | positionText :: Text -> (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex) -> Text 189 | positionText haystack (firstByte, lastByte) = 190 | let len = lastByte - firstByte + 1 -- length is 1 if firstByte==lastByte 191 | in Utf8.unsafeSliceUtf8 firstByte len haystack 192 | 193 | matchTexts :: Text -> Text -> [Text] 194 | matchTexts needle haystack = 195 | map (positionText haystack) $ matchPositions needle haystack 196 | -------------------------------------------------------------------------------- /tests/Data/Text/BoyerMooreSpec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} 2 | 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | 6 | module Data.Text.BoyerMooreSpec 7 | ( spec 8 | ) where 9 | 10 | import Control.DeepSeq (rnf) 11 | import Control.Monad (forM_) 12 | import Data.Foldable (for_) 13 | import GHC.Stack (HasCallStack) 14 | import Prelude hiding (replicate) 15 | import Test.Hspec (Expectation, Spec, describe, it, parallel, shouldBe) 16 | import Test.Hspec.Expectations (shouldMatchList, shouldSatisfy) 17 | import Test.Hspec.QuickCheck (modifyMaxSuccess, prop) 18 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAllShrink, (==>)) 19 | import Test.QuickCheck.Instances () 20 | 21 | import qualified Test.QuickCheck as QuickCheck 22 | 23 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 24 | import Data.Text.TestInstances (arbitraryNeedleHaystack) 25 | import Data.Text.Utf8 (Text) 26 | 27 | import qualified Data.Text.Utf8 as Text 28 | import qualified Data.Text.Utf8 as TextSearch 29 | import qualified Data.Text.Utf8 as Utf8 30 | import qualified Data.Text.AhoCorasick.Replacer as AhoReplacer 31 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore 32 | import qualified Data.Text.BoyerMoore.Replacer as Replacer 33 | import qualified Data.Text.BoyerMoore.Searcher as Searcher 34 | 35 | -- | Test that for a single needle which equals the haystack, we find a single 36 | -- match. Does not apply to the empty needle. 37 | needleIsHaystackMatches :: HasCallStack => Text -> Expectation 38 | needleIsHaystackMatches needle = 39 | let 40 | prependMatch ms match = BoyerMoore.Step (Utf8.codeUnitIndex match : ms) 41 | matches = BoyerMoore.runText [] prependMatch (BoyerMoore.buildAutomaton needle) needle 42 | in 43 | matches `shouldBe` [0] 44 | 45 | boyerMatch :: Text -> Text -> [Int] 46 | boyerMatch needle haystack = 47 | let 48 | prependMatch matches match = BoyerMoore.Step (Utf8.codeUnitIndex match : matches) 49 | in 50 | BoyerMoore.runText [] prependMatch (BoyerMoore.buildAutomaton needle) haystack 51 | 52 | -- | Match without a payload, return only the match positions. 53 | matchEndPositions :: Text -> Text -> [Int] 54 | matchEndPositions needle haystack = 55 | let 56 | matches = boyerMatch needle haystack 57 | in 58 | fmap (Utf8.codeUnitIndex (Utf8.lengthUtf8 needle) +) matches 59 | 60 | -- | `matchEndPositions` implemented naively in terms of Text's functionality, 61 | -- which we assume to be correct. 62 | naiveMatchPositions :: Text -> Text -> [Int] 63 | naiveMatchPositions needle haystack = 64 | map toEndPos $ TextSearch.indices needle haystack 65 | where 66 | toEndPos index = Utf8.codeUnitIndex (Utf8.lengthUtf8 needle) + index 67 | 68 | spec :: Spec 69 | spec = parallel $ modifyMaxSuccess (const 200) $ do 70 | describe "build" $ do 71 | prop "does not throw exceptions" $ \ (pat :: Text) -> 72 | rnf $ BoyerMoore.buildAutomaton pat 73 | 74 | describe "runText" $ do 75 | 76 | describe "when given a needle equal to the haystack" $ do 77 | 78 | it "reports a single match for a repeated character" $ 79 | forM_ [1..128] $ \n -> 80 | needleIsHaystackMatches $ Text.replicate n "a" 81 | 82 | it "reports a single match for non-BMP data" $ do 83 | -- Include a few code points outside of the Basic Multilingual Plane, 84 | -- which require multible code units to encode. 85 | needleIsHaystackMatches "\x000437b8suffix" 86 | needleIsHaystackMatches "aaa\359339aaa\95759aa\899256aa" 87 | 88 | prop "reports a single match for random needles" $ \needle -> 89 | not (Text.null needle) ==> needleIsHaystackMatches needle 90 | 91 | describe "when given a sliced text (with nonzero internal offset)" $ do 92 | 93 | it "still reports offset relative to the text start" $ 94 | -- The match position should be relative to the start of the text "a". 95 | -- Even if this text is represented as a slice of "bbba" internally. 96 | matchEndPositions "a" (Text.dropWhile (== 'b') "bbba") `shouldMatchList` [1] 97 | 98 | describe "when given non-ascii inputs" $ do 99 | 100 | -- We have a special lookup table for bad character shifts for 101 | -- the first 128 code units, which is always hit for ascii inputs. 102 | -- Also exercise the fallback code path with a different input. 103 | -- The code point é is encoded as two code units in UTF-8. 104 | -- 0 7 13 105 | -- │ │ │ 106 | -- ▼ ▼ ▼ 107 | -- ┌───┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┐ 108 | -- │ é │c│l│a│i│r│e│c│l│a│i│r│ Code Points 109 | -- ├─┬─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤ 110 | -- │ │ │ │ │ │ │ │ │ │ │ │ │ │ Code Units (Bytes) 111 | -- └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ 112 | it "reports a match if the haystack contains a character > U+7f" $ do 113 | matchEndPositions "eclair" "éclaireclair" `shouldMatchList` [13] 114 | matchEndPositions "éclair" "éclaireclair" `shouldMatchList` [7] 115 | matchEndPositions "éclair" "eclairéclair" `shouldMatchList` [13] 116 | 117 | it "reports the correct code unit index for complex characters" $ do 118 | -- Note that the index after the match is 4, even though there is 119 | -- only a single code point. U+1d11e is encoded as four code units: 120 | -- in UTF-8: 121 | -- 0 4 122 | -- │ │ 123 | -- ▼ ▼ 124 | -- ┌───────┐ 125 | -- │ 𝄞 │ Code Points 126 | -- ├─┬─┬─┬─┤ 127 | -- │ │ │ │ │ Code Units (Bytes) 128 | -- └─┴─┴─┴─┘ 129 | matchEndPositions "𝄞" "𝄞" `shouldMatchList` [4] 130 | 131 | -- A levitating woman in business suit with dark skin tone needs a 132 | -- whopping 5 code points to encode. The first two need 4 code units each to encode, 133 | -- the remaining three need 3 code units each for a total of 17 code units: 134 | -- 0 4 8 17 135 | -- │ │ │ │ 136 | -- ▼ ▼ ▼ ▼ 137 | -- ┌───────┬───────┬─────┬─────┬─────┐ 138 | -- │ 1 │ 2 │ 3 │ 4 │ 5 │ Code Points 139 | -- ├─┬─┬─┬─┼─┬─┬─┬─┼─┬─┬─┼─┬─┬─┼─┬─┬─┤ 140 | -- │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ Code Units (Bytes) 141 | -- └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ 142 | -- 1. U+1f574: man in business suit levitating (🕴) 143 | -- 2. U+1f3ff: emoji modifier Fitzpatrick type-6 144 | -- 3. U+200d: zero width joiner 145 | -- 4. U+2640: female sign (♀) 146 | -- 5. U+fe0f: variation selector-16 147 | -- A peculiar feature of Unicode emoji, is that the male levivating 148 | -- man in business suit with dark skin tone is a substring of the 149 | -- levivating woman in business suit. And the levivating man in 150 | -- business suit without particular skin tone is a substring of that. 151 | let 152 | examples = 153 | [ ("\x1f574\x1f3ff\x200d\x2640\xfe0f", 17) 154 | , ("\x1f574\x1f3ff", 8) 155 | , ("\x1f574", 4) 156 | ] 157 | for_ examples $ \(needle, endPos) -> 158 | matchEndPositions needle "\x1f574\x1f3ff\x200d\x2640\xfe0f" `shouldMatchList` [endPos] 159 | 160 | describe "when given empty needle" $ do 161 | 162 | it "does not report a match" $ do 163 | matchEndPositions "" "" `shouldMatchList` [] 164 | matchEndPositions "" "foo" `shouldMatchList` [] 165 | 166 | describe "kitchen sink" $ do 167 | it "kitchen sinks" $ do 168 | matchEndPositions "\"\SO]JL\"" "aaaaa\"\SO]JL\"" `shouldMatchList` [11] 169 | matchEndPositions "\"X]JL\"" "aaaaa\"X]JL\"" `shouldMatchList` [11] 170 | 171 | describe "when given random needles and haystacks" $ do 172 | 173 | prop "reports only infixes of the haystack" $ 174 | QuickCheck.forAllShrink arbitraryNeedleHaystack shrink $ \ (needle, haystack) -> 175 | let 176 | matches = boyerMatch needle haystack 177 | sliceMatch startPos len = Utf8.unsafeSliceUtf8 startPos len haystack 178 | in 179 | forM_ matches $ \pos -> do 180 | needle `shouldSatisfy` (`Text.isInfixOf` haystack) 181 | sliceMatch (Utf8.CodeUnitIndex pos) (Utf8.lengthUtf8 needle) `shouldBe` needle 182 | 183 | prop "reports all infixes of the haystack" $ 184 | QuickCheck.forAllShrink arbitraryNeedleHaystack shrink $ \ (needle, haystack) -> 185 | matchEndPositions needle haystack `shouldMatchList` naiveMatchPositions needle haystack 186 | 187 | describe "replaceSingleLimited" $ do 188 | 189 | prop "is equivalent to Aho-Corasick replacer with a single needle" $ 190 | forAllShrink arbitraryNeedleHaystack shrink $ \(needle, haystack) -> 191 | forAllShrink arbitrary shrink $ \replacement -> 192 | let 193 | expected = AhoReplacer.run (AhoReplacer.build CaseSensitive [(needle, replacement)]) haystack 194 | 195 | auto = BoyerMoore.buildAutomaton needle 196 | 197 | actual = Replacer.replaceSingleLimited auto replacement haystack maxBound 198 | in 199 | actual `shouldBe` Just expected 200 | 201 | describe "Searcher" $ do 202 | 203 | describe "containsAny" $ do 204 | 205 | -- For the edge case where a needle is the empty string, 206 | -- 'Text.isInfixOf' and 'Searcher.containsAny' are different: 207 | -- 208 | -- @ 209 | -- Text.isInfixOf "" "abc" == True /= False == Searcher.containsAny (Searcher.build [""]) "abc" 210 | -- @ 211 | -- 212 | -- However, at this point we probably shouldn't break this property. 213 | prop "is equivalent to disjunction of Text.isInfixOf calls*" $ \ (needles :: [Text]) (haystack :: Text) -> 214 | let 215 | searcher = Searcher.build needles 216 | test needle = 217 | not (Text.null needle) && needle `Text.isInfixOf` haystack 218 | in 219 | Searcher.containsAny searcher haystack `shouldBe` any test needles 220 | 221 | describe "containsAll" $ do 222 | prop "is equivalent to conjunction of Text.isInfixOf calls*" $ \ (needles :: [Text]) (haystack :: Text) -> 223 | let 224 | searcher = Searcher.buildNeedleIdSearcher needles 225 | test needle = 226 | not (Text.null needle) && needle `Text.isInfixOf` haystack 227 | in 228 | Searcher.containsAll searcher haystack `shouldBe` all test needles 229 | -------------------------------------------------------------------------------- /bench/bm/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Main 5 | ( main 6 | ) where 7 | 8 | import Criterion.Main (bgroup, bench, defaultMain, nf) 9 | 10 | import qualified Data.Text as Text 11 | import qualified Data.Text.AhoCorasick.Automaton as Aho 12 | import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore 13 | import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMooreCI 14 | import qualified Data.Text.Utf8 as Utf8 15 | 16 | -- 17 | -- @ 18 | -- stack bench alfred-margaret:bm --ba '--output bm.html' 19 | -- @ 20 | 21 | main :: IO () 22 | main = defaultMain 23 | [ 24 | bgroup "Big needle, large haystack" $ 25 | benchVariants needle1 haystack1 26 | 27 | , bgroup "Big needle, small haystack" $ 28 | benchVariants needle2 haystack2 29 | 30 | , bgroup "Small needle, large haystack" $ 31 | benchVariants needle3 haystack3 32 | 33 | , bgroup "Small needle, small haystack" $ 34 | benchVariants needle4 haystack4 35 | 36 | , bgroup "Cyrillic, medium needle, large haystack" $ 37 | benchVariants needle5 haystack5 38 | 39 | ] 40 | where 41 | benchVariants needle haystack = 42 | [ bench "Count Aho" $ nf (ahoCount needle) haystack 43 | , bench "Count Aho CI" $ nf (ahociCount needle) haystack 44 | , bench "Count BoyerMoore" $ nf (bmCount needle) haystack 45 | , bench "Count BoyerMoore CI" $ nf (bmciCount needle) haystack 46 | , bench "Count Aho CI multineedle" $ nf (ahocimultiCount needle) haystack 47 | , bench "Count Text.count" $ nf (Text.count needle) haystack 48 | -- This naive case insensitive count is really slow. We leave it out 49 | -- because it throws off the plot scale. 50 | -- , bench "Count Text.count . lowerUtf8" $ nf (Text.count needle . Utf8.lowerUtf8) haystack 51 | , bench "Contains Aho" $ nf (ahoContains needle) haystack 52 | , bench "Contains Aho CI" $ nf (ahociContains needle) haystack 53 | , bench "Contains BoyerMoore" $ nf (bmContains needle) haystack 54 | , bench "Contains BoyerMoore CI" $ nf (bmciContains needle) haystack 55 | , bench "Contains Text.isInfixOf" $ nf (Text.isInfixOf needle) haystack 56 | ] 57 | 58 | 59 | -- | The NOINLINE annotation is used so that the needle doesn't get inlined at compile time. The 60 | -- function does get partially applied with the needle so that the automaton is constructed only 61 | -- once. 62 | ahoCount :: Text.Text -> Text.Text -> Int 63 | {-# NOINLINE ahoCount #-} 64 | ahoCount needle = 65 | let 66 | !automaton = Aho.build [(needle, ())] 67 | onMatch !n _match = Aho.Step (n + 1) 68 | in 69 | -- Eta expansion of the haystack variable is necessary for proper inlining of runText 70 | \haystack -> Aho.runText 0 onMatch automaton haystack 71 | 72 | ahoContains :: Text.Text -> Text.Text -> Bool 73 | {-# NOINLINE ahoContains #-} 74 | ahoContains needle = 75 | let 76 | !automaton = Aho.build [(needle, ())] 77 | onMatch _ _match = Aho.Done True 78 | in 79 | \haystack -> Aho.runText False onMatch automaton haystack 80 | 81 | ahociCount :: Text.Text -> Text.Text -> Int 82 | {-# NOINLINE ahociCount #-} 83 | ahociCount needle = 84 | let 85 | !automaton = Aho.build [(needle, ())] 86 | onMatch !n _match = Aho.Step (n + 1) 87 | in 88 | \haystack -> Aho.runLower 0 onMatch automaton haystack 89 | 90 | 91 | ahocimultiCount :: Text.Text -> Text.Text -> Int 92 | {-# NOINLINE ahocimultiCount #-} 93 | ahocimultiCount needle = 94 | let 95 | !automaton = Aho.build $ map (\n -> (n, ())) $ Aho.needleCasings needle 96 | onMatch !n _match = Aho.Step (n + 1) 97 | in 98 | \haystack -> Aho.runText 0 onMatch automaton haystack 99 | 100 | ahociContains :: Text.Text -> Text.Text -> Bool 101 | {-# NOINLINE ahociContains #-} 102 | ahociContains needle = 103 | let 104 | !automaton = Aho.build [(needle, ())] 105 | onMatch _ _match = Aho.Done True 106 | in 107 | \haystack -> Aho.runLower False onMatch automaton haystack 108 | 109 | bmciCount :: Text.Text -> Text.Text -> Int 110 | {-# NOINLINE bmciCount #-} 111 | bmciCount needle = 112 | let 113 | !automaton = BoyerMooreCI.buildAutomaton needle 114 | onMatch !n _matchStart _matchEnd = BoyerMooreCI.Step (n + 1) 115 | in 116 | \haystack -> BoyerMooreCI.runText 0 onMatch automaton haystack 117 | 118 | bmciContains :: Text.Text -> Text.Text -> Bool 119 | {-# NOINLINE bmciContains #-} 120 | bmciContains needle = 121 | let 122 | !automaton = BoyerMooreCI.buildAutomaton needle 123 | onMatch _ _matchStart _matchEnd = BoyerMooreCI.Done True 124 | in 125 | \haystack -> BoyerMooreCI.runText False onMatch automaton haystack 126 | 127 | bmCount :: Text.Text -> Text.Text -> Int 128 | {-# NOINLINE bmCount #-} 129 | bmCount needle = 130 | let 131 | !automaton = BoyerMoore.buildAutomaton needle 132 | onMatch !n _match = BoyerMoore.Step (n + 1) 133 | in 134 | \haystack -> BoyerMoore.runText 0 onMatch automaton haystack 135 | 136 | bmContains :: Text.Text -> Text.Text -> Bool 137 | {-# NOINLINE bmContains #-} 138 | bmContains needle = 139 | let 140 | !automaton = BoyerMoore.buildAutomaton needle 141 | onMatch _ _match = BoyerMoore.Done True 142 | in 143 | \haystack -> BoyerMoore.runText False onMatch automaton haystack 144 | 145 | 146 | needle1 :: Text.Text 147 | needle1 = "necessitatibus" 148 | 149 | haystack1 :: Text.Text 150 | haystack1 = 151 | Text.unlines 152 | [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem." 153 | , "Qui quia totam non rerum eveniet sed tempora repellendus ab enim consequatur eum quaerat iste. Sed amet nihil sed voluptate aspernatur ut rerum facilis est officia earum aut molestiae tenetur non autem nulla. Qui deserunt necessitatibus ab accusamus doloremque non sint aspernatur." 154 | , "Et officia illum non quaerat obcaecati cum accusamus minus rem quae quis sed rerum omnis sed inventore quasi. Qui totam deserunt sit minima ullam sit debitis dolores. Est debitis explicabo ut temporibus corporis nam harum dolore est fuga numquam non exercitationem Quis cum amet fuga." 155 | , "Aut incidunt provident et sequi nulla est molestias perferendis. Hic exercitationem modi ex optio cumque nam voluptate debitis nam iste consequatur non nihil rerum ut accusantium nihil. Sed ullam maiores nobis dolorem sit galisum maiores eum reprehenderit maxime sed galisum placeat cum molestiae quia id similique velit. Aut quasi autem non illo reiciendis sit ullam tempora." 156 | , "Ut corporis exercitationem sed dicta autem ut voluptatem dolorem vel dolores dolores. Qui ipsam quisquam sed facere porro rem autem necessitatibus nam beatae quisquam. Quo voluptatem optio hic quod reprehenderit ut nostrum voluptatem." 157 | , "Et consequuntur quia vel unde laudantium non voluptatum magnam. Ut quam autem rem fugit quia ut assumenda error quam amet in omnis quia ut rerum soluta sed consequatur fuga. Ut blanditiis quia et facilis ratione aut blanditiis dolorum aut itaque excepturi eos iste incidunt qui blanditiis velit et magni autem." 158 | ] 159 | 160 | needle2 :: Text.Text 161 | needle2 = "necessitatibus" 162 | 163 | haystack2 :: Text.Text 164 | haystack2 = 165 | Text.unlines 166 | [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem." 167 | ] 168 | 169 | needle3 :: Text.Text 170 | needle3 = "sit" 171 | 172 | haystack3 :: Text.Text 173 | haystack3 = 174 | Text.unlines 175 | [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem." 176 | , "Qui quia totam non rerum eveniet sed tempora repellendus ab enim consequatur eum quaerat iste. Sed amet nihil sed voluptate aspernatur ut rerum facilis est officia earum aut molestiae tenetur non autem nulla. Qui deserunt necessitatibus ab accusamus doloremque non sint aspernatur." 177 | , "Et officia illum non quaerat obcaecati cum accusamus minus rem quae quis sed rerum omnis sed inventore quasi. Qui totam deserunt sit minima ullam sit debitis dolores. Est debitis explicabo ut temporibus corporis nam harum dolore est fuga numquam non exercitationem Quis cum amet fuga." 178 | , "Aut incidunt provident et sequi nulla est molestias perferendis. Hic exercitationem modi ex optio cumque nam voluptate debitis nam iste consequatur non nihil rerum ut accusantium nihil. Sed ullam maiores nobis dolorem sit galisum maiores eum reprehenderit maxime sed galisum placeat cum molestiae quia id similique velit. Aut quasi autem non illo reiciendis sit ullam tempora." 179 | , "Ut corporis exercitationem sed dicta autem ut voluptatem dolorem vel dolores dolores. Qui ipsam quisquam sed facere porro rem autem necessitatibus nam beatae quisquam. Quo voluptatem optio hic quod reprehenderit ut nostrum voluptatem." 180 | , "Et consequuntur quia vel unde laudantium non voluptatum magnam. Ut quam autem rem fugit quia ut assumenda error quam amet in omnis quia ut rerum soluta sed consequatur fuga. Ut blanditiis quia et facilis ratione aut blanditiis dolorum aut itaque excepturi eos iste incidunt qui blanditiis velit et magni autem." 181 | ] 182 | 183 | needle4 :: Text.Text 184 | needle4 = "sit" 185 | 186 | haystack4 :: Text.Text 187 | haystack4 = 188 | Text.unlines 189 | [ "Lorem ipsum dolor sit amet. Et sint voluptatibus est vero maxime vel explicabo reprehenderit non molestiae quisquam sit dolores facere qui cumque quibusdam 33 impedit deserunt! Aut libero harum et quis quasi qui cupiditate autem." 190 | ] 191 | 192 | 193 | 194 | needle5 :: Text.Text 195 | needle5 = "сусципиантур" 196 | 197 | -- Cyrillic lipsum from https://generator.lorem-ipsum.info/_russian 198 | haystack5 :: Text.Text 199 | haystack5 = 200 | Text.unlines 201 | [ "Лорем ипсум долор сит амет, ин вис вирис маиорум инсоленс, алиа агам иудицабит цу нец, ут адмодум инцидеринт еффициантур цум. Вих те номинави диспутатиони, не дуо путент дицунт. Дебет репудиандае яуо ан, вих тамяуам репудиандае цонцлусионемяуе но, цу иус регионе урбанитас. Персиус дицерет форенсибус ат цум, еум яуаеяуе аппеллантур еу. Хас вивендо цонсететур те, ад хас меис пхаедрум, репудиаре демоцритум вих еу." 202 | , "Пер ут нулла минимум цонтентионес, сеа ат мунере алтера медиоцрем. Но вим лабитур цонсететур, нам ут ипсум фацилис феугаит. Не дицат феугиат аппареат нам, дуис адиписцинг ех еам. Ут еуисмод веритус интерпретарис усу. Сед цасе дицунт ехпетендис ех. Ад лудус урбанитас вим." 203 | , "Перфецто инсоленс ут меи, адмодум ментитум партиендо усу ат. Ин солеат еирмод аперири дуо, еи мазим фацилиси делицата сед. Дицо неглегентур ут хис, еи хис афферт аудире. Ат аццусата глориатур вел." 204 | , "Персиус омиттам при ад, цонгуе воцент репудиаре усу ет. Ат детрахит салутатус вим. Еа сеа яуис тациматес. Ид фацер дицтас еум, вери дицам еум еу, при ид тантас омиттам. Суавитате персецути еам те, ет яуот омиттантур ест." 205 | , "Еа тациматес евертитур дуо, меа ин вениам ассентиор. Еос ессе мунди аццусата цу. Ат партем медиоцрем иус, ан мел алии плацерат. Еам еи суммо ерант тинцидунт. Меа еа алиа сцрибентур." 206 | , "Еи вих малис омиттам сплендиде, ид витае ностер алияуандо еам, те новум репудиаре пер. Вис цу еррор алтерум садипсцинг, ан ерат фацете аудире ест. Меи ат виде ассуеверит. Вис либрис популо ид, ат хис симилияуе репудиандае, нибх салутатус ех сеа." 207 | , "Сеа пондерум адиписцинг ех, ет нонумы анциллае вих. Цу иллум ессент медиоцрем вис, еу мовет мунди еос, яуо зрил долорем партиендо но. Хис ат диам цоммодо демоцритум, вери цаусае инцидеринт усу ин. Темпорибус диссентиунт вис еа, ех цум цонсул поссим алтерум. Вим ин веро нулла, еиус реяуе ех меа." 208 | , "Омнес трацтатос цу еос. Пер ехерци модератиус еа. Нец оффендит сусципит ех. Пер ех постеа фацете, те еум еиус игнота легендос. Ат поссе воцибус пер, ин про лаудем сплендиде цотидиеяуе, но адхуц инермис аццоммодаре еос. Ин денияуе малуиссет еам, ин яуис доцтус нец." 209 | , "Идяуе елояуентиам еу яуи, витае интерессет не вис, хис диссентиет сусципиантур цу. Тибияуе елаборарет еам еа. Оратио вивендум цонсулату ид пер. Еа путант еррорибус сит, цум ад алияуид аццоммодаре. Алтерум делецтус алияуандо еу иус, ан иус поссит персиус тибияуе. Сеа не нумяуам фуиссет, ат виси аудире при. Суас граеце цопиосае дуо ан, но мунди опортере цонвенире яуи." 210 | , "Мел фугит путант опортере но, цопиосае оффициис еос еа. Вис инвидунт еффициенди ат, те луцилиус сцрибентур хис, вих не поссит вертерем. Дуо омнис дицат вирис еи, ет адхуц цетеро вис. Еа вим утинам инвенире, вих ин еяуидем ассентиор." 211 | ] 212 | -------------------------------------------------------------------------------- /src/Data/Text/AhoCorasick/Replacer.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE CPP #-} 9 | {-# LANGUAGE DeriveAnyClass #-} 10 | {-# LANGUAGE DeriveGeneric #-} 11 | {-# LANGUAGE DerivingStrategies #-} 12 | 13 | -- | Implements sequential string replacements based on the Aho-Corasick algorithm. 14 | module Data.Text.AhoCorasick.Replacer 15 | ( -- * State machine 16 | Needle 17 | , Payload (..) 18 | , Replacement 19 | , Replacer (..) 20 | , replacerCaseSensitivity 21 | , build 22 | , compose 23 | , mapReplacement 24 | , run 25 | , runWithLimit 26 | , setCaseSensitivity 27 | ) where 28 | 29 | import Control.DeepSeq (NFData) 30 | import Data.Hashable (Hashable) 31 | import Data.List (sort) 32 | import Data.Maybe (fromJust) 33 | import GHC.Generics (Generic) 34 | 35 | #if defined(HAS_AESON) 36 | import qualified Data.Aeson as AE 37 | #endif 38 | 39 | import Data.Text.AhoCorasick.Searcher (Searcher) 40 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 41 | import Data.Text.Utf8 (CodeUnitIndex (..), Text) 42 | 43 | import qualified Data.Text as Text 44 | import qualified Data.Text.AhoCorasick.Automaton as Aho 45 | import qualified Data.Text.AhoCorasick.Searcher as Searcher 46 | import qualified Data.Text.Utf8 as Utf8 47 | 48 | -- | Descriptive type alias for strings to search for. 49 | type Needle = Text 50 | 51 | -- | Descriptive type alias for replacements. 52 | type Replacement = Text 53 | 54 | -- | Priority of a needle. Higher integers indicate higher priorities. 55 | -- Replacement order is such that all matches of priority p are replaced before 56 | -- replacing any matches of priority q where p > q. 57 | type Priority = Int 58 | 59 | data Payload = Payload 60 | { needlePriority :: {-# UNPACK #-} !Priority 61 | , needleLengthBytes :: {-# UNPACK #-} !CodeUnitIndex 62 | -- ^ Number of bytes is used for case sensitive matching 63 | , needleLengthCodePoints :: {-# UNPACK #-} !Int 64 | -- ^ For case insensitive matches, the byte length does not necessarily match the needle byte 65 | -- length. Due to our simple case folding the number of codepoints _does_ match, so we put that 66 | -- in the payload. It's less efficient because we have to scan backwards through the text to 67 | -- obtain the length of a match. 68 | 69 | , needleReplacement :: !Replacement 70 | } 71 | #if defined(HAS_AESON) 72 | deriving (Eq, Generic, Hashable, NFData, Show, AE.FromJSON, AE.ToJSON) 73 | #else 74 | deriving (Eq, Generic, Hashable, NFData, Show) 75 | #endif 76 | 77 | -- | A state machine used for efficient replacements with many different needles. 78 | data Replacer = Replacer 79 | { replacerSearcher :: Searcher Payload 80 | } 81 | deriving stock (Show, Eq, Generic) 82 | #if defined(HAS_AESON) 83 | deriving (Hashable, NFData, AE.FromJSON, AE.ToJSON) 84 | #else 85 | deriving (Hashable, NFData) 86 | #endif 87 | 88 | -- | Build an Aho-Corasick automaton that can be used for performing fast 89 | -- sequential replaces. 90 | -- 91 | -- Case-insensitive matching performs per-letter language-agnostic lower-casing. 92 | -- Therefore, it will work in most cases, but not in languages where lower-casing 93 | -- depends on the context of the character in question. 94 | -- 95 | -- We need to revisit this algorithm when we want to implement full Unicode 96 | -- support. 97 | build :: CaseSensitivity -> [(Needle, Replacement)] -> Replacer 98 | build caseSensitivity replaces = Replacer searcher 99 | where 100 | searcher = Searcher.buildWithValues caseSensitivity $ zipWith mapNeedle [0..] replaces 101 | mapNeedle i (needle, replacement) = 102 | -- Note that we negate i: earlier needles have a higher priority. We 103 | -- could avoid it and define larger integers to be lower priority, but 104 | -- that made the terminology in this module very confusing. 105 | let needle' = case Searcher.caseSensitivity searcher of 106 | CaseSensitive -> needle 107 | IgnoreCase -> Utf8.lowerUtf8 needle 108 | -- Payload includes byte and code point lengths, so can still be used if we change case 109 | -- sensitivity later. 110 | payload = Payload 111 | { needlePriority = (-i) 112 | , needleLengthBytes = Utf8.lengthUtf8 needle 113 | , needleLengthCodePoints = Text.length needle 114 | , needleReplacement = replacement 115 | } 116 | in (needle', payload) 117 | 118 | -- | Return the composition `replacer2` after `replacer1`, if they have the same 119 | -- case sensitivity. If the case sensitivity differs, Nothing is returned. 120 | compose :: Replacer -> Replacer -> Maybe Replacer 121 | compose (Replacer searcher1) (Replacer searcher2) 122 | | Searcher.caseSensitivity searcher1 /= Searcher.caseSensitivity searcher2 = Nothing 123 | | otherwise = 124 | let 125 | -- Replace the priorities of the second machine, so they all come after 126 | -- the first. 127 | renumber i (needle, Payload _ lenb lenc replacement) = (needle, Payload (-i) lenb lenc replacement) 128 | needles1 = Searcher.needles searcher1 129 | needles2 = Searcher.needles searcher2 130 | cs = Searcher.caseSensitivity searcher1 131 | searcher = Searcher.buildWithValues cs $ zipWith renumber [0..] (needles1 ++ needles2) 132 | in 133 | Just $ Replacer searcher 134 | 135 | -- | Modify the replacement of a replacer. It doesn't modify the needles. 136 | mapReplacement :: (Replacement -> Replacement) -> Replacer -> Replacer 137 | mapReplacement f replacer = replacer{ 138 | replacerSearcher = Searcher.mapSearcher 139 | (\p -> p {needleReplacement = f (needleReplacement p)}) 140 | (replacerSearcher replacer) 141 | } 142 | 143 | 144 | replacerCaseSensitivity :: Replacer -> CaseSensitivity 145 | replacerCaseSensitivity (Replacer searcher) = Searcher.caseSensitivity searcher 146 | 147 | 148 | -- | Updates the case sensitivity of the replacer. Does not change the 149 | -- capitilization of the needles. The caller should be certain that if IgnoreCase 150 | -- is passed, the needles are already lower case. 151 | setCaseSensitivity :: CaseSensitivity -> Replacer -> Replacer 152 | setCaseSensitivity case_ (Replacer searcher) = 153 | Replacer (Searcher.setCaseSensitivity case_ searcher) 154 | 155 | 156 | -- A match collected while running replacements. It is isomorphic to the Match 157 | -- reported by the automaton, but the data is arranged in a more useful way: 158 | -- as the start index and length of the match, and the replacement. 159 | data Match = Match !CodeUnitIndex !CodeUnitIndex !Text deriving (Eq, Ord, Show, Generic) 160 | 161 | -- | Apply replacements of all matches. Assumes that the matches are ordered by 162 | -- match position, and that no matches overlap. 163 | replace :: [Match] -> Text -> Text 164 | replace matches haystack = Utf8.concat $ go 0 matches haystack 165 | where 166 | -- At every match, cut the string into three pieces, removing the match. 167 | -- Because a Text is a buffer pointer and (offset, length), cutting does not 168 | -- involve string copies. Only at the very end we piece together the strings 169 | -- again, so Text can allocate a buffer of the right length and memcpy the 170 | -- parts into the new target string. 171 | -- If `k` is a code unit index into the original text, then `k - offset` 172 | -- is an index into `remainder`. In other words, `offset` is the index into 173 | -- the original text where `remainder` starts. 174 | go :: CodeUnitIndex -> [Match] -> Text -> [Text] 175 | go !_offset [] remainder = [remainder] 176 | go !offset ((Match pos len replacement) : ms) remainder = 177 | let 178 | (prefix, suffix) = Utf8.unsafeCutUtf8 (pos - offset) len remainder 179 | in 180 | prefix : replacement : go (pos + len) ms suffix 181 | 182 | -- | Compute the length of the string resulting from applying the replacements. 183 | replacementLength :: [Match] -> Text -> CodeUnitIndex 184 | replacementLength matches initial = go matches (Utf8.lengthUtf8 initial) 185 | where 186 | go [] !acc = acc 187 | go (Match _ matchLen repl : rest) !acc = go rest (acc - matchLen + Utf8.lengthUtf8 repl) 188 | 189 | -- | Given a list of matches sorted on start position, remove matches that start 190 | -- within an earlier match. 191 | removeOverlap :: [Match] -> [Match] 192 | removeOverlap matches = case matches of 193 | [] -> [] 194 | [m] -> [m] 195 | (m0@(Match pos0 len0 _) : m1@(Match pos1 _ _) : ms) -> 196 | if pos1 >= pos0 + len0 197 | then m0 : removeOverlap (m1:ms) 198 | else removeOverlap (m0:ms) 199 | 200 | run :: Replacer -> Text -> Text 201 | run replacer = fromJust . runWithLimit replacer maxBound 202 | 203 | {-# NOINLINE runWithLimit #-} 204 | runWithLimit :: Replacer -> CodeUnitIndex -> Text -> Maybe Text 205 | runWithLimit (Replacer searcher) maxLength = go initialThreshold 206 | where 207 | !automaton = Searcher.automaton searcher 208 | 209 | -- Priorities are 0 or lower, so an initial threshold of 1 keeps all 210 | -- matches. 211 | !initialThreshold = 1 212 | 213 | -- Needle priorities go from 0 for the highest priority to (-numNeedles + 1) 214 | -- for the lowest priority. That means that if we find a match with 215 | -- minPriority, we don't need to do another pass afterwards, because there 216 | -- are no remaining needles. 217 | !minPriority = 1 - Searcher.numNeedles searcher 218 | 219 | go :: Priority -> Text -> Maybe Text 220 | go !threshold haystack = 221 | let 222 | seed = (minBound :: Priority, []) 223 | matchesWithPriority = case Searcher.caseSensitivity searcher of 224 | CaseSensitive -> Aho.runText seed (prependMatch threshold haystack) automaton haystack 225 | IgnoreCase -> Aho.runLower seed (prependMatch threshold haystack) automaton haystack 226 | in 227 | case matchesWithPriority of 228 | -- No match at the given threshold, there is nothing left to do. 229 | -- Return the input string unmodified. 230 | (_, []) -> Just haystack 231 | -- We found matches at priority p. Remove overlapping matches, then 232 | -- apply all replacements. Next, we need to go again, this time 233 | -- considering only needles with a lower priority than p. As an 234 | -- optimization (which matters mainly for the single needle case), 235 | -- if we find a match at the lowest priority, we don't need another 236 | -- pass. Note that if in `rawMatches` we find only matches of priority 237 | -- p > minPriority, then we do still need another pass, because the 238 | -- replacements could create new matches. 239 | (p, matches) 240 | | replacementLength matches haystack > maxLength -> Nothing 241 | | p == minPriority -> Just $ replace (removeOverlap $ sort matches) haystack 242 | | otherwise -> go p $ replace (removeOverlap $ sort matches) haystack 243 | 244 | -- When we iterate through all matches, keep track only of the matches with 245 | -- the highest priority: those are the ones that we will replace first. If we 246 | -- find multiple matches with that priority, remember all of them. If we find a 247 | -- match with lower priority, ignore it, because we already have a more 248 | -- important match. Also, if the priority is `threshold` or higher, ignore the 249 | -- match, so we can exclude matches if we already did a round of replacements 250 | -- for that priority. This way we don't have to build a new automaton after 251 | -- every round of replacements. 252 | prependMatch 253 | :: Priority -> Text -> (Priority, [Match]) -> Aho.Match Payload -> Aho.Next (Priority, [Match]) 254 | {-# INLINE prependMatch #-} 255 | prependMatch !threshold haystack (!pBest, !matches) (Aho.Match pos (Payload pMatch lenb lenc replacement)) 256 | | pMatch < threshold && pMatch > pBest = 257 | Aho.Step (pMatch, [makeMatch haystack pos lenb lenc replacement]) 258 | | pMatch < threshold && pMatch == pBest = 259 | Aho.Step (pMatch, makeMatch haystack pos lenb lenc replacement : matches) 260 | | otherwise = Aho.Step (pBest, matches) 261 | 262 | -- Pos is the code unit index past the last code unit of the match, we have 263 | -- to find the first code unit. 264 | makeMatch :: Text -> CodeUnitIndex -> CodeUnitIndex -> Int -> Replacement -> Match 265 | {-# INLINE makeMatch #-} 266 | makeMatch = case Searcher.caseSensitivity searcher of 267 | -- Case sensitive: length is interpreted as number of bytes 268 | CaseSensitive -> \_ pos lenb _ replacement -> 269 | Match (pos - lenb) lenb replacement 270 | -- Case insensitive: length is interpreted as number of characters 271 | IgnoreCase -> \haystack pos _ lenc replacement -> 272 | -- We start at the last byte of the match, and look backwards. 273 | let start = Utf8.skipCodePointsBackwards haystack (pos-1) (lenc-1) in 274 | Match start (pos - start) replacement 275 | -------------------------------------------------------------------------------- /tests/Data/Text/AhoCorasickSpec.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2022 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} 8 | 9 | {-# LANGUAGE BangPatterns #-} 10 | {-# LANGUAGE OverloadedStrings #-} 11 | {-# LANGUAGE ScopedTypeVariables #-} 12 | 13 | module Data.Text.AhoCorasickSpec where 14 | 15 | import Control.Monad (forM_) 16 | import Data.Foldable (foldl') 17 | import Data.List.NonEmpty (NonEmpty ((:|))) 18 | import Test.Hspec (Expectation, Spec, describe, it, shouldBe) 19 | import Test.Hspec.QuickCheck (modifyMaxSize, prop) 20 | import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAll, forAllShrink) 21 | import Test.QuickCheck.Instances () 22 | 23 | import qualified Data.Text as T 24 | import qualified Test.QuickCheck.Gen as Gen 25 | 26 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 27 | import Data.Text.TestInstances () 28 | import Data.Text.Utf8 (Text) 29 | 30 | import qualified Data.Text.Utf8 as Text 31 | import qualified Data.Text.Utf8 as Utf8 32 | import qualified Data.Text.AhoCorasick.Automaton as Aho 33 | import qualified Data.Text.AhoCorasick.Replacer as Replacer 34 | import qualified Data.Text.AhoCorasick.Searcher as Searcher 35 | import qualified Data.Text.AhoCorasick.Splitter as Splitter 36 | 37 | spec :: Spec 38 | spec = do 39 | -- Ensure that helper functions are actually helping 40 | -- Examples are from https://en.wikipedia.org/wiki/UTF-8 41 | describe "IsString ByteArray" $ do 42 | 43 | it "encodes the dollar sign" $ utf8Test "$" [0x24] 44 | it "encodes the euro sign" $ utf8Test "€" [0xe2, 0x82, 0xac] 45 | it "encodes the pound sign" $ utf8Test "£" [0xc2, 0xa3] 46 | it "encodes Hwair" $ utf8Test "𐍈" [0xf0, 0x90, 0x8d, 0x88] 47 | it "encodes all of the above" $ utf8Test "$€£𐍈" [0x24, 0xe2, 0x82, 0xac, 0xc2, 0xa3, 0xf0, 0x90, 0x8d, 0x88] 48 | 49 | describe "runText" $ do 50 | 51 | describe "countMatches" $ do 52 | it "counts the right number of matches in a basic example" $ do 53 | countMatches Aho.CaseSensitive ["abc", "rst", "xyz"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 3 54 | 55 | it "counts the right number of matches in an example with 1-, 2-, 3- and 4-code unit code points" $ do 56 | countMatches Aho.CaseSensitive ["$", "£"] "$€£𐍈" `shouldBe` 2 57 | 58 | describe "runLower" $ do 59 | 60 | describe "countMatches" $ do 61 | it "counts the right number of matches in a basic example" $ do 62 | countMatches Aho.IgnoreCase ["abc", "rst", "xyz"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 3 63 | 64 | it "does not work with uppercase needles" $ do 65 | countMatches Aho.IgnoreCase ["ABC", "Rst", "xYZ"] "abcdefghijklmnopqrstuvwxyz" `shouldBe` 0 66 | 67 | it "works with characters that are not in ASCII" $ do 68 | countMatches Aho.IgnoreCase ["groß", "öffnung", "tür"] "Großfräsmaschinenöffnungstür" `shouldBe` 3 69 | countMatches Aho.IgnoreCase ["groß", "öffnung", "tür"] "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` 3 70 | 71 | modifyMaxSize (const 10) $ describe "Replacer" $ do 72 | 73 | describe "run" $ do 74 | let 75 | genHaystack = fmap Utf8.pack $ Gen.listOf $ Gen.frequency [(40, Gen.elements "abAB"), (1, pure 'İ'), (1, arbitrary)] 76 | 77 | -- needles may not be empty, because empty needles are filtered out in an I.ActionReplaceMultiple 78 | genNeedle = fmap Utf8.pack $ Gen.resize 3 $ Gen.listOf1 $ Gen.elements "abAB" 79 | genReplaces = Gen.listOf $ (,) <$> genNeedle <*> arbitrary 80 | shrinkReplaces = filter (not . any (\(needle, _) -> Utf8.null needle)) . shrink 81 | 82 | replace needles haystack = 83 | Replacer.run (Replacer.build Aho.CaseSensitive needles) haystack 84 | 85 | replaceIgnoreCase needles haystack = 86 | Replacer.run (Replacer.build Aho.IgnoreCase needles) haystack 87 | 88 | it "replaces all occurrences" $ do 89 | replace [("A", "B")] "AXAXB" `shouldBe` "BXBXB" 90 | replace [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB" 91 | replace [("aaa", ""), ("b", "c")] "aaabaaa" `shouldBe` "c" 92 | -- Have a few non-matching needles too. 93 | replace [("A", "B"), ("Q", "r"), ("Z", "")] "AXAXB" `shouldBe` "BXBXB" 94 | 95 | it "replaces only non-overlapping matches" $ do 96 | replace [("aa", "zz"), ("bb", "w")] "aaabbb" `shouldBe` "zzawb" 97 | replace [("aaa", "")] "aaaaa" `shouldBe` "aa" 98 | 99 | it "replaces all occurrences in priority order" $ do 100 | replace [("A", ""), ("BBBB", "bingo")] "BBABB" `shouldBe` "bingo" 101 | replace [("BB", ""), ("BBBB", "bingo")] "BBBB" `shouldBe` "" 102 | 103 | it "replaces needles that contain a surrogate pair" $ 104 | replace [("\x1f574", "levitating man in business suit")] 105 | "the \x1f574" `shouldBe` "the levitating man in business suit" 106 | 107 | 108 | it "replaces all occurrences case-insensitively" $ do 109 | replaceIgnoreCase [("A", "B")] "AXAXB" `shouldBe` "BXBXB" 110 | replaceIgnoreCase [("A", "B")] "axaxb" `shouldBe` "BxBxb" 111 | replaceIgnoreCase [("a", "b")] "AXAXB" `shouldBe` "bXbXB" 112 | 113 | replaceIgnoreCase [("A", "B"), ("X", "Y")] "AXAXB" `shouldBe` "BYBYB" 114 | replaceIgnoreCase [("A", "B"), ("X", "Y")] "axaxb" `shouldBe` "BYBYb" 115 | replaceIgnoreCase [("a", "b"), ("x", "y")] "AXAXB" `shouldBe` "bybyB" 116 | 117 | it "matches replacements case-insensitively" $ 118 | replaceIgnoreCase [("foo", "BAR"), ("bar", "BAZ")] "Foo" `shouldBe` "BAZ" 119 | 120 | it "matches replacements case-insensitively for non-ascii characters" $ do 121 | replaceIgnoreCase [("éclair", "lightning")] "Éclair" `shouldBe` "lightning" 122 | -- Note: U+0319 is an uppercase alpha, which looks exactly like A, but it 123 | -- is a different code point. 124 | replaceIgnoreCase [("å", "b")] "åÅÅ" `shouldBe` "bbb" 125 | replaceIgnoreCase [("k", "m")] "KkK" `shouldBe` "mmm" 126 | replaceIgnoreCase [("dz", "z")] "dzDzDZ" `shouldBe` "zzz" 127 | replaceIgnoreCase [("bèta", "α"), ("\x0391", "alpha")] "BÈTA" `shouldBe` "alpha" 128 | replaceIgnoreCase [("ßèta", "sseta")] "ßèta" `shouldBe` "sseta" 129 | replaceIgnoreCase [("ßèta", "sseta")] "ẞÈTA" `shouldBe` "sseta" 130 | 131 | it "matches surrogate pairs case-insensitively" $ do 132 | -- We can't lowercase a levivating man in business suit, but that should 133 | -- not affect whether we match it or not. 134 | replaceIgnoreCase [("\x1f574", "levitating man in business suit")] "the \x1f574" 135 | `shouldBe` "the levitating man in business suit" 136 | 137 | prop "satisfies (run . compose a b) == (run b (run a))" $ 138 | forAllShrink genHaystack shrink $ \haystack -> 139 | forAll arbitrary $ \case_ -> 140 | forAllShrink genReplaces shrinkReplaces $ \replaces1 -> 141 | forAllShrink genReplaces shrinkReplaces $ \replaces2 -> 142 | let 143 | rm1 = Replacer.build case_ replaces1 144 | rm2 = Replacer.build case_ replaces2 145 | Just rm12 = Replacer.compose rm1 rm2 146 | in 147 | Replacer.run rm2 (Replacer.run rm1 haystack) 148 | `shouldBe` Replacer.run rm12 haystack 149 | 150 | prop "is identity for empty needles" $ \case_ haystack -> 151 | let replacerId = Replacer.build case_ [] 152 | in Replacer.run replacerId haystack `shouldBe` haystack 153 | 154 | prop "is equivalent to sequential Text.replace calls" $ 155 | forAllShrink genHaystack shrink $ \haystack -> 156 | forAllShrink genReplaces shrinkReplaces $ \replaces -> 157 | let 158 | replacer = Replacer.build Aho.CaseSensitive replaces 159 | -- TODO: Remove conversions once we move to text-2.0 160 | replaceText agg (needle, replacement) = Utf8.pack $ T.unpack $ T.replace (T.pack $ Utf8.unpack needle) (T.pack $ Utf8.unpack replacement) (T.pack $ Utf8.unpack agg) 161 | expected = foldl' replaceText haystack replaces 162 | in 163 | Replacer.run replacer haystack `shouldBe` expected 164 | 165 | describe "Searcher" $ do 166 | 167 | describe "containsAny" $ do 168 | 169 | it "gives the right values for the examples in the README" $ do 170 | let needles = ["tshirt", "shirts", "shorts"] 171 | let searcher = Searcher.build Aho.CaseSensitive needles 172 | 173 | Searcher.containsAny searcher "short tshirts" `shouldBe` True 174 | Searcher.containsAny searcher "long shirt" `shouldBe` False 175 | Searcher.containsAny searcher "Short TSHIRTS" `shouldBe` False 176 | 177 | let searcher' = Searcher.build Aho.IgnoreCase needles 178 | 179 | Searcher.containsAny searcher' "Short TSHIRTS" `shouldBe` True 180 | 181 | it "works with the the first line of the illiad" $ do 182 | let illiad = "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ" 183 | needleSets = [(["μοι"], True), (["Ὀδυσεύς"], False)] 184 | 185 | forM_ needleSets $ \(needles, expectedResult) -> do 186 | let searcher = Searcher.build Aho.CaseSensitive needles 187 | Searcher.containsAny searcher illiad `shouldBe` expectedResult 188 | 189 | it "works with the the first line of the illiad (ignore case)" $ do 190 | let illiad = "ἌΝΔΡΑ ΜΟΙ ἜΝΝΕΠΕ, ΜΟΥ͂ΣΑ, ΠΟΛΎΤΡΟΠΟΝ, ὋΣ ΜΆΛΑ ΠΟΛΛᾺ" 191 | searcher = Searcher.build Aho.IgnoreCase ["μοι"] 192 | Searcher.containsAny searcher illiad `shouldBe` True 193 | 194 | describe "containsAll" $ do 195 | 196 | prop "never reports true for empty needles" $ \ (haystack :: Text) -> 197 | let 198 | searcher = Searcher.buildNeedleIdSearcher CaseSensitive [""] 199 | in 200 | Searcher.containsAll searcher haystack `shouldBe` False 201 | 202 | prop "is equivalent to sequential Text.isInfixOf calls for non-empty needles" $ \ (needles' :: [NonEmptyText]) (haystack :: Text) -> 203 | let 204 | needles = map unNonEmptyText needles' 205 | searcher = Searcher.buildNeedleIdSearcher CaseSensitive needles 206 | in 207 | Searcher.containsAll searcher haystack `shouldBe` all (`Text.isInfixOf` haystack) needles 208 | 209 | prop "is equivalent to sequential Text.isInfixOf calls for case-insensitive matching for non-empty needles" $ \ (needles' :: [NonEmptyText]) (haystack :: Text) -> 210 | let 211 | needles = map unNonEmptyText needles' 212 | 213 | lowerNeedles = map Utf8.lowerUtf8 needles 214 | lowerHaystack = Utf8.lowerUtf8 haystack 215 | 216 | searcher = Searcher.buildNeedleIdSearcher IgnoreCase lowerNeedles 217 | in 218 | Searcher.containsAll searcher haystack `shouldBe` all (`Text.isInfixOf` lowerHaystack) lowerNeedles 219 | 220 | describe "Splitter" $ do 221 | 222 | describe "split" $ do 223 | 224 | it "passes an example" $ do 225 | let separator = "bob" 226 | splitter = Splitter.build separator 227 | 228 | Splitter.split splitter "C++bobobCOBOLbobScala" `shouldBe` "C++" :| ["obCOBOL", "Scala"] 229 | Splitter.splitIgnoreCase splitter "C++bobobCOBOLbobScala" `shouldBe` "C++" :| ["obCOBOL", "Scala"] 230 | Splitter.splitIgnoreCase splitter "C++BOBOBCOBOLBOBSCALA" `shouldBe` "C++" :| ["OBCOBOL", "SCALA"] 231 | 232 | it "neatly splits the first line of the illiad" $ do 233 | let splitter = Splitter.build ", " 234 | 235 | Splitter.split splitter "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ" `shouldBe` 236 | "Ἄνδρα μοι ἔννεπε" :| ["Μοῦσα", "πολύτροπον", "ὃς μάλα πολλὰ"] 237 | Splitter.splitIgnoreCase splitter "Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ" `shouldBe` 238 | "Ἄνδρα μοι ἔννεπε" :| ["Μοῦσα", "πολύτροπον", "ὃς μάλα πολλὰ"] 239 | 240 | it "splits on case insensitive needles" $ do 241 | -- The case variations of å have different byte lengths 242 | let splitter = Splitter.build "å" 243 | Splitter.splitIgnoreCase splitter "aaåbbÅccÅdd" `shouldBe` 244 | "aa" :| ["bb", "cc", "dd"] 245 | 246 | -- helpers 247 | 248 | utf8Test :: Utf8.Text -> [Utf8.CodeUnit] -> Expectation 249 | utf8Test str byteList = str `shouldBe` Utf8.fromByteList byteList 250 | 251 | -- From ./benchmark 252 | countMatches :: Aho.CaseSensitivity -> [Utf8.Text] -> Utf8.Text -> Int 253 | {-# NOINLINE countMatches #-} 254 | countMatches caseSensitivity needles haystack = case needles of 255 | [] -> 0 256 | _ -> 257 | let 258 | ac = Aho.build $ zip needles (repeat ()) 259 | onMatch !n _match = Aho.Step (n + 1) 260 | in 261 | Aho.runWithCase caseSensitivity 0 onMatch ac haystack 262 | 263 | -- | A newtype for generating non-empty 'Text' values. 264 | newtype NonEmptyText = NonEmptyText { unNonEmptyText :: Text } 265 | 266 | -- | Simply generates and packs non-empty @[Char]@ values. 267 | instance Arbitrary NonEmptyText where 268 | arbitrary = NonEmptyText . Text.pack <$> Gen.listOf1 arbitrary 269 | 270 | instance Show NonEmptyText where 271 | show = show . unNonEmptyText 272 | -------------------------------------------------------------------------------- /src/Data/Text/BoyerMoore/Automaton.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2019 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE CPP #-} 9 | {-# LANGUAGE DeriveAnyClass #-} 10 | {-# LANGUAGE DeriveGeneric #-} 11 | {-# LANGUAGE DerivingStrategies #-} 12 | {-# LANGUAGE ScopedTypeVariables #-} 13 | 14 | -- | An efficient implementation of the Boyer-Moore string search algorithm. 15 | -- http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140 16 | -- https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm 17 | -- 18 | -- This module contains a almost 1:1 translation from the C example code in the 19 | -- wikipedia article. 20 | -- 21 | -- The algorithm here can be potentially improved by including the Galil rule 22 | -- (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_Galil_rule) 23 | module Data.Text.BoyerMoore.Automaton 24 | ( Automaton 25 | , CaseSensitivity (..) 26 | , CodeUnitIndex (..) 27 | , Next (..) 28 | , buildAutomaton 29 | , patternLength 30 | , patternText 31 | , runText 32 | ) where 33 | 34 | import Prelude hiding (length) 35 | 36 | import Control.DeepSeq (NFData) 37 | import Control.Monad (when) 38 | import Control.Monad.ST (runST) 39 | import Data.Hashable (Hashable (..)) 40 | import Data.Primitive.Extended 41 | ( Prim 42 | , PrimArray 43 | , indexPrimArray 44 | , newPrimArray 45 | , replicateMutablePrimArray 46 | , unsafeFreezePrimArray 47 | , writePrimArray 48 | ) 49 | import GHC.Generics (Generic) 50 | 51 | #if defined(HAS_AESON) 52 | import qualified Data.Aeson as AE 53 | #endif 54 | 55 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) 56 | import Data.Text.Utf8 (CodeUnit, CodeUnitIndex (..), Text) 57 | 58 | import qualified Data.Text.Utf8 as Utf8 59 | 60 | data Next a 61 | = Done !a 62 | | Step !a 63 | 64 | -- | A Boyer-Moore automaton is based on lookup-tables that allow skipping through the haystack. 65 | -- This allows for sub-linear matching in some cases, as we do not have to look at every input 66 | -- character. 67 | -- 68 | -- NOTE: Unlike the AcMachine, a Boyer-Moore automaton only returns non-overlapping matches. 69 | -- This means that a Boyer-Moore automaton is not a 100% drop-in replacement for Aho-Corasick. 70 | -- 71 | -- Returning overlapping matches would degrade the performance to /O(nm)/ in pathological cases like 72 | -- finding @aaaa@ in @aaaaa....aaaaaa@ as for each match it would scan back the whole /m/ characters 73 | -- of the pattern. 74 | data Automaton = Automaton 75 | { automatonPattern :: !Text 76 | , automatonPatternHash :: !Int -- ^ Remember our own hash (similar to what 'Hashed' does but our 77 | -- fields are strict). 78 | , automatonSuffixTable :: !SuffixTable 79 | , automatonBadCharTable :: !BadCharTable 80 | } 81 | deriving stock (Generic, Show) 82 | deriving anyclass (NFData) 83 | 84 | instance Hashable Automaton where 85 | hashWithSalt salt (Automaton _ patternHash _ _) = hashWithSalt salt patternHash 86 | 87 | instance Eq Automaton where 88 | (Automaton pat1 patHash1 _ _) == (Automaton pat2 patHash2 _ _) = 89 | patHash1 == patHash2 && pat1 == pat2 90 | 91 | #if defined(HAS_AESON) 92 | instance AE.FromJSON Automaton where 93 | parseJSON v = buildAutomaton <$> AE.parseJSON v 94 | 95 | instance AE.ToJSON Automaton where 96 | toJSON = AE.toJSON . automatonPattern 97 | #endif 98 | 99 | buildAutomaton :: Text -> Automaton 100 | buildAutomaton pattern = 101 | Automaton pattern (hash pattern) (buildSuffixTable pattern) (buildBadCharTable pattern) 102 | 103 | -- | Finds all matches in the text, calling the match callback with the *first* 104 | -- matched character of each match of the pattern. 105 | -- 106 | -- NOTE: This is unlike Aho-Corasick, which reports the index of the character 107 | -- right after a match. 108 | -- 109 | -- NOTE: In the UTF-16 version of this module, there is a function 'Data.Text.BoyerMoore.Automaton.runLower' 110 | -- which does lower-case matching. This function does not exist for the UTF-8 version since it is very 111 | -- tricky to skip code points going backwards without preprocessing the whole input first. 112 | -- 113 | -- NOTE: To get full advantage of inlining this function, you probably want to 114 | -- compile the compiling module with -fllvm and the same optimization flags as 115 | -- this module. 116 | runText :: forall a 117 | . a 118 | -> (a -> CodeUnitIndex -> Next a) 119 | -> Automaton 120 | -> Text 121 | -> a 122 | {-# INLINE runText #-} 123 | runText seed f automaton text 124 | | patLen == 0 = seed 125 | | otherwise = go seed (patLen - 1) 126 | where 127 | -- Use needle as identifier since pattern is potentially a keyword 128 | Automaton needle _ suffixTable badCharTable = automaton 129 | patLen = Utf8.lengthUtf8 needle 130 | stringLen = Utf8.lengthUtf8 text 131 | 132 | codeUnitAt = Utf8.unsafeIndexCodeUnit text 133 | 134 | {-# INLINE go #-} 135 | go result haystackIndex 136 | | haystackIndex < stringLen = matchLoop result haystackIndex (patLen - 1) 137 | | otherwise = result 138 | 139 | -- Compare the needle back-to-front with the haystack 140 | matchLoop result haystackIndex needleIndex 141 | | needleIndex >= 0 && codeUnitAt haystackIndex == Utf8.unsafeIndexCodeUnit needle needleIndex = 142 | -- Characters match, try the pair before 143 | matchLoop result (haystackIndex - 1) (needleIndex - 1) 144 | -- We found a match (all needle characters matched) 145 | | needleIndex < 0 = 146 | case f result (haystackIndex + 1) of 147 | Done final -> final 148 | -- `haystackIndex` now points to the character just before the match starts 149 | -- Adding `patLen` once points to the last character of the match, 150 | -- Adding `patLen` once more points to the earliest character where 151 | -- we can find a non-overlapping match. 152 | Step intermediate -> go intermediate (haystackIndex + 2 * patLen) 153 | -- We know it's not a match, the characters differ at the current position 154 | | otherwise = 155 | let 156 | -- The bad character table tells us how far we can advance to the right so that the 157 | -- character at the current position in the input string, where matching failed, 158 | -- is lined up with it's rightmost occurrence in the needle. 159 | -- Note: we could end up left of were we started, essentially never making progress, 160 | -- if we were to use this rule alone. 161 | badCharSkip = badCharLookup badCharTable (codeUnitAt haystackIndex) 162 | suffixSkip = suffixLookup suffixTable needleIndex 163 | skip = max badCharSkip suffixSkip 164 | in 165 | go result (haystackIndex + skip) 166 | 167 | -- | Length of the matched pattern measured in UTF-8 code units (bytes). 168 | patternLength :: Automaton -> CodeUnitIndex 169 | patternLength = Utf8.lengthUtf8 . patternText 170 | 171 | -- | Return the pattern that was used to construct the automaton. 172 | patternText :: Automaton -> Text 173 | patternText = automatonPattern 174 | 175 | -- | The suffix table tells us for each character of the pattern how many characters we can 176 | -- jump ahead if the match fails at that point. 177 | newtype SuffixTable = SuffixTable (PrimArray Int) 178 | deriving stock (Generic, Show) 179 | deriving anyclass (NFData) 180 | 181 | -- | Lookup an entry in the suffix table. 182 | suffixLookup :: SuffixTable -> CodeUnitIndex -> CodeUnitIndex 183 | {-# INLINE suffixLookup #-} 184 | suffixLookup (SuffixTable table) = CodeUnitIndex . indexTable table . codeUnitIndex 185 | 186 | buildSuffixTable :: Text -> SuffixTable 187 | buildSuffixTable pattern = runST $ do 188 | let patLen = Utf8.lengthUtf8 pattern 189 | 190 | table <- newPrimArray $ codeUnitIndex patLen 191 | 192 | let 193 | -- Case 1: For each position of the pattern we record the shift that would align the pattern so 194 | -- that it starts at the longest suffix that is at the same time a prefix, if a mismatch would 195 | -- happen at that position. 196 | -- 197 | -- Suppose the length of the pattern is n, a mismatch occurs at position i in the pattern and j 198 | -- in the haystack, then we know that pattern[i+1..n] == haystack[j+1..j+n-i]. That is, we know 199 | -- that the part of the haystack that we already matched is a suffix of the pattern. 200 | -- If the pattern happens to have a prefix that is equal to or a shorter suffix of that matched 201 | -- suffix, we can shift the pattern to the right so that the pattern starts at the longest 202 | -- suffix that we have seen that conincides with a prefix of the pattern. 203 | -- 204 | -- Consider the pattern `ababa`. Then we get 205 | -- 206 | -- p: 0 1 2 3 4 207 | -- Pattern: a b a b a 208 | -- lastPrefixIndex: 2 2 4 4 5 209 | -- table: 6 5 6 5 5 210 | init1 lastPrefixIndex p 211 | | p >= 0 = do 212 | let 213 | prefixIndex 214 | | isPrefix pattern (p + 1) = p + 1 215 | | otherwise = lastPrefixIndex 216 | writePrimArray table (codeUnitIndex p) (codeUnitIndex $ prefixIndex + patLen - 1 - p) 217 | init1 prefixIndex (p - 1) 218 | | otherwise = pure () 219 | 220 | -- Case 2: We also have to account for the fact that the matching suffix of the pattern might 221 | -- occur again somewhere within the pattern. In that case, we may not shift as far as if it was 222 | -- a prefix. That is why the `init2` loop is run after `init1`, potentially overwriting some 223 | -- entries with smaller shifts. 224 | init2 p 225 | | p < patLen - 1 = do 226 | let 227 | suffixLen = suffixLength pattern p 228 | when (Utf8.unsafeIndexCodeUnit pattern (p - suffixLen) /= Utf8.unsafeIndexCodeUnit pattern (patLen - 1 - suffixLen)) $ 229 | writePrimArray table (codeUnitIndex $ patLen - 1 - suffixLen) (codeUnitIndex $ patLen - 1 - p + suffixLen) 230 | init2 (p + 1) 231 | | otherwise = pure () 232 | 233 | init1 (patLen - 1) (patLen - 1) 234 | init2 0 235 | 236 | SuffixTable <$> unsafeFreezePrimArray table 237 | 238 | 239 | -- | The bad char table tells us how far we may skip ahead when encountering a certain character 240 | -- in the input string. For example, if there's a character that is not contained in the pattern at 241 | -- all, we can skip ahead until after that character. 242 | data BadCharTable = BadCharTable 243 | { badCharTableEntries :: {-# UNPACK #-} !(PrimArray Int) 244 | -- ^ The element type should be CodeUnitIndex, but there's no unboxed vector for that type, and 245 | -- defining it would be a lot of boilerplate. 246 | , badCharTablePatternLen :: CodeUnitIndex 247 | } 248 | deriving stock (Generic, Show) 249 | deriving anyclass (NFData) 250 | 251 | -- | Number of entries in the fixed-size lookup-table of the bad char table. 252 | badcharTableSize :: Int 253 | {-# INLINE badcharTableSize #-} 254 | badcharTableSize = 256 255 | 256 | -- | Lookup an entry in the bad char table. 257 | badCharLookup :: BadCharTable -> CodeUnit -> CodeUnitIndex 258 | {-# INLINE badCharLookup #-} 259 | badCharLookup (BadCharTable asciiTable _patLen) char = CodeUnitIndex $ indexTable asciiTable intChar 260 | where 261 | intChar = fromIntegral char 262 | 263 | -- | True if the suffix of the @pattern@ starting from @pos@ is a prefix of the pattern 264 | -- For example, @isPrefix \"aabbaa\" 4 == True@. 265 | isPrefix :: Text -> CodeUnitIndex -> Bool 266 | isPrefix needle pos = go 0 267 | where 268 | suffixLen = Utf8.lengthUtf8 needle - pos 269 | go i 270 | | i < suffixLen = 271 | -- FIXME: Check whether implementing the linter warning kills tco 272 | if Utf8.unsafeIndexCodeUnit needle i == Utf8.unsafeIndexCodeUnit needle (pos + i) 273 | then go (i + 1) 274 | else False 275 | | otherwise = True 276 | 277 | -- | Length of the longest suffix of the pattern ending on @pos@. 278 | -- For example, @suffixLength \"abaacbbaac\" 4 == 4@, because the substring \"baac\" ends at position 279 | -- 4 and is at the same time the longest suffix that does so, having length 4. 280 | suffixLength :: Text -> CodeUnitIndex -> CodeUnitIndex 281 | suffixLength pattern pos = go 0 282 | where 283 | patLen = Utf8.lengthUtf8 pattern 284 | go i 285 | | Utf8.unsafeIndexCodeUnit pattern (pos - i) == Utf8.unsafeIndexCodeUnit pattern (patLen - 1 - i) && i < pos = go (i + 1) 286 | | otherwise = i 287 | 288 | buildBadCharTable :: Text -> BadCharTable 289 | buildBadCharTable pattern = runST $ do 290 | let patLen = Utf8.lengthUtf8 pattern 291 | 292 | -- Initialize table with the maximum skip distance, which is the length of the pattern. 293 | -- This applies to all characters that are not part of the pattern. 294 | asciiTable <- replicateMutablePrimArray badcharTableSize $ codeUnitIndex patLen 295 | 296 | let 297 | -- Fill the bad character table based on the rightmost occurrence of a character in the pattern. 298 | -- Note that there is also a variant of Boyer-Moore that records all positions (see Wikipedia, 299 | -- but that requires even more storage space). 300 | -- Also note that we exclude the last character of the pattern when building the table. 301 | -- This is because 302 | -- 303 | -- 1. If the last character does not occur anywhere else in the pattern and we encounter it 304 | -- during a mismatch, we can advance the pattern to just after that character: 305 | -- 306 | -- Haystack: aaadcdabcdbb 307 | -- Pattern: abcd 308 | -- 309 | -- In the above example, we would match `d` and `c`, but then fail because `d` != `b`. 310 | -- Since `d` only occurs at the very last position of the pattern, we can shift to 311 | -- 312 | -- Haystack: aaadcdabcdbb 313 | -- Pattern: abcd 314 | -- 315 | -- 2. If it does occur anywhere else in the pattern, we can only shift as far as it's necessary 316 | -- to align it with the haystack: 317 | -- 318 | -- Haystack: aaadddabcdbb 319 | -- Pattern: adcd 320 | -- 321 | -- We match `d`, and then there is a mismatch `d` != `c`, which allows us to shift only up to: 322 | 323 | -- Haystack: aaadddabcdbb 324 | -- Pattern: adcd 325 | fillTable !i 326 | -- for(i = 0; i < patLen - 1; i++) { 327 | | i < patLen - 1 = do 328 | let patChar = Utf8.unsafeIndexCodeUnit pattern i 329 | writePrimArray asciiTable (fromIntegral patChar) (codeUnitIndex $ patLen - 1 - i) 330 | fillTable (i + 1) 331 | | otherwise = pure () 332 | 333 | fillTable 0 334 | 335 | asciiTableFrozen <- unsafeFreezePrimArray asciiTable 336 | 337 | pure BadCharTable 338 | { badCharTableEntries = asciiTableFrozen 339 | , badCharTablePatternLen = patLen 340 | } 341 | 342 | 343 | -- Helper functions for easily toggling the safety of this module 344 | 345 | -- | Read from a lookup table at the specified index. 346 | indexTable :: Prim a => PrimArray a -> Int -> a 347 | {-# INLINE indexTable #-} 348 | indexTable = indexPrimArray 349 | -------------------------------------------------------------------------------- /src/Data/Text/Utf8.hs: -------------------------------------------------------------------------------- 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching 2 | -- Copyright 2022 Channable 3 | -- 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the 5 | -- repository root. 6 | 7 | {-# LANGUAGE BangPatterns #-} 8 | {-# LANGUAGE BinaryLiterals #-} 9 | {-# LANGUAGE CPP #-} 10 | {-# LANGUAGE DeriveGeneric #-} 11 | {-# LANGUAGE DerivingStrategies #-} 12 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 13 | {-# LANGUAGE MagicHash #-} 14 | {-# LANGUAGE NumericUnderscores #-} 15 | {-# LANGUAGE UnboxedTuples #-} 16 | 17 | -- | This module provides functions that allow treating 'Text' values as series of UTF-8 code units 18 | -- instead of characters. Any calls to 'Text' in @alfred-margaret@ go through this module. 19 | -- Therefore we re-export some 'Text' functions, e.g. 'Text.concat'. 20 | module Data.Text.Utf8 21 | ( CodePoint 22 | , CodeUnit 23 | , CodeUnitIndex (..) 24 | , Text (..) 25 | , fromByteList 26 | , isCaseInvariant 27 | , lengthUtf8 28 | , lowerCodePoint 29 | , unlowerCodePoint 30 | , lowerUtf8 31 | , toLowerAscii 32 | , unicode2utf8 33 | , unpackUtf8 34 | -- * Decoding 35 | -- 36 | -- $decoding 37 | , decode2 38 | , decode3 39 | , decode4 40 | , decodeUtf8 41 | -- * Indexing 42 | -- 43 | -- $indexing 44 | , indexCodeUnit 45 | , unsafeIndexCodePoint 46 | , unsafeIndexCodeUnit 47 | , skipCodePointsBackwards 48 | -- * Slicing Functions 49 | -- 50 | -- $slicingFunctions 51 | , unsafeCutUtf8 52 | , unsafeSliceUtf8 53 | -- * Functions on Arrays 54 | -- 55 | -- $functionsOnArrays 56 | , arrayContents 57 | , isArrayPinned 58 | , unsafeIndexCodePoint' 59 | , unsafeIndexCodeUnit' 60 | , BackwardsIter (..) 61 | , unsafeIndexEndOfCodePoint' 62 | , unsafeIndexAnywhereInCodePoint' 63 | 64 | -- * General Functions 65 | -- 66 | -- $generalFunctions 67 | , Text.concat 68 | , Text.dropWhile 69 | , Text.isInfixOf 70 | , Text.null 71 | , Text.pack 72 | , Text.replicate 73 | , Text.unpack 74 | , TextSearch.indices 75 | ) where 76 | 77 | import Control.DeepSeq (NFData) 78 | import Data.Bits (Bits (shiftL), shiftR, (.&.), (.|.)) 79 | import Data.Hashable (Hashable) 80 | import Data.Text.Internal (Text (..)) 81 | import Data.Word (Word8) 82 | import GHC.Generics (Generic) 83 | import Data.Primitive (ByteArray (ByteArray), Prim, byteArrayFromList) 84 | #if defined(HAS_AESON) 85 | import Data.Aeson (FromJSON, ToJSON) 86 | #endif 87 | import Data.Text.Utf8.Unlower (unlowerCodePoint) 88 | 89 | import qualified Data.Char as Char 90 | import qualified Data.Text as Text 91 | import qualified Data.Text.Array as TextArray 92 | import qualified Data.Text.Internal.Search as TextSearch 93 | import qualified Data.Text.Unsafe as TextUnsafe 94 | import qualified GHC.Exts as Exts 95 | 96 | -- | A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units. 97 | type CodeUnit = Word8 98 | 99 | -- | A Unicode code point. 100 | type CodePoint = Char 101 | 102 | -- | An index into the raw UTF-8 data of a `Text`. This is not the code point 103 | -- index as conventionally accepted by `Text`, so we wrap it to avoid confusing 104 | -- the two. Incorrect index manipulation can lead to surrogate pairs being 105 | -- sliced, so manipulate indices with care. This type is also used for lengths. 106 | newtype CodeUnitIndex = CodeUnitIndex 107 | { codeUnitIndex :: Int 108 | } 109 | deriving stock (Eq, Ord, Generic, Bounded) 110 | #if defined(HAS_AESON) 111 | deriving newtype (Show, Prim, Hashable, Num, NFData, FromJSON, ToJSON) 112 | #else 113 | deriving newtype (Show, Prim, Hashable, Num, NFData) 114 | #endif 115 | 116 | {-# INLINABLE unpackUtf8 #-} 117 | unpackUtf8 :: Text -> [CodeUnit] 118 | unpackUtf8 (Text u8data offset len) = 119 | let 120 | go _ 0 = [] 121 | go i n = unsafeIndexCodeUnit' u8data (CodeUnitIndex i) : go (i + 1) (n - 1) 122 | in 123 | go offset len 124 | 125 | -- | The return value of this function is not really an index. 126 | -- However the signature is supposed to make it clear that the length is returned in terms of code units, not code points. 127 | lengthUtf8 :: Text -> CodeUnitIndex 128 | lengthUtf8 (Text _ _ !len) = CodeUnitIndex len 129 | 130 | -- | Lower-case the ASCII code points A-Z and leave the rest of ASCII intact. 131 | {-# INLINE toLowerAscii #-} 132 | toLowerAscii :: Char -> Char 133 | toLowerAscii cp 134 | | Char.isAsciiUpper cp = Char.chr (Char.ord cp + 0x20) 135 | | otherwise = cp 136 | 137 | -- | Lowercase a 'Text' by applying 'lowerCodePoint' to each 'Char'. 138 | {-# INLINE lowerUtf8 #-} 139 | lowerUtf8 :: Text -> Text 140 | lowerUtf8 = Text.map lowerCodePoint 141 | 142 | asciiCount :: Int 143 | asciiCount = 128 144 | 145 | {-# INLINE lowerCodePoint #-} 146 | -- | Lower-Case a UTF-8 codepoint. 147 | -- Uses 'toLowerAscii' for ASCII and 'Char.toLower' otherwise. 148 | lowerCodePoint :: Char -> Char 149 | lowerCodePoint cp 150 | | Char.ord cp < asciiCount = toLowerAscii cp 151 | | otherwise = Char.toLower cp 152 | 153 | -- | Convert a Unicode Code Point 'c' into a list of UTF-8 code units (bytes). 154 | unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a] 155 | {-# INLINE unicode2utf8 #-} 156 | unicode2utf8 c 157 | | c < 0x80 = [c] 158 | | c < 0x800 = [0xc0 .|. (c `shiftR` 6), 0x80 .|. (0x3f .&. c)] 159 | | c < 0x10000 = [0xe0 .|. (c `shiftR` 12), 0x80 .|. (0x3f .&. (c `shiftR` 6)), 0x80 .|. (0x3f .&. c)] 160 | | otherwise = [0xf0 .|. (c `shiftR` 18), 0x80 .|. (0x3f .&. (c `shiftR` 12)), 0x80 .|. (0x3f .&. (c `shiftR` 6)), 0x80 .|. (0x3f .&. c)] 161 | 162 | fromByteList :: [Word8] -> Text 163 | fromByteList byteList = Text (TextArray.ByteArray ba#) 0 (length byteList) 164 | where !(ByteArray ba#) = byteArrayFromList byteList 165 | 166 | -- | Return whether text has exactly one case variation, such that this function 167 | -- will not return true when Aho–Corasick would differentiate when doing 168 | -- case-insensitive matching. 169 | {-# INLINE isCaseInvariant #-} 170 | isCaseInvariant :: Text -> Bool 171 | isCaseInvariant = Text.all (\c -> unlowerCodePoint (lowerCodePoint c) == [c]) 172 | 173 | -- $decoding 174 | -- 175 | -- Functions that turns code unit sequences into code point sequences. 176 | 177 | -- | Decode a single UTF-8 code unit into its code point. 178 | -- The given code unit should have the following format: 179 | -- 180 | -- > ┌───────────────┐ 181 | -- > │0 x x x x x x x│ 182 | -- > └───────────────┘ 183 | decode1 :: CodeUnit -> CodePoint 184 | decode1 cu0 = 185 | Char.chr $ fromIntegral cu0 186 | 187 | -- | Decode 2 UTF-8 code units into their code point. 188 | -- The given code units should have the following format: 189 | -- 190 | -- > ┌───────────────┬───────────────┐ 191 | -- > │1 1 0 x x x x x│1 0 x x x x x x│ 192 | -- > └───────────────┴───────────────┘ 193 | {-# INLINE decode2 #-} 194 | decode2 :: CodeUnit -> CodeUnit -> CodePoint 195 | decode2 cu0 cu1 = 196 | Char.chr $ (fromIntegral cu0 .&. 0x1f) `shiftL` 6 .|. fromIntegral cu1 .&. 0x3f 197 | 198 | -- | Decode 3 UTF-8 code units into their code point. 199 | -- The given code units should have the following format: 200 | -- 201 | -- > ┌───────────────┬───────────────┬───────────────┐ 202 | -- > │1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│ 203 | -- > └───────────────┴───────────────┴───────────────┘ 204 | {-# INLINE decode3 #-} 205 | decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint 206 | decode3 cu0 cu1 cu2 = 207 | Char.chr $ (fromIntegral cu0 .&. 0xf) `shiftL` 12 .|. (fromIntegral cu1 .&. 0x3f) `shiftL` 6 .|. (fromIntegral cu2 .&. 0x3f) 208 | 209 | -- | Decode 4 UTF-8 code units into their code point. 210 | -- The given code units should have the following format: 211 | -- 212 | -- > ┌───────────────┬───────────────┬───────────────┬───────────────┐ 213 | -- > │1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│ 214 | -- > └───────────────┴───────────────┴───────────────┴───────────────┘ 215 | {-# INLINE decode4 #-} 216 | decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint 217 | decode4 cu0 cu1 cu2 cu3 = 218 | Char.chr $ (fromIntegral cu0 .&. 0x7) `shiftL` 18 .|. (fromIntegral cu1 .&. 0x3f) `shiftL` 12 .|. (fromIntegral cu2 .&. 0x3f) `shiftL` 6 .|. (fromIntegral cu3 .&. 0x3f) 219 | 220 | -- | Decode a list of UTF-8 code units into a list of code points. 221 | decodeUtf8 :: [CodeUnit] -> [CodePoint] 222 | decodeUtf8 [] = [] 223 | decodeUtf8 (cu0 : cus) | cu0 < 0xc0 = Char.chr (fromIntegral cu0) : decodeUtf8 cus 224 | decodeUtf8 (cu0 : cu1 : cus) | cu0 < 0xe0 = decode2 cu0 cu1 : decodeUtf8 cus 225 | decodeUtf8 (cu0 : cu1 : cu2 : cus) | cu0 < 0xf0 = decode3 cu0 cu1 cu2 : decodeUtf8 cus 226 | decodeUtf8 (cu0 : cu1 : cu2 : cu3 : cus) | cu0 < 0xf8 = decode4 cu0 cu1 cu2 cu3 : decodeUtf8 cus 227 | decodeUtf8 cus = error $ "Invalid UTF-8 input sequence at " ++ show (take 4 cus) 228 | 229 | -- $indexing 230 | -- 231 | -- 'Text' can be indexed by code units or code points. 232 | -- A 'CodePoint' is a 21-bit Unicode code point and can consist of up to four code units. 233 | -- A 'CodeUnit' is a single byte. 234 | 235 | -- | Does exactly the same thing as 'unsafeIndexCodePoint'', but on 'Text' values. 236 | {-# INLINE unsafeIndexCodePoint #-} 237 | unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) 238 | unsafeIndexCodePoint (Text !u8data !off !_len) !index = 239 | unsafeIndexCodePoint' u8data $ CodeUnitIndex off + index 240 | 241 | -- | Get the code unit at the given 'CodeUnitIndex'. 242 | -- Performs bounds checking. 243 | {-# INLINE indexCodeUnit #-} 244 | indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit 245 | indexCodeUnit !text !index 246 | | index < 0 || index >= lengthUtf8 text = error $ "Index out of bounds " ++ show index 247 | | otherwise = unsafeIndexCodeUnit text index 248 | 249 | {-# INLINE unsafeIndexCodeUnit #-} 250 | unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit 251 | unsafeIndexCodeUnit (Text !u8data !off !_len) !index = 252 | unsafeIndexCodeUnit' u8data $ CodeUnitIndex off + index 253 | 254 | -- | Scan backwards through the text until we've seen the specified number of codepoints. Assumes 255 | -- that the initial CodeUnitIndex is within a codepoint. 256 | {-# INLINE skipCodePointsBackwards #-} 257 | skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex 258 | skipCodePointsBackwards (Text !u8data !off !len) !index0 !n0 259 | | index0 >= CodeUnitIndex len = error "Invalid use of skipCodePointsBackwards" 260 | | otherwise = loop (index0 + CodeUnitIndex off) n0 261 | where 262 | loop index n | atTrailingByte index = 263 | loop (index-1) n -- Don't exit before we're at a leading byte 264 | loop index 0 | index < 0 = 265 | -- Throw an error if we've read before the array (e.g. when the data was 266 | -- not valid UTF-8), this one-time check doesn't prevent undefined 267 | -- behaviour but may help you locate bugs. 268 | error "Invalid use of skipCodePointsBackwards" 269 | loop index 0 = 270 | index - CodeUnitIndex off 271 | loop index n = 272 | loop (index-1) (n-1) 273 | 274 | -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while 275 | -- the first byte can be 0xxxxxxx or 11yyyyyy. 276 | atTrailingByte !index = unsafeIndexCodeUnit' u8data index .&. 0b1100_0000 == 0b1000_0000 277 | 278 | -- $slicingFunctions 279 | -- 280 | -- 'unsafeCutUtf8' and 'unsafeSliceUtf8' are used to retrieve slices of 'Text' values. 281 | -- @unsafeSliceUtf8 begin length@ returns a substring of length @length@ starting at @begin@. 282 | -- @unsafeSliceUtf8 begin length@ returns a tuple of the "surrounding" substrings. 283 | -- 284 | -- They satisfy the following property: 285 | -- 286 | -- > let (prefix, suffix) = unsafeCutUtf8 begin length t 287 | -- > in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t 288 | -- 289 | -- The following diagram visualizes the relevant offsets for @begin = CodeUnitIndex 2@, @length = CodeUnitIndex 6@ and @t = \"BCDEFGHIJKL\"@. 290 | -- 291 | -- > off off+len 292 | -- > │ │ 293 | -- > ▼ ▼ 294 | -- > ──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬── 295 | -- > A│B│C│D│E│F│G│H│I│J│K│L│M│N 296 | -- > ──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴── 297 | -- > ▲ ▲ 298 | -- > │ │ 299 | -- > off+begin off+begin+length 300 | -- > 301 | -- > unsafeSliceUtf8 begin length t == "DEFGHI" 302 | -- > unsafeCutUtf8 begin length t == ("BC", "JKL") 303 | -- 304 | -- The shown array is open at each end because in general, @t@ may be a slice as well. 305 | -- 306 | -- __WARNING__: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk. 307 | 308 | unsafeCutUtf8 :: CodeUnitIndex -- ^ Starting position of substring. 309 | -> CodeUnitIndex -- ^ Length of substring. 310 | -> Text -- ^ Initial string. 311 | -> (Text, Text) 312 | unsafeCutUtf8 (CodeUnitIndex !begin) (CodeUnitIndex !len) !text = 313 | ( TextUnsafe.takeWord8 begin text 314 | , TextUnsafe.dropWord8 (begin + len) text 315 | ) 316 | 317 | unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text 318 | unsafeSliceUtf8 (CodeUnitIndex !begin) (CodeUnitIndex !len) !text = 319 | TextUnsafe.takeWord8 len $ TextUnsafe.dropWord8 begin text 320 | 321 | -- $functionsOnArrays 322 | -- 323 | -- Functions for working with 'TextArray.Array' values. 324 | 325 | -- | See 'Data.Primitive.isByteArrayPinned'. 326 | isArrayPinned :: TextArray.Array -> Bool 327 | isArrayPinned (TextArray.ByteArray ba#) = Exts.isTrue# (Exts.isByteArrayPinned# ba#) 328 | 329 | -- | See 'Data.Primitive.byteArrayContents'. 330 | arrayContents :: TextArray.Array -> Exts.Ptr Word8 331 | arrayContents (TextArray.ByteArray ba#) = Exts.Ptr (Exts.byteArrayContents# ba#) 332 | 333 | -- | Decode a code point at the given 'CodeUnitIndex'. 334 | -- Returns garbage if there is no valid code point at that position. 335 | -- Does not perform bounds checking. 336 | -- See 'decode2', 'decode3' and 'decode4' for the expected format of multi-byte code points. 337 | unsafeIndexCodePoint' :: TextArray.Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) 338 | {-# INLINE unsafeIndexCodePoint' #-} 339 | unsafeIndexCodePoint' !u8data !idx = 340 | decodeN (cuAt 0) (cuAt 1) (cuAt 2) (cuAt 3) 341 | where 342 | cuAt !i = unsafeIndexCodeUnit' u8data $ idx + i 343 | 344 | decodeN :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> (CodeUnitIndex, CodePoint) 345 | {-# INLINE decodeN #-} 346 | decodeN cu0 cu1 cu2 cu3 347 | | cu0 < 0xc0 = (1, decode1 cu0) 348 | | cu0 < 0xe0 = (2, decode2 cu0 cu1) 349 | | cu0 < 0xf0 = (3, decode3 cu0 cu1 cu2) 350 | | otherwise = (4, decode4 cu0 cu1 cu2 cu3) 351 | 352 | 353 | 354 | -- | Intermediate state when you're iterating backwards through a Utf8 text. 355 | data BackwardsIter = BackwardsIter 356 | { backwardsIterNext :: {-# UNPACK #-} !CodeUnitIndex 357 | -- ^ First byte to the left of the codepoint that we're focused on. This can 358 | -- be used with 'unsafeIndexEndOfCodePoint'' to find the next codepoint. 359 | , backwardsIterChar :: {-# UNPACK #-} !CodePoint 360 | -- ^ The codepoint that we're focused on 361 | , backwardsIterEndOfChar :: {-# UNPACK #-} !CodeUnitIndex 362 | -- ^ Points to the last byte of the codepoint that we're focused on 363 | } 364 | deriving Generic 365 | 366 | -- | Similar to unsafeIndexCodePoint', but assumes that the given index is the 367 | -- end of a utf8 codepoint. It returns the decoded code point and the index 368 | -- _before_ the code point. The resulting index could be passed directly to 369 | -- unsafeIndexEndOfCodePoint' again to decode the _previous_ code point. 370 | unsafeIndexEndOfCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter 371 | {-# INLINE unsafeIndexEndOfCodePoint' #-} 372 | unsafeIndexEndOfCodePoint' !u8data !idx = 373 | let 374 | cuAt !i = unsafeIndexCodeUnit' u8data $ idx - i 375 | -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while 376 | -- the first byte can be 0xxxxxxx or 11yyyyyy. 377 | isFirstByte !cu = cu .&. 0b1100_0000 /= 0b1000_0000 378 | cu0 = cuAt 0 379 | in 380 | if isFirstByte cu0 381 | then BackwardsIter (idx - 1) (decode1 cu0) idx 382 | else 383 | let cu1 = cuAt 1 in 384 | if isFirstByte cu1 385 | then BackwardsIter (idx - 2) (decode2 cu1 cu0) idx 386 | else 387 | let cu2 = cuAt 2 in 388 | if isFirstByte cu2 389 | then BackwardsIter (idx - 3) (decode3 cu2 cu1 cu0) idx 390 | else 391 | let cu3 = cuAt 3 in 392 | if isFirstByte cu3 393 | then BackwardsIter (idx - 4) (decode4 cu3 cu2 cu1 cu0) idx 394 | else 395 | error "unsafeIndexEndOfCodePoint' could not find valid UTF8 codepoint" 396 | 397 | unsafeIndexAnywhereInCodePoint' :: TextArray.Array -> CodeUnitIndex -> BackwardsIter 398 | {-# INLINE unsafeIndexAnywhereInCodePoint' #-} 399 | unsafeIndexAnywhereInCodePoint' !u8data !idx = 400 | let 401 | cuAt !i = unsafeIndexCodeUnit' u8data $ idx + i 402 | -- Second, third and fourth bytes of a codepoint are always 10xxxxxx, while 403 | -- the first byte can be 0xxxxxxx or 11yyyyyy. 404 | isFirstByte !cu = cu .&. 0b1100_0000 /= 0b1000_0000 405 | cu0 = cuAt 0 406 | 407 | makeBackwardsIter next (l, cp) = BackwardsIter next cp (next + l) 408 | in 409 | if isFirstByte cu0 410 | then makeBackwardsIter (idx - 1) $ decodeN cu0 (cuAt 1) (cuAt 2) (cuAt 3) 411 | else 412 | let cu00 = cuAt (-1) in 413 | if isFirstByte cu00 414 | then makeBackwardsIter (idx - 2) $ decodeN cu00 cu0 (cuAt 1) (cuAt 2) 415 | else 416 | let cu000 = cuAt (-2) in 417 | if isFirstByte cu000 418 | then makeBackwardsIter (idx - 3) $ decodeN cu000 cu00 cu0 (cuAt 1) 419 | else 420 | let cu0000 = cuAt (-3) in 421 | if isFirstByte cu0000 422 | then makeBackwardsIter (idx - 4) $ decodeN cu0000 cu000 cu00 cu0 423 | else 424 | error "unsafeIndexAnywhereInCodePoint' could not find valid UTF8 codepoint" 425 | 426 | {-# INLINE unsafeIndexCodeUnit' #-} 427 | unsafeIndexCodeUnit' :: TextArray.Array -> CodeUnitIndex -> CodeUnit 428 | unsafeIndexCodeUnit' !u8data (CodeUnitIndex !idx) = TextArray.unsafeIndex u8data idx 429 | 430 | -- $generalFunctions 431 | -- 432 | -- Re-exported from 'Text'. 433 | --------------------------------------------------------------------------------