├── stack-8.6.5.yaml
├── stack-8.8.4.yaml
├── stack-9.0.2.yaml
├── stack-9.2.8.yaml
├── stack-9.4.8.yaml
├── stack-8.10.7.yaml
├── .gitignore
├── stack.yaml
├── stack-9.6.6.yaml
├── stack-9.8.4.yaml
├── bench
    ├── ReadInteger.hs
    ├── Common.hs
    ├── FPBasic.hs
    ├── FPStateful.hs
    ├── Attoparsec.hs
    ├── Megaparsec.hs
    ├── Parsec.hs
    └── Bench.hs
├── src
    └── FlatParse
    │   ├── Common
    │       ├── Parser.hs
    │       ├── GHCExts.hs
    │       ├── Position.hs
    │       ├── Switch.hs
    │       ├── Assorted.hs
    │       └── Numbers.hs
    │   ├── Basic
    │       ├── Bytes.hs
    │       ├── Addr.hs
    │       ├── Switch.hs
    │       ├── Parser.hs
    │       ├── Text.hs
    │       ├── Integers.hs
    │       └── Base.hs
    │   ├── Stateful
    │       ├── Bytes.hs
    │       ├── Addr.hs
    │       ├── Switch.hs
    │       ├── Parser.hs
    │       ├── Text.hs
    │       ├── Integers.hs
    │       └── Base.hs
    │   ├── Examples
    │       └── BasicLambda
    │       │   ├── Parser.hs
    │       │   └── Lexer.hs
    │   └── Stateful.hs
├── LICENSE
├── flake.nix
├── .github
    └── workflows
    │   └── haskell.yml
├── flake.lock
├── package.yaml
├── flatparse.cabal
└── README.md


/stack-8.6.5.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-14.27
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/stack-8.8.4.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-16.31
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/stack-9.0.2.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-19.33
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/stack-9.2.8.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-20.26
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/stack-9.4.8.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-21.25
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/stack-8.10.7.yaml:
--------------------------------------------------------------------------------
1 | 
2 | resolver: lts-18.28
3 | 
4 | packages:
5 | - .
6 | 
7 | ghc-options:
8 |     "$everything": -split-sections
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.hi
 2 | *.o
 3 | ./dist
 4 | .stack-work/
 5 | *#
 6 | TAGS
 7 | #*.lock
 8 | *.dump-simpl
 9 | *.dump-cmm
10 | *.dump-stg
11 | dist-newstyle/
12 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # ghc 9.10.2
 3 | 
 4 | resolver: lts-24.7
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | ghc-options:
10 |     "$everything": -split-sections
11 | 


--------------------------------------------------------------------------------
/stack-9.6.6.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # ghc 9.6.6
 3 | 
 4 | resolver: lts-22.41
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | ghc-options:
10 |     "$everything": -split-sections
11 | 


--------------------------------------------------------------------------------
/stack-9.8.4.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # ghc 9.8.4
 3 | 
 4 | resolver: lts-23.25
 5 | 
 6 | packages:
 7 | - .
 8 | 
 9 | ghc-options:
10 |     "$everything": -split-sections
11 | 


--------------------------------------------------------------------------------
/bench/ReadInteger.hs:
--------------------------------------------------------------------------------
1 | 
2 | module ReadInteger where
3 | 
4 | import FlatParse.Basic as FPBasic
5 | 
6 | readInt     = runParser FPBasic.anyAsciiDecimalInt
7 | readInteger = runParser FPBasic.anyAsciiDecimalInteger
8 | 


--------------------------------------------------------------------------------
/bench/Common.hs:
--------------------------------------------------------------------------------
 1 | {-# language Strict #-}
 2 | 
 3 | module Common where
 4 | 
 5 | import qualified Data.ByteString as B
 6 | 
 7 | type Name = B.ByteString
 8 | data Tm = Var Name | App Tm Tm | Lam Name Tm | Let Name Tm Tm | Int Int | Add Tm Tm | Mul Tm Tm
 9 |   deriving Show
10 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/Parser.hs:
--------------------------------------------------------------------------------
 1 | -- | Common low-level parser definitions.
 2 | 
 3 | module FlatParse.Common.Parser
 4 |   (
 5 |   -- * Parser state token types
 6 |   -- $parser-state-token-types
 7 |     type PureMode
 8 |   , type IOMode
 9 |   , type STMode
10 |   ) where
11 | 
12 | import GHC.Exts
13 | import Data.Void ( Void )
14 | 
15 | {- $parser-state-token-types
16 | These type synonyms are used as parameters to @ParserT@. Different state tokens
17 | support different embedded effects.
18 | -}
19 | 
20 | -- TODO 2023-01-17 raehik: perhaps more/better explanation?
21 | 
22 | type PureMode = Proxy# Void
23 | type IOMode   = State# RealWorld
24 | type STMode s = State# s
25 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/GHCExts.hs:
--------------------------------------------------------------------------------
 1 | -- | 'GHC.Exts' compatibility wrapper.
 2 | 
 3 | {-# LANGUAGE KindSignatures #-}
 4 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why)
 5 | {-# OPTIONS_GHC -Wno-dodgy-exports #-}
 6 | 
 7 | module FlatParse.Common.GHCExts
 8 |   ( module FlatParse.Common.GHCExts
 9 |   , module GHC.Exts
10 |   ) where
11 | 
12 | import GHC.Exts
13 | 
14 | #if !MIN_VERSION_base(4,17,0)
15 | {-
16 | GHC 9.4 clarified the story for types without runtime representations. These
17 | type synonyms are defined and used to simplify certain internal definitions
18 | (e.g. 'State#'). They are nicer than using the "expanded" type, so we define
19 | them here for older compilers.
20 | -}
21 | type ZeroBitRep = 'TupleRep ('[] :: [RuntimeRep])
22 | type ZeroBitType = TYPE ZeroBitRep
23 | #endif
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2021 András Kovács
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   inputs = {
 3 |     nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable";
 4 |     flake-parts.url = "github:hercules-ci/flake-parts";
 5 |     haskell-flake.url = "github:srid/haskell-flake";
 6 |   };
 7 |   outputs = inputs:
 8 |   let
 9 |     # simple devshell for non-dev compilers: really just want `cabal repl`
10 |     nondevDevShell = compiler: {
11 |       mkShellArgs.name = "${compiler}-flatparse";
12 |       hoogle = false;
13 |       tools = _: {
14 |         hlint = null;
15 |         haskell-language-server = null;
16 |         ghcid = null;
17 |       };
18 |     };
19 |   in
20 |     inputs.flake-parts.lib.mkFlake { inherit inputs; } {
21 |       systems = inputs.nixpkgs.lib.systems.flakeExposed;
22 |       imports = [ inputs.haskell-flake.flakeModule ];
23 |       perSystem = { self', pkgs, config, ... }: {
24 |         packages.default  = self'.packages.ghc96-flatparse;
25 |         devShells.default = self'.devShells.ghc96;
26 |         haskellProjects.ghc98 = {
27 |           # shouldn't work, pkgs aren't up to date and mine aren't 9.8 ready
28 |           basePackages = pkgs.haskell.packages.ghc98;
29 |           devShell = nondevDevShell "ghc98";
30 |         };
31 |         haskellProjects.ghc96 = {
32 |           basePackages = pkgs.haskell.packages.ghc96;
33 |           devShell.mkShellArgs.name = "ghc96-flatparse";
34 |           devShell.tools = _: {
35 |             haskell-language-server = null; # 2024-03-06: broken
36 |           };
37 |         };
38 |         haskellProjects.ghc94 = {
39 |           basePackages = pkgs.haskell.packages.ghc94;
40 |           devShell = nondevDevShell "ghc94";
41 |         };
42 |         haskellProjects.ghc92 = {
43 |           basePackages = pkgs.haskell.packages.ghc92;
44 |           devShell = nondevDevShell "ghc92";
45 |         };
46 |       };
47 |     };
48 | }
49 | 


--------------------------------------------------------------------------------
/bench/FPBasic.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module FPBasic (
 3 |     runSexp
 4 |   , runLongws
 5 |   , runNumcsv
 6 |   , runTm) where
 7 | 
 8 | import FlatParse.Basic
 9 | import Common
10 | import qualified Data.ByteString as B
11 | 
12 | ws, open, close, ident, sexp, src :: Parser () ()
13 | ws      = skipMany $(switch [| case _ of " " -> pure (); "\n" -> pure () |])
14 | open    = $(char '(') >> ws
15 | close   = $(char ')') >> ws
16 | ident   = skipSome (skipSatisfyAscii isLatinLetter) >> ws
17 | sexp    = branch open (skipSome sexp >> close) ident
18 | src     = sexp >> eof
19 | runSexp = runParser src
20 | 
21 | longw, longws :: Parser () ()
22 | longw     = $(string "thisisalongkeyword")
23 | longws    = skipSome (longw >> ws) >> eof
24 | runLongws = runParser longws
25 | 
26 | numeral, comma, numcsv :: Parser () ()
27 | numeral   = skipSome (skipSatisfyAscii isDigit) >> ws
28 | comma     = $(char ',') >> ws
29 | numcsv    = numeral >> skipMany (comma >> numeral) >> eof
30 | runNumcsv = runParser numcsv
31 | 
32 | 
33 | --------------------------------------------------------------------------------
34 | 
35 | ident' :: Parser () B.ByteString
36 | ident' = byteStringOf (skipSome (skipSatisfyAscii \c -> isLatinLetter c || isDigit c)) <* ws
37 | 
38 | equal = $(string "=") <* ws
39 | semi  = $(string ";") <* ws
40 | dot   = $(string ".") <* ws
41 | addOp = $(string "+") <* ws
42 | mulOp = $(string "*") <* ws
43 | parl  = $(string "(") <* ws
44 | parr  = $(string ")") <* ws
45 | 
46 | add :: Parser () Tm
47 | add = chainl Add mul (addOp *> mul)
48 | 
49 | mul :: Parser () Tm
50 | mul = chainl Mul spine (mulOp *> spine)
51 | 
52 | spine :: Parser () Tm
53 | spine = chainl App atom atom
54 | 
55 | atom :: Parser () Tm
56 | atom =
57 |         (Int <$> (anyAsciiDecimalInt <* ws))
58 |     <|> (Var <$> ident')
59 |     <|> (parl *> tm <* parr)
60 | 
61 | tm :: Parser () Tm
62 | tm = $(switch [| case _ of
63 |   "fun" -> do {ws; x <- ident'; dot; t <- tm; pure (Lam x t)}
64 |   "let" -> do {ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)}
65 |   _     -> add |])
66 | 
67 | runTm = runParser (ws *> tm <* eof)
68 | 


--------------------------------------------------------------------------------
/bench/FPStateful.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module FPStateful (
 3 |     runSexp
 4 |   , runLongws
 5 |   , runNumcsv
 6 |   , runTm) where
 7 | 
 8 | import FlatParse.Stateful
 9 | import Common
10 | import qualified Data.ByteString as B
11 | 
12 | ws, open, close, ident, sexp, src :: Parser () () ()
13 | ws      = skipMany $(switch [| case _ of " " -> pure (); "\n" -> pure () |])
14 | open    = $(char '(') >> ws
15 | close   = $(char ')') >> ws
16 | ident   = skipSome (skipSatisfyAscii isLatinLetter) >> ws
17 | sexp    = branch open (skipSome sexp >> close) ident
18 | src     = sexp >> eof
19 | runSexp = runParser src () 0
20 | 
21 | longw, longws :: Parser () () ()
22 | longw     = $(string "thisisalongkeyword")
23 | longws    = skipSome (longw >> ws) >> eof
24 | runLongws = runParser longws () 0
25 | 
26 | numeral, comma, numcsv :: Parser () () ()
27 | numeral   = skipSome (skipSatisfyAscii isDigit) >> ws
28 | comma     = $(char ',') >> ws
29 | numcsv    = numeral >> skipMany (comma >> numeral) >> eof
30 | runNumcsv = runParser numcsv () 0
31 | 
32 | 
33 | --------------------------------------------------------------------------------
34 | 
35 | ident' :: Parser () () B.ByteString
36 | ident' = byteStringOf (skipSome (skipSatisfyAscii \c -> isLatinLetter c || isDigit c)) <* ws
37 | 
38 | equal = $(string "=") <* ws
39 | semi  = $(string ";") <* ws
40 | dot   = $(string ".") <* ws
41 | addOp = $(string "+") <* ws
42 | mulOp = $(string "*") <* ws
43 | parl  = $(string "(") <* ws
44 | parr  = $(string ")") <* ws
45 | 
46 | add :: Parser () () Tm
47 | add = chainl Add mul (addOp *> mul)
48 | 
49 | mul :: Parser () () Tm
50 | mul = chainl Mul spine (mulOp *> spine)
51 | 
52 | spine :: Parser () () Tm
53 | spine = chainl App atom atom
54 | 
55 | atom :: Parser () () Tm
56 | atom =
57 |         (Int <$> (anyAsciiDecimalInt <* ws))
58 |     <|> (Var <$> ident')
59 |     <|> (parl *> tm <* parr)
60 | 
61 | tm :: Parser () () Tm
62 | tm = $(switch [| case _ of
63 |   "fun" -> do {ws; x <- ident'; dot; t <- tm; pure (Lam x t)}
64 |   "let" -> do {ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)}
65 |   _     -> add |])
66 | 
67 | runTm = runParser (ws *> tm <* eof) () 0
68 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/Position.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DerivingStrategies, DerivingVia #-}
 2 | 
 3 | module FlatParse.Common.Position
 4 |   ( Pos(..), endPos, addrToPos#, posToAddr#
 5 |   , Span(..), unsafeSlice, leftPos, rightPos
 6 |   ) where
 7 | 
 8 | import qualified Data.ByteString as B
 9 | import qualified Data.ByteString.Internal as B
10 | import GHC.Int
11 | import GHC.ForeignPtr ( ForeignPtr(..) )
12 | import GHC.Exts
13 | 
14 | -- | Byte offset counted backwards from the end of the buffer.
15 | --   Note: the `Ord` instance for `Pos` considers the earlier positions to be
16 | --   smaller.
17 | newtype Pos = Pos { unPos :: Int }
18 |     deriving stock (Show)
19 |     deriving Eq via Int
20 | 
21 | instance Ord Pos where
22 |   (<=) (Pos x) (Pos y) = y <= x
23 |   {-# inline (<=) #-}
24 |   compare (Pos x) (Pos y) = compare y x
25 |   {-# inline compare #-}
26 | 
27 | -- | A pair of positions.
28 | data Span = Span !Pos !Pos
29 |     deriving stock (Eq, Show)
30 | 
31 | -- | Very unsafe conversion between a primitive address and a position.  The
32 | --   first argument points to the end of the buffer, the second argument is
33 | --   being converted.
34 | addrToPos# :: Addr# -> Addr# -> Pos
35 | addrToPos# eob s = Pos (I# (minusAddr# eob s))
36 | {-# inline addrToPos# #-}
37 | 
38 | -- | Very unsafe conversion between a primitive address and a position.  The
39 | --   first argument points to the end of the buffer.
40 | posToAddr# :: Addr# -> Pos -> Addr#
41 | posToAddr# eob (Pos (I# n)) = unsafeCoerce# (minusAddr# eob (unsafeCoerce# n))
42 | {-# inline posToAddr# #-}
43 | 
44 | -- | Slice into a `B.ByteString` using a `Span`. The result is invalid if the `Span`
45 | --   is not a valid slice of the first argument.
46 | unsafeSlice :: B.ByteString -> Span -> B.ByteString
47 | unsafeSlice (B.PS (ForeignPtr addr fp) (I# start) (I# len))
48 |             (Span (Pos (I# o1)) (Pos (I# o2))) =
49 |   let end = addr `plusAddr#` start `plusAddr#` len
50 |   in B.PS (ForeignPtr (plusAddr# end (negateInt# o1)) fp) (I# 0#) (I# (o1 -# o2))
51 | {-# inline unsafeSlice #-}
52 | 
53 | -- | The end of the input.
54 | endPos :: Pos
55 | endPos = Pos 0
56 | {-# inline endPos #-}
57 | 
58 | leftPos :: Span -> Pos
59 | leftPos (Span p _) = p
60 | {-# inline leftPos #-}
61 | 
62 | rightPos :: Span -> Pos
63 | rightPos (Span _ p) = p
64 | {-# inline rightPos #-}
65 | 


--------------------------------------------------------------------------------
/bench/Attoparsec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Attoparsec (runSexp, runLongws, runNumcsv, runTm) where
 3 | 
 4 | import Control.Applicative
 5 | import Data.Attoparsec.ByteString.Char8
 6 | import Common
 7 | import qualified Data.ByteString as B
 8 | 
 9 | isLatinLetter :: Char -> Bool
10 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')
11 | 
12 | skipWhile1 :: (Char -> Bool) -> Parser ()
13 | skipWhile1 pred = satisfy pred *> skipWhile pred
14 | 
15 | ws, open, close, ident, sexp :: Parser ()
16 | ws      = skipWhile (\c -> c == ' ' || c == '\n')
17 | open    = char '(' >> ws
18 | close   = char ')' >> ws
19 | ident   = skipWhile1 isLatinLetter <* ws
20 | sexp    = (open *> skipMany1 sexp <* close) <|> ident
21 | runSexp = parseOnly sexp
22 | 
23 | longw, longws :: Parser ()
24 | longw     = () <$ string "thisisalongkeyword"
25 | longws    = skipMany1 (longw *> ws) <* endOfInput
26 | runLongws = parseOnly longws
27 | 
28 | numeral, comma, numcsv :: Parser ()
29 | numeral   = skipWhile1 (\c -> '0' <= c && c <= '9') >> ws
30 | comma     = char ',' >> ws
31 | numcsv    = numeral >> skipMany1 (comma >> numeral) >> endOfInput
32 | runNumcsv = parseOnly numcsv
33 | 
34 | 
35 | --------------------------------------------------------------------------------
36 | 
37 | ident' :: Parser B.ByteString
38 | ident' = takeWhile1 (\c -> isLatinLetter c || isDigit c) <* ws
39 | 
40 | equal = char '=' >> ws
41 | semi  = char ';' >> ws
42 | dot   = char '.' >> ws
43 | addOp = char '+' >> ws
44 | mulOp = char '*' >> ws
45 | parl  = char '(' >> ws
46 | parr  = char ')' >> ws
47 | 
48 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b
49 | chainl f start elem = start >>= go where
50 |   go b = do {!a <- elem; go $! f b a} <|> pure b
51 | {-# inline chainl #-}
52 | 
53 | add :: Parser Tm
54 | add = chainl Add mul (addOp *> mul)
55 | 
56 | mul :: Parser Tm
57 | mul = chainl Mul spine (mulOp *> spine)
58 | 
59 | spine :: Parser Tm
60 | spine = chainl App atom atom
61 | 
62 | atom :: Parser Tm
63 | atom =
64 |         (Int <$> (decimal <* ws))
65 |     <|> (Var <$> ident')
66 |     <|> (parl *> tm <* parr)
67 | 
68 | tm :: Parser Tm
69 | tm =  (do {_ <- string "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)})
70 |   <|> (do {_ <- string "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)})
71 |   <|> add
72 | 
73 | runTm = parseOnly (ws *> tm <* endOfInput)
74 | 


--------------------------------------------------------------------------------
/.github/workflows/haskell.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     types:
 8 |       - opened
 9 |       - synchronize
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         ghc: ["8.6.5", "8.8.4", "8.10.7", "9.0.2", "9.2.8", "9.4.8", "9.6.7", "9.8.4", "9.10.2"]
16 |     env:
17 |       TESTCONFIG: "--enable-tests"
18 |       BENCHCONFIG: "--enable-tests"
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: haskell-actions/setup@v2
22 |         id: setup-haskell-cabal
23 |         with:
24 |           ghc-version: ${{ matrix.ghc }}
25 |       - run: cabal freeze $CONFIG
26 |       - uses: actions/cache@v4
27 |         with:
28 |           path: |
29 |             ${{ steps.setup-haskell-cabal.outputs.cabal-store }}
30 |             dist-newstyle
31 |           key: ${{ runner.os }}-${{ matrix.ghc }}-${{ hashFiles('cabal.project.freeze') }}
32 |           restore-keys: |
33 |             ${{ runner.os }}-${{ matrix.ghc }}-
34 |       - run: cabal build $CONFIG
35 |       - run: cabal haddock $CONFIG
36 |       - run: cabal sdist
37 |       - run: cabal bench
38 |       - run: cabal test
39 | 
40 |   wasm:
41 |     runs-on: ubuntu-latest
42 |     env:
43 |       GHC_WASM_META_REV: 7927129e42bcd6a54b9e06e26455803fa4878261
44 |     strategy:
45 |       matrix:
46 |         ghc: ['9.10', '9.12']
47 |       fail-fast: false
48 |     steps:
49 |     - name: Setup wasm32-wasi-ghc
50 |       run: |
51 |         cd $(mktemp -d)
52 |         curl -L https://gitlab.haskell.org/ghc/ghc-wasm-meta/-/archive/$GHC_WASM_META_REV/ghc-wasm-meta.tar.gz | tar xz --strip-components=1
53 |         ./setup.sh
54 |         ~/.ghc-wasm/add_to_github_path.sh
55 |       env:
56 |         FLAVOUR: ${{ matrix.ghc }}
57 |     - uses: actions/checkout@v4
58 |     - uses: actions/cache@v4
59 |       with:
60 |         path: |
61 |           ~/.ghc-wasm/.cabal/store
62 |         key: wasi-${{ runner.os }}-${{ env.GHC_WASM_META_REV }}-flavour-${{ matrix.ghc }}-${{ github.sha }}
63 |         restore-keys: |
64 |           wasi-${{ runner.os }}-${{ env.GHC_WASM_META_REV }}-flavour-${{ matrix.ghc }}-
65 |     - name: Build
66 |       run: |
67 |         wasm32-wasi-cabal build --enable-tests --enable-benchmarks
68 |     - name: Test
69 |       run: |
70 |         wasm32-wasi-cabal test --test-wrapper=wasmtime
71 | 


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nodes": {
 3 |     "flake-parts": {
 4 |       "inputs": {
 5 |         "nixpkgs-lib": "nixpkgs-lib"
 6 |       },
 7 |       "locked": {
 8 |         "lastModified": 1709336216,
 9 |         "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
10 |         "owner": "hercules-ci",
11 |         "repo": "flake-parts",
12 |         "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
13 |         "type": "github"
14 |       },
15 |       "original": {
16 |         "owner": "hercules-ci",
17 |         "repo": "flake-parts",
18 |         "type": "github"
19 |       }
20 |     },
21 |     "haskell-flake": {
22 |       "locked": {
23 |         "lastModified": 1709467224,
24 |         "narHash": "sha256-xnPTkLMqq78BiTqt6WXj2TXLupclJi+NEH84HDbQSPc=",
25 |         "owner": "srid",
26 |         "repo": "haskell-flake",
27 |         "rev": "9173cc45aeb72b7e7adfe0e5a53a425fe439e3ca",
28 |         "type": "github"
29 |       },
30 |       "original": {
31 |         "owner": "srid",
32 |         "repo": "haskell-flake",
33 |         "type": "github"
34 |       }
35 |     },
36 |     "nixpkgs": {
37 |       "locked": {
38 |         "lastModified": 1709641832,
39 |         "narHash": "sha256-bzzRc3DiV8Cm/67HDa39pyBymqF45ISgUbXqjrMk2UE=",
40 |         "owner": "nixos",
41 |         "repo": "nixpkgs",
42 |         "rev": "bfa8b30043892dc2b660d403faa159bab7b65898",
43 |         "type": "github"
44 |       },
45 |       "original": {
46 |         "owner": "nixos",
47 |         "ref": "nixpkgs-unstable",
48 |         "repo": "nixpkgs",
49 |         "type": "github"
50 |       }
51 |     },
52 |     "nixpkgs-lib": {
53 |       "locked": {
54 |         "dir": "lib",
55 |         "lastModified": 1709237383,
56 |         "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
57 |         "owner": "NixOS",
58 |         "repo": "nixpkgs",
59 |         "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
60 |         "type": "github"
61 |       },
62 |       "original": {
63 |         "dir": "lib",
64 |         "owner": "NixOS",
65 |         "ref": "nixos-unstable",
66 |         "repo": "nixpkgs",
67 |         "type": "github"
68 |       }
69 |     },
70 |     "root": {
71 |       "inputs": {
72 |         "flake-parts": "flake-parts",
73 |         "haskell-flake": "haskell-flake",
74 |         "nixpkgs": "nixpkgs"
75 |       }
76 |     }
77 |   },
78 |   "root": "root",
79 |   "version": 7
80 | }
81 | 


--------------------------------------------------------------------------------
/bench/Megaparsec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Megaparsec (runSexp, runLongws, runNumcsv, runTm) where
 3 | 
 4 | import Control.Applicative
 5 | import qualified Data.ByteString as B
 6 | import Text.Megaparsec
 7 | import Text.Megaparsec.Byte.Lexer
 8 | import Data.Char
 9 | import Common
10 | 
11 | type Parser = Parsec () B.ByteString
12 | 
13 | isLatinLetter :: Char -> Bool
14 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')
15 | 
16 | char8 :: Char -> Parser ()
17 | char8 c = () <$ single (fromIntegral (ord c))
18 | 
19 | {-# inline skipWhile #-}
20 | skipWhile :: (Char -> Bool) -> Parser ()
21 | skipWhile f = () <$ takeWhileP Nothing (f . chr . fromIntegral)
22 | 
23 | {-# inline skipWhile1 #-}
24 | skipWhile1 :: (Char -> Bool) -> Parser ()
25 | skipWhile1 f = () <$ takeWhile1P Nothing (f . chr . fromIntegral)
26 | 
27 | {-# inline takeWhile1 #-}
28 | takeWhile1 :: (Char -> Bool) -> Parser B.ByteString
29 | takeWhile1 f = takeWhile1P Nothing (f . chr . fromIntegral)
30 | 
31 | ws, open, close, ident, sexp :: Parser ()
32 | ws      = skipWhile (\c -> c == ' ' || c == '\n')
33 | open    = char8 '(' >> ws
34 | close   = char8 ')' >> ws
35 | ident   = skipWhile1 isLatinLetter <* ws
36 | sexp    = (open *> skipSome sexp <* close) <|> ident
37 | runSexp = runParser sexp ""
38 | 
39 | longw, longws :: Parser ()
40 | longw     = () <$ chunk "thisisalongkeyword"
41 | longws    = skipSome (longw *> ws) <* eof
42 | runLongws = runParser longws ""
43 | 
44 | numeral, comma, numcsv :: Parser ()
45 | numeral   = skipWhile1 (\c -> '0' <= c && c <= '9') >> ws
46 | comma     = single (fromIntegral (ord ',')) >> ws
47 | numcsv    = numeral >> skipSome (comma >> numeral) >> eof
48 | runNumcsv = runParser numcsv ""
49 | 
50 | --------------------------------------------------------------------------------
51 | 
52 | ident' :: Parser B.ByteString
53 | ident' = takeWhile1 (\c -> isLatinLetter c || isDigit c) <* ws
54 | 
55 | equal = char8 '=' >> ws
56 | semi  = char8 ';' >> ws
57 | dot   = char8 '.' >> ws
58 | addOp = char8 '+' >> ws
59 | mulOp = char8 '*' >> ws
60 | parl  = char8 '(' >> ws
61 | parr  = char8 ')' >> ws
62 | 
63 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b
64 | chainl f start elem = start >>= go where
65 |   go b = do {!a <- elem; go $! f b a} <|> pure b
66 | {-# inline chainl #-}
67 | 
68 | add :: Parser Tm
69 | add = chainl Add mul (addOp *> mul)
70 | 
71 | mul :: Parser Tm
72 | mul = chainl Mul spine (mulOp *> spine)
73 | 
74 | spine :: Parser Tm
75 | spine = chainl App atom atom
76 | 
77 | atom :: Parser Tm
78 | atom =
79 |         (Int <$> (decimal <* ws))
80 |     <|> (Var <$> ident')
81 |     <|> (parl *> tm <* parr)
82 | 
83 | tm :: Parser Tm
84 | tm =  (do {_ <- chunk "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)})
85 |   <|> (do {_ <- chunk "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)})
86 |   <|> add
87 | 
88 | runTm = runParser (ws *> tm <* eof) ""
89 | 


--------------------------------------------------------------------------------
/package.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | name:        flatparse
  3 | version:     0.5.3.1
  4 | license:     MIT
  5 | category:    Parsing
  6 | synopsis:    High-performance parsing from strict bytestrings
  7 | author:      András Kovács
  8 | maintainer:  puttamalac@gmail.com
  9 | copyright:   2021 András Kovács
 10 | bug-reports: https://github.com/AndrasKovacs/flatparse/issues
 11 | github:      AndrasKovacs/flatparse
 12 | 
 13 | description: |
 14 |   @Flatparse@ is a high-performance parsing library for strict bytestring input. See the README for more information:
 15 |   <https://github.com/AndrasKovacs/flatparse>.
 16 | 
 17 | tested-with:
 18 |   - GHC == 8.6.5
 19 |   - GHC == 8.8.4
 20 |   - GHC == 8.10.7
 21 |   - GHC == 9.0.2
 22 |   - GHC == 9.2.8
 23 |   - GHC == 9.4.8
 24 |   - GHC == 9.6.7
 25 |   - GHC == 9.8.4
 26 | 
 27 | flags:
 28 |   llvm:
 29 |     description: use llvm for building
 30 |     default: false
 31 |     manual : true
 32 |   dump:
 33 |     description: dump core, stg and cmm to files
 34 |     default: false
 35 |     manual: true
 36 | 
 37 | dependencies:
 38 |   - base >= 4.7 && < 5
 39 |   - utf8-string ^>= 1.0.2
 40 | 
 41 | extra-source-files:
 42 |   - README.md
 43 | 
 44 | default-extensions:
 45 |   - BangPatterns
 46 |   - BlockArguments
 47 |   - CPP
 48 |   - ExplicitNamespaces
 49 |   - LambdaCase
 50 |   - MagicHash
 51 |   - OverloadedStrings
 52 |   - PatternSynonyms
 53 |   - TemplateHaskell
 54 |   - TupleSections
 55 | 
 56 | ghc-options:
 57 |   - -Wall
 58 |   - -Wno-missing-signatures
 59 |   - -Wno-name-shadowing
 60 |   - -Wno-unused-binds
 61 |   - -Wno-unused-matches
 62 | 
 63 | when:
 64 |   - condition: flag(dump)
 65 |     ghc-options:
 66 |       - -ddump-simpl
 67 |       - -ddump-stg-final
 68 |       - -ddump-cmm
 69 |       - -dsuppress-all
 70 |       - -dno-suppress-type-signatures
 71 |       - -ddump-to-file
 72 | 
 73 |   - condition: flag(llvm)
 74 |     ghc-options:
 75 |       - -fllvm
 76 | 
 77 | library:
 78 |   source-dirs:
 79 |     - src
 80 |   dependencies:
 81 |     - bytestring
 82 |     - containers
 83 |     - integer-gmp
 84 |     - template-haskell
 85 |   ghc-options:
 86 |     - -O2
 87 | 
 88 | tests:
 89 |   spec:
 90 |     main: Test.hs
 91 |     source-dirs:
 92 |       - test
 93 |     dependencies:
 94 |       - HUnit
 95 |       - bytestring
 96 |       - flatparse
 97 |       - hspec
 98 |       - QuickCheck
 99 |       - quickcheck-instances
100 |     default-extensions:
101 |       - ExtendedDefaultRules
102 |     ghc-options:
103 |       - -O2
104 |       - -Wno-type-defaults
105 | 
106 | benchmarks:
107 |   bench:
108 |     source-dirs: bench
109 |     main: Bench.hs
110 |     dependencies:
111 |       - attoparsec
112 |       - bytestring
113 |       - flatparse
114 |       - integer-gmp
115 |       - megaparsec
116 |       - parsec
117 |       - primitive
118 |       - tasty-bench
119 |     ghc-options:
120 |       - -O2
121 | 


--------------------------------------------------------------------------------
/bench/Parsec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Parsec (runSexp, runLongws, runNumcsv, runTm) where
 3 | 
 4 | import Control.Monad
 5 | import Text.Parsec hiding (chainl, digit)
 6 | import Text.Parsec.ByteString
 7 | import qualified Data.ByteString as B
 8 | import Data.Char hiding (isDigit)
 9 | import Common
10 | 
11 | isLatinLetter :: Char -> Bool
12 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')
13 | 
14 | isDigit :: Char -> Bool
15 | isDigit c = '0' <= c && c <= '9'
16 | 
17 | ws, open, close, ident, sexp :: Parser ()
18 | ws      = skipMany (satisfy \c -> c == ' ' || c == '\n')
19 | open    = char '(' >> ws
20 | close   = char ')' >> ws
21 | ident   = skipMany1 (satisfy isLatinLetter) <* ws
22 | sexp    = (open *> skipMany1 sexp <* close) <|> ident
23 | runSexp = parse sexp ""
24 | 
25 | byteString :: B.ByteString -> Parser ()
26 | byteString b = do
27 |   i <- getInput
28 |   guard $ B.isPrefixOf b i
29 |   setInput $ B.drop (B.length b) i
30 | 
31 | longw, longws :: Parser ()
32 | longw     = () <$ byteString "thisisalongkeyword"
33 | longws    = skipMany1 (longw *> ws) <* eof
34 | runLongws = parse longws ""
35 | 
36 | numeral, comma, numcsv :: Parser ()
37 | numeral   = skipMany1 (satisfy \c -> '0' <= c && c <= '9') >> ws
38 | comma     = char ',' >> ws
39 | numcsv    = numeral >> skipMany1 (comma >> numeral) >> eof
40 | runNumcsv = parse numcsv ""
41 | 
42 | --------------------------------------------------------------------------------
43 | 
44 | {-# inline byteStringOf #-}
45 | byteStringOf :: Parser a -> Parser B.ByteString
46 | byteStringOf p = do
47 |   i <- getInput
48 |   _ <- p
49 |   i' <- getInput
50 |   pure $! B.take (B.length i - B.length i') i
51 | 
52 | ident' :: Parser B.ByteString
53 | ident' = byteStringOf (skipMany1 (satisfy \c -> isLatinLetter c || isDigit c)) <* ws
54 | 
55 | equal = char '=' >> ws
56 | semi  = char ';' >> ws
57 | dot   = char '.' >> ws
58 | addOp = char '+' >> ws
59 | mulOp = char '*' >> ws
60 | parl  = char '(' >> ws
61 | parr  = char ')' >> ws
62 | 
63 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b
64 | chainl f start elem = start >>= go where
65 |   go b = do {!a <- elem; go $! f b a} <|> pure b
66 | {-# inline chainl #-}
67 | 
68 | digit :: Parser Int
69 | digit = digitToInt <$> satisfy isDigit
70 | 
71 | decimal :: Parser Int
72 | decimal = chainl (\x n -> 10*x + n) digit digit
73 | 
74 | add :: Parser Tm
75 | add = chainl Add mul (addOp *> mul)
76 | 
77 | mul :: Parser Tm
78 | mul = chainl Mul spine (mulOp *> spine)
79 | 
80 | spine :: Parser Tm
81 | spine = chainl App atom atom
82 | 
83 | atom :: Parser Tm
84 | atom =
85 |         (Int <$> (decimal <* ws))
86 |     <|> (Var <$> ident')
87 |     <|> (parl *> tm <* parr)
88 | 
89 | tm :: Parser Tm
90 | tm =  (do {byteString "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)})
91 |   <|> (do {byteString "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)})
92 |   <|> add
93 | 
94 | runTm = parse (ws *> tm <* eof) ""
95 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Bytes.hs:
--------------------------------------------------------------------------------
 1 | {- | Bytestring parsers.
 2 | 
 3 | Module dependency complications prevent us from placing these in
 4 | "FlatParse.Basic.Base".
 5 | -}
 6 | 
 7 | module FlatParse.Basic.Bytes
 8 |   ( bytes, bytesUnsafe
 9 |   ) where
10 | 
11 | import FlatParse.Basic.Parser
12 | import FlatParse.Basic.Base ( withEnsure )
13 | import FlatParse.Basic.Integers ( word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe )
14 | import qualified FlatParse.Common.Assorted as Common
15 | import Language.Haskell.TH
16 | import GHC.Exts
17 | import GHC.Word
18 | 
19 | -- | Read a sequence of bytes. This is a template function, you can use it as @$(bytes [3, 4, 5])@,
20 | --   for example, and the splice has type @Parser e ()@. For a non-TH variant see 'FlatParse.Basic.byteString'.
21 | bytes :: [Word] -> Q Exp
22 | bytes bs = do
23 |   let !len = length bs
24 |   [| withEnsure len $(bytesUnsafe bs) |]
25 | 
26 | -- | Template function, creates a @Parser e ()@ which unsafely parses a given
27 | --   sequence of bytes.
28 | --
29 | -- The caller must guarantee that the input has enough bytes.
30 | bytesUnsafe :: [Word] -> Q Exp
31 | bytesUnsafe bytes = do
32 |   let !(leading, w8s) = Common.splitBytes $ (fromIntegral :: Word -> Word64) <$> bytes
33 |       !scanw8s        = go w8s where
34 |                          go (w8:[] ) = [| word64Unsafe w8 |]
35 |                          go (w8:w8s) = [| word64Unsafe w8 >> $(go w8s) |]
36 |                          go []       = [| pure () |]
37 |   case w8s of
38 |     [] -> go leading
39 |           where
40 |             go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w |]
41 |             go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w >> $(go ws) |]
42 |             go (a:b:[])     = let !w = Common.packBytes [a, b]       in [| word16Unsafe w |]
43 |             go (a:b:ws)     = let !w = Common.packBytes [a, b]       in [| word16Unsafe w >> $(go ws) |]
44 |             go (a:[])       = [| word8Unsafe a |]
45 |             go []           = [| pure () |]
46 |     _  -> case leading of
47 | 
48 |       []              -> scanw8s
49 |       [a]             -> [| word8Unsafe a >> $scanw8s |]
50 |       ws@[a, b]       -> let !w = Common.packBytes ws in [| word16Unsafe w >> $scanw8s |]
51 |       ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| word32Unsafe w >> $scanw8s |]
52 |       ws              -> let !w = Common.packBytes ws
53 |                              !l = length ws
54 |                          in [| scanPartial64# l w >> $scanw8s |]
55 | 
56 | scanPartial64# :: Int -> Word64 -> ParserT st e ()
57 | scanPartial64# (I# len) (W64# w) = ParserT \fp eob s st ->
58 |   case indexWord64OffAddr# s 0# of
59 |     w' -> case uncheckedIShiftL# (8# -# len) 3# of
60 |       sh -> case uncheckedShiftL64# w' sh of
61 |         w' -> case uncheckedShiftRL64# w' sh of
62 | #if MIN_VERSION_base(4,17,0)
63 |           w' -> case eqWord64# w w' of
64 | #else
65 |           w' -> case eqWord# w w' of
66 | #endif
67 |             1# -> OK#   st () (plusAddr# s len)
68 |             _  -> Fail# st
69 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Bytes.hs:
--------------------------------------------------------------------------------
 1 | {- | Bytestring parsers.
 2 | 
 3 | Module dependency complications prevent us from placing these in
 4 | "FlatParse.Stateful.Base".
 5 | -}
 6 | 
 7 | module FlatParse.Stateful.Bytes
 8 |   ( bytes, bytesUnsafe
 9 |   ) where
10 | 
11 | import FlatParse.Stateful.Parser
12 | import FlatParse.Stateful.Base ( withEnsure )
13 | import FlatParse.Stateful.Integers ( word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe )
14 | import qualified FlatParse.Common.Assorted as Common
15 | import Language.Haskell.TH
16 | import GHC.Exts
17 | import GHC.Word
18 | 
19 | -- | Read a sequence of bytes. This is a template function, you can use it as
20 | --   @$(bytes [3, 4, 5])@, for example, and the splice has type @Parser e
21 | --   ()@. For a non-TH variant see 'FlatParse.Stateful.byteString'.
22 | bytes :: [Word] -> Q Exp
23 | bytes bs = do
24 |   let !len = length bs
25 |   [| withEnsure len $(bytesUnsafe bs) |]
26 | 
27 | -- | Template function, creates a @Parser e ()@ which unsafely parses a given
28 | --   sequence of bytes.
29 | --
30 | -- The caller must guarantee that the input has enough bytes.
31 | bytesUnsafe :: [Word] -> Q Exp
32 | bytesUnsafe bytes = do
33 |   let !(leading, w8s) = Common.splitBytes $ (fromIntegral :: Word -> Word64) <$> bytes
34 |       !scanw8s        = go w8s where
35 |                          go (w8:[] ) = [| word64Unsafe w8 |]
36 |                          go (w8:w8s) = [| word64Unsafe w8 >> $(go w8s) |]
37 |                          go []       = [| pure () |]
38 |   case w8s of
39 |     [] -> go leading
40 |           where
41 |             go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w |]
42 |             go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w >> $(go ws) |]
43 |             go (a:b:[])     = let !w = Common.packBytes [a, b]       in [| word16Unsafe w |]
44 |             go (a:b:ws)     = let !w = Common.packBytes [a, b]       in [| word16Unsafe w >> $(go ws) |]
45 |             go (a:[])       = [| word8Unsafe a |]
46 |             go []           = [| pure () |]
47 |     _  -> case leading of
48 | 
49 |       []              -> scanw8s
50 |       [a]             -> [| word8Unsafe a >> $scanw8s |]
51 |       ws@[a, b]       -> let !w = Common.packBytes ws in [| word16Unsafe w >> $scanw8s |]
52 |       ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| word32Unsafe w >> $scanw8s |]
53 |       ws              -> let !w = Common.packBytes ws
54 |                              !l = length ws
55 |                          in [| scanPartial64# l w >> $scanw8s |]
56 | 
57 | scanPartial64# :: Int -> Word64 -> ParserT st r e ()
58 | scanPartial64# (I# len) (W64# w) = ParserT \fp !r eob s n st ->
59 |   case indexWord64OffAddr# s 0# of
60 |     w' -> case uncheckedIShiftL# (8# -# len) 3# of
61 |       sh -> case uncheckedShiftL64# w' sh of
62 |         w' -> case uncheckedShiftRL64# w' sh of
63 | #if MIN_VERSION_base(4,17,0)
64 |           w' -> case eqWord64# w w' of
65 | #else
66 |           w' -> case eqWord# w w' of
67 | #endif
68 |             1# -> OK#   st () (plusAddr# s len) n
69 |             _  -> Fail# st
70 | 


--------------------------------------------------------------------------------
/bench/Bench.hs:
--------------------------------------------------------------------------------
 1 | {-# options_ghc -Wno-unused-imports #-}
 2 | 
 3 | module Main where
 4 | 
 5 | import Data.Primitive.ByteArray
 6 | import Test.Tasty.Bench
 7 | 
 8 | import qualified Data.ByteString.Char8 as B
 9 | 
10 | import qualified Attoparsec
11 | import qualified Megaparsec
12 | import qualified Parsec
13 | import qualified FPStateful
14 | import qualified FPBasic
15 | import qualified ReadInteger
16 | 
17 | import qualified Data.ByteString.UTF8
18 | import FlatParse.Common.Assorted (strToUtf8)
19 | import qualified FlatParse.Basic
20 | 
21 | import Common
22 | 
23 | sexpInp :: B.ByteString
24 | sexpInp =
25 |   B.concat $ "(" : replicate 33333 "(foo (foo (foo ((bar baza)))))" ++ [")"]
26 | 
27 | longwsInp :: B.ByteString
28 | longwsInp = B.concat $ replicate 55555 "thisisalongkeyword   "
29 | 
30 | numcsvInp :: B.ByteString
31 | numcsvInp = B.concat ("0" : [B.pack (",  " ++ show n) | n <- [1..100000::Int]])
32 | 
33 | readIntInp :: B.ByteString
34 | readIntInp = "12345678910"
35 | 
36 | longString :: String
37 | longString =
38 |   concat $ "(" : replicate 33333 "(foo (foo (foo ((bar baza)))))" ++ [")"]
39 | 
40 | tmInp :: B.ByteString
41 | tmInp = B.pack (unlines (do
42 |   x <- [0..3000::Int]
43 |   pure ("let x" ++ show x ++ " = fun f. fun g. fun x. fun y. f (f (f ((g x y + g x y) * g x y * g x y * 13500)));")
44 |   ++ ["x1000"]))
45 | 
46 | 
47 | main :: IO ()
48 | main = defaultMain [
49 | {-
50 |   bgroup "String -> UTF-8 ByteString" [
51 |     bench "utf8-string" $ whnf Data.ByteString.UTF8.toString sexpInp,
52 |     bench "fp" $ whnf FlatParse.Common.Assorted.utf8ToStr sexpInp
53 |   ],
54 |   bgroup "UTF-8 ByteString -> String" [
55 |     bench "utf8-string" $ whnf Data.ByteString.UTF8.fromString longString,
56 |     bench "fp" $ whnf FlatParse.Common.Assorted.strToUtf8 longString
57 |   ]
58 |  ]
59 | -}
60 |   bgroup "sexp" [
61 |     bench "fpbasic"     $ whnf FPBasic.runSexp    sexpInp,
62 |     bench "fpstateful"  $ whnf FPStateful.runSexp sexpInp,
63 |     bench "attoparsec"  $ whnf Attoparsec.runSexp sexpInp,
64 |     bench "megaparsec"  $ whnf Megaparsec.runSexp sexpInp,
65 |     bench "parsec"      $ whnf Parsec.runSexp     sexpInp
66 |   ],
67 | 
68 |   bgroup "long keyword" [
69 |     bench "fpbasic"    $ whnf FPBasic.runLongws    longwsInp,
70 |     bench "fpstateful" $ whnf FPStateful.runLongws longwsInp,
71 |     bench "attoparsec" $ whnf Attoparsec.runLongws longwsInp,
72 |     bench "megaparsec" $ whnf Megaparsec.runLongws longwsInp,
73 |     bench "parsec"     $ whnf Parsec.runLongws     longwsInp
74 |   ],
75 | 
76 |   bgroup "numeral csv" [
77 |     bench "fpbasic"    $ whnf FPBasic.runNumcsv    numcsvInp,
78 |     bench "fpstateful" $ whnf FPStateful.runNumcsv numcsvInp,
79 |     bench "attoparsec" $ whnf Attoparsec.runNumcsv numcsvInp,
80 |     bench "megaparsec" $ whnf Megaparsec.runNumcsv numcsvInp,
81 |     bench "parsec"     $ whnf Parsec.runNumcsv     numcsvInp
82 |   ],
83 | 
84 |   bgroup "lambda term" [
85 |     bench "fpbasic"    $ whnf FPBasic.runTm    tmInp,
86 |     bench "fpstateful" $ whnf FPStateful.runTm tmInp,
87 |     bench "attoparsec" $ whnf Attoparsec.runTm tmInp,
88 |     bench "megaparsec" $ whnf Megaparsec.runTm tmInp,
89 |     bench "parsec"     $ whnf Parsec.runTm tmInp
90 |   ],
91 | 
92 |   bgroup "readInt/readInteger" [
93 |     bench "readInt"      $ whnf ReadInteger.readInt     readIntInp,
94 |     bench "readInteger"  $ whnf ReadInteger.readInteger readIntInp
95 |     ]
96 |  ]
97 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Addr.hs:
--------------------------------------------------------------------------------
 1 | {- | Unsafe, highly dangerous parsing primitives using 'Addr#'.
 2 | 
 3 | Ensure to read the documentation before using any definitions from this module.
 4 | 
 5 | This module exports primitives useful for efficiently parsing binary files that
 6 | store data using an internal index.
 7 | 
 8 | Often, such indices describes records using a starting offset and a length.
 9 | Offsets are often relative to the file start, or some dynamic address in the
10 | file. This way, individual records can be read out efficiently (much faster than
11 | opening lots of small files!).
12 | 
13 | We may parse these in-place efficiently by adding record offsets to a base
14 | memory address somewhere in the input. This is also extremely unsafe, and easy
15 | to get catastrophically wrong. Thus, we provide as much utility as reasonable to
16 | enable performing such parsing safely. (That means CPS functions.)
17 | 
18 | Note that all definitions here should be considered unsafe. Any 'Int#' is not
19 | checked for positivity. You must perform any necessary checks when you obtain
20 | your offsets and lengths as 'Int#'. Failure to do so may result in undefined
21 | behaviour.
22 | -}
23 | 
24 | module FlatParse.Basic.Addr where
25 | 
26 | import FlatParse.Basic.Parser
27 | import FlatParse.Basic.Base ( takeUnsafe#, atSkipUnsafe#, lookahead )
28 | 
29 | import GHC.Exts
30 | 
31 | import qualified Data.ByteString as B
32 | 
33 | -- | Run a parser, passing it the current address the parser is at.
34 | --
35 | -- Useful for parsing offset-based data tables. For example, you may use this to
36 | -- save the base address to use together with various relative offsets.
37 | withAddr# :: (Addr# -> ParserT st e a) -> ParserT st e a
38 | withAddr# p = ParserT \fp eob s st -> runParserT# (p s) fp eob s st
39 | {-# inline withAddr# #-}
40 | 
41 | -- | @takeOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@
42 | --   bytes, reads @len#@ bytes into a 'ByteString', and restores the original
43 | --   address.
44 | --
45 | -- The 'Addr#' should be from 'withAddr#'.
46 | --
47 | -- Useful for parsing offset-based data tables. Ex: Your file contains an index
48 | -- storing @(OFFSET, LENGTH)@ entries where the offset is the byte position in
49 | -- the file. Begin with @'withAddr#' $ \tableBase# -> ...@, then read each entry
50 | -- like @'takeOffAddr#' tableBase# OFFSET LENGTH@.
51 | --
52 | -- Fails if you attempt to read outside the input.
53 | --
54 | -- Undefined behaviour if @offset#@ or @len#@ is negative.
55 | --
56 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops.
57 | takeOffAddr# :: Addr# -> Int# -> Int# -> ParserT st e B.ByteString
58 | takeOffAddr# addr# offset# len# = withOffAddr# addr# offset# (takeUnsafe# len#)
59 | {-# inline takeOffAddr# #-}
60 | 
61 | -- | @withOffAddr# addr# offset# p@ moves to @addr#@, skips @offset#@
62 | --   bytes, then runs the given parser @p@.
63 | --
64 | -- The 'Addr#' should be from 'withAddr#'.
65 | --
66 | -- Fails if you attempt to read outside the input.
67 | --
68 | -- Undefined behaviour if @offset#@ is negative.
69 | --
70 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops.
71 | withOffAddr# :: Addr# -> Int# -> ParserT st e a -> ParserT st e a
72 | withOffAddr# addr# offset# =
73 |     lookaheadFromAddr# addr# . atSkipUnsafe# offset#
74 | {-# inline withOffAddr# #-}
75 | 
76 | -- | 'lookahead', but specify the address to lookahead from.
77 | --
78 | -- The 'Addr#' should be from 'withAddr#'.
79 | lookaheadFromAddr# :: Addr# -> ParserT st e a -> ParserT st e a
80 | lookaheadFromAddr# s = lookahead . atAddr# s
81 | {-# inline lookaheadFromAddr# #-}
82 | 
83 | -- | Run a parser at the given address.
84 | --
85 | -- The 'Addr#' should be from 'withAddr#'.
86 | --
87 | -- This is a highly internal function -- you likely want 'lookaheadFromAddr#',
88 | -- which will reset the address after running the parser.
89 | atAddr# :: Addr# -> ParserT st e a -> ParserT st e a
90 | atAddr# s (ParserT p) = ParserT \fp eob _ st -> p fp eob s st
91 | {-# inline atAddr# #-}
92 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Addr.hs:
--------------------------------------------------------------------------------
 1 | {- | Unsafe, highly dangerous parsing primitives using 'Addr#'.
 2 | 
 3 | Ensure to read the documentation before using any definitions from this module.
 4 | 
 5 | This module exports primitives useful for efficiently parsing binary files that
 6 | store data using an internal index.
 7 | 
 8 | Often, such indices describes records using a starting offset and a length.
 9 | Offsets are often relative to the file start, or some dynamic address in the
10 | file. This way, individual records can be read out efficiently (much faster than
11 | opening lots of small files!).
12 | 
13 | We may parse these in-place efficiently by adding record offsets to a base
14 | memory address somewhere in the input. This is also extremely unsafe, and easy
15 | to get catastrophically wrong. Thus, we provide as much utility as reasonable to
16 | enable performing such parsing safely. (That means CPS functions.)
17 | 
18 | Note that all definitions here should be considered unsafe. Any 'Int#' is not
19 | checked for positivity. You must perform any necessary checks when you obtain
20 | your offsets and lengths as 'Int#'. Failure to do so may result in undefined
21 | behaviour.
22 | -}
23 | 
24 | module FlatParse.Stateful.Addr where
25 | 
26 | import FlatParse.Stateful.Parser
27 | import FlatParse.Stateful.Base ( takeUnsafe#, atSkipUnsafe#, lookahead )
28 | 
29 | import GHC.Exts
30 | 
31 | import qualified Data.ByteString as B
32 | 
33 | -- | Run a parser, passing it the current address the parser is at.
34 | --
35 | -- Useful for parsing offset-based data tables. For example, you may use this to
36 | -- save the base address to use together with various relative offsets.
37 | withAddr# :: (Addr# -> ParserT st r e a) -> ParserT st r e a
38 | withAddr# p = ParserT \fp !r eob s n st -> runParserT# (p s) fp r eob s n st
39 | {-# inline withAddr# #-}
40 | 
41 | -- | @takeOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@
42 | --   bytes, reads @len#@ bytes into a 'ByteString', and restores the original
43 | --   address.
44 | --
45 | -- The 'Addr#' should be from 'withAddr#'.
46 | --
47 | -- Useful for parsing offset-based data tables. Ex: Your file contains an index
48 | -- storing @(OFFSET, LENGTH)@ entries where the offset is the byte position in
49 | -- the file. Begin with @'withAddr#' $ \tableBase# -> ...@, then read each entry
50 | -- like @'takeOffAddr#' tableBase# OFFSET LENGTH@.
51 | --
52 | -- Fails if you attempt to read outside the input.
53 | --
54 | -- Undefined behaviour if @offset#@ or @len#@ is negative.
55 | --
56 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops.
57 | takeOffAddr# :: Addr# -> Int# -> Int# -> ParserT st r e B.ByteString
58 | takeOffAddr# addr# offset# len# = withOffAddr# addr# offset# (takeUnsafe# len#)
59 | {-# inline takeOffAddr# #-}
60 | 
61 | -- | @withOffAddr# addr# offset# p@ moves to @addr#@, skips @offset#@
62 | --   bytes, then runs the given parser @p@.
63 | --
64 | -- The 'Addr#' should be from 'withAddr#'.
65 | --
66 | -- Fails if you attempt to read outside the input.
67 | --
68 | -- Undefined behaviour if @offset#@ is negative.
69 | --
70 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops.
71 | withOffAddr# :: Addr# -> Int# -> ParserT st r e a -> ParserT st r e a
72 | withOffAddr# addr# offset# =
73 |     lookaheadFromAddr# addr# . atSkipUnsafe# offset#
74 | {-# inline withOffAddr# #-}
75 | 
76 | -- | 'lookahead', but specify the address to lookahead from.
77 | --
78 | -- The 'Addr#' should be from 'withAddr#'.
79 | lookaheadFromAddr# :: Addr# -> ParserT st r e a -> ParserT st r e a
80 | lookaheadFromAddr# s = lookahead . atAddr# s
81 | {-# inline lookaheadFromAddr# #-}
82 | 
83 | -- | Run a parser at the given address.
84 | --
85 | -- The 'Addr#' should be from 'withAddr#'.
86 | --
87 | -- This is a highly internal function -- you likely want 'lookaheadFromAddr#',
88 | -- which will reset the address after running the parser.
89 | atAddr# :: Addr# -> ParserT st r e a -> ParserT st r e a
90 | atAddr# s (ParserT p) = ParserT \fp !r eob _ n st -> p fp r eob s n st
91 | {-# inline atAddr# #-}
92 | 


--------------------------------------------------------------------------------
/src/FlatParse/Examples/BasicLambda/Parser.hs:
--------------------------------------------------------------------------------
  1 | {-|
  2 | This module contains a simple lambda calculus parser. This parser is not optimized for maximum
  3 | performance; instead it's written in a style which emulates the look and feel of conventional
  4 | monadic parsers. An optimized implementation would use low-level `switch` expressions more often.
  5 | -}
  6 | 
  7 | {-# language StrictData #-}
  8 | 
  9 | module FlatParse.Examples.BasicLambda.Parser where
 10 | 
 11 | import Data.Char (ord)
 12 | import qualified Data.ByteString as B
 13 | 
 14 | import FlatParse.Basic hiding (Parser, runParser, string, char, cut)
 15 | import FlatParse.Examples.BasicLambda.Lexer
 16 | 
 17 | --------------------------------------------------------------------------------
 18 | 
 19 | type Name = B.ByteString
 20 | 
 21 | {-|
 22 | A term in the language. The precedences of different constructs are the following, in decreasing
 23 | order of strength:
 24 | 
 25 | * Identifiers, literals and parenthesized expressions
 26 | * Function application (left assoc)
 27 | * Multiplication (left assoc)
 28 | * Addition (left assoc)
 29 | * Equality, less-than (non-assoc)
 30 | * @lam@, @let@, @if@ (right assoc)
 31 | 
 32 | -}
 33 | data Tm
 34 |   = Var Name        -- ^ @x@
 35 |   | App Tm Tm       -- ^ @t u@
 36 |   | Lam Name Tm     -- ^ @lam x. t@
 37 |   | Let Name Tm Tm  -- ^ @let x = t in u@
 38 |   | BoolLit Bool    -- ^ @true@ or @false@.
 39 |   | IntLit Int      -- ^ A positive `Int` literal.
 40 |   | If Tm Tm Tm     -- ^ @if t then u else v@
 41 |   | Add Tm Tm       -- ^ @t + u@
 42 |   | Mul Tm Tm       -- ^ @t * u@
 43 |   | Eq Tm Tm        -- ^ @t == u@
 44 |   | Lt Tm Tm        -- ^ @t < u@
 45 |   deriving Show
 46 | 
 47 | 
 48 | -- | Parse an identifier. This parser uses `isKeyword` to check that an identifier is not a
 49 | --   keyword.
 50 | ident :: Parser Name
 51 | ident = token $ byteStringOf $
 52 |   withSpan (identStartChar *> skipMany identChar) (\_ span -> fails (isKeyword span))
 53 | 
 54 | -- | Parse an identifier, throw a precise error on failure.
 55 | ident' :: Parser Name
 56 | ident' = ident `cut'` (Msg "identifier")
 57 | 
 58 | digit :: Parser Int
 59 | digit = (\c -> ord c - ord '0') <$> satisfyAscii isDigit
 60 | 
 61 | int :: Parser Int
 62 | int = token do
 63 |   (place, n) <- chainr (\n (!place, !acc) -> (place*10,acc+place*n)) digit (pure (1, 0))
 64 |   case place of
 65 |     1 -> empty
 66 |     _ -> pure n
 67 | 
 68 | -- | Parse a literal, identifier or parenthesized expression.
 69 | atom :: Parser Tm
 70 | atom =
 71 |        (Var           <$> ident)
 72 |    <|> (BoolLit True  <$  $(keyword "true"))
 73 |    <|> (BoolLit False <$  $(keyword "false"))
 74 |    <|> (IntLit        <$> int)
 75 |    <|> ($(symbol "(") *> tm' <* $(symbol' ")"))
 76 | 
 77 | atom' :: Parser Tm
 78 | atom' = atom
 79 |   `cut` [Msg "identifier", "true", "false", Msg "parenthesized expression", Msg "integer literal"]
 80 | 
 81 | -- | Parse an `App`-level expression.
 82 | app' :: Parser Tm
 83 | app' = chainl App atom' atom
 84 | 
 85 | -- | Parse a `Mul`-level expression.
 86 | mul' :: Parser Tm
 87 | mul' = chainl Mul app' ($(symbol "*") *> app')
 88 | 
 89 | -- | Parse an `Add`-level expression.
 90 | add' :: Parser Tm
 91 | add' = chainl Add mul' ($(symbol "+") *> mul')
 92 | 
 93 | -- | Parse an `FlatParse.Examples.BasicLambda.Parser.Eq` or `Lt`-level expression.
 94 | eqLt' :: Parser Tm
 95 | eqLt' =
 96 |   add' >>= \e1 ->
 97 |   branch $(symbol "==") (Eq e1 <$> add') $
 98 |   branch $(symbol "<")  (Lt e1 <$> add') $
 99 |   pure e1
100 | 
101 | -- | Parse a `Let`.
102 | pLet :: Parser Tm
103 | pLet = do
104 |   $(keyword "let")
105 |   x <- ident'
106 |   $(symbol' "=")
107 |   t <- tm'
108 |   $(keyword' "in")
109 |   u <- tm'
110 |   pure $ Let x t u
111 | 
112 | -- | Parse a `Lam`.
113 | lam :: Parser Tm
114 | lam = do
115 |   $(keyword "lam")
116 |   x <- ident'
117 |   $(symbol' ".")
118 |   t <- tm'
119 |   pure $ Lam x t
120 | 
121 | -- | Parse an `If`.
122 | pIf :: Parser Tm
123 | pIf = do
124 |   $(keyword "if")
125 |   t <- tm'
126 |   $(keyword' "then")
127 |   u <- tm'
128 |   $(keyword' "else")
129 |   v <- tm'
130 |   pure $ If t u v
131 | 
132 | -- | Parse any `Tm`.
133 | tm' :: Parser Tm
134 | tm' = (pLet <|> lam <|> pIf <|> eqLt') `cut` ["let", "lam", "if"]
135 | 
136 | -- | Parse a complete source file.
137 | src' :: Parser Tm
138 | src' = ws *> tm' <* eof `cut` [Msg "end of input (lexical error)"]
139 | 
140 | 
141 | -- Examples
142 | --------------------------------------------------------------------------------
143 | 
144 | -- testParser src' p1
145 | p1 = unlines [
146 |   "let f = lam x. lam y. x (x (x y)) in",
147 |   "let g = if f true then false else true in",
148 |   "let h = f x y + 200 in",
149 |   "f g g h"
150 |   ]
151 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/Switch.hs:
--------------------------------------------------------------------------------
  1 | module FlatParse.Common.Switch where
  2 | 
  3 | import Control.Monad (forM)
  4 | import Data.Foldable (foldl')
  5 | import Data.Map (Map)
  6 | import Language.Haskell.TH
  7 | 
  8 | import qualified Data.Map.Strict as M
  9 | 
 10 | import FlatParse.Common.Assorted
 11 | 
 12 | -- Switch trie compilation
 13 | --------------------------------------------------------------------------------
 14 | 
 15 | data Trie a = Branch !a !(Map Word (Trie a))
 16 |   deriving Show
 17 | 
 18 | type Rule = Maybe Int
 19 | 
 20 | nilTrie :: Trie Rule
 21 | nilTrie = Branch Nothing mempty
 22 | 
 23 | updRule :: Int -> Maybe Int -> Maybe Int
 24 | updRule rule = Just . maybe rule (min rule)
 25 | 
 26 | insert :: Int -> [Word] -> Trie Rule -> Trie Rule
 27 | insert rule = go where
 28 |   go [] (Branch rule' ts) =
 29 |     Branch (updRule rule rule') ts
 30 |   go (c:cs) (Branch rule' ts) =
 31 |     Branch rule' (M.alter (Just . maybe (go cs nilTrie) (go cs)) c ts)
 32 | 
 33 | listToTrie :: [(Int, String)] -> Trie Rule
 34 | listToTrie = foldl' (\t (!r, !s) -> insert r (charToBytes =<< s) t) nilTrie
 35 | 
 36 | -- | Decorate a trie with the minimum lengths of non-empty paths. This
 37 | --   is used later to place `ensure`.
 38 | mindepths :: Trie Rule -> Trie (Rule, Int)
 39 | mindepths (Branch rule ts) =
 40 |   if M.null ts then
 41 |     Branch (rule, 0) mempty
 42 |   else
 43 |     let !ts' = M.map mindepths ts
 44 |         !min = minimum (M.map (\(Branch (rule,d) _) -> maybe (d + 1) (\_ -> 1) rule) ts') in
 45 |     Branch (rule, min) ts'
 46 | 
 47 | data Trie' a
 48 |   = Branch' !a !(Map Word (Trie' a))
 49 |   | Path !a ![Word] !(Trie' a)
 50 |   deriving Show
 51 | 
 52 | -- | Compress linear paths.
 53 | pathify :: Trie (Rule, Int) -> Trie' (Rule, Int)
 54 | pathify (Branch a ts) = case M.toList ts of
 55 |   [] -> Branch' a mempty
 56 |   [(w, t)] -> case pathify t of
 57 |            Path (Nothing, _) ws t -> Path a (w:ws) t
 58 |            t                      -> Path a [w] t
 59 |   _   -> Branch' a (M.map pathify ts)
 60 | 
 61 | -- | Compute where to fall back after we exhausted a branch. If the branch is
 62 | --   empty, that means we've succeded at reading and we jump to the rhs rule.
 63 | fallbacks :: Trie' (Rule, Int) -> Trie' (Rule, Int, Int)
 64 | fallbacks = go Nothing 0  where
 65 |   go :: Rule -> Int -> Trie' (Rule, Int) -> Trie' (Rule, Int, Int)
 66 |   go !rule !n (Branch' (rule', d) ts)
 67 |     | M.null ts        = Branch' (rule', 0, d) mempty
 68 |     | Nothing <- rule' = Branch' (rule, n, d) (go rule (n + 1) <$> ts)
 69 |     | otherwise        = Branch' (rule', 0, d) (go rule' 1     <$> ts)
 70 |   go rule n (Path (rule', d) ws t)
 71 |     | Nothing <- rule' = Path (rule, n, d)  ws (go rule (n + length ws) t)
 72 |     | otherwise        = Path (rule', 0, d) ws (go rule' (length ws) t)
 73 | 
 74 | -- | Decorate with `ensure` invocations, represented as
 75 | --   `Maybe Int`.
 76 | ensureBytes :: Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int)
 77 | ensureBytes = go 0 where
 78 |   go :: Int -> Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int)
 79 |   go !res = \case
 80 |     Branch' (r, n, d) ts
 81 |       | M.null ts -> Branch' (r, n, Nothing) mempty
 82 |       | res < 1   -> Branch' (r, n, Just d ) (go (d   - 1) <$> ts)
 83 |       | otherwise -> Branch' (r, n, Nothing) (go (res - 1) <$> ts)
 84 |     Path (r, n, d) ws t -> case length ws of
 85 |       l | res < l   -> Path (r, n, Just d ) ws (go (d   - l) t)
 86 |         | otherwise -> Path (r, n, Nothing) ws (go (res - l) t)
 87 | 
 88 | compileTrie :: [(Int, String)] -> Trie' (Rule, Int, Maybe Int)
 89 | compileTrie = ensureBytes . fallbacks . pathify . mindepths . listToTrie
 90 | 
 91 | parseSwitch :: Q Exp -> Q ([(String, Exp)], Maybe Exp)
 92 | parseSwitch exp = exp >>= \case
 93 |   CaseE (UnboundVarE _) []    -> error "switch: empty clause list"
 94 |   CaseE (UnboundVarE _) cases -> do
 95 |     (!cases, !last) <- pure (init cases, last cases)
 96 |     !cases <- forM cases \case
 97 |       Match (LitP (StringL str)) (NormalB rhs) [] -> pure (str, rhs)
 98 |       _ -> error "switch: expected a match clause on a string literal"
 99 |     (!cases, !last) <- case last of
100 |       Match (LitP (StringL str)) (NormalB rhs) [] -> pure (cases ++ [(str, rhs)], Nothing)
101 |       Match WildP                (NormalB rhs) [] -> pure (cases, Just rhs)
102 |       _ -> error "switch: expected a match clause on a string literal or a wildcard"
103 |     pure (cases, last)
104 |   _ -> error "switch: expected a \"case _ of\" expression"
105 | 
106 | makeRawSwitch :: [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp
107 | makeRawSwitch branches deflt = do
108 |   branches <- forM branches $ \(s, body) -> do
109 |     body <- body
110 |     pure $ Match (LitP (StringL s)) (NormalB body) []
111 |   branches <- case deflt of
112 |     Nothing    -> pure branches
113 |     Just deflt -> do
114 |       deflt <- deflt
115 |       pure $ branches ++ [Match WildP (NormalB deflt) []]
116 |   pure $ CaseE (UnboundVarE (mkName "_")) branches
117 | 


--------------------------------------------------------------------------------
/flatparse.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version: 1.12
  2 | 
  3 | -- This file has been generated from package.yaml by hpack version 0.38.1.
  4 | --
  5 | -- see: https://github.com/sol/hpack
  6 | 
  7 | name:           flatparse
  8 | version:        0.5.3.1
  9 | synopsis:       High-performance parsing from strict bytestrings
 10 | description:    @Flatparse@ is a high-performance parsing library for strict bytestring input. See the README for more information:
 11 |                 <https://github.com/AndrasKovacs/flatparse>.
 12 | category:       Parsing
 13 | homepage:       https://github.com/AndrasKovacs/flatparse#readme
 14 | bug-reports:    https://github.com/AndrasKovacs/flatparse/issues
 15 | author:         András Kovács
 16 | maintainer:     puttamalac@gmail.com
 17 | copyright:      2021 András Kovács
 18 | license:        MIT
 19 | license-file:   LICENSE
 20 | build-type:     Simple
 21 | tested-with:
 22 |     GHC == 8.6.5
 23 |   , GHC == 8.8.4
 24 |   , GHC == 8.10.7
 25 |   , GHC == 9.0.2
 26 |   , GHC == 9.2.8
 27 |   , GHC == 9.4.8
 28 |   , GHC == 9.6.7
 29 |   , GHC == 9.8.4
 30 | extra-source-files:
 31 |     README.md
 32 | 
 33 | source-repository head
 34 |   type: git
 35 |   location: https://github.com/AndrasKovacs/flatparse
 36 | 
 37 | flag dump
 38 |   description: dump core, stg and cmm to files
 39 |   manual: True
 40 |   default: False
 41 | 
 42 | flag llvm
 43 |   description: use llvm for building
 44 |   manual: True
 45 |   default: False
 46 | 
 47 | library
 48 |   exposed-modules:
 49 |       FlatParse.Basic
 50 |       FlatParse.Basic.Addr
 51 |       FlatParse.Basic.Base
 52 |       FlatParse.Basic.Bytes
 53 |       FlatParse.Basic.Integers
 54 |       FlatParse.Basic.Parser
 55 |       FlatParse.Basic.Switch
 56 |       FlatParse.Basic.Text
 57 |       FlatParse.Common.Assorted
 58 |       FlatParse.Common.GHCExts
 59 |       FlatParse.Common.Numbers
 60 |       FlatParse.Common.Parser
 61 |       FlatParse.Common.Position
 62 |       FlatParse.Common.Switch
 63 |       FlatParse.Examples.BasicLambda.Lexer
 64 |       FlatParse.Examples.BasicLambda.Parser
 65 |       FlatParse.Stateful
 66 |       FlatParse.Stateful.Addr
 67 |       FlatParse.Stateful.Base
 68 |       FlatParse.Stateful.Bytes
 69 |       FlatParse.Stateful.Integers
 70 |       FlatParse.Stateful.Parser
 71 |       FlatParse.Stateful.Switch
 72 |       FlatParse.Stateful.Text
 73 |   other-modules:
 74 |       Paths_flatparse
 75 |   hs-source-dirs:
 76 |       src
 77 |   default-extensions:
 78 |       BangPatterns
 79 |       BlockArguments
 80 |       CPP
 81 |       ExplicitNamespaces
 82 |       LambdaCase
 83 |       MagicHash
 84 |       OverloadedStrings
 85 |       PatternSynonyms
 86 |       TemplateHaskell
 87 |       TupleSections
 88 |   ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2
 89 |   build-depends:
 90 |       base >=4.7 && <5
 91 |     , bytestring
 92 |     , containers
 93 |     , integer-gmp
 94 |     , template-haskell
 95 |     , utf8-string >=1.0.2 && <1.1
 96 |   default-language: Haskell2010
 97 |   if flag(dump)
 98 |     ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file
 99 |   if flag(llvm)
100 |     ghc-options: -fllvm
101 | 
102 | test-suite spec
103 |   type: exitcode-stdio-1.0
104 |   main-is: Test.hs
105 |   other-modules:
106 |       Paths_flatparse
107 |   hs-source-dirs:
108 |       test
109 |   default-extensions:
110 |       BangPatterns
111 |       BlockArguments
112 |       CPP
113 |       ExplicitNamespaces
114 |       LambdaCase
115 |       MagicHash
116 |       OverloadedStrings
117 |       PatternSynonyms
118 |       TemplateHaskell
119 |       TupleSections
120 |       ExtendedDefaultRules
121 |   ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2 -Wno-type-defaults
122 |   build-depends:
123 |       HUnit
124 |     , QuickCheck
125 |     , base >=4.7 && <5
126 |     , bytestring
127 |     , flatparse
128 |     , hspec
129 |     , quickcheck-instances
130 |     , utf8-string >=1.0.2 && <1.1
131 |   default-language: Haskell2010
132 |   if flag(dump)
133 |     ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file
134 |   if flag(llvm)
135 |     ghc-options: -fllvm
136 | 
137 | benchmark bench
138 |   type: exitcode-stdio-1.0
139 |   main-is: Bench.hs
140 |   other-modules:
141 |       Attoparsec
142 |       Common
143 |       FPBasic
144 |       FPStateful
145 |       Megaparsec
146 |       Parsec
147 |       ReadInteger
148 |       Paths_flatparse
149 |   hs-source-dirs:
150 |       bench
151 |   default-extensions:
152 |       BangPatterns
153 |       BlockArguments
154 |       CPP
155 |       ExplicitNamespaces
156 |       LambdaCase
157 |       MagicHash
158 |       OverloadedStrings
159 |       PatternSynonyms
160 |       TemplateHaskell
161 |       TupleSections
162 |   ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2
163 |   build-depends:
164 |       attoparsec
165 |     , base >=4.7 && <5
166 |     , bytestring
167 |     , flatparse
168 |     , integer-gmp
169 |     , megaparsec
170 |     , parsec
171 |     , primitive
172 |     , tasty-bench
173 |     , utf8-string >=1.0.2 && <1.1
174 |   default-language: Haskell2010
175 |   if flag(dump)
176 |     ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file
177 |   if flag(llvm)
178 |     ghc-options: -fllvm
179 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Switch.hs:
--------------------------------------------------------------------------------
  1 | -- | Efficient literal branching using Template Haskell.
  2 | 
  3 | module FlatParse.Stateful.Switch
  4 |   ( switch, switchWithPost, rawSwitchWithPost
  5 |   ) where
  6 | 
  7 | import Control.Monad
  8 | import Data.Foldable
  9 | import Data.Map (Map)
 10 | import Language.Haskell.TH
 11 | 
 12 | import qualified Data.Map.Strict as M
 13 | 
 14 | import FlatParse.Common.Switch
 15 | import FlatParse.Stateful.Base ( ensure, skipBack, branch, failed )
 16 | import FlatParse.Stateful.Bytes ( bytesUnsafe )
 17 | import FlatParse.Stateful.Integers ( anyWord8Unsafe )
 18 | 
 19 | {-|
 20 | This is a template function which makes it possible to branch on a collection of string literals in
 21 | an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing
 22 | operations, which has optimized control flow, vectorized reads and grouped checking for needed input
 23 | bytes.
 24 | 
 25 | The syntax is slightly magical, it overloads the usual @case@ expression. An example:
 26 | 
 27 | @
 28 |     $(switch [| case _ of
 29 |         "foo" -> pure True
 30 |         "bar" -> pure False |])
 31 | @
 32 | 
 33 | The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally
 34 | we may have a default case, like in
 35 | 
 36 | @
 37 |     $(switch [| case _ of
 38 |         "foo" -> pure 10
 39 |         "bar" -> pure 20
 40 |         _     -> pure 30 |])
 41 | @
 42 | 
 43 | All case right hand sides must be parsers with the same type. That type is also the type
 44 | of the whole `switch` expression.
 45 | 
 46 | A `switch` has longest match semantics, and the order of cases does not matter, except for
 47 | the default case, which may only appear as the last case.
 48 | 
 49 | If a `switch` does not have a default case, and no case matches the input, then it returns with
 50 | failure, \without\ having consumed any input. A fallthrough to the default case also does not
 51 | consume any input.
 52 | -}
 53 | switch :: Q Exp -> Q Exp
 54 | switch = switchWithPost Nothing
 55 | 
 56 | {-|
 57 | Switch expression with an optional first argument for performing a post-processing action after
 58 | every successful branch matching. For example, if we have @ws :: ParserT st r e ()@ for a
 59 | whitespace parser, we might want to consume whitespace after matching on any of the switch
 60 | cases. For that case, we can define a "lexeme" version of `switch` as follows.
 61 | 
 62 | @
 63 |   switch' :: Q Exp -> Q Exp
 64 |   switch' = switchWithPost (Just [| ws |])
 65 | @
 66 | 
 67 | Note that this @switch'@ function cannot be used in the same module it's defined in, because of the
 68 | stage restriction of Template Haskell.
 69 | -}
 70 | switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp
 71 | switchWithPost postAction exp = do
 72 |   !postAction <- sequence postAction
 73 |   (!cases, !fallback) <- parseSwitch exp
 74 |   genTrie $! genSwitchTrie' postAction cases fallback
 75 | 
 76 | -- | Version of `switchWithPost` without syntactic sugar. The second argument is the
 77 | --   list of cases, the third is the default case.
 78 | rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp
 79 | rawSwitchWithPost postAction cases fallback = do
 80 |   !postAction <- sequence postAction
 81 |   !cases <- forM cases \(str, rhs) -> (str,) <$> rhs
 82 |   !fallback <- sequence fallback
 83 |   genTrie $! genSwitchTrie' postAction cases fallback
 84 | 
 85 | #if MIN_VERSION_base(4,15,0)
 86 | mkDoE = DoE Nothing
 87 | {-# inline mkDoE #-}
 88 | #else
 89 | mkDoE = DoE
 90 | {-# inline mkDoE #-}
 91 | #endif
 92 | 
 93 | genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp
 94 | genTrie (rules, t) = do
 95 |   branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules
 96 | 
 97 |   let ix m k = case M.lookup k m of
 98 |         Nothing -> error ("key not in map: " ++ show k)
 99 |         Just a  -> a
100 | 
101 |   let ensure' :: Maybe Int -> Maybe (Q Exp)
102 |       ensure' = fmap (\n -> [| ensure n |])
103 | 
104 |       fallback :: Rule -> Int ->  Q Exp
105 |       fallback rule 0 = pure $ VarE $ fst $ ix branches rule
106 |       fallback rule n = [| skipBack n >> $(pure $ VarE $ fst $ ix branches rule) |]
107 | 
108 |   let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp
109 |       go = \case
110 |         Branch' (r, n, alloc) ts
111 |           | M.null ts -> pure $ VarE $ fst $ branches M.! r
112 |           | otherwise -> do
113 |               !next         <- (traverse . traverse) go (M.toList ts)
114 |               !defaultCase  <- fallback r (n + 1)
115 | 
116 |               let cases = mkDoE $
117 |                     [BindS (VarP (mkName "c")) (VarE 'anyWord8Unsafe),
118 |                       NoBindS (CaseE (VarE (mkName "c"))
119 |                          (map (\(w, t) ->
120 |                                  Match (LitP (IntegerL (fromIntegral w)))
121 |                                        (NormalB t)
122 |                                        [])
123 |                               next
124 |                           ++ [Match WildP (NormalB defaultCase) []]))]
125 | 
126 |               case ensure' alloc of
127 |                 Nothing    -> pure cases
128 |                 Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |]
129 | 
130 |         Path (r, n, alloc) ws t ->
131 |           case ensure' alloc of
132 |             Nothing    -> [| branch $(bytesUnsafe ws) $(go t) $(fallback r n)|]
133 |             Just alloc -> [| branch ($alloc >> $(bytesUnsafe ws)) $(go t) $(fallback r n) |]
134 | 
135 |   letE
136 |     (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches))
137 |     (go t)
138 | 
139 | genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp
140 |               -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int))
141 | genSwitchTrie' postAction cases fallback =
142 | 
143 |   let (!branches, !strings) = unzip do
144 |         (!i, (!str, !rhs)) <- zip [0..] cases
145 |         case postAction of
146 |           Nothing    -> pure ((Just i, rhs), (i, str))
147 |           Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str))
148 | 
149 |       !m    =  M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches)
150 |       !trie = compileTrie strings
151 |   in (m , trie)
152 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Switch.hs:
--------------------------------------------------------------------------------
  1 | -- | Efficient literal branching using Template Haskell.
  2 | 
  3 | module FlatParse.Basic.Switch
  4 |   ( switch, switchWithPost, rawSwitchWithPost
  5 |   ) where
  6 | 
  7 | import Control.Monad
  8 | import Data.Foldable
  9 | import Data.Map (Map)
 10 | import Language.Haskell.TH
 11 | 
 12 | import qualified Data.Map.Strict as M
 13 | 
 14 | import FlatParse.Common.Switch
 15 | import FlatParse.Basic.Base ( ensure, skipBack, branch, failed )
 16 | import FlatParse.Basic.Bytes ( bytesUnsafe )
 17 | import FlatParse.Basic.Integers ( anyWord8Unsafe )
 18 | 
 19 | {-|
 20 | This is a template function which makes it possible to branch on a collection of string literals in
 21 | an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing
 22 | operations, which has optimized control flow, vectorized reads and grouped checking for needed input
 23 | bytes.
 24 | 
 25 | The syntax is slightly magical, it overloads the usual @case@ expression. An example:
 26 | 
 27 | @
 28 |     $(switch [| case _ of
 29 |         "foo" -> pure True
 30 |         "bar" -> pure False |])
 31 | @
 32 | 
 33 | The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally
 34 | we may have a default case, like in
 35 | 
 36 | @
 37 |     $(switch [| case _ of
 38 |         "foo" -> pure 10
 39 |         "bar" -> pure 20
 40 |         _     -> pure 30 |])
 41 | @
 42 | 
 43 | All case right hand sides must be parsers with the same type. That type is also the type
 44 | of the whole `switch` expression.
 45 | 
 46 | A `switch` has longest match semantics, and the order of cases does not matter, except for
 47 | the default case, which may only appear as the last case.
 48 | 
 49 | If a `switch` does not have a default case, and no case matches the input, then it returns with
 50 | failure, \without\ having consumed any input. A fallthrough to the default case also does not
 51 | consume any input.
 52 | -}
 53 | switch :: Q Exp -> Q Exp
 54 | switch = switchWithPost Nothing
 55 | 
 56 | {-|
 57 | Switch expression with an optional first argument for performing a post-processing action after
 58 | every successful branch matching, not including the default branch. For example, if we have
 59 | @ws :: ParserT st e ()@ for a whitespace parser, we might want to consume whitespace after matching
 60 | on any of the switch cases. For that case, we can define a "lexeme" version of `switch` as
 61 | follows.
 62 | 
 63 | @
 64 |   switch' :: Q Exp -> Q Exp
 65 |   switch' = switchWithPost (Just [| ws |])
 66 | @
 67 | 
 68 | Note that this @switch'@ function cannot be used in the same module it's defined in, because of the
 69 | stage restriction of Template Haskell.
 70 | -}
 71 | switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp
 72 | switchWithPost postAction exp = do
 73 |   !postAction <- sequence postAction
 74 |   (!cases, !fallback) <- parseSwitch exp
 75 |   genTrie $! genSwitchTrie' postAction cases fallback
 76 | 
 77 | -- | Version of `switchWithPost` without syntactic sugar. The second argument is the
 78 | --   list of cases, the third is the default case.
 79 | rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp
 80 | rawSwitchWithPost postAction cases fallback = do
 81 |   !postAction <- sequence postAction
 82 |   !cases <- forM cases \(str, rhs) -> (str,) <$> rhs
 83 |   !fallback <- sequence fallback
 84 |   genTrie $! genSwitchTrie' postAction cases fallback
 85 | 
 86 | #if MIN_VERSION_base(4,15,0)
 87 | mkDoE = DoE Nothing
 88 | {-# inline mkDoE #-}
 89 | #else
 90 | mkDoE = DoE
 91 | {-# inline mkDoE #-}
 92 | #endif
 93 | 
 94 | genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp
 95 | genTrie (rules, t) = do
 96 |   branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules
 97 | 
 98 |   let ix m k = case M.lookup k m of
 99 |         Nothing -> error ("key not in map: " ++ show k)
100 |         Just a  -> a
101 | 
102 |   let ensure' :: Maybe Int -> Maybe (Q Exp)
103 |       ensure' = fmap (\n -> [| ensure n |])
104 | 
105 |       fallback :: Rule -> Int ->  Q Exp
106 |       fallback rule 0 = pure $ VarE $ fst $ ix branches rule
107 |       fallback rule n = [| skipBack n >> $(pure $ VarE $ fst $ ix branches rule) |]
108 | 
109 |   let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp
110 |       go = \case
111 |         Branch' (r, n, alloc) ts
112 |           | M.null ts -> pure $ VarE $ fst $ branches M.! r
113 |           | otherwise -> do
114 |               !next         <- (traverse . traverse) go (M.toList ts)
115 |               !defaultCase  <- fallback r (n + 1)
116 | 
117 |               let cases = mkDoE $
118 |                     [BindS (VarP (mkName "c")) (VarE 'anyWord8Unsafe),
119 |                       NoBindS (CaseE (VarE (mkName "c"))
120 |                          (map (\(w, t) ->
121 |                                  Match (LitP (IntegerL (fromIntegral w)))
122 |                                        (NormalB t)
123 |                                        [])
124 |                               next
125 |                           ++ [Match WildP (NormalB defaultCase) []]))]
126 | 
127 |               case ensure' alloc of
128 |                 Nothing    -> pure cases
129 |                 Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |]
130 | 
131 |         Path (r, n, alloc) ws t ->
132 |           case ensure' alloc of
133 |             Nothing    -> [| branch $(bytesUnsafe ws) $(go t) $(fallback r n)|]
134 |             Just alloc -> [| branch ($alloc >> $(bytesUnsafe ws)) $(go t) $(fallback r n) |]
135 | 
136 |   letE
137 |     (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches))
138 |     (go t)
139 | 
140 | genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp
141 |               -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int))
142 | genSwitchTrie' postAction cases fallback =
143 | 
144 |   let (!branches, !strings) = unzip do
145 |         (!i, (!str, !rhs)) <- zip [0..] cases
146 |         case postAction of
147 |           Nothing    -> pure ((Just i, rhs), (i, str))
148 |           Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str))
149 | 
150 |       !m    = M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches)
151 |       !trie = compileTrie strings
152 |   in (m , trie)
153 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Parser.hs:
--------------------------------------------------------------------------------
  1 | -- | Minimal parser definition.
  2 | 
  3 | {-# LANGUAGE UnboxedTuples #-}
  4 | {-# LANGUAGE KindSignatures #-}
  5 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why)
  6 | {-# LANGUAGE FlexibleInstances #-}
  7 | 
  8 | module FlatParse.Basic.Parser
  9 |   (
 10 |   -- * Parser
 11 |     ParserT(..)
 12 |   , Parser, ParserIO, ParserST
 13 |   , pureLazy
 14 | 
 15 |   -- * Result
 16 |   , type Res#
 17 |   , pattern OK#, pattern Err#, pattern Fail#
 18 | 
 19 |   -- ** Internal
 20 |   , type ResI#
 21 | 
 22 |   -- * Choice operator (defined with right associativity)
 23 |   , (<|>)
 24 |   ) where
 25 | 
 26 | import FlatParse.Common.GHCExts ( Addr#, unsafeCoerce#, ZeroBitType )
 27 | import FlatParse.Common.Parser
 28 | 
 29 | import GHC.ForeignPtr ( ForeignPtrContents )
 30 | 
 31 | import qualified Control.Applicative
 32 | import Control.Monad ( MonadPlus(..) )
 33 | import Control.Monad.IO.Class ( MonadIO(..) )
 34 | import GHC.IO ( IO(IO) )
 35 | 
 36 | -- | @ParserT st e a@ is a parser with a state token type @st@, an error type
 37 | --   @e@ and a return type @a@. The different state token types support
 38 | --   different embedded effects; see `Parser`, `ParserIO` and `ParserST` below.
 39 | newtype ParserT (st :: ZeroBitType) e a =
 40 |     ParserT { runParserT# :: ForeignPtrContents -> Addr# -> Addr# -> st -> Res# st e a }
 41 | 
 42 | -- | The type of pure parsers.
 43 | type Parser     = ParserT PureMode
 44 | 
 45 | -- | The type of parsers which can embed `IO` actions.
 46 | type ParserIO   = ParserT IOMode
 47 | 
 48 | -- | The type of parsers which can embed `ST` actions.
 49 | type ParserST s = ParserT (STMode s)
 50 | 
 51 | -- | You may lift IO actions into a 'ParserIO' using `liftIO`.
 52 | instance MonadIO (ParserIO e) where
 53 |   liftIO (IO a) = ParserT \fp eob s rw ->
 54 |     case a rw of (# rw', a #) -> OK# rw' a s
 55 |   {-# inline liftIO #-}
 56 | 
 57 | instance Functor (ParserT st e) where
 58 |   fmap f (ParserT g) = ParserT \fp eob s st -> case g fp eob s st of
 59 |     OK# st' a s -> let !b = f a in OK# st' b s
 60 |     x           -> unsafeCoerce# x
 61 |   {-# inline fmap #-}
 62 | 
 63 |   (<$) a' (ParserT g) = ParserT \fp eob s st -> case g fp eob s st of
 64 |     OK# st' _a s -> OK# st' a' s
 65 |     x           -> unsafeCoerce# x
 66 |   {-# inline (<$) #-}
 67 | 
 68 | instance Applicative (ParserT st e) where
 69 |   pure !a = ParserT \fp eob s st -> OK# st a s
 70 |   {-# inline pure #-}
 71 |   ParserT ff <*> ParserT fa = ParserT \fp eob s st -> case ff fp eob s st of
 72 |     OK# st' f s -> case fa fp eob s st' of
 73 |       OK# st'' a s -> let !b = f a in OK# st'' b s
 74 |       x            -> unsafeCoerce# x
 75 |     x           -> unsafeCoerce# x
 76 |   {-# inline (<*>) #-}
 77 |   ParserT fa <* ParserT fb = ParserT \fp eob s st -> case fa fp eob s st of
 78 |     OK# st' a s -> case fb fp eob s st' of
 79 |       OK# st'' _b s -> OK# st'' a s
 80 |       x             -> unsafeCoerce# x
 81 |     x           -> unsafeCoerce# x
 82 |   {-# inline (<*) #-}
 83 |   ParserT fa *> ParserT fb = ParserT \fp eob s st -> case fa fp eob s st of
 84 |     OK# st' _a s -> fb fp eob s st'
 85 |     x            -> unsafeCoerce# x
 86 |   {-# inline (*>) #-}
 87 | 
 88 | -- | Same as `pure` for `ParserT` except that it does not force the returned value.
 89 | pureLazy :: a -> ParserT st e a
 90 | pureLazy a = ParserT \fp eob s st -> OK# st a s
 91 | {-# inline pureLazy #-}
 92 | 
 93 | instance Monad (ParserT st e) where
 94 |   return = pure
 95 |   {-# inline return #-}
 96 |   ParserT fa >>= f = ParserT \fp eob s st -> case fa fp eob s st of
 97 |     OK# st' a s -> runParserT# (f a) fp eob s st'
 98 |     x           -> unsafeCoerce# x
 99 |   {-# inline (>>=) #-}
100 |   (>>) = (*>)
101 |   {-# inline (>>) #-}
102 | 
103 | -- | By default, parser choice `(<|>)` arbitrarily backtracks on parser failure.
104 | instance Control.Applicative.Alternative (ParserT st e) where
105 |   empty = ParserT \fp eob s st -> Fail# st
106 |   {-# inline empty #-}
107 | 
108 |   (<|>) = (<|>)
109 |   {-# inline (Control.Applicative.<|>) #-}
110 | 
111 |   many (ParserT f) = ParserT go where
112 |     go fp eob s st = case f fp eob s st of
113 |       OK# st a s -> case go fp eob s st of
114 |                       OK# st as s -> OK# st (a:as) s
115 |                       x           -> x
116 |       Fail# st  -> OK# st [] s
117 |       Err# st e -> Err# st e
118 |   {-# inline many #-}
119 | 
120 |   some p = (:) <$> p <*> Control.Applicative.many p
121 |   {-# inline some #-}
122 | 
123 | infixr 6 <|>
124 | -- | Choose between two parsers. If the first parser fails, try the second one,
125 | --   but if the first one throws an error, propagate the error. This operation
126 | --   can arbitrarily backtrack.
127 | --
128 | -- Note: this exported operator has different fixity than the same operator in
129 | -- `Control.Applicative`. Hide this operator if you want to use the
130 | -- `Alternative` version.
131 | (<|>) :: ParserT st e a -> ParserT st e a -> ParserT st e a
132 | (<|>) (ParserT f) (ParserT g) = ParserT \fp eob s st ->
133 |   case f fp eob s st of
134 |     Fail# st' -> g fp eob s st'
135 |     x         -> x
136 | {-# inline[1] (<|>) #-}
137 | 
138 | {-# RULES
139 | 
140 | "flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r)
141 | 
142 | #-}
143 | 
144 | instance MonadPlus (ParserT st e) where
145 |   mzero = Control.Applicative.empty
146 |   {-# inline mzero #-}
147 |   mplus = (<|>)
148 |   {-# inline mplus #-}
149 | 
150 | --------------------------------------------------------------------------------
151 | 
152 | -- | Primitive parser result wrapped with a state token.
153 | --
154 | -- You should rarely need to manipulate values of this type directly. Use the
155 | -- provided bidirectional pattern synonyms 'OK#', 'Fail#' and 'Err#'.
156 | type Res# (st :: ZeroBitType) e a =
157 |   (# st, ResI# e a #)
158 | 
159 | -- | Primitive parser result.
160 | type ResI# e a =
161 |   (#
162 |     (# a, Addr# #)
163 |   | (# #)
164 |   | (# e #)
165 |   #)
166 | 
167 | -- | 'Res#' constructor for a successful parse.
168 | --   Contains the return value and a pointer to the rest of the input buffer,
169 | --   plus a state token.
170 | pattern OK# :: (st :: ZeroBitType) -> a -> Addr# -> Res# st e a
171 | pattern OK# st a s = (# st, (# (# a, s #) | | #) #)
172 | 
173 | -- | 'Res#' constructor for recoverable failure.
174 | --   Contains only a state token.
175 | pattern Fail# :: (st :: ZeroBitType) -> Res# st e a
176 | pattern Fail# st = (# st, (# | (# #) | #) #)
177 | 
178 | -- | 'Res#' constructor for errors which are by default non-recoverable.
179 | --    Contains the error, plus a state token.
180 | pattern Err# :: (st :: ZeroBitType) -> e -> Res# st e a
181 | pattern Err# st e = (# st, (# | | (# e #) #) #)
182 | {-# complete OK#, Fail#, Err# #-}
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # flatparse
 2 | 
 3 | [![Hackage](https://img.shields.io/hackage/v/flatparse.svg)](https://hackage.haskell.org/package/flatparse)
 4 | ![CI](https://github.com/AndrasKovacs/flatparse/actions/workflows/haskell.yml/badge.svg)
 5 | 
 6 | `flatparse` is a high-performance parsing library, supporting parsing for __programming languages__, __human-readable data__ and __machine-readable data__. The "flat" in the name refers to the `ByteString` parsing input, which has pinned contiguous data, and also to the library internals, which avoids indirections and heap allocations whenever possible. `flatparse` is generally __lower-level__ than `parsec`-style libraries, but it is possible to build higher-level features (such as source spans, hints, indentation parsing) on top of it, without making any compromises in performance.
 7 | 
 8 | ### LLVM
 9 | 
10 | It is advised to build with [`-fllvm`
11 | option](https://downloads.haskell.org/~ghc/latest/docs/html/users_guide/codegens.html#llvm-code-generator-fllvm)
12 | when using this package, since that can result in significant speedups (20-40%
13 | from what I've seen). Additionally, you can enable `-fllvm` for `flatparse`
14 | specifically by enabling the `llvm` package flag. However, this has minor
15 | impact, since almost all parser code will be typically inlined into modules
16 | outside `flatparse`, and compiled there.
17 | 
18 | ## Features and non-features
19 | 
20 | * __Excellent performance__. On microbenchmarks, `flatparse` is 2-10 times faster than `attoparsec` or `megaparsec`. On examples with heavier use of source positions and spans and/or indentation parsing, you can expect a bigger gap between `megaparsec` and `flatparse`. Compile times and executable sizes are also significantly better with `flatparse` than with `megaparsec` or `attoparsec`. `flatparse` internals make liberal use of unboxed tuples and GHC primops. As a result, pure validators (parsers returning `()`) in `flatparse` are not difficult to implement with zero heap allocation.
21 | * __No incremental parsing__, and __only strict `ByteString`__ is supported as input. However, it can be still useful to convert from `Text`, `String` or other types to `ByteString`, and then use `flatparse` for parsing, since `flatparse` performance usually more than makes up for the conversion costs.
22 | * __Only little-endian systems are currently supported as the host machine__. This may change in the future. However, `flatparse` does include primitive integer parsers with specific endianness.
23 | * __Support for fast source location handling, indentation parsing and informative error messages__. `flatparse` provides a low-level interface to these. Batteries are _not included_, but it should be possible for users to build custom solutions, which are more sophisticated, but still as fast as possible. In my experience, the included batteries in other libraries often come with major unavoidable overheads, and often we still have to extend existing machinery in order to scale to production features.
24 | * The __backtracking model__ of `flatparse` is different to parsec libraries, and is more close to the [nom](https://github.com/Geal/nom) library in Rust. The idea is that _parser failure_ is distinguished from _parsing error_. The former is used for control flow, and we can backtrack from it. The latter is used for unrecoverable errors, and by default it's propagated to the top. `flatparse` does not track whether parsers have consumed inputs. In my experience, what we really care about is the failure/error distinction, and in `parsec` or `megaparsec` the consumed/non-consumed separation is often muddled and discarded in larger parser implementations. By default, basic `flatparse` parsers can fail but can not throw errors, with the exception of the specifically error-throwing operations. Hence, `flatparse` users have to be mindful about grammar, and explicitly insert errors where it is known that the input can't be valid.
25 | 
26 | `flatparse` comes in two flavors: [`FlatParse.Basic`][basic] and [`FlatParse.Stateful`][stateful]. Both support a custom error type. Also, both come in three modes, where we can respectively run `IO` actions, `ST` actions, or no side effects. The modes are selected by a state token type parameter on the parser types.
27 | 
28 | * [`FlatParse.Basic`][basic] only supports the above features. If you don't need
29 |   indentation parsing, this is sufficient.
30 | * [`FlatParse.Stateful`][stateful] additionally supports a built-in `Int` worth
31 |   of internal state and an additional custom reader environment. This can
32 |   support a wide range of indentation parsing features. There is a moderate
33 |   overhead in performance and code size compared to `Basic`. In microbenchmarks
34 |   and small parsers, the performance difference between `Basic` and `Stateful`
35 |   is more up to the whims of GHC and LLVM, and is a bit more "random".
36 | 
37 | ## Tutorial
38 | 
39 | Informative tutorials are work in progress. See [`src/FlatParse/Examples`](src/FlatParse/Examples)
40 | for a lexer/parser example with acceptably good error messages.
41 | 
42 | ## Contribution
43 | 
44 | Pull requests are welcome. I'm fairly quick to add PR authors as collaborators.
45 | 
46 | ## Some benchmarks
47 | 
48 | Execution times below. See source code in [bench](bench). Compiled with GHC 9.10.2 `-O2 -fllvm` with
49 | `flatparse-0.5.3.1`. Executed on AMD 9800X3D CPU. Uses `lts-24.7` Stackage snapshot for the involved
50 | packages.
51 | 
52 | |      benchmark              |  runtime   |
53 | |-----------------------------|-------------
54 | |sexp/fpbasic | 1.80 ms|
55 | |sexp/fpstateful | 1.25 ms|
56 | |sexp/attoparsec | 10.2 ms|
57 | |sexp/megaparsec | 6.92 ms|
58 | |sexp/parsec | 39.9 ms|
59 | |long keyword/fpbasic | 0.054 ms |
60 | |long keyword/fpstateful | 0.062 ms |
61 | |long keyword/attoparsec | 0.308 ms |
62 | |long keyword/megaparsec | 0.687 ms |
63 | |long keyword/parsec | 3.50 ms |
64 | |numeral csv/fpbasic | 0.540 ms |
65 | |numeral csv/fpstateful | 0.504 ms |
66 | |numeral csv/attoparsec | 3.17 ms |
67 | |numeral csv/megaparsec | 1.09 ms |
68 | |numeral csv/parsec | 13.8 ms |
69 | |lambda term/fpbasic | 1.52 ms|
70 | |lambda term/fpstateful | 1.56 ms|
71 | |lambda term/attoparsec | 4.94 ms|
72 | |lambda term/megaparsec | 5.35 ms|
73 | |lambda term/parsec | 17.7 ms|
74 | 
75 | Object file sizes for each module containing the `s-exp`, `long keyword`, `numeral csv` and `lambda term` benchmarks.
76 | 
77 | | library    | object file size (bytes) |
78 | | -------    | ------------------------ |
79 | | fpbasic    |  71088                   |
80 | | fpstateful |  73576                   |
81 | | attoparsec |  242816                  |
82 | | megaparsec |  402984                  |
83 | | parsec     |  329008                  |
84 | 
85 | [basic]: https://hackage.haskell.org/package/flatparse/docs/FlatParse-Basic.html
86 | [stateful]: https://hackage.haskell.org/package/flatparse/docs/FlatParse-Stateful.html
87 | 


--------------------------------------------------------------------------------
/src/FlatParse/Examples/BasicLambda/Lexer.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | {-|
  3 | This module contains lexer and error message primitives for a simple lambda calculus parser. It
  4 | demonstrates a simple but decently informative implementation of error message propagation.
  5 | -}
  6 | 
  7 | {-# language StrictData #-}
  8 | 
  9 | module FlatParse.Examples.BasicLambda.Lexer where
 10 | 
 11 | import FlatParse.Basic hiding (Parser, runParser, string, char, cut)
 12 | 
 13 | import qualified FlatParse.Basic as FP
 14 | import qualified Data.ByteString as B
 15 | import Language.Haskell.TH
 16 | 
 17 | import Data.String
 18 | import qualified Data.Set as S
 19 | 
 20 | import qualified Data.ByteString.UTF8 as UTF8
 21 | 
 22 | --------------------------------------------------------------------------------
 23 | 
 24 | -- | An expected item which is displayed in error messages.
 25 | data Expected
 26 |   = Msg String  -- ^ An error message.
 27 |   | Lit String  -- ^ A literal expected thing.
 28 |   deriving (Eq, Show, Ord)
 29 | 
 30 | instance IsString Expected where fromString = Lit
 31 | 
 32 | -- | A parsing error.
 33 | data Error
 34 |   = Precise Pos Expected     -- ^ A precisely known error, like leaving out "in" from "let".
 35 |   | Imprecise Pos [Expected] -- ^ An imprecise error, when we expect a number of different things,
 36 |                              --   but parse something else.
 37 |   deriving Show
 38 | 
 39 | errorPos :: Error -> Pos
 40 | errorPos (Precise p _)   = p
 41 | errorPos (Imprecise p _) = p
 42 | 
 43 | -- | Merge two errors. Inner errors (which were thrown at points with more consumed inputs)
 44 | --   are preferred. If errors are thrown at identical input positions, we prefer precise errors
 45 | --   to imprecise ones.
 46 | --
 47 | --   The point of prioritizing inner and precise errors is to suppress the deluge of "expected"
 48 | --   items, and instead try to point to a concrete issue to fix.
 49 | merge :: Error -> Error -> Error
 50 | merge e e' = case (errorPos e, errorPos e') of
 51 |   (p, p') | p < p' -> e'
 52 |   (p, p') | p > p' -> e
 53 |   (p, p')          -> case (e, e') of
 54 |     (Precise{}      , _               ) -> e
 55 |     (_              , Precise{}       ) -> e'
 56 |     (Imprecise _ es , Imprecise _ es' ) -> Imprecise p (es ++ es')
 57 | {-# noinline merge #-} -- merge is "cold" code, so we shouldn't inline it.
 58 | 
 59 | type Parser = FP.Parser Error
 60 | 
 61 | -- | Pretty print an error. The `B.ByteString` input is the source file. The offending line from the
 62 | --   source is displayed in the output.
 63 | prettyError :: B.ByteString -> Error -> String
 64 | prettyError b e =
 65 | 
 66 |   let pos :: Pos
 67 |       pos      = case e of Imprecise pos e -> pos
 68 |                            Precise pos e   -> pos
 69 |       ls       = FP.linesUtf8 b
 70 |       (l, c)   = case FP.posLineCols b [pos] of
 71 |                    x: _ -> x
 72 |                    _    -> error "impossible"
 73 |       line     = if l < length ls then ls !! l else ""
 74 |       linum    = show l
 75 |       lpad     = map (const ' ') linum
 76 | 
 77 |       expected (Lit s) = show s
 78 |       expected (Msg s) = s
 79 | 
 80 |       err (Precise _ e)    = expected e
 81 |       err (Imprecise _ es) = imprec $ S.toList $ S.fromList es
 82 | 
 83 |       imprec :: [Expected] -> String
 84 |       imprec []     = error "impossible"
 85 |       imprec [e]    = expected e
 86 |       imprec (e:es) = expected e ++ go es where
 87 |         go []     = ""
 88 |         go [e]    = " or " ++ expected e
 89 |         go (e:es) = ", " ++ expected e ++ go es
 90 | 
 91 |   in show l ++ ":" ++ show c ++ ":\n" ++
 92 |      lpad   ++ "|\n" ++
 93 |      linum  ++ "| " ++ line ++ "\n" ++
 94 |      lpad   ++ "| " ++ replicate c ' ' ++ "^\n" ++
 95 |      "parse error: expected " ++
 96 |      err e
 97 | 
 98 | -- | Imprecise cut: we slap a list of items on inner errors.
 99 | cut :: Parser a -> [Expected] -> Parser a
100 | cut p es = do
101 |   pos <- getPos
102 |   FP.cutting p (Imprecise pos es) merge
103 | 
104 | -- | Precise cut: we propagate at most a single error.
105 | cut' :: Parser a -> Expected -> Parser a
106 | cut' p e = do
107 |   pos <- getPos
108 |   FP.cutting p (Precise pos e) merge
109 | 
110 | runParser :: Parser a -> B.ByteString -> Result Error a
111 | runParser = FP.runParser
112 | 
113 | -- | Run parser, print pretty error on failure.
114 | testParser :: Show a => Parser a -> String -> IO ()
115 | testParser p str = case UTF8.fromString str of
116 |   b -> case runParser p b of
117 |     Err e  -> putStrLn $ prettyError b e
118 |     OK a _ -> print a
119 |     Fail   -> putStrLn "uncaught parse error"
120 | 
121 | -- | Parse a line comment.
122 | lineComment :: Parser ()
123 | lineComment =
124 |   withOption anyWord8
125 |     (\case 10 -> ws
126 |            _  -> lineComment)
127 |     (pure ())
128 | 
129 | -- | Parse a potentially nested multiline comment.
130 | multilineComment :: Parser ()
131 | multilineComment = go (1 :: Int) where
132 |   go 0 = ws
133 |   go n = $(switch [| case _ of
134 |     "-}" -> go (n - 1)
135 |     "{-" -> go (n + 1)
136 |     _    -> branch anyWord8 (go n) (pure ()) |])
137 | 
138 | -- | Consume whitespace.
139 | ws :: Parser ()
140 | ws = $(switch [| case _ of
141 |   " "  -> ws
142 |   "\n" -> ws
143 |   "\t" -> ws
144 |   "\r" -> ws
145 |   "--" -> lineComment
146 |   "{-" -> multilineComment
147 |   _    -> pure () |])
148 | 
149 | -- | Consume whitespace after running a parser.
150 | token :: Parser a -> Parser a
151 | token p = p <* ws
152 | {-# inline token #-}
153 | 
154 | -- | Read a starting character of an identifier.
155 | identStartChar :: Parser Char
156 | identStartChar = satisfyAscii isLatinLetter
157 | {-# inline identStartChar #-}
158 | 
159 | -- | Read a non-starting character of an identifier.
160 | identChar :: Parser Char
161 | identChar = satisfyAscii (\c -> isLatinLetter c || isDigit c)
162 | {-# inline identChar #-}
163 | 
164 | -- | Check whether a `Span` contains exactly a keyword. Does not change parsing state.
165 | isKeyword :: Span -> Parser ()
166 | isKeyword span = inSpan span do
167 |   $(FP.switch [| case _ of
168 |       "lam"   -> pure ()
169 |       "let"   -> pure ()
170 |       "in"    -> pure ()
171 |       "if"    -> pure ()
172 |       "then"  -> pure ()
173 |       "else"  -> pure ()
174 |       "true"  -> pure ()
175 |       "false" -> pure ()  |])
176 |   eof
177 | 
178 | -- | Parse a non-keyword string.
179 | symbol :: String -> Q Exp
180 | symbol str = [| token $(FP.string str) |]
181 | 
182 | -- | Parser a non-keyword string, throw precise error on failure.
183 | symbol' :: String -> Q Exp
184 | symbol' str = [| $(symbol str) `cut'` Lit str |]
185 | 
186 | -- | Parse a keyword string.
187 | keyword :: String -> Q Exp
188 | keyword str = [| token ($(FP.string str) `notFollowedBy` identChar) |]
189 | 
190 | -- | Parse a keyword string, throw precise error on failure.
191 | keyword' :: String -> Q Exp
192 | keyword' str = [| $(keyword str) `cut'` Lit str |]
193 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Parser.hs:
--------------------------------------------------------------------------------
  1 | -- | Minimal parser definition.
  2 | 
  3 | {-# LANGUAGE UnboxedTuples #-}
  4 | {-# LANGUAGE KindSignatures #-}
  5 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why)
  6 | {-# LANGUAGE FlexibleInstances #-}
  7 | 
  8 | module FlatParse.Stateful.Parser
  9 |   (
 10 |   -- * Parser
 11 |     ParserT(..)
 12 |   , Parser, ParserIO, ParserST
 13 |   , pureLazy
 14 | 
 15 |   -- ** Result
 16 |   , type Res#
 17 |   , pattern OK#, pattern Err#, pattern Fail#
 18 | 
 19 |   -- *** Internal
 20 |   , type ResI#
 21 | 
 22 |   -- * Choice operator (defined with right associativity)
 23 |   , (<|>)
 24 |   ) where
 25 | 
 26 | import FlatParse.Common.GHCExts ( Addr#, unsafeCoerce#, ZeroBitType, Int# )
 27 | import FlatParse.Common.Parser
 28 | 
 29 | import GHC.ForeignPtr ( ForeignPtrContents )
 30 | 
 31 | import qualified Control.Applicative
 32 | import Control.Monad ( MonadPlus(..) )
 33 | import Control.Monad.IO.Class ( MonadIO(..) )
 34 | import GHC.IO ( IO(IO) )
 35 | 
 36 | -- | @ParserT st r e a@ is a parser with a state token type @st@, a reader
 37 | --   environment @r@, an error type @e@ and a return type @a@. The different
 38 | --   state token types support different embedded effects; see `Parser`,
 39 | --   `ParserIO` and `ParserST` below.
 40 | newtype ParserT (st :: ZeroBitType) r e a =
 41 |     ParserT { runParserT# :: ForeignPtrContents -> r -> Addr# -> Addr# -> Int# -> st -> Res# st e a }
 42 | 
 43 | -- | The type of pure parsers.
 44 | type Parser     = ParserT PureMode
 45 | 
 46 | -- | The type of parsers which can embed `IO` actions.
 47 | type ParserIO   = ParserT IOMode
 48 | 
 49 | -- | The type of parsers which can embed `ST` actions.
 50 | type ParserST s = ParserT (STMode s)
 51 | 
 52 | -- | You may lift IO actions into a 'ParserIO' using `liftIO`.
 53 | instance MonadIO (ParserT IOMode r e) where
 54 |   liftIO (IO a) = ParserT \fp !r eob s n rw ->
 55 |     case a rw of (# rw', a #) -> OK# rw' a s n
 56 |   {-# inline liftIO #-}
 57 | 
 58 | instance Functor (ParserT st r e) where
 59 |   fmap f (ParserT g) = ParserT \fp !r eob s n st -> case g fp r eob s n st of
 60 |     OK# st' a s n -> let !b = f a in OK# st' b s n
 61 |     x             -> unsafeCoerce# x
 62 |   {-# inline fmap #-}
 63 | 
 64 |   (<$) a' (ParserT g) = ParserT \fp !r eob s n st -> case g fp r eob s n st of
 65 |     OK# st' _a s n -> OK# st' a' s n
 66 |     x              -> unsafeCoerce# x
 67 |   {-# inline (<$) #-}
 68 | 
 69 | instance Applicative (ParserT st r e) where
 70 |   pure !a = ParserT \_fp !_r _eob s n st -> OK# st a s n
 71 |   {-# inline pure #-}
 72 |   ParserT ff <*> ParserT fa = ParserT \fp !r eob s n st -> case ff fp r eob s n st of
 73 |     OK# st' f s n -> case fa fp r eob s n st' of
 74 |       OK# st'' a s n -> let b = f a in OK# st'' b s n
 75 |       x              -> unsafeCoerce# x
 76 |     x             -> unsafeCoerce# x
 77 |   {-# inline (<*>) #-}
 78 |   ParserT fa <* ParserT fb = ParserT \fp !r eob s n st -> case fa fp r eob s n st of
 79 |     OK# st' a s n -> case fb fp r eob s n st' of
 80 |       OK# st'' _b s n -> OK# st'' a s n
 81 |       x               -> unsafeCoerce# x
 82 |     x             -> unsafeCoerce# x
 83 |   {-# inline (<*) #-}
 84 |   ParserT fa *> ParserT fb = ParserT \fp !r eob s n st -> case fa fp r eob s n st of
 85 |     OK# st' _a s n -> fb fp r eob s n st'
 86 |     x              -> unsafeCoerce# x
 87 |   {-# inline (*>) #-}
 88 | 
 89 | -- | Same as `pure` for `ParserT` except that it does not force the returned value.
 90 | pureLazy :: a -> ParserT st r e a
 91 | pureLazy a = ParserT \_fp !_r _eob s n st -> OK# st a s n
 92 | {-# inline pureLazy #-}
 93 | 
 94 | instance Monad (ParserT st r e) where
 95 |   return = pure
 96 |   {-# inline return #-}
 97 |   ParserT fa >>= f = ParserT \fp !r eob s n st -> case fa fp r eob s n st of
 98 |     OK# st' a s n -> runParserT# (f a) fp r eob s n st'
 99 |     x             -> unsafeCoerce# x
100 |   {-# inline (>>=) #-}
101 |   (>>) = (*>)
102 |   {-# inline (>>) #-}
103 | 
104 | instance Control.Applicative.Alternative (ParserT st r e) where
105 |   empty = ParserT \fp !r eob s n st -> Fail# st
106 |   {-# inline empty #-}
107 | 
108 |   (<|>) = (<|>)
109 |   {-# inline (Control.Applicative.<|>) #-}
110 | 
111 |   many (ParserT f) = ParserT go where
112 |     go fp !r eob s n st =
113 |         case f fp r eob s n st of
114 |           OK# st a s n ->
115 |             case go fp r eob s n st of
116 |               OK# st as s n -> OK# st (a:as) s n
117 |               x             -> x
118 |           Fail# st'    -> OK# st [] s n
119 |           Err# st' e   -> Err# st e
120 |   {-# inline many #-}
121 | 
122 |   some p = (:) <$> p <*> Control.Applicative.many p
123 |   {-# inline some #-}
124 | 
125 | infixr 6 <|>
126 | -- | Choose between two parsers. If the first parser fails, try the second one,
127 | --   but if the first one throws an error, propagate the error. This operation
128 | --   can arbitrarily backtrack.
129 | --
130 | -- Note: this exported operator has different fixity than the same operator in
131 | -- `Control.Applicative`. Hide this operator if you want to use the
132 | -- `Alternative` version.
133 | (<|>) :: ParserT st r e a -> ParserT st r e a -> ParserT st r e a
134 | (<|>) (ParserT f) (ParserT g) = ParserT \fp !r eob s n st ->
135 |   case f fp r eob s n st of
136 |     Fail# st' -> g fp r eob s n st'
137 |     x         -> x
138 | {-# inline[1] (<|>) #-}
139 | 
140 | {-# RULES
141 | 
142 | "flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r)
143 | 
144 | #-}
145 | 
146 | instance MonadPlus (ParserT st r e) where
147 |   mzero = Control.Applicative.empty
148 |   {-# inline mzero #-}
149 |   mplus = (<|>)
150 |   {-# inline mplus #-}
151 | 
152 | --------------------------------------------------------------------------------
153 | 
154 | -- | Primitive parser result wrapped with a state token.
155 | --
156 | -- You should rarely need to manipulate values of this type directly. Use the
157 | -- provided bidirectional pattern synonyms 'OK#', 'Fail#' and 'Err#'.
158 | type Res# (st :: ZeroBitType) e a =
159 |   (# st, ResI# e a #)
160 | 
161 | -- | Primitive parser result.
162 | type ResI# e a =
163 |   (#
164 |     (# a, Addr#, Int# #)
165 |   | (# #)
166 |   | (# e #)
167 |   #)
168 | 
169 | -- | 'Res#' constructor for a successful parse.
170 | --   Contains the return value, a pointer to the rest of the input buffer, and
171 | --   the next 'Int' state, plus a state token.
172 | pattern OK# :: (st :: ZeroBitType) -> a -> Addr# -> Int# -> Res# st e a
173 | pattern OK# st a s n = (# st, (# (# a, s, n #) | | #) #)
174 | 
175 | -- | 'Res#' constructor for recoverable failure.
176 | --   Contains only a state token.
177 | pattern Fail# :: (st :: ZeroBitType) -> Res# st e a
178 | pattern Fail# st = (# st, (# | (# #) | #) #)
179 | 
180 | -- | 'Res#' constructor for errors which are by default non-recoverable.
181 | --    Contains the error, plus a state token.
182 | pattern Err# :: (st :: ZeroBitType) -> e -> Res# st e a
183 | pattern Err# st e = (# st, (# | | (# e #) #) #)
184 | {-# complete OK#, Fail#, Err# #-}
185 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/Assorted.hs:
--------------------------------------------------------------------------------
  1 | module FlatParse.Common.Assorted
  2 |   (
  3 |   -- * Compatibility
  4 |     shortInteger
  5 | 
  6 |   -- * 'Char' predicates
  7 |   , isDigit, isLatinLetter, isGreekLetter
  8 | 
  9 |   -- * Other
 10 |   , packBytes, splitBytes
 11 | 
 12 |   -- * UTF-8 conversions
 13 |   , charToBytes, strToBytes
 14 |   , strToUtf8, utf8ToStr
 15 | 
 16 |   -- * Shortcuts
 17 |   , derefChar8#
 18 | 
 19 |   -- * Boxed integer coercions
 20 |   -- $boxed-integer-coercion
 21 |   , word16ToInt16
 22 |   , word32ToInt32
 23 |   , word64ToInt64
 24 | 
 25 |   -- * Helpers
 26 |   , withPosInt#, withIntUnwrap#
 27 | 
 28 |   -- * Bit manipulation
 29 |   , zbytel, zbytel'intermediate, zbytel'toIdx
 30 |   , zbyter, zbyter'intermediate, zbyter'toIdx
 31 |   ) where
 32 | 
 33 | import Data.Bits
 34 | import Data.Char ( ord )
 35 | import Data.Foldable (foldl')
 36 | import GHC.Exts
 37 | 
 38 | import qualified Data.ByteString as B
 39 | 
 40 | import Data.Word
 41 | import Data.Int
 42 | 
 43 | #if MIN_VERSION_base(4,15,0)
 44 | import GHC.Num.Integer (Integer(..))
 45 | #else
 46 | import GHC.Integer.GMP.Internals (Integer(..))
 47 | #endif
 48 | 
 49 | import qualified Data.ByteString.UTF8 as UTF8
 50 | 
 51 | 
 52 | -- Compatibility
 53 | --------------------------------------------------------------------------------
 54 | 
 55 | shortInteger :: Int# -> Integer
 56 | #if MIN_VERSION_base(4,15,0)
 57 | shortInteger = IS
 58 | #else
 59 | shortInteger = S#
 60 | #endif
 61 | {-# inline shortInteger #-}
 62 | 
 63 | 
 64 | -- Char predicates
 65 | --------------------------------------------------------------------------------
 66 | 
 67 | -- | @isDigit c = \'0\' <= c && c <= \'9\'@
 68 | isDigit :: Char -> Bool
 69 | isDigit c = '0' <= c && c <= '9'
 70 | {-# inline isDigit #-}
 71 | 
 72 | -- | @isLatinLetter c = (\'A\' <= c && c <= \'Z\') || (\'a\' <= c && c <= \'z\')@
 73 | isLatinLetter :: Char -> Bool
 74 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')
 75 | {-# inline isLatinLetter #-}
 76 | 
 77 | -- | @isGreekLetter c = (\'Α\' <= c && c <= \'Ω\') || (\'α\' <= c && c <= \'ω\')@
 78 | isGreekLetter :: Char -> Bool
 79 | isGreekLetter c = ('Α' <= c && c <= 'Ω') || ('α' <= c && c <= 'ω')
 80 | {-# inline isGreekLetter #-}
 81 | 
 82 | -- UTF conversions
 83 | --------------------------------------------------------------------------------
 84 | 
 85 | packBytes :: [Word64] -> Word64
 86 | packBytes = fst . foldl' go (0, 0) where
 87 |   go (acc, shift) w | shift == 64 = error "packBytes: too many bytes"
 88 |   go (acc, shift) w = (unsafeShiftL (fromIntegral w) shift .|. acc, shift+8)
 89 | 
 90 | -- TODO chunks into 8-bytes for 64-bit performance
 91 | splitBytes :: [Word64] -> ([Word64], [Word64])
 92 | splitBytes ws = case quotRem (length ws) 8 of
 93 |   (0, _) -> (ws, [])
 94 |   (_, r) -> (as, chunk8s bs) where
 95 |               (as, bs) = splitAt r ws
 96 |               chunk8s [] = []
 97 |               chunk8s ws = let (as, bs) = splitAt 8 ws in
 98 |                            packBytes as : chunk8s bs
 99 | 
100 | -- | Shortcut for 'indexCharOffAddr# addr# 0#'.
101 | derefChar8# :: Addr# -> Char#
102 | derefChar8# addr# = indexCharOffAddr# addr# 0#
103 | {-# inline derefChar8# #-}
104 | 
105 | --------------------------------------------------------------------------------
106 | 
107 | {- $boxed-integer-coercion
108 | 
109 | These functions should be no-ops. They correspond to the similarly-named GHC 9.4
110 | primops which work on unboxed integers.
111 | -}
112 | 
113 | -- | Coerce a 'Word16' to 'Int16'.
114 | word16ToInt16 :: Word16 -> Int16
115 | word16ToInt16 = fromIntegral
116 | {-# inline word16ToInt16 #-}
117 | 
118 | -- | Coerce a 'Word32' to 'Int32'.
119 | word32ToInt32 :: Word32 -> Int32
120 | word32ToInt32 = fromIntegral
121 | {-# inline word32ToInt32 #-}
122 | 
123 | -- | Coerce a 'Word64' to 'Int64'.
124 | word64ToInt64 :: Word64 -> Int64
125 | word64ToInt64 = fromIntegral
126 | {-# inline word64ToInt64 #-}
127 | 
128 | --------------------------------------------------------------------------------
129 | 
130 | -- | Assert for the given 'Int#' that @n >= 0@.
131 | --
132 | -- Throws a runtime error if given a negative integer.
133 | withPosInt# :: Int# -> r -> r
134 | withPosInt# n# r = case n# >=# 0# of
135 |   1# -> r
136 |   _  -> error "FlatParse.Basic.Base.withPosInt#: negative integer"
137 | {-# inline withPosInt# #-}
138 | 
139 | -- | Unwrap the 'Int#' from an 'Int' and apply it to the given function.
140 | withIntUnwrap# :: (Int# -> r) -> Int -> r
141 | withIntUnwrap# f (I# i#) = f i#
142 | {-# inline withIntUnwrap# #-}
143 | 
144 | --------------------------------------------------------------------------------
145 | 
146 | charToBytes :: Char -> [Word]
147 | charToBytes c'
148 |     | c <= 0x7f     = [fromIntegral c]
149 |     | c <= 0x7ff    = [0xc0 .|. y, 0x80 .|. z]
150 |     | c <= 0xffff   = [0xe0 .|. x, 0x80 .|. y, 0x80 .|. z]
151 |     | c <= 0x10ffff = [0xf0 .|. w, 0x80 .|. x, 0x80 .|. y, 0x80 .|. z]
152 |     | otherwise = error "Not a valid Unicode code point"
153 |   where
154 |     c = ord c'
155 |     z = fromIntegral (c                 .&. 0x3f)
156 |     y = fromIntegral (unsafeShiftR c 6  .&. 0x3f)
157 |     x = fromIntegral (unsafeShiftR c 12 .&. 0x3f)
158 |     w = fromIntegral (unsafeShiftR c 18 .&. 0x7)
159 | 
160 | strToBytes :: String -> [Word]
161 | strToBytes = concatMap charToBytes
162 | {-# inline strToBytes #-}
163 | 
164 | -- | Convert an UTF8-encoded `String` to a `B.ByteString`.
165 | strToUtf8 :: String -> B.ByteString
166 | strToUtf8 = UTF8.fromString
167 | {-# inline strToUtf8 #-}
168 | 
169 | -- | Convert a `B.ByteString` to an UTF8-encoded `String`.
170 | utf8ToStr :: B.ByteString -> String
171 | utf8ToStr = UTF8.toString
172 | {-# inline utf8ToStr #-}
173 | 
174 | --------------------------------------------------------------------------------
175 | 
176 | -- | Index of leftmost null byte, or (number of bytes in type) if not present.
177 | --
178 | -- Adapted from Hacker's Delight 6-1. Useful in big-endian environments.
179 | zbytel :: (FiniteBits a, Num a) => a -> Int
180 | zbytel = zbytel'toIdx . zbytel'intermediate
181 | {-# inline zbytel #-}
182 | 
183 | -- | bit mangling, returns 0 for inputs without a null byte
184 | --
185 | -- Separating allows us to skip some index calculation if there was no null byte.
186 | zbytel'intermediate :: (FiniteBits a, Num a) => a -> a
187 | zbytel'intermediate a =
188 |     let a' = (a .&. mask) + mask
189 |     in  complement (a' .|. a .|. mask)
190 |   where
191 |     mask = 0x7F7F7F7F7F7F7F7F
192 | {-# inline zbytel'intermediate #-}
193 | 
194 | -- | bit mangling, turns intermediate value into an index
195 | --
196 | -- Separating allows us to skip some index calculation if there was no null byte.
197 | zbytel'toIdx :: (FiniteBits a, Num a) => a -> Int
198 | zbytel'toIdx a = countLeadingZeros a `unsafeShiftR` 3
199 | {-# inline zbytel'toIdx #-}
200 | 
201 | -- | Index of rightmost null byte, or (number of bytes in type) if not present
202 | --
203 | -- Adapted from Hacker's Delight 6-1. Useful in little-endian environments.
204 | zbyter :: (FiniteBits a, Num a) => a -> Int
205 | zbyter = zbyter'toIdx . zbyter'intermediate
206 | {-# inline zbyter #-}
207 | 
208 | -- | bit mangling, returns 0 for inputs without a null byte
209 | --
210 | -- Separating allows us to skip some index calculation if there was no null byte.
211 | zbyter'intermediate :: (FiniteBits a, Num a) => a -> a
212 | zbyter'intermediate a = (a - 0x0101010101010101) .&. (complement a) .&. 0x8080808080808080
213 | {-# inline zbyter'intermediate #-}
214 | 
215 | -- | bit mangling, turns intermediate value into an index
216 | --
217 | -- Separating allows us to skip some index calculation if there was no null byte.
218 | zbyter'toIdx :: (FiniteBits a, Num a) => a -> Int
219 | zbyter'toIdx a = countTrailingZeros a `unsafeShiftR` 3
220 | {-# inline zbyter'toIdx #-}
221 | 


--------------------------------------------------------------------------------
/src/FlatParse/Common/Numbers.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples, BinaryLiterals #-}
  2 | 
  3 | module FlatParse.Common.Numbers where
  4 | 
  5 | import FlatParse.Common.Assorted ( shortInteger )
  6 | import Language.Haskell.TH.Syntax (lift)
  7 | 
  8 | import GHC.Exts
  9 | import GHC.ForeignPtr
 10 | 
 11 | import qualified Data.ByteString.Char8 as BC8
 12 | import qualified Data.ByteString.Internal as B
 13 | 
 14 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'.
 15 | --   Fails on overflow.
 16 | anyAsciiDecimalWord# :: Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #)
 17 | anyAsciiDecimalWord# eob s = case anyAsciiDecimalWord_# 0## eob s of
 18 |     (# | (# n, s' #) #) | 0# <- eqAddr# s s'
 19 |                         -> (# | (# n, s' #) #)
 20 |     _                   -> (# (# #) | #)
 21 | {-# inline anyAsciiDecimalWord# #-}
 22 | 
 23 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
 24 | --   Fails on overflow.
 25 | anyAsciiDecimalInt# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #)
 26 | anyAsciiDecimalInt# eob s = case anyAsciiDecimalWord_# 0## eob s of
 27 |     (# | (# n, s' #) #) | 0# <- eqAddr# s s'
 28 |                         , 1# <- leWord# n (int2Word# (unI# maxBound))
 29 |                         -> (# | (# word2Int# n, s' #) #)
 30 |     _                   -> (# (##) | #)
 31 | {-# inline anyAsciiDecimalInt# #-}
 32 | 
 33 | anyAsciiDecimalWord_# :: Word# -> Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #)
 34 | anyAsciiDecimalWord_# acc eob s = case eqAddr# s eob of
 35 |   1# -> (# | (# acc, s #) #)
 36 |   _  -> case indexWord8OffAddr# s 0# of
 37 | #if MIN_VERSION_base(4,16,0)
 38 |     w | 1# <- leWord8# (wordToWord8# 0x30##) w
 39 |       , 1# <- leWord8# w (wordToWord8# 0x39##)
 40 |       -> case timesWord2# acc 10## of
 41 |           (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x30##) of
 42 | #else
 43 |     w | 1# <- leWord# 0x30## w
 44 |       , 1# <- leWord# w 0x39##
 45 |       -> case timesWord2# acc 10## of
 46 |           (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x30##) of
 47 | #endif
 48 |             (# q, 0# #) -> anyAsciiDecimalWord_# q eob (s `plusAddr#` 1#)
 49 |             _           -> (# (##) | #)
 50 |           _             -> (# (##) | #)
 51 |     _ -> (# | (# acc, s #) #)
 52 | 
 53 | --------------------------------------------------------------------------------
 54 | 
 55 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
 56 | --   May overflow.
 57 | anyAsciiDecimalIntOverflow# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #)
 58 | anyAsciiDecimalIntOverflow# eob s = case anyAsciiDecimalIntOverflow_# 0# eob s of
 59 |     (# n, s' #) | 0# <- eqAddr# s s'
 60 |                 -> (# | (# n, s' #) #)
 61 | 
 62 |                 | otherwise
 63 |                 -> (# (##) | #)
 64 | {-# inline anyAsciiDecimalIntOverflow# #-}
 65 | 
 66 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'.
 67 | anyAsciiDecimalInteger# :: ForeignPtrContents -> Addr# -> Addr# -> (# (##) | (# Integer, Addr# #) #)
 68 | anyAsciiDecimalInteger# fp eob s = case anyAsciiDecimalIntOverflow_# 0# eob s of
 69 |   (# n, s' #)
 70 |     | 1# <- eqAddr# s s'            -> (# (##) | #)
 71 | 
 72 |     -- Simple heuristic, using the largest number of digits that can be in an
 73 |     -- 'Int#', such that we can use the 'IS' constructor.
 74 |     | 1# <- minusAddr# s' s <=# maxDigitsInt -> (# | (# shortInteger n, s' #) #)
 75 |     | otherwise -> case BC8.readInteger (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) of
 76 |         Nothing     -> (# (##) | #)
 77 |         Just (i, _) -> (# | (# i, s' #) #)
 78 |   where
 79 |     maxDigitsInt :: Int#
 80 |     !(I# maxDigitsInt) = $(let p e = 10 ^ e - 1 <= toInteger (maxBound :: Int)
 81 |                            in  lift $ last $ takeWhile p [0 :: Int ..])
 82 | {-# inline anyAsciiDecimalInteger# #-}
 83 | 
 84 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
 85 | --   May overflow.
 86 | anyAsciiDecimalIntOverflow_# :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #)
 87 | anyAsciiDecimalIntOverflow_# acc eob s = case eqAddr# s eob of
 88 |   1# -> (# acc, s #)
 89 |   _  -> case indexWord8OffAddr# s 0# of
 90 | #if MIN_VERSION_base(4,16,0)
 91 |     w | 1# <- leWord8# (wordToWord8# 0x30##) w, 1# <- leWord8# w (wordToWord8# 0x39##) ->
 92 |       anyAsciiDecimalIntOverflow_# (mul10# acc +# (word2Int# (word8ToWord# w) -# 0x30#)) eob (plusAddr# s 1#)
 93 | #else
 94 |     w | 1# <- leWord# 0x30## w, 1# <- leWord# w 0x39## ->
 95 |       anyAsciiDecimalIntOverflow_# (mul10# acc +# (word2Int# w -# 0x30#)) eob (plusAddr# s 1#)
 96 | #endif
 97 |     _ -> (# acc, s #)
 98 | 
 99 | --------------------------------------------------------------------------------
100 | 
101 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
102 | --   'Word'.
103 | --   Fails on overflow.
104 | anyAsciiHexWord# :: Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #)
105 | anyAsciiHexWord# eob s = case anyAsciiHexWord_# 0## eob s of
106 |     (# | (# n, s' #) #) | 0# <- eqAddr# s s'
107 |                         -> (# | (# n, s' #) #)
108 |     _                   -> (# (# #) | #)
109 | {-# inline anyAsciiHexWord# #-}
110 | 
111 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
112 | --   positive 'Int'.
113 | --   Fails on overflow.
114 | anyAsciiHexInt# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #)
115 | anyAsciiHexInt# eob s = case anyAsciiHexWord_# 0## eob s of
116 |     (# | (# n, s' #) #) | 0# <- eqAddr# s s'
117 |                         , 1# <- leWord# n (int2Word# (unI# maxBound))
118 |                         -> (# | (# word2Int# n, s' #) #)
119 | 
120 |                         | otherwise
121 |                         -> (# (##) | #)
122 |     (# (##) | #)        -> (# (##) | #)
123 | {-# inline anyAsciiHexInt# #-}
124 | 
125 | anyAsciiHexWord_# :: Word# -> Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #)
126 | anyAsciiHexWord_# acc eob s = case eqAddr# s eob of
127 |   1# -> (# | (# acc, s #) #)
128 |   _  -> case indexWord8OffAddr# s 0# of
129 | #if MIN_VERSION_base(4,16,0)
130 |     w | 1# <- leWord8# (wordToWord8# 0x30##) w
131 |       , 1# <- leWord8# w (wordToWord8# 0x39##)
132 |       -> case timesWord2# acc 16## of
133 |           (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x30##) of
134 | #else
135 |     w | 1# <- leWord# 0x30## w
136 |       , 1# <- leWord# w 0x39##
137 |       -> case timesWord2# acc 16## of
138 |           (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x30##) of
139 | #endif
140 |             (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#)
141 |             _           -> (# (##) | #)
142 |           _             -> (# (##) | #)
143 | #if MIN_VERSION_base(4,16,0)
144 |       | 1# <- leWord8# (wordToWord8# 0x41##) w
145 |       , 1# <- leWord8# w (wordToWord8# 0x46##)
146 |       -> case timesWord2# acc 16## of
147 |           (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x37##) of
148 | #else
149 |       | 1# <- leWord# 0x41## w
150 |       , 1# <- leWord# w 0x46##
151 |       -> case timesWord2# acc 16## of
152 |           (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x37##) of
153 | #endif
154 |             (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#)
155 |             _           -> (# (##) | #)
156 |           _             -> (# (##) | #)
157 | #if MIN_VERSION_base(4,16,0)
158 |       | 1# <- leWord8# (wordToWord8# 0x61##) w
159 |       , 1# <- leWord8# w (wordToWord8# 0x66##)
160 |       -> case timesWord2# acc 16## of
161 | 
162 |           (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x57##) of
163 | #else
164 |       | 1# <- leWord# 0x61## w
165 |       , 1# <- leWord# w 0x66##
166 |       -> case timesWord2# acc 16## of
167 | 
168 |           (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x57##) of
169 | #endif
170 |             (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#)
171 |             _           -> (# (##) | #)
172 |           _             -> (# (##) | #)
173 |     _ -> (# | (# acc, s #) #)
174 | 
175 | --------------------------------------------------------------------------------
176 | -- Zigzag encoding
177 | -- See: https://hackage.haskell.org/package/zigzag-0.0.1.0/docs/src/Data.Word.Zigzag.html
178 | 
179 | fromZigzagNative :: Word -> Int
180 | fromZigzagNative (W# w#) = I# (fromZigzagNative# w#)
181 | {-# inline fromZigzagNative #-}
182 | 
183 | -- GHC should optimize to this, but to be sure, here it is
184 | fromZigzagNative# :: Word# -> Int#
185 | fromZigzagNative# w# =
186 |     word2Int# ((w# `uncheckedShiftRL#` 1#) `xor#` (not# (w# `and#` 1##) `plusWord#` 1##))
187 | {-# inline fromZigzagNative# #-}
188 | 
189 | toZigzagNative :: Int -> Word
190 | toZigzagNative (I# i#) = W# (toZigzagNative# i#)
191 | {-# inline toZigzagNative #-}
192 | 
193 | -- GHC should optimize to this, but to be sure, here it is
194 | toZigzagNative# :: Int# -> Word#
195 | toZigzagNative# i# = toZigzagNative'# (int2Word# i#)
196 | {-# inline toZigzagNative# #-}
197 | 
198 | -- GHC should optimize to this, but to be sure, here it is
199 | toZigzagNative'# :: Word# -> Word#
200 | toZigzagNative'# w# =
201 |     (w# `uncheckedShiftL#` 1#) `xor#` (w# `uncheckedShiftRL#` 63#)
202 | {-# inline toZigzagNative'# #-}
203 | 
204 | --------------------------------------------------------------------------------
205 | 
206 | -- | protobuf style (LE, redundant, on continues)
207 | anyVarintProtobuf# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #)
208 | 
209 | #if MIN_VERSION_base(4,16,0)
210 | 
211 | anyVarintProtobuf# end# = go 0# 0#
212 |   where
213 |     word8ToInt# :: Word8# -> Int#
214 |     word8ToInt# w8# = word2Int# (word8ToWord# w8#)
215 |     {-# inline word8ToInt# #-}
216 |     go :: Int# -> Int# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #)
217 |     go i# n# s# = case eqAddr# s# end# of
218 |       1# -> (# (##) | #)
219 |       _  ->
220 |         let w8# = indexWord8OffAddr# s# 0#
221 |             w8'# = word8ToInt# (w8# `andWord8#` (wordToWord8# 0b01111111##))
222 |             i'# = i# `orI#` (w8'# `uncheckedIShiftL#` n#)
223 |             s'# = s# `plusAddr#` 1#
224 |             n'# = n# +# 7#
225 |         in  case w8# `geWord8#` wordToWord8# 0b10000000## of
226 |               1# -> go i'# n'# s'#
227 |               _  -> (# | (# i'#, s'#, n'# #) #)
228 | 
229 | #else
230 | 
231 | anyVarintProtobuf# end# = go 0# 0#
232 |   where
233 |     go :: Int# -> Int# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #)
234 |     go i# n# s# = case eqAddr# s# end# of
235 |       1# -> (# (##) | #)
236 |       _  ->
237 |         let w8# = indexWord8OffAddr# s# 0#
238 |             w8'# = word2Int# (w8# `and#` 0b01111111##)
239 |             i'# = i# `orI#` (w8'# `uncheckedIShiftL#` n#)
240 |             s'# = s# `plusAddr#` 1#
241 |             n'# = n# +# 7#
242 |         in  case w8# `geWord#` 0b10000000## of
243 |               1# -> go i'# n'# s'#
244 |               _  -> (# | (# i'#, s'#, n'# #) #)
245 | 
246 | #endif
247 | 
248 | --------------------------------------------------------------------------------
249 | 
250 | unI# :: Int -> Int#
251 | unI# (I# i) = i
252 | {-# inline unI# #-}
253 | 
254 | mul10# :: Int# -> Int#
255 | mul10# n = uncheckedIShiftL# n 3# +# uncheckedIShiftL# n 1#
256 | {-# inline mul10# #-}
257 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Text.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples #-}
  2 | 
  3 | -- | Parsers for textual data (UTF-8, ASCII).
  4 | 
  5 | module FlatParse.Basic.Text
  6 |   (
  7 |   -- * UTF-8
  8 |     char, string
  9 |   , anyChar, skipAnyChar
 10 |   , satisfy, skipSatisfy
 11 |   , fusedSatisfy, skipFusedSatisfy
 12 |   , takeLine
 13 |   , takeRestString
 14 | 
 15 |   -- * ASCII
 16 |   , anyAsciiChar, skipAnyAsciiChar
 17 |   , satisfyAscii, skipSatisfyAscii
 18 | 
 19 |   -- ** ASCII-encoded numbers
 20 |   , anyAsciiDecimalWord
 21 |   , anyAsciiDecimalInt
 22 |   , anyAsciiDecimalInteger
 23 |   , anyAsciiHexWord
 24 |   , anyAsciiHexInt
 25 | 
 26 |   -- * Debugging parsers
 27 |   , traceLine
 28 |   , traceRest
 29 |   ) where
 30 | 
 31 | import FlatParse.Basic.Parser
 32 | import FlatParse.Basic.Base ( withEnsure1, lookahead, eof, branch )
 33 | import FlatParse.Basic.Bytes ( bytes )
 34 | 
 35 | import FlatParse.Common.GHCExts
 36 | 
 37 | import Language.Haskell.TH
 38 | import qualified FlatParse.Common.Numbers as Common
 39 | import qualified FlatParse.Common.Assorted as Common
 40 | 
 41 | -- | Parse any single Unicode character encoded using UTF-8 as a 'Char'.
 42 | anyChar :: ParserT st e Char
 43 | anyChar = ParserT \fp eob buf st -> case eqAddr# eob buf of
 44 |   1# -> Fail# st
 45 |   _  -> case Common.derefChar8# buf of
 46 |     c1 -> case c1 `leChar#` '\x7F'# of
 47 |       1# -> OK# st (C# c1) (plusAddr# buf 1#)
 48 |       _  -> case eqAddr# eob (plusAddr# buf 1#) of
 49 |         1# -> Fail# st
 50 |         _ -> case indexCharOffAddr# buf 1# of
 51 |           c2 -> case c1 `leChar#` '\xDF'# of
 52 |             1# ->
 53 |               let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
 54 |                           (ord# c2 -# 0x80#)
 55 |               in OK# st (C# (chr# resc)) (plusAddr# buf 2#)
 56 |             _ -> case eqAddr# eob (plusAddr# buf 2#) of
 57 |               1# -> Fail# st
 58 |               _  -> case indexCharOffAddr# buf 2# of
 59 |                 c3 -> case c1 `leChar#` '\xEF'# of
 60 |                   1# ->
 61 |                     let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
 62 |                                ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
 63 |                                 (ord# c3 -# 0x80#)
 64 |                     in OK# st (C# (chr# resc)) (plusAddr# buf 3#)
 65 |                   _ -> case eqAddr# eob (plusAddr# buf 3#) of
 66 |                     1# -> Fail# st
 67 |                     _  -> case indexCharOffAddr# buf 3# of
 68 |                       c4 ->
 69 |                         let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
 70 |                                    ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
 71 |                                    ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
 72 |                                     (ord# c4 -# 0x80#)
 73 |                         in OK# st (C# (chr# resc)) (plusAddr# buf 4#)
 74 | {-# inline anyChar #-}
 75 | 
 76 | -- | Skip any single Unicode character encoded using UTF-8.
 77 | skipAnyChar :: ParserT st e ()
 78 | skipAnyChar = ParserT \fp eob buf st -> case eqAddr# eob buf of
 79 |   1# -> Fail# st
 80 |   _  -> case Common.derefChar8# buf of
 81 |     c1 -> case c1 `leChar#` '\x7F'# of
 82 |       1# -> OK# st () (plusAddr# buf 1#)
 83 |       _  ->
 84 |         let buf' =
 85 |               case c1 `leChar#` '\xDF'# of
 86 |                 1# -> plusAddr# buf 2#
 87 |                 _  -> case c1 `leChar#` '\xEF'# of
 88 |                     1# -> plusAddr# buf 3#
 89 |                     _ ->  plusAddr# buf 4#
 90 |         in case leAddr# buf' eob of
 91 |              1# -> OK#   st () buf'
 92 |              _  -> Fail# st
 93 | {-# inline skipAnyChar #-}
 94 | 
 95 | withSatisfy
 96 |     :: (Char -> Bool) -> (Char -> ParserT st e r) -> ParserT st e r
 97 | withSatisfy f p = ParserT \fp eob s st ->
 98 |     case runParserT# anyChar fp eob s st of
 99 |       -- This is OK# unfolded, to silence incomplete pattern warnings
100 |       -- in GHC <= 8.8.4.
101 |       (# st, (# (# c, s #) | | #) #) | f c -> runParserT# (p c) fp eob s st
102 | 
103 |       (# st, _ #)                          -> Fail# st
104 | {-# inline withSatisfy #-}
105 | 
106 | -- | Parse a UTF-8 'Char' for which a predicate holds.
107 | satisfy :: (Char -> Bool) -> ParserT st e Char
108 | satisfy f = withSatisfy f pure
109 | {-# inline satisfy #-}
110 | 
111 | -- | Skip a UTF-8 `Char` for which a predicate holds.
112 | skipSatisfy :: (Char -> Bool) -> ParserT st e ()
113 | skipSatisfy f = withSatisfy f (\_ -> pure ())
114 | {-# inline skipSatisfy #-}
115 | 
116 | withSatisfyAscii :: (Char -> Bool) -> (Char -> ParserT st e r) -> ParserT st e r
117 | withSatisfyAscii f p = withEnsure1 $ ParserT \fp eob s st ->
118 |     case Common.derefChar8# s of
119 |       c1 | f (C# c1) -> runParserT# (p (C# c1)) fp eob (plusAddr# s 1#) st
120 |          | otherwise -> Fail# st
121 | {-# inline withSatisfyAscii #-}
122 | 
123 | -- | Parse an ASCII `Char` for which a predicate holds.
124 | --
125 | -- Assumption: the predicate must only return 'True' for ASCII-range characters.
126 | -- Otherwise this function might read a 128-255 range byte, thereby breaking
127 | -- UTF-8 decoding.
128 | satisfyAscii :: (Char -> Bool) -> ParserT st e Char
129 | satisfyAscii f = withSatisfyAscii f pure
130 | {-# inline satisfyAscii #-}
131 | 
132 | -- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate
133 | --   must only return `True` for ASCII-range characters.
134 | skipSatisfyAscii :: (Char -> Bool) -> ParserT st e ()
135 | skipSatisfyAscii f = withSatisfyAscii f (\_ -> pure ())
136 | {-# inline skipSatisfyAscii #-}
137 | 
138 | -- | This is a variant of `satisfy` which allows more optimization. We can pick four testing
139 | --   functions for the four cases for the possible number of bytes in the UTF-8 character. So in
140 | --   @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with
141 | --   @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups.
142 | --
143 | --   For example, if we want to accept any letter, the naive solution would be to use
144 | --   `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We
145 | --   can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the
146 | --   `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of
147 | --   all cases without accessing the character table.
148 | fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st e Char
149 | fusedSatisfy f1 f2 f3 f4 = ParserT \fp eob buf st ->
150 |     case eqAddr# eob buf of
151 |       1# -> Fail# st
152 |       _  -> case Common.derefChar8# buf of
153 |         c1 -> case c1 `leChar#` '\x7F'# of
154 |           1# | f1 (C# c1) -> OK#   st (C# c1) (plusAddr# buf 1#)
155 |              | otherwise  -> Fail# st
156 |           _  -> case eqAddr# eob (plusAddr# buf 1#) of
157 |             1# -> Fail# st
158 |             _ -> case indexCharOffAddr# buf 1# of
159 |               c2 -> case c1 `leChar#` '\xDF'# of
160 |                 1# ->
161 |                   let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
162 |                                        (ord# c2 -# 0x80#)))
163 |                   in case f2 resc of
164 |                        True -> OK#   st resc (plusAddr# buf 2#)
165 |                        _    -> Fail# st
166 |                 _ -> case eqAddr# eob (plusAddr# buf 2#) of
167 |                   1# -> Fail# st
168 |                   _  -> case indexCharOffAddr# buf 2# of
169 |                     c3 -> case c1 `leChar#` '\xEF'# of
170 |                       1# ->
171 |                         let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
172 |                                              ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
173 |                                              (ord# c3 -# 0x80#)))
174 |                         in case f3 resc of
175 |                              True -> OK#   st resc (plusAddr# buf 3#)
176 |                              _    -> Fail# st
177 |                       _ -> case eqAddr# eob (plusAddr# buf 3#) of
178 |                         1# -> Fail# st
179 |                         _  -> case indexCharOffAddr# buf 3# of
180 |                           c4 ->
181 |                             let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
182 |                                                  ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
183 |                                                  ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
184 |                                                   (ord# c4 -# 0x80#)))
185 |                             in case f4 resc of
186 |                                  True -> OK#   st resc (plusAddr# buf 4#)
187 |                                  _    -> Fail# st
188 | {-# inline fusedSatisfy #-}
189 | 
190 | -- | Skipping variant of `fusedSatisfy`.
191 | skipFusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e ()
192 | skipFusedSatisfy f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4
193 | {-# inline skipFusedSatisfy #-}
194 | 
195 | --------------------------------------------------------------------------------
196 | 
197 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'.
198 | --   Fails on overflow.
199 | anyAsciiDecimalWord :: ParserT st e Word
200 | anyAsciiDecimalWord = ParserT \fp eob s st ->
201 |     case Common.anyAsciiDecimalWord# eob s of
202 |       (# | (# w, s' #) #) -> OK#   st (W# w) s'
203 |       (# (##) | #)        -> Fail# st
204 | {-# inline anyAsciiDecimalWord #-}
205 | 
206 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
207 | --   Fails on overflow.
208 | anyAsciiDecimalInt :: ParserT st e Int
209 | anyAsciiDecimalInt = ParserT \fp eob s st ->
210 |     case Common.anyAsciiDecimalInt# eob s of
211 |       (# | (# n, s' #) #) -> OK#   st (I# n) s'
212 |       (# (##) | #)        -> Fail# st
213 | {-# inline anyAsciiDecimalInt #-}
214 | 
215 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'.
216 | anyAsciiDecimalInteger :: ParserT st e Integer
217 | anyAsciiDecimalInteger = ParserT \fp eob s st ->
218 |     case Common.anyAsciiDecimalInteger# fp eob s of
219 |       (# | (# i, s' #) #) -> OK#   st i s'
220 |       (# (##) | #)        -> Fail# st
221 | {-# inline anyAsciiDecimalInteger #-}
222 | 
223 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
224 | --   'Word'.
225 | --   Fails on overflow.
226 | anyAsciiHexWord :: ParserT st e Word
227 | anyAsciiHexWord = ParserT \fp eob s st ->
228 |     case Common.anyAsciiHexWord# eob s of
229 |       (# | (# w, s' #) #) -> OK#   st (W# w) s'
230 |       (# (##) | #)        -> Fail# st
231 | {-# inline anyAsciiHexWord #-}
232 | 
233 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
234 | --   positive 'Int'.
235 | --   Fails on overflow.
236 | anyAsciiHexInt :: ParserT st e Int
237 | anyAsciiHexInt = ParserT \fp eob s st ->
238 |     case Common.anyAsciiHexInt# eob s of
239 |       (# | (# n, s' #) #) -> OK#   st (I# n) s'
240 |       (# (##) | #)        -> Fail# st
241 | {-# inline anyAsciiHexInt #-}
242 | 
243 | --------------------------------------------------------------------------------
244 | 
245 | -- | Parse any single ASCII character (a single byte) as a 'Char'.
246 | --
247 | -- More efficient than 'anyChar' for ASCII-only input.
248 | anyAsciiChar :: ParserT st e Char
249 | anyAsciiChar = withEnsure1 $ ParserT \fp eob buf st ->
250 |     case Common.derefChar8# buf of
251 |       c1 -> case c1 `leChar#` '\x7F'# of
252 |               1# -> OK#   st (C# c1) (plusAddr# buf 1#)
253 |               _  -> Fail# st
254 | {-# inline anyAsciiChar #-}
255 | 
256 | -- | Skip any single ASCII character (a single byte).
257 | --
258 | -- More efficient than 'skipAnyChar' for ASCII-only input.
259 | skipAnyAsciiChar :: ParserT st e ()
260 | skipAnyAsciiChar = () <$ anyAsciiChar
261 | {-# inline skipAnyAsciiChar #-}
262 | 
263 | --------------------------------------------------------------------------------
264 | 
265 | -- | Parse a UTF-8 character literal. This is a template function, you can use it as
266 | --   @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@.
267 | char :: Char -> Q Exp
268 | char c = string [c]
269 | 
270 | -- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@,
271 | --   for example, and the splice has type @Parser e ()@.
272 | string :: String -> Q Exp
273 | string str = bytes (Common.strToBytes str)
274 | 
275 | --------------------------------------------------------------------------------
276 | 
277 | -- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding,
278 | --   throws an error if the encoding is invalid.
279 | takeLine :: ParserT st e String
280 | takeLine = branch eof (pure "") do
281 |   c <- anyChar
282 |   case c of
283 |     '\n' -> pure ""
284 |     _    -> (c:) <$> takeLine
285 | 
286 | -- | Parse the rest of the current line as a `String`, but restore the parsing state.
287 | --   Assumes UTF-8 encoding. This can be used for debugging.
288 | traceLine :: ParserT st e String
289 | traceLine = lookahead takeLine
290 | 
291 | -- | Take the rest of the input as a `String`. Assumes UTF-8 encoding.
292 | takeRestString :: ParserT st e String
293 | takeRestString = branch eof (pure "") do
294 |   c <- anyChar
295 |   cs <- takeRestString
296 |   pure (c:cs)
297 | 
298 | -- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding.
299 | --   This can be used for debugging.
300 | traceRest :: ParserT st e String
301 | traceRest = lookahead takeRestString
302 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Text.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples #-}
  2 | 
  3 | -- | Parsers and textual data (UTF-8, ASCII).
  4 | 
  5 | module FlatParse.Stateful.Text
  6 |   (
  7 |   -- * UTF-8
  8 |     char, string
  9 |   , anyChar, skipAnyChar
 10 |   , satisfy, skipSatisfy
 11 |   , fusedSatisfy, skipFusedSatisfy
 12 |   , takeLine
 13 |   , takeRestString
 14 | 
 15 |   -- * ASCII
 16 |   , anyAsciiChar, skipAnyAsciiChar
 17 |   , satisfyAscii, skipSatisfyAscii
 18 | 
 19 |   -- ** ASCII-encoded numbers
 20 |   , anyAsciiDecimalWord
 21 |   , anyAsciiDecimalInt
 22 |   , anyAsciiDecimalInteger
 23 |   , anyAsciiHexWord
 24 |   , anyAsciiHexInt
 25 | 
 26 |   -- * Debugging parsers
 27 |   , traceLine
 28 |   , traceRest
 29 |   ) where
 30 | 
 31 | import FlatParse.Stateful.Parser
 32 | import FlatParse.Stateful.Base ( withEnsure1, lookahead, eof, branch )
 33 | import FlatParse.Stateful.Bytes ( bytes )
 34 | 
 35 | import FlatParse.Common.GHCExts
 36 | 
 37 | import Language.Haskell.TH
 38 | import qualified FlatParse.Common.Numbers as Common
 39 | import qualified FlatParse.Common.Assorted as Common
 40 | 
 41 | -- | Parse any single Unicode character encoded using UTF-8 as a 'Char'.
 42 | anyChar :: ParserT st r e Char
 43 | anyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
 44 |   1# -> Fail# st
 45 |   _  -> case Common.derefChar8# buf of
 46 |     c1 -> case c1 `leChar#` '\x7F'# of
 47 |       1# -> OK# st (C# c1) (plusAddr# buf 1#) n
 48 |       _  -> case eqAddr# eob (plusAddr# buf 1#) of
 49 |         1# -> Fail# st
 50 |         _ -> case indexCharOffAddr# buf 1# of
 51 |           c2 -> case c1 `leChar#` '\xDF'# of
 52 |             1# ->
 53 |               let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
 54 |                           (ord# c2 -# 0x80#)
 55 |               in OK# st (C# (chr# resc)) (plusAddr# buf 2#) n
 56 |             _ -> case eqAddr# eob (plusAddr# buf 2#) of
 57 |               1# -> Fail# st
 58 |               _  -> case indexCharOffAddr# buf 2# of
 59 |                 c3 -> case c1 `leChar#` '\xEF'# of
 60 |                   1# ->
 61 |                     let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
 62 |                                ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
 63 |                                 (ord# c3 -# 0x80#)
 64 |                     in OK# st (C# (chr# resc)) (plusAddr# buf 3#) n
 65 |                   _ -> case eqAddr# eob (plusAddr# buf 3#) of
 66 |                     1# -> Fail# st
 67 |                     _  -> case indexCharOffAddr# buf 3# of
 68 |                       c4 ->
 69 |                         let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
 70 |                                    ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
 71 |                                    ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
 72 |                                     (ord# c4 -# 0x80#)
 73 |                         in OK# st (C# (chr# resc)) (plusAddr# buf 4#) n
 74 | {-# inline anyChar #-}
 75 | 
 76 | -- | Skip any single Unicode character encoded using UTF-8.
 77 | skipAnyChar :: ParserT st r e ()
 78 | skipAnyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
 79 |   1# -> Fail# st
 80 |   _  -> case Common.derefChar8# buf of
 81 |     c1 -> case c1 `leChar#` '\x7F'# of
 82 |       1# -> OK# st () (plusAddr# buf 1#) n
 83 |       _  ->
 84 |         let buf' =
 85 |               case c1 `leChar#` '\xDF'# of
 86 |                 1# -> plusAddr# buf 2#
 87 |                 _  -> case c1 `leChar#` '\xEF'# of
 88 |                     1# -> plusAddr# buf 3#
 89 |                     _ ->  plusAddr# buf 4#
 90 |         in case leAddr# buf' eob of
 91 |              1# -> OK#   st () buf' n
 92 |              _  -> Fail# st
 93 | {-# inline skipAnyChar #-}
 94 | 
 95 | withSatisfy
 96 |     :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret
 97 | withSatisfy f p = ParserT \fp !r eob s n st ->
 98 |     case runParserT# anyChar fp r eob s n st of
 99 |       -- This is OK# unfolded, to silence incomplete pattern warnings
100 |       -- in GHC <= 8.8.4.
101 |       (# st, (# (# c, s, n #) | | #) #) | f c -> runParserT# (p c) fp r eob s n st
102 |       (# st, _ #)                             -> Fail# st
103 | {-# inline withSatisfy #-}
104 | 
105 | -- | Parse a UTF-8 'Char' for which a predicate holds.
106 | satisfy :: (Char -> Bool) -> ParserT st r e Char
107 | satisfy f = withSatisfy f pure
108 | {-# inline satisfy #-}
109 | 
110 | -- | Skip a UTF-8 `Char` for which a predicate holds.
111 | skipSatisfy :: (Char -> Bool) -> ParserT st r e ()
112 | skipSatisfy f = withSatisfy f (\_ -> pure ())
113 | {-# inline skipSatisfy #-}
114 | 
115 | withSatisfyAscii
116 |     :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret
117 | withSatisfyAscii f p = withEnsure1 $ ParserT \fp !r eob s n st ->
118 |     case Common.derefChar8# s of
119 |       c1 | f (C# c1) -> runParserT# (p (C# c1)) fp r eob (plusAddr# s 1#) n st
120 |          | otherwise -> Fail# st
121 | {-#  inline withSatisfyAscii #-}
122 | 
123 | -- | Parse an ASCII 'Char' for which a predicate holds.
124 | --
125 | -- Assumption: the predicate must only return 'True' for ASCII-range characters.
126 | -- Otherwise this function might read a 128-255 range byte, thereby breaking
127 | -- UTF-8 decoding.
128 | satisfyAscii :: (Char -> Bool) -> ParserT st r e Char
129 | satisfyAscii f = withSatisfyAscii f pure
130 | {-#  inline satisfyAscii #-}
131 | 
132 | -- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate
133 | --   must only return `True` for ASCII-range characters.
134 | skipSatisfyAscii :: (Char -> Bool) -> ParserT st r e ()
135 | skipSatisfyAscii f = withSatisfyAscii f (\_ -> pure ())
136 | {-#  inline skipSatisfyAscii #-}
137 | 
138 | -- | This is a variant of `satisfy` which allows more optimization. We can pick four testing
139 | --   functions for the four cases for the possible number of bytes in the UTF-8 character. So in
140 | --   @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with
141 | --   @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups.
142 | --
143 | --   For example, if we want to accept any letter, the naive solution would be to use
144 | --   `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We
145 | --   can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the
146 | --   `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of
147 | --   all cases without accessing the character table.
148 | fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e Char
149 | fusedSatisfy f1 f2 f3 f4 = ParserT \fp !r eob buf n st ->
150 |     case eqAddr# eob buf of
151 |       1# -> Fail# st
152 |       _  -> case Common.derefChar8# buf of
153 |         c1 -> case c1 `leChar#` '\x7F'# of
154 |           1# | f1 (C# c1) -> OK#   st (C# c1) (plusAddr# buf 1#) n
155 |              | otherwise  -> Fail# st
156 |           _  -> case eqAddr# eob (plusAddr# buf 1#) of
157 |             1# -> Fail# st
158 |             _ -> case indexCharOffAddr# buf 1# of
159 |               c2 -> case c1 `leChar#` '\xDF'# of
160 |                 1# ->
161 |                   let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#`
162 |                                        (ord# c2 -# 0x80#)))
163 |                   in case f2 resc of
164 |                        True -> OK#   st resc (plusAddr# buf 2#) n
165 |                        _    -> Fail# st
166 |                 _ -> case eqAddr# eob (plusAddr# buf 2#) of
167 |                   1# -> Fail# st
168 |                   _  -> case indexCharOffAddr# buf 2# of
169 |                     c3 -> case c1 `leChar#` '\xEF'# of
170 |                       1# ->
171 |                         let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#`
172 |                                              ((ord# c2 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
173 |                                              (ord# c3 -# 0x80#)))
174 |                         in case f3 resc of
175 |                              True -> OK#   st resc (plusAddr# buf 3#) n
176 |                              _    -> Fail# st
177 |                       _ -> case eqAddr# eob (plusAddr# buf 3#) of
178 |                         1# -> Fail# st
179 |                         _  -> case indexCharOffAddr# buf 3# of
180 |                           c4 ->
181 |                             let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#`
182 |                                                  ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#`
183 |                                                  ((ord# c3 -# 0x80#) `uncheckedIShiftL#`  6#) `orI#`
184 |                                                   (ord# c4 -# 0x80#)))
185 |                             in case f4 resc of
186 |                                  True -> OK#   st resc (plusAddr# buf 4#) n
187 |                                  _    -> Fail# st
188 | {-# inline fusedSatisfy #-}
189 | 
190 | -- | Skipping variant of `fusedSatisfy`.
191 | skipFusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e ()
192 | skipFusedSatisfy f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4
193 | {-# inline skipFusedSatisfy #-}
194 | 
195 | --------------------------------------------------------------------------------
196 | 
197 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'.
198 | --   Fails on overflow.
199 | anyAsciiDecimalWord :: ParserT st r e Word
200 | anyAsciiDecimalWord = ParserT \fp !r eob s n st ->
201 |     case Common.anyAsciiDecimalWord# eob s of
202 |       (# | (# w, s' #) #) -> OK#   st (W# w) s' n
203 |       (# (##) | #)        -> Fail# st
204 | {-# inline anyAsciiDecimalWord #-}
205 | 
206 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'.
207 | --   Fails on overflow.
208 | anyAsciiDecimalInt :: ParserT st r e Int
209 | anyAsciiDecimalInt = ParserT \fp !r eob s n st ->
210 |     case Common.anyAsciiDecimalInt# eob s of
211 |       (# | (# i, s' #) #) -> OK#   st (I# i) s' n
212 |       (# (##) | #)        -> Fail# st
213 | {-# inline anyAsciiDecimalInt #-}
214 | 
215 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'.
216 | anyAsciiDecimalInteger :: ParserT st r e Integer
217 | anyAsciiDecimalInteger = ParserT \fp !r eob s n st ->
218 |     case Common.anyAsciiDecimalInteger# fp eob s of
219 |       (# | (# i, s' #) #) -> OK#   st i s' n
220 |       (# (##) | #)        -> Fail# st
221 | {-# inline anyAsciiDecimalInteger #-}
222 | 
223 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
224 | --   'Word'.
225 | --   Fails on overflow.
226 | anyAsciiHexWord :: ParserT st r e Word
227 | anyAsciiHexWord = ParserT \fp !r eob s n st ->
228 |     case Common.anyAsciiHexWord# eob s of
229 |       (# | (# w, s' #) #) -> OK#   st (W# w) s' n
230 |       (# (##) | #)        -> Fail# st
231 | {-# inline anyAsciiHexWord #-}
232 | 
233 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a
234 | --   positive 'Int'.
235 | --   Fails on overflow.
236 | anyAsciiHexInt :: ParserT st r e Int
237 | anyAsciiHexInt = ParserT \fp !r eob s n st ->
238 |     case Common.anyAsciiHexInt# eob s of
239 |       (# | (# i, s' #) #) -> OK#   st (I# i) s' n
240 |       (# (##) | #)        -> Fail# st
241 | {-# inline anyAsciiHexInt #-}
242 | 
243 | --------------------------------------------------------------------------------
244 | 
245 | -- | Parse any single ASCII character (a single byte) as a 'Char'.
246 | --
247 | -- More efficient than 'anyChar' for ASCII-only input.
248 | anyAsciiChar :: ParserT st r e Char
249 | anyAsciiChar = withEnsure1 $ ParserT \fp !r eob buf n st ->
250 |     case Common.derefChar8# buf of
251 |       c1 -> case c1 `leChar#` '\x7F'# of
252 |               1# -> OK#   st (C# c1) (plusAddr# buf 1#) n
253 |               _  -> Fail# st
254 | {-# inline anyAsciiChar #-}
255 | 
256 | -- | Skip any single ASCII character (a single byte).
257 | --
258 | -- More efficient than 'anyChar_' for ASCII-only input.
259 | skipAnyAsciiChar :: ParserT st r e ()
260 | skipAnyAsciiChar = () <$ anyAsciiChar
261 | {-# inline skipAnyAsciiChar #-}
262 | 
263 | --------------------------------------------------------------------------------
264 | 
265 | -- | Parse a UTF-8 character literal. This is a template function, you can use it as
266 | --   @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@.
267 | char :: Char -> Q Exp
268 | char c = string [c]
269 | 
270 | -- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@,
271 | --   for example, and the splice has type @Parser e ()@.
272 | string :: String -> Q Exp
273 | string str = bytes (Common.strToBytes str)
274 | 
275 | --------------------------------------------------------------------------------
276 | 
277 | -- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding,
278 | --   throws an error if the encoding is invalid.
279 | takeLine :: ParserT st r e String
280 | takeLine = branch eof (pure "") do
281 |   c <- anyChar
282 |   case c of
283 |     '\n' -> pure ""
284 |     _    -> (c:) <$> takeLine
285 | 
286 | -- | Parse the rest of the current line as a `String`, but restore the parsing state.
287 | --   Assumes UTF-8 encoding. This can be used for debugging.
288 | traceLine :: ParserT st r e String
289 | traceLine = lookahead takeLine
290 | 
291 | -- | Take the rest of the input as a `String`. Assumes UTF-8 encoding.
292 | takeRestString :: ParserT st r e String
293 | takeRestString = branch eof (pure "") do
294 |   c <- anyChar
295 |   cs <- takeRestString
296 |   pure (c:cs)
297 | 
298 | -- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding.
299 | --   This can be used for debugging.
300 | traceRest :: ParserT st r e String
301 | traceRest = lookahead takeRestString
302 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Integers.hs:
--------------------------------------------------------------------------------
  1 | -- | Machine integer parsers.
  2 | 
  3 | module FlatParse.Basic.Integers
  4 |   (
  5 |   -- * Native byte order
  6 |     anyWord8, anyWord16, anyWord32, anyWord64
  7 |   , anyInt8,  anyInt16,  anyInt32,  anyInt64
  8 |   , anyWord, anyInt
  9 | 
 10 |   -- * Explicit endianness
 11 |   -- $explicit-endianness
 12 |   , anyWord16le, anyWord16be
 13 |   , anyWord32le, anyWord32be
 14 |   , anyWord64le, anyWord64be
 15 |   , anyInt16le,  anyInt16be
 16 |   , anyInt32le,  anyInt32be
 17 |   , anyInt64le,  anyInt64be
 18 | 
 19 |   -- * Value assertions
 20 |   , word8
 21 | 
 22 |   -- * CPS parsers
 23 |   , withAnyWord8, withAnyWord16, withAnyWord32, withAnyWord64
 24 |   , withAnyInt8,  withAnyInt16,  withAnyInt32,  withAnyInt64
 25 |   , withAnyWord, withAnyInt
 26 | 
 27 |   -- * Unsafe
 28 |   -- $unsafe
 29 |   , anyWord8Unsafe
 30 | 
 31 |   -- ** Value assertions
 32 |   , word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe
 33 | 
 34 |   -- * Helper definitions
 35 |   , withAnySized#, withAnySizedUnsafe#
 36 |   , sizedUnsafe#
 37 |   ) where
 38 | 
 39 | -- for WORDS_BIGENDIAN
 40 | #include "MachDeps.h"
 41 | 
 42 | import FlatParse.Basic.Parser
 43 | import FlatParse.Basic.Base ( withEnsure# )
 44 | 
 45 | import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 )
 46 | 
 47 | import FlatParse.Common.GHCExts
 48 | import GHC.Word
 49 | import GHC.Int
 50 | 
 51 | import Control.Applicative ( Alternative(empty) )
 52 | 
 53 | --------------------------------------------------------------------------------
 54 | 
 55 | -- | Helper for defining CPS parsers for types of a constant byte size (i.e.
 56 | --   machine integers).
 57 | --
 58 | -- Call this with an @indexXYZOffAddr@ primop (e.g.
 59 | -- 'GHC.Exts.indexWord8OffAddr') and the size in bytes of the type you're
 60 | -- parsing.
 61 | withAnySized#
 62 |     :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st e r) -> ParserT st e r
 63 | withAnySized# size# indexOffAddr p =
 64 |     withEnsure# size# (withAnySizedUnsafe# size# indexOffAddr p)
 65 | {-# inline withAnySized# #-}
 66 | 
 67 | -- | Unsafe helper for defining CPS parsers for types of a constant byte size
 68 | --   (i.e. machine integers).
 69 | --
 70 | -- Is really just syntactic sugar for applying the given parser and shifting the
 71 | -- buffer along.
 72 | --
 73 | -- The caller must guarantee that the input has enough bytes.
 74 | withAnySizedUnsafe#
 75 |     :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st e r) -> ParserT st e r
 76 | withAnySizedUnsafe# size# indexOffAddr p = ParserT \fp eob buf st ->
 77 |   let !a   = indexOffAddr buf 0#
 78 |       buf' = plusAddr# buf size#
 79 |   in  runParserT# (p a) fp eob buf' st
 80 | {-# inline withAnySizedUnsafe# #-}
 81 | 
 82 | -- | Parse any 'Word8' (CPS).
 83 | withAnyWord8 :: (Word8 -> ParserT st e r) -> ParserT st e r
 84 | withAnyWord8 p = ParserT \fp eob buf st -> case eqAddr# eob buf of
 85 |   1# -> Fail# st
 86 |   _  -> let w# = indexWord8OffAddr# buf 0#
 87 |         in  runParserT# (p (W8# w#)) fp eob (plusAddr# buf 1#) st
 88 | {-# inline withAnyWord8 #-}
 89 | 
 90 | -- | Parse any 'Word16' (native byte order) (CPS).
 91 | withAnyWord16 :: (Word16 -> ParserT st e r) -> ParserT st e r
 92 | withAnyWord16 = withAnySized# 2# (\a i -> W16# (indexWord16OffAddr# a i))
 93 | {-# inline withAnyWord16 #-}
 94 | 
 95 | -- | Parse any 'Word32' (native byte order) (CPS).
 96 | withAnyWord32 :: (Word32 -> ParserT st e r) -> ParserT st e r
 97 | withAnyWord32 = withAnySized# 4# (\a i -> W32# (indexWord32OffAddr# a i))
 98 | {-# inline withAnyWord32 #-}
 99 | 
100 | -- | Parse any 'Word64' (native byte order) (CPS).
101 | withAnyWord64 :: (Word64 -> ParserT st e r) -> ParserT st e r
102 | withAnyWord64 = withAnySized# 8# (\a i -> W64# (indexWord64OffAddr# a i))
103 | {-# inline withAnyWord64 #-}
104 | 
105 | -- | Parse any 'Int8' (CPS).
106 | withAnyInt8 :: (Int8 -> ParserT st e r) -> ParserT st e r
107 | withAnyInt8 p = ParserT \fp eob buf st -> case eqAddr# eob buf of
108 |   1# -> Fail# st
109 |   _  -> let i# = indexInt8OffAddr# buf 0#
110 |         in  runParserT# (p (I8# i#)) fp eob (plusAddr# buf 1#) st
111 | {-# inline withAnyInt8 #-}
112 | 
113 | -- | Parse any 'Int16' (native byte order) (CPS).
114 | withAnyInt16 :: (Int16 -> ParserT st e r) -> ParserT st e r
115 | withAnyInt16 = withAnySized# 2# (\a i -> I16# (indexInt16OffAddr# a i))
116 | {-# inline withAnyInt16 #-}
117 | 
118 | -- | Parse any 'Int32' (native byte order) (CPS).
119 | withAnyInt32 :: (Int32 -> ParserT st e r) -> ParserT st e r
120 | withAnyInt32 = withAnySized# 4# (\a i -> I32# (indexInt32OffAddr# a i))
121 | {-# inline withAnyInt32 #-}
122 | 
123 | -- | Parse any 'Int64' (native byte order) (CPS).
124 | withAnyInt64 :: (Int64 -> ParserT st e r) -> ParserT st e r
125 | withAnyInt64 = withAnySized# 8# (\a i -> I64# (indexInt64OffAddr# a i))
126 | {-# inline withAnyInt64 #-}
127 | 
128 | -- | Parse any 'Word' (native size) (CPS).
129 | withAnyWord :: (Word -> ParserT st e r) -> ParserT st e r
130 | withAnyWord p = ParserT \fp eob buf st -> case SIZEOF_HSWORD# <=# minusAddr# eob buf of
131 |   0# -> Fail# st
132 |   _  -> let w# = indexWordOffAddr# buf 0#
133 |         in  runParserT# (p (W# w#)) fp eob (plusAddr# buf SIZEOF_HSWORD#) st
134 | {-# inline withAnyWord #-}
135 | 
136 | -- | Parse any 'Int' (native size) (CPS).
137 | withAnyInt :: (Int -> ParserT st e r) -> ParserT st e r
138 | withAnyInt p = ParserT \fp eob buf st -> case SIZEOF_HSWORD# <=# minusAddr# eob buf of
139 |   0# -> Fail# st
140 |   _  -> let i# = indexIntOffAddr# buf 0#
141 |         in  runParserT# (p (I# i#)) fp eob (plusAddr# buf SIZEOF_HSWORD#) st
142 | {-# inline withAnyInt #-}
143 | 
144 | --------------------------------------------------------------------------------
145 | 
146 | -- | Parse any 'Word8'.
147 | anyWord8 :: ParserT st e Word8
148 | anyWord8 = withAnyWord8 pure
149 | {-# inline anyWord8 #-}
150 | 
151 | -- | Parse any 'Word16' (native byte order).
152 | anyWord16 :: ParserT st e Word16
153 | anyWord16 = withAnyWord16 pure
154 | {-# inline anyWord16 #-}
155 | 
156 | -- | Parse any 'Word32' (native byte order).
157 | anyWord32 :: ParserT st e Word32
158 | anyWord32 = withAnyWord32 pure
159 | {-# inline anyWord32 #-}
160 | 
161 | -- | Parse any 'Word64' (native byte order).
162 | anyWord64 :: ParserT st e Word64
163 | anyWord64 = withAnyWord64 pure
164 | {-# inline anyWord64 #-}
165 | 
166 | -- | Parse any 'Int8'.
167 | anyInt8 :: ParserT st e Int8
168 | anyInt8 = withAnyInt8 pure
169 | {-# inline anyInt8 #-}
170 | 
171 | -- | Parse any 'Int16' (native byte order).
172 | anyInt16 :: ParserT st e Int16
173 | anyInt16 = withAnyInt16 pure
174 | {-# inline anyInt16 #-}
175 | 
176 | -- | Parse any 'Int32' (native byte order).
177 | anyInt32 :: ParserT st e Int32
178 | anyInt32 = withAnyInt32 pure
179 | {-# inline anyInt32 #-}
180 | 
181 | -- | Parse any 'Int64' (native byte order).
182 | anyInt64 :: ParserT st e Int64
183 | anyInt64 = withAnyInt64 pure
184 | {-# inline anyInt64 #-}
185 | 
186 | -- | Parse any 'Word' (native size).
187 | anyWord :: ParserT st e Word
188 | anyWord = withAnyWord pure
189 | {-# inline anyWord #-}
190 | 
191 | -- | Parse any 'Int' (native size).
192 | anyInt :: ParserT st e Int
193 | anyInt = withAnyInt pure
194 | {-# inline anyInt #-}
195 | 
196 | --------------------------------------------------------------------------------
197 | 
198 | {- $explicit-endianness
199 | Native endianness parsers are used where possible. For non-native endianness
200 | parsers, we parse then use the corresponding @byteSwapX@ function. On x86, this
201 | is inlined as a single @BSWAP@ instruction.
202 | -}
203 | 
204 | -- | Parse any 'Word16' (little-endian).
205 | anyWord16le :: ParserT st e Word16
206 | #if defined(WORDS_BIGENDIAN)
207 | anyWord16le = withAnyWord16 (pure . byteSwap16)
208 | #else
209 | anyWord16le = anyWord16
210 | #endif
211 | {-# inline anyWord16le #-}
212 | 
213 | -- | Parse any 'Word16' (big-endian).
214 | anyWord16be :: ParserT st e Word16
215 | #if defined(WORDS_BIGENDIAN)
216 | anyWord16be = anyWord16
217 | #else
218 | anyWord16be = withAnyWord16 (pure . byteSwap16)
219 | #endif
220 | {-# inline anyWord16be #-}
221 | 
222 | -- | Parse any 'Word32' (little-endian).
223 | anyWord32le :: ParserT st e Word32
224 | #if defined(WORDS_BIGENDIAN)
225 | anyWord32le = withAnyWord32 (pure . byteSwap32)
226 | #else
227 | anyWord32le = anyWord32
228 | #endif
229 | {-# inline anyWord32le #-}
230 | 
231 | -- | Parse any 'Word32' (big-endian).
232 | anyWord32be :: ParserT st e Word32
233 | #if defined(WORDS_BIGENDIAN)
234 | anyWord32be = anyWord32
235 | #else
236 | anyWord32be = withAnyWord32 (pure . byteSwap32)
237 | #endif
238 | {-# inline anyWord32be #-}
239 | 
240 | -- | Parse any 'Word64' (little-endian).
241 | anyWord64le :: ParserT st e Word64
242 | #if defined(WORDS_BIGENDIAN)
243 | anyWord64le = withAnyWord64 (pure . byteSwap64)
244 | #else
245 | anyWord64le = anyWord64
246 | #endif
247 | {-# inline anyWord64le #-}
248 | 
249 | -- | Parse any 'Word64' (big-endian).
250 | anyWord64be :: ParserT st e Word64
251 | #if defined(WORDS_BIGENDIAN)
252 | anyWord64be = anyWord64
253 | #else
254 | anyWord64be = withAnyWord64 (pure . byteSwap64)
255 | #endif
256 | {-# inline anyWord64be #-}
257 | 
258 | -- | Parse any 'Int16' (little-endian).
259 | anyInt16le :: ParserT st e Int16
260 | #if defined(WORDS_BIGENDIAN)
261 | anyInt16le = withAnyWord16 (pure . word16ToInt16 . byteSwap16)
262 | #else
263 | anyInt16le = anyInt16
264 | #endif
265 | {-# inline anyInt16le #-}
266 | 
267 | -- | Parse any 'Int16' (big-endian).
268 | anyInt16be :: ParserT st e Int16
269 | #if defined(WORDS_BIGENDIAN)
270 | anyInt16be = anyInt16
271 | #else
272 | anyInt16be = withAnyWord16 (pure . word16ToInt16 . byteSwap16)
273 | #endif
274 | {-# inline anyInt16be #-}
275 | 
276 | -- | Parse any 'Int32' (little-endian).
277 | anyInt32le :: ParserT st e Int32
278 | #if defined(WORDS_BIGENDIAN)
279 | anyInt32le = withAnyWord32 (pure . word32ToInt32 . byteSwap32)
280 | #else
281 | anyInt32le = anyInt32
282 | #endif
283 | {-# inline anyInt32le #-}
284 | 
285 | -- | Parse any 'Int32' (big-endian).
286 | anyInt32be :: ParserT st e Int32
287 | #if defined(WORDS_BIGENDIAN)
288 | anyInt32be = anyInt32
289 | #else
290 | anyInt32be = withAnyWord32 (pure . word32ToInt32 . byteSwap32)
291 | #endif
292 | {-# inline anyInt32be #-}
293 | 
294 | -- | Parse any 'Int64' (little-endian).
295 | anyInt64le :: ParserT st e Int64
296 | #if defined(WORDS_BIGENDIAN)
297 | anyInt64le = withAnyWord64 (pure . word64ToInt64 . byteSwap64)
298 | #else
299 | anyInt64le = anyInt64
300 | #endif
301 | {-# inline anyInt64le #-}
302 | 
303 | -- | Parse any 'Int64' (big-endian).
304 | anyInt64be :: ParserT st e Int64
305 | #if defined(WORDS_BIGENDIAN)
306 | anyInt64be = anyInt64
307 | #else
308 | anyInt64be = withAnyWord64 (pure . word64ToInt64 . byteSwap64)
309 | #endif
310 | {-# inline anyInt64be #-}
311 | 
312 | --------------------------------------------------------------------------------
313 | 
314 | -- | Read the next 1 byte and assert its value as a 'Word8'.
315 | word8 :: Word8 -> ParserT st e ()
316 | word8 wExpected = ParserT \fp eob buf st -> case eqAddr# eob buf of
317 |   1# -> Fail# st
318 |   _  -> let w# = indexWord8OffAddr# buf 0#
319 |         in  if   W8# w# == wExpected
320 |             then OK# st () (plusAddr# buf 1#)
321 |             else Fail# st
322 | {-# inline word8 #-}
323 | 
324 | --------------------------------------------------------------------------------
325 | 
326 | {- $unsafe
327 | These unsafe parsers and helpers may be useful for efficient parsing in special
328 | situations e.g. you already know that the input has enough bytes. You should
329 | only use them if you can assert their necessary guarantees (see the individual
330 | function documentation).
331 | -}
332 | 
333 | -- | Unsafe helper for defining parsers for types of a constant byte size (i.e.
334 | --   machine integers) which assert the parsed value's... value.
335 | --
336 | -- Call this with an @indexXYZOffAddr@ primop (e.g.
337 | -- 'GHC.Exts.indexWord8OffAddr'), the size in bytes of the type you're parsing,
338 | -- and the expected value to test the parsed value against.
339 | --
340 | -- The caller must guarantee that the input has enough bytes.
341 | sizedUnsafe# :: Eq a => Int# -> (Addr# -> Int# -> a) -> a -> ParserT st e ()
342 | sizedUnsafe# size# indexOffAddr aExpected =
343 |     withAnySizedUnsafe# size# indexOffAddr go
344 |   where
345 |     go aParsed =
346 |         if   aParsed == aExpected
347 |         then pure ()
348 |         else empty
349 | {-# inline sizedUnsafe# #-}
350 | 
351 | -- | Unsafely read the next 1 byte and assert its value as a 'Word8'.
352 | --
353 | -- The caller must guarantee that the input has enough bytes.
354 | word8Unsafe :: Word8 -> ParserT st e ()
355 | word8Unsafe = sizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i))
356 | {-# inline word8Unsafe #-}
357 | 
358 | -- | Unsafely read the next 2 bytes and assert their value as a 'Word16'
359 | --   (native byte order).
360 | --
361 | -- The caller must guarantee that the input has enough bytes.
362 | word16Unsafe :: Word16 -> ParserT st e ()
363 | word16Unsafe = sizedUnsafe# 2# (\a i -> W16# (indexWord16OffAddr# a i))
364 | {-# inline word16Unsafe #-}
365 | 
366 | -- | Unsafely read the next 4 bytes and assert their value as a 'Word32'.
367 | --   (native byte order).
368 | --
369 | -- The caller must guarantee that the input has enough bytes.
370 | word32Unsafe :: Word32 -> ParserT st e ()
371 | word32Unsafe = sizedUnsafe# 4# (\a i -> W32# (indexWord32OffAddr# a i))
372 | {-# inline word32Unsafe #-}
373 | 
374 | -- | Unsafely read the next 8 bytes and assert their value as a 'Word64'.
375 | --   (native byte order).
376 | --
377 | -- The caller must guarantee that the input has enough bytes.
378 | word64Unsafe :: Word64 -> ParserT st e ()
379 | word64Unsafe = sizedUnsafe# 8# (\a i -> W64# (indexWord64OffAddr# a i))
380 | {-# inline word64Unsafe #-}
381 | 
382 | --------------------------------------------------------------------------------
383 | 
384 | -- | Unsafely parse any 'Word8', without asserting the input is non-empty.
385 | --
386 | -- The caller must guarantee that the input has enough bytes.
387 | anyWord8Unsafe :: ParserT st e Word8
388 | anyWord8Unsafe = withAnySizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) pure
389 | {-# inline anyWord8Unsafe #-}
390 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Integers.hs:
--------------------------------------------------------------------------------
  1 | -- | Machine integer parsers.
  2 | 
  3 | module FlatParse.Stateful.Integers
  4 |   (
  5 |   -- * Native byte order
  6 |     anyWord8, anyWord16, anyWord32, anyWord64
  7 |   , anyInt8,  anyInt16,  anyInt32,  anyInt64
  8 |   , anyWord, anyInt
  9 | 
 10 |   -- * Explicit endianness
 11 |   -- $explicit-endianness
 12 |   , anyWord16le, anyWord16be
 13 |   , anyWord32le, anyWord32be
 14 |   , anyWord64le, anyWord64be
 15 |   , anyInt16le,  anyInt16be
 16 |   , anyInt32le,  anyInt32be
 17 |   , anyInt64le,  anyInt64be
 18 | 
 19 |   -- * Value assertions
 20 |   , word8
 21 | 
 22 |   -- * CPS parsers
 23 |   , withAnyWord8, withAnyWord16, withAnyWord32, withAnyWord64
 24 |   , withAnyInt8,  withAnyInt16,  withAnyInt32,  withAnyInt64
 25 |   , withAnyWord, withAnyInt
 26 | 
 27 |   -- * Unsafe
 28 |   -- $unsafe
 29 |   , anyWord8Unsafe
 30 | 
 31 |   -- ** Value assertions
 32 |   , word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe
 33 | 
 34 |   -- * Helper definitions
 35 |   , withAnySized#, withAnySizedUnsafe#
 36 |   , sizedUnsafe#
 37 |   ) where
 38 | 
 39 | -- for WORDS_BIGENDIAN
 40 | #include "MachDeps.h"
 41 | 
 42 | import FlatParse.Stateful.Parser
 43 | import FlatParse.Stateful.Base ( withEnsure# )
 44 | 
 45 | import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 )
 46 | 
 47 | import FlatParse.Common.GHCExts
 48 | import GHC.Word
 49 | import GHC.Int
 50 | 
 51 | import Control.Applicative ( Alternative(empty) )
 52 | 
 53 | --------------------------------------------------------------------------------
 54 | 
 55 | -- | Helper for defining CPS parsers for types of a constant byte size (i.e.
 56 | --   machine integers).
 57 | --
 58 | -- Call this with an @indexXYZOffAddr@ primop (e.g.
 59 | -- 'GHC.Exts.indexWord8OffAddr') and the size in bytes of the type you're
 60 | -- parsing.
 61 | withAnySized#
 62 |     :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st r e ret)
 63 |     -> ParserT st r e ret
 64 | withAnySized# size# indexOffAddr p =
 65 |     withEnsure# size# (withAnySizedUnsafe# size# indexOffAddr p)
 66 | {-# inline withAnySized# #-}
 67 | 
 68 | -- | Unsafe helper for defining CPS parsers for types of a constant byte size
 69 | --   (i.e. machine integers).
 70 | --
 71 | -- Is really just syntactic sugar for applying the given parser and shifting the
 72 | -- buffer along.
 73 | --
 74 | -- The caller must guarantee that the input has enough bytes.
 75 | withAnySizedUnsafe#
 76 |     :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st r e ret)
 77 |     -> ParserT st r e ret
 78 | withAnySizedUnsafe# size# indexOffAddr p = ParserT \fp !r eob buf n st ->
 79 |   let a    = indexOffAddr buf 0#
 80 |       buf' = plusAddr# buf size#
 81 |   in  runParserT# (p a) fp r eob buf' n st
 82 | {-# inline withAnySizedUnsafe# #-}
 83 | 
 84 | -- | Parse any 'Word8' (CPS).
 85 | withAnyWord8 :: (Word8 -> ParserT st r e ret) -> ParserT st r e ret
 86 | withAnyWord8 p = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
 87 |   1# -> Fail# st
 88 |   _  -> let w# = indexWord8OffAddr# buf 0#
 89 |         in  runParserT# (p (W8# w#)) fp r eob (plusAddr# buf 1#) n st
 90 | {-# inline withAnyWord8 #-}
 91 | 
 92 | -- | Parse any 'Word16' (native byte order) (CPS).
 93 | withAnyWord16 :: (Word16 -> ParserT st r e ret) -> ParserT st r e ret
 94 | withAnyWord16 = withAnySized# 2# (\a i -> W16# (indexWord16OffAddr# a i))
 95 | {-# inline withAnyWord16 #-}
 96 | 
 97 | -- | Parse any 'Word32' (native byte order) (CPS).
 98 | withAnyWord32 :: (Word32 -> ParserT st r e ret) -> ParserT st r e ret
 99 | withAnyWord32 = withAnySized# 4# (\a i -> W32# (indexWord32OffAddr# a i))
100 | {-# inline withAnyWord32 #-}
101 | 
102 | -- | Parse any 'Word64' (native byte order) (CPS).
103 | withAnyWord64 :: (Word64 -> ParserT st r e ret) -> ParserT st r e ret
104 | withAnyWord64 = withAnySized# 8# (\a i -> W64# (indexWord64OffAddr# a i))
105 | {-# inline withAnyWord64 #-}
106 | 
107 | -- | Parse any 'Int8' (CPS).
108 | withAnyInt8 :: (Int8 -> ParserT st r e ret) -> ParserT st r e ret
109 | withAnyInt8 p = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
110 |   1# -> Fail# st
111 |   _  -> let i# = indexInt8OffAddr# buf 0#
112 |         in  runParserT# (p (I8# i#)) fp r eob (plusAddr# buf 1#) n st
113 | {-# inline withAnyInt8 #-}
114 | 
115 | -- | Parse any 'Int16' (native byte order) (CPS).
116 | withAnyInt16 :: (Int16 -> ParserT st r e ret) -> ParserT st r e ret
117 | withAnyInt16 = withAnySized# 2# (\a i -> I16# (indexInt16OffAddr# a i))
118 | {-# inline withAnyInt16 #-}
119 | 
120 | -- | Parse any 'Int32' (native byte order) (CPS).
121 | withAnyInt32 :: (Int32 -> ParserT st r e ret) -> ParserT st r e ret
122 | withAnyInt32 = withAnySized# 4# (\a i -> I32# (indexInt32OffAddr# a i))
123 | {-# inline withAnyInt32 #-}
124 | 
125 | -- | Parse any 'Int64' (native byte order) (CPS).
126 | withAnyInt64 :: (Int64 -> ParserT st r e ret) -> ParserT st r e ret
127 | withAnyInt64 = withAnySized# 8# (\a i -> I64# (indexInt64OffAddr# a i))
128 | {-# inline withAnyInt64 #-}
129 | 
130 | -- | Parse any 'Word' (native size) (CPS).
131 | withAnyWord :: (Word -> ParserT st r e ret) -> ParserT st r e ret
132 | withAnyWord p = ParserT \fp !r eob buf n st -> case 8# <=# minusAddr# eob buf of
133 |   0# -> Fail# st
134 |   _  -> let w# = indexWordOffAddr# buf 0#
135 |         in  runParserT# (p (W# w#)) fp r eob (plusAddr# buf SIZEOF_HSWORD#) n st
136 | {-# inline withAnyWord #-}
137 | 
138 | -- | Parse any 'Int' (native size) (CPS).
139 | withAnyInt :: (Int -> ParserT st r e ret) -> ParserT st r e ret
140 | withAnyInt p = ParserT \fp !r eob buf n st -> case 8# <=# minusAddr# eob buf of
141 |   0# -> Fail# st
142 |   _  -> let i# = indexIntOffAddr# buf 0#
143 |         in  runParserT# (p (I# i#)) fp r eob (plusAddr# buf SIZEOF_HSWORD#) n st
144 | {-# inline withAnyInt #-}
145 | 
146 | --------------------------------------------------------------------------------
147 | 
148 | -- | Parse any 'Word8'.
149 | anyWord8 :: ParserT st r e Word8
150 | anyWord8 = withAnyWord8 pure
151 | {-# inline anyWord8 #-}
152 | 
153 | -- | Parse any 'Word16' (native byte order).
154 | anyWord16 :: ParserT st r e Word16
155 | anyWord16 = withAnyWord16 pure
156 | {-# inline anyWord16 #-}
157 | 
158 | -- | Parse any 'Word32' (native byte order).
159 | anyWord32 :: ParserT st r e Word32
160 | anyWord32 = withAnyWord32 pure
161 | {-# inline anyWord32 #-}
162 | 
163 | -- | Parse any 'Word64' (native byte order).
164 | anyWord64 :: ParserT st r e Word64
165 | anyWord64 = withAnyWord64 pure
166 | {-# inline anyWord64 #-}
167 | 
168 | -- | Parse any 'Int8'.
169 | anyInt8 :: ParserT st r e Int8
170 | anyInt8 = withAnyInt8 pure
171 | {-# inline anyInt8 #-}
172 | 
173 | -- | Parse any 'Int16' (native byte order).
174 | anyInt16 :: ParserT st r e Int16
175 | anyInt16 = withAnyInt16 pure
176 | {-# inline anyInt16 #-}
177 | 
178 | -- | Parse any 'Int32' (native byte order).
179 | anyInt32 :: ParserT st r e Int32
180 | anyInt32 = withAnyInt32 pure
181 | {-# inline anyInt32 #-}
182 | 
183 | -- | Parse any 'Int64' (native byte order).
184 | anyInt64 :: ParserT st r e Int64
185 | anyInt64 = withAnyInt64 pure
186 | {-# inline anyInt64 #-}
187 | 
188 | -- | Parse any 'Word' (native size).
189 | anyWord :: ParserT st r e Word
190 | anyWord = withAnyWord pure
191 | {-# inline anyWord #-}
192 | 
193 | -- | Parse any 'Int' (native size).
194 | anyInt :: ParserT st r e Int
195 | anyInt = withAnyInt pure
196 | {-# inline anyInt #-}
197 | 
198 | --------------------------------------------------------------------------------
199 | 
200 | {- $explicit-endianness
201 | Native endianness parsers are used where possible. For non-native endianness
202 | parsers, we parse then use the corresponding @byteSwapX@ function. On x86, this
203 | is inlined as a single @BSWAP@ instruction.
204 | -}
205 | 
206 | -- | Parse any 'Word16' (little-endian).
207 | anyWord16le :: ParserT st r e Word16
208 | #if defined(WORDS_BIGENDIAN)
209 | anyWord16le = withAnyWord16 (pure . byteSwap16)
210 | #else
211 | anyWord16le = anyWord16
212 | #endif
213 | {-# inline anyWord16le #-}
214 | 
215 | -- | Parse any 'Word16' (big-endian).
216 | anyWord16be :: ParserT st r e Word16
217 | #if defined(WORDS_BIGENDIAN)
218 | anyWord16be = anyWord16
219 | #else
220 | anyWord16be = withAnyWord16 (pure . byteSwap16)
221 | #endif
222 | {-# inline anyWord16be #-}
223 | 
224 | -- | Parse any 'Word32' (little-endian).
225 | anyWord32le :: ParserT st r e Word32
226 | #if defined(WORDS_BIGENDIAN)
227 | anyWord32le = withAnyWord32 (pure . byteSwap32)
228 | #else
229 | anyWord32le = anyWord32
230 | #endif
231 | {-# inline anyWord32le #-}
232 | 
233 | -- | Parse any 'Word32' (big-endian).
234 | anyWord32be :: ParserT st r e Word32
235 | #if defined(WORDS_BIGENDIAN)
236 | anyWord32be = anyWord32
237 | #else
238 | anyWord32be = withAnyWord32 (pure . byteSwap32)
239 | #endif
240 | {-# inline anyWord32be #-}
241 | 
242 | -- | Parse any 'Word64' (little-endian).
243 | anyWord64le :: ParserT st r e Word64
244 | #if defined(WORDS_BIGENDIAN)
245 | anyWord64le = withAnyWord64 (pure . byteSwap64)
246 | #else
247 | anyWord64le = anyWord64
248 | #endif
249 | {-# inline anyWord64le #-}
250 | 
251 | -- | Parse any 'Word64' (big-endian).
252 | anyWord64be :: ParserT st r e Word64
253 | #if defined(WORDS_BIGENDIAN)
254 | anyWord64be = anyWord64
255 | #else
256 | anyWord64be = withAnyWord64 (pure . byteSwap64)
257 | #endif
258 | {-# inline anyWord64be #-}
259 | 
260 | -- | Parse any 'Int16' (little-endian).
261 | anyInt16le :: ParserT st r e Int16
262 | #if defined(WORDS_BIGENDIAN)
263 | anyInt16le = withAnyWord16 (pure . word16ToInt16 . byteSwap16)
264 | #else
265 | anyInt16le = anyInt16
266 | #endif
267 | {-# inline anyInt16le #-}
268 | 
269 | -- | Parse any 'Int16' (big-endian).
270 | anyInt16be :: ParserT st r e Int16
271 | #if defined(WORDS_BIGENDIAN)
272 | anyInt16be = anyInt16
273 | #else
274 | anyInt16be = withAnyWord16 (pure . word16ToInt16 . byteSwap16)
275 | #endif
276 | {-# inline anyInt16be #-}
277 | 
278 | -- | Parse any 'Int32' (little-endian).
279 | anyInt32le :: ParserT st r e Int32
280 | #if defined(WORDS_BIGENDIAN)
281 | anyInt32le = withAnyWord32 (pure . word32ToInt32 . byteSwap32)
282 | #else
283 | anyInt32le = anyInt32
284 | #endif
285 | {-# inline anyInt32le #-}
286 | 
287 | -- | Parse any 'Int32' (big-endian).
288 | anyInt32be :: ParserT st r e Int32
289 | #if defined(WORDS_BIGENDIAN)
290 | anyInt32be = anyInt32
291 | #else
292 | anyInt32be = withAnyWord32 (pure . word32ToInt32 . byteSwap32)
293 | #endif
294 | {-# inline anyInt32be #-}
295 | 
296 | -- | Parse any 'Int64' (little-endian).
297 | anyInt64le :: ParserT st r e Int64
298 | #if defined(WORDS_BIGENDIAN)
299 | anyInt64le = withAnyWord64 (pure . word64ToInt64 . byteSwap64)
300 | #else
301 | anyInt64le = anyInt64
302 | #endif
303 | {-# inline anyInt64le #-}
304 | 
305 | -- | Parse any 'Int64' (big-endian).
306 | anyInt64be :: ParserT st r e Int64
307 | #if defined(WORDS_BIGENDIAN)
308 | anyInt64be = anyInt64
309 | #else
310 | anyInt64be = withAnyWord64 (pure . word64ToInt64 . byteSwap64)
311 | #endif
312 | {-# inline anyInt64be #-}
313 | 
314 | --------------------------------------------------------------------------------
315 | 
316 | -- | Read the next 1 byte and assert its value as a 'Word8'.
317 | word8 :: Word8 -> ParserT st r e ()
318 | word8 wExpected = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of
319 |   1# -> Fail# st
320 |   _  -> let w# = indexWord8OffAddr# buf 0#
321 |         in  if   W8# w# == wExpected
322 |             then OK# st () (plusAddr# buf 1#) n
323 |             else Fail# st
324 | {-# inline word8 #-}
325 | 
326 | --------------------------------------------------------------------------------
327 | 
328 | {- $unsafe
329 | These unsafe parsers and helpers may be useful for efficient parsing in special
330 | situations e.g. you already know that the input has enough bytes. You should
331 | only use them if you can assert their necessary guarantees (see the individual
332 | function documentation).
333 | -}
334 | 
335 | -- | Unsafe helper for defining parsers for types of a constant byte size (i.e.
336 | --   machine integers) which assert the parsed value's... value.
337 | --
338 | -- Call this with an @indexXYZOffAddr@ primop (e.g.
339 | -- 'GHC.Exts.indexWord8OffAddr'), the size in bytes of the type you're parsing,
340 | -- and the expected value to test the parsed value against.
341 | --
342 | -- The caller must guarantee that the input has enough bytes.
343 | sizedUnsafe# :: Eq a => Int# -> (Addr# -> Int# -> a) -> a -> ParserT st r e ()
344 | sizedUnsafe# size# indexOffAddr aExpected =
345 |     withAnySizedUnsafe# size# indexOffAddr go
346 |   where
347 |     go aParsed =
348 |         if   aParsed == aExpected
349 |         then pure ()
350 |         else empty
351 | {-# inline sizedUnsafe# #-}
352 | 
353 | -- | Unsafely read the next 1 byte and assert its value as a 'Word8'.
354 | --
355 | -- The caller must guarantee that the input has enough bytes.
356 | word8Unsafe :: Word8 -> ParserT st r e ()
357 | word8Unsafe = sizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i))
358 | {-# inline word8Unsafe #-}
359 | 
360 | -- | Unsafely read the next 2 bytes and assert their value as a 'Word16'
361 | --   (native byte order).
362 | --
363 | -- The caller must guarantee that the input has enough bytes.
364 | word16Unsafe :: Word16 -> ParserT st r e ()
365 | word16Unsafe = sizedUnsafe# 2# (\a i -> W16# (indexWord16OffAddr# a i))
366 | {-# inline word16Unsafe #-}
367 | 
368 | -- | Unsafely read the next 4 bytes and assert their value as a 'Word32'.
369 | --   (native byte order).
370 | --
371 | -- The caller must guarantee that the input has enough bytes.
372 | word32Unsafe :: Word32 -> ParserT st r e ()
373 | word32Unsafe = sizedUnsafe# 4# (\a i -> W32# (indexWord32OffAddr# a i))
374 | {-# inline word32Unsafe #-}
375 | 
376 | -- | Unsafely read the next 8 bytes and assert their value as a 'Word64'.
377 | --   (native byte order).
378 | --
379 | -- The caller must guarantee that the input has enough bytes.
380 | word64Unsafe :: Word64 -> ParserT st r e ()
381 | word64Unsafe = sizedUnsafe# 8# (\a i -> W64# (indexWord64OffAddr# a i))
382 | {-# inline word64Unsafe #-}
383 | 
384 | --------------------------------------------------------------------------------
385 | 
386 | -- | Unsafely parse any 'Word8', without asserting the input is non-empty.
387 | --
388 | -- The caller must guarantee that the input has enough bytes.
389 | anyWord8Unsafe :: ParserT st r e Word8
390 | anyWord8Unsafe = withAnySizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) pure
391 | {-# inline anyWord8Unsafe #-}
392 | 


--------------------------------------------------------------------------------
/src/FlatParse/Basic/Base.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples #-}
  2 | 
  3 | -- | Basic parser building blocks.
  4 | 
  5 | module FlatParse.Basic.Base
  6 |   (
  7 |   -- * Bytewise
  8 |     eof
  9 |   , take
 10 |   , take#
 11 |   , takeUnsafe#
 12 |   , takeRest
 13 |   , skip
 14 |   , skip#
 15 |   , skipBack
 16 |   , skipBack#
 17 |   , atSkip#
 18 |   , atSkipUnsafe#
 19 | 
 20 |   -- * Combinators
 21 |   , branch
 22 |   , notFollowedBy
 23 |   , chainl
 24 |   , chainr
 25 |   , lookahead
 26 |   , ensure
 27 |   , ensure#
 28 |   , withEnsure
 29 |   , withEnsure1
 30 |   , withEnsure#
 31 |   , isolate
 32 |   , isolate#
 33 |   , isolateUnsafe#
 34 | 
 35 |   -- ** Non-specific (TODO)
 36 |   , skipMany
 37 |   , skipSome
 38 | 
 39 |   -- * Errors and failures
 40 |   , failed
 41 |   , try
 42 |   , err
 43 |   , withError
 44 |   , withAnyResult
 45 |   , fails
 46 |   , cut
 47 |   , cutting
 48 |   , optional
 49 |   , optional_
 50 |   , withOption
 51 |   ) where
 52 | 
 53 | import Prelude hiding ( take )
 54 | 
 55 | import FlatParse.Basic.Parser
 56 | import qualified FlatParse.Common.Assorted as Common
 57 | 
 58 | import GHC.Exts
 59 | import qualified Data.ByteString as B
 60 | import qualified Data.ByteString.Internal as B
 61 | import GHC.ForeignPtr ( ForeignPtr(..) )
 62 | import qualified Control.Applicative
 63 | 
 64 | -- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks
 65 | --   on parser failure.
 66 | failed :: ParserT st e a
 67 | failed = Control.Applicative.empty
 68 | {-# inline failed #-}
 69 | 
 70 | -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack
 71 | --   on parser error. Use `try` to convert an error to a recoverable failure.
 72 | err :: e -> ParserT st e a
 73 | err e = ParserT \_fp _eob _s st -> Err# st e
 74 | {-# inline err #-}
 75 | 
 76 | -- | Run the parser, if an error is thrown, handle it with the given function.
 77 | withError :: ParserT st e b -> (e -> ParserT st e b) -> ParserT st e b
 78 | withError (ParserT f) hdl = ParserT $ \fp eob s st -> case f fp eob s st of
 79 |   Err# st' e -> case hdl e of
 80 |     ParserT g -> g fp eob s st'
 81 |   x -> x
 82 | {-# inline withError #-}
 83 | 
 84 | -- | Run the parser, and handle each possible result.
 85 | withAnyResult
 86 |   :: ParserT st t a         -- ^ The parser to run.
 87 |   -> (a -> ParserT st e b)  -- ^ The parser to run in case of success.
 88 |   -> ParserT st e b         -- ^ The parser to run in case of failure.
 89 |   -> (t -> ParserT st e b)  -- ^ The parser to run in case of error.
 90 |   -> ParserT st e b
 91 | withAnyResult (ParserT first) whenSuccess (ParserT whenFailure) whenError =
 92 |   ParserT \fp eob n st ->
 93 |     case first fp eob n st of
 94 |       OK# st' a  n' -> runParserT# (whenSuccess a) fp eob n' st'
 95 |       Fail# st'     -> whenFailure fp eob n st'
 96 |       Err# st' e    -> runParserT# (whenError e) fp eob n st'
 97 | {-# INLINE withAnyResult #-}
 98 | 
 99 | -- | Convert a parsing error into failure.
100 | try :: ParserT st e a -> ParserT st e a
101 | try (ParserT f) = ParserT \fp eob s st -> case f fp eob s st of
102 |   Err# st' _ -> Fail# st'
103 |   x          -> x
104 | {-# inline try #-}
105 | 
106 | -- | Convert a parsing failure to a success.
107 | fails :: ParserT st e a -> ParserT st e ()
108 | fails (ParserT f) = ParserT \fp eob s st ->
109 |   case f fp eob s st of
110 |     OK#   st' _ _ -> Fail# st'
111 |     Fail# st'     -> OK#   st' () s
112 |     Err#  st' e   -> Err#  st' e
113 | {-# inline fails #-}
114 | 
115 | -- | Convert a parsing failure to an error.
116 | cut :: ParserT st e a -> e -> ParserT st e a
117 | cut (ParserT f) e = ParserT \fp eob s st -> case f fp eob s st of
118 |   Fail# st' -> Err# st' e
119 |   x         -> x
120 | {-# inline cut #-}
121 | 
122 | -- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the
123 | --   inner and the newly given errors using the @e -> e -> e@ function. This can be useful for
124 | --   implementing parsing errors which may propagate hints or accummulate contextual information.
125 | cutting :: ParserT st e a -> e -> (e -> e -> e) -> ParserT st e a
126 | cutting (ParserT f) e merge = ParserT \fp eob s st -> case f fp eob s st of
127 |   Fail# st'    -> Err# st' e
128 |   Err#  st' e' -> Err# st' $! merge e' e
129 |   x            -> x
130 | {-# inline cutting #-}
131 | 
132 | -- | Convert a parsing failure to a `Maybe`. If possible, use `withOption`
133 | --   instead.
134 | optional :: ParserT st e a -> ParserT st e (Maybe a)
135 | optional p = (Just <$> p) <|> pure Nothing
136 | {-# inline optional #-}
137 | 
138 | -- | Convert a parsing failure to a `()`.
139 | optional_ :: ParserT st e a -> ParserT st e ()
140 | optional_ p = (() <$ p) <|> pure ()
141 | {-# inline optional_ #-}
142 | 
143 | -- | CPS'd version of `optional`. This is usually more efficient, since it gets
144 | --   rid of the extra `Maybe` allocation.
145 | withOption :: ParserT st e a -> (a -> ParserT st e r) -> ParserT st e r -> ParserT st e r
146 | withOption (ParserT f) just (ParserT nothing) = ParserT \fp eob s st ->
147 |     case f fp eob s st of
148 |       OK#   st' a s -> runParserT# (just a) fp eob s st'
149 |       Fail# st'     -> nothing fp eob s st'
150 |       Err#  st' e   -> Err# st' e
151 | {-# inline withOption #-}
152 | 
153 | --------------------------------------------------------------------------------
154 | 
155 | -- | Succeed if the input is empty.
156 | eof :: ParserT st e ()
157 | eof = ParserT \fp eob s st -> case eqAddr# eob s of
158 |   1# -> OK#   st () s
159 |   _  -> Fail# st
160 | {-# inline eof #-}
161 | 
162 | -- | Save the parsing state, then run a parser, then restore the state.
163 | lookahead :: ParserT st e a -> ParserT st e a
164 | lookahead (ParserT f) = ParserT \fp eob s st ->
165 |   case f fp eob s st of
166 |     OK# st' a _ -> OK# st' a s
167 |     x           -> x
168 | {-# inline lookahead #-}
169 | 
170 | -- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes.
171 | --   All isolated bytes must be consumed.
172 | --
173 | -- Throws a runtime error if given a negative integer.
174 | isolate :: Int -> ParserT st e a -> ParserT st e a
175 | isolate = Common.withIntUnwrap# isolate#
176 | {-# inline isolate #-}
177 | 
178 | -- | @isolate# n# p@ runs the parser @p@ isolated to the next @n#@ bytes.
179 | --   All isolated bytes must be consumed.
180 | --
181 | -- Throws a runtime error if given a negative integer.
182 | isolate# :: Int# -> ParserT st e a -> ParserT st e a
183 | isolate# n# p = Common.withPosInt# n# (isolateUnsafe# n# p)
184 | {-# inline isolate# #-}
185 | 
186 | -- | @isolateUnsafe# n# p@ runs the parser @p@ isolated to the next @n#@ bytes.
187 | --   All isolated bytes must be consumed.
188 | --
189 | -- Undefined behaviour if given a negative integer.
190 | isolateUnsafe# :: Int# -> ParserT st e a -> ParserT st e a
191 | isolateUnsafe# n# (ParserT p) =
192 |     withEnsure# n# $ ParserT \fp eob s st ->
193 |         let s' = plusAddr# s n#
194 |         in  case p fp s' s st of
195 |               OK# st' a s'' ->
196 |                 case eqAddr# s' s'' of
197 |                   1# -> OK#   st' a s''
198 |                   _  -> Fail# st'
199 |               x -> x
200 | {-# inline isolateUnsafe# #-}
201 | 
202 | -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s,
203 | --   and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not
204 | --   the usual `chainl` function from the parsec libraries!
205 | chainl :: (b -> a -> b) -> ParserT st e b -> ParserT st e a -> ParserT st e b
206 | chainl f start elem = start >>= go where
207 |   go b = withOption elem (\ !a -> go $! f b a) (pure b)
208 | {-# inline chainl #-}
209 | 
210 | -- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and
211 | --   combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not
212 | --   the usual `chainr` function from the parsec libraries!
213 | chainr :: (a -> b -> b) -> ParserT st e a -> ParserT st e b -> ParserT st e b
214 | chainr f (ParserT elem) (ParserT end) = ParserT go where
215 |   go fp eob s st = case elem fp eob s st of
216 |     OK#   st' a s -> case go fp eob s st' of
217 |       OK# st'' b s -> let !b' = f a b in OK# st'' b' s
218 |       x            -> x
219 |     Fail# st'     -> end fp eob s st'
220 |     Err#  st' e   -> Err# st' e
221 | {-# inline chainr #-}
222 | 
223 | -- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third.
224 | --   This can produce slightly more efficient code than `(<|>)`. Moreover, `branch` does not
225 | --   backtrack from the true/false cases.
226 | branch :: ParserT st e a -> ParserT st e b -> ParserT st e b -> ParserT st e b
227 | branch pa pt pf = ParserT \fp eob s st -> case runParserT# pa fp eob s st of
228 |   OK#   st' _ s -> runParserT# pt fp eob s st'
229 |   Fail# st'     -> runParserT# pf fp eob s st'
230 |   Err#  st' e   -> Err# st' e
231 | {-# inline branch #-}
232 | 
233 | -- | Succeed if the first parser succeeds and the second one fails.
234 | notFollowedBy :: ParserT st e a -> ParserT st e b -> ParserT st e a
235 | notFollowedBy p1 p2 = p1 <* fails p2
236 | {-# inline notFollowedBy #-}
237 | 
238 | --------------------------------------------------------------------------------
239 | 
240 | -- | Assert that there are at least @n@ bytes remaining.
241 | --
242 | -- Undefined behaviour if given a negative integer.
243 | ensure :: Int -> ParserT st e ()
244 | ensure = Common.withIntUnwrap# ensure#
245 | {-# inline ensure #-}
246 | 
247 | -- | Assert that there are at least @n#@ bytes remaining.
248 | --
249 | -- Undefined behaviour if given a negative integer.
250 | ensure# :: Int# -> ParserT st e ()
251 | ensure# n# = withEnsure# n# (pure ())
252 | {-# inline ensure# #-}
253 | 
254 | -- TODO: András: why do we need withEnsure-s?
255 | -- There's no unboxing to be improved.
256 | 
257 | -- | Assert that there are at least @n#@ bytes remaining (CPS).
258 | --
259 | -- Undefined behaviour if given a negative integer.
260 | withEnsure :: Int -> ParserT st e r -> ParserT st e r
261 | withEnsure = Common.withIntUnwrap# withEnsure#
262 | {-# inline withEnsure #-}
263 | 
264 | -- | Assert that there is at least 1 byte remaining (CPS).
265 | --
266 | -- Undefined behaviour if given a negative integer.
267 | withEnsure1 :: ParserT st e r -> ParserT st e r
268 | withEnsure1 (ParserT p) = ParserT \fp eob s st ->
269 |     case eqAddr# eob s of
270 |       0# -> p fp eob s st
271 |       _  -> Fail# st
272 | {-# inline withEnsure1 #-}
273 | 
274 | -- | Assert that there are at least @n#@ bytes remaining (CPS).
275 | --
276 | -- Undefined behaviour if given a negative integer.
277 | withEnsure# :: Int# -> ParserT st e r -> ParserT st e r
278 | withEnsure# n# (ParserT p) = ParserT \fp eob s st ->
279 |     case n# <=# minusAddr# eob s of
280 |       1# -> p fp eob s st
281 |       _  -> Fail# st
282 | {-# inline withEnsure# #-}
283 | 
284 | --------------------------------------------------------------------------------
285 | 
286 | -- | Read the given number of bytes as a 'ByteString'.
287 | --
288 | -- Throws a runtime error if given a negative integer.
289 | --
290 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
291 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
292 | take :: Int -> ParserT st e B.ByteString
293 | take (I# n#) = take# n#
294 | {-# inline take #-}
295 | 
296 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are
297 | --   available.
298 | --
299 | -- Throws a runtime error if given a negative integer.
300 | --
301 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
302 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
303 | take# :: Int# -> ParserT st e B.ByteString
304 | take# n# = Common.withPosInt# n# (takeUnsafe# n#)
305 | {-# inline take# #-}
306 | 
307 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are
308 | --   available.
309 | --
310 | -- Undefined behaviour if given a negative integer.
311 | --
312 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
313 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
314 | takeUnsafe# :: Int# -> ParserT st e B.ByteString
315 | takeUnsafe# n# = ParserT \fp eob s st ->
316 |     case n# <=# minusAddr# eob s of
317 |       1# -> OK#   st (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#)
318 |       _  -> Fail# st
319 | {-# inline takeUnsafe# #-}
320 | 
321 | -- | Consume the rest of the input. May return the empty bytestring.
322 | --
323 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
324 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
325 | takeRest :: ParserT st e B.ByteString
326 | takeRest = ParserT \fp eob s st ->
327 |   let n# = minusAddr# eob s
328 |   in  OK# st (B.PS (ForeignPtr s fp) 0 (I# n#)) eob
329 | {-# inline takeRest #-}
330 | 
331 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available.
332 | --
333 | -- Throws a runtime error if given a negative integer.
334 | skip :: Int -> ParserT st e ()
335 | skip (I# n#) = skip# n#
336 | {-# inline skip #-}
337 | 
338 | -- | Skip forward @n#@ bytes. Fails if fewer than @n#@ bytes are available.
339 | --
340 | -- Throws a runtime error if given a negative integer.
341 | skip# :: Int# -> ParserT st e ()
342 | skip# n# = atSkip# n# (pure ())
343 | {-# inline skip# #-}
344 | 
345 | -- | Go back @i@ bytes in the input. Takes a positive integer.
346 | --
347 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea.
348 | skipBack :: Int -> ParserT st e ()
349 | skipBack = Common.withIntUnwrap# skipBack#
350 | {-# inline skipBack #-}
351 | 
352 | -- | Go back @n#@ bytes. Takes a positive integer.
353 | --
354 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea.
355 | skipBack# :: Int# -> ParserT st e ()
356 | skipBack# n# = ParserT \fp eob s st ->
357 |     OK# st () (plusAddr# s (negateInt# n#))
358 | {-# inline skipBack# #-}
359 | 
360 | -- | Skip forward @n#@ bytes and run the given parser. Fails if fewer than @n#@
361 | --   bytes are available.
362 | --
363 | -- Throws a runtime error if given a negative integer.
364 | atSkip# :: Int# -> ParserT st e a -> ParserT st e a
365 | atSkip# n# p = Common.withPosInt# n# (atSkipUnsafe# n# p)
366 | {-# inline atSkip# #-}
367 | 
368 | -- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@
369 | --   bytes are available.
370 | --
371 | -- Undefined behaviour if given a negative integer.
372 | atSkipUnsafe# :: Int# -> ParserT st e r -> ParserT st e r
373 | atSkipUnsafe# n# (ParserT p) =
374 |     withEnsure# n# $ ParserT \fp eob s st ->
375 |         p fp eob (plusAddr# s n#) st
376 | {-# inline atSkipUnsafe# #-}
377 | 
378 | --------------------------------------------------------------------------------
379 | 
380 | -- | Skip a parser zero or more times.
381 | skipMany :: ParserT st e a -> ParserT st e ()
382 | skipMany (ParserT f) = ParserT go where
383 |   go fp eob s st = case f fp eob s st of
384 |     OK# st a s -> go fp eob s st
385 |     Fail# st   -> OK# st () s
386 |     Err# st e  -> Err# st e
387 | {-# inline skipMany #-}
388 | 
389 | -- TODO identical to one from parser-combinators
390 | -- | Skip a parser one or more times.
391 | skipSome :: ParserT st e a -> ParserT st e ()
392 | skipSome p = p *> skipMany p
393 | {-# inline skipSome #-}
394 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful/Base.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples #-}
  2 | 
  3 | -- | Basic parser building blocks.
  4 | 
  5 | module FlatParse.Stateful.Base
  6 |   (
  7 |   -- * Bytewise
  8 |     eof
  9 |   , take
 10 |   , take#
 11 |   , takeUnsafe#
 12 |   , takeRest
 13 |   , skip
 14 |   , skip#
 15 |   , skipBack
 16 |   , skipBack#
 17 |   , atSkip#
 18 |   , atSkipUnsafe#
 19 | 
 20 |   -- * Combinators
 21 |   , branch
 22 |   , notFollowedBy
 23 |   , chainl
 24 |   , chainr
 25 |   , lookahead
 26 |   , ensure
 27 |   , ensure#
 28 |   , withEnsure
 29 |   , withEnsure1
 30 |   , withEnsure#
 31 |   , isolate
 32 |   , isolate#
 33 |   , isolateUnsafe#
 34 | 
 35 |   -- ** Non-specific (TODO)
 36 |   , skipMany
 37 |   , skipSome
 38 | 
 39 |   -- * Errors and failures
 40 |   , failed
 41 |   , try
 42 |   , err
 43 |   , withError
 44 |   , withAnyResult
 45 |   , fails
 46 |   , cut
 47 |   , cutting
 48 |   , optional
 49 |   , optional_
 50 |   , withOption
 51 |   ) where
 52 | 
 53 | import Prelude hiding ( take )
 54 | 
 55 | import FlatParse.Stateful.Parser
 56 | import qualified FlatParse.Common.Assorted as Common
 57 | 
 58 | import GHC.Exts
 59 | import qualified Data.ByteString as B
 60 | import qualified Data.ByteString.Internal as B
 61 | import GHC.ForeignPtr ( ForeignPtr(..) )
 62 | import qualified Control.Applicative
 63 | 
 64 | -- | The failing parser. By default, parser choice `(<|>)` arbitrarily
 65 | --   backtracks on parser failure. This is a synonym for `Control.Applicative.empty`.
 66 | failed :: ParserT st r e a
 67 | failed = Control.Applicative.empty
 68 | {-# inline failed #-}
 69 | 
 70 | -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack
 71 | --   on parser error. Use `try` to convert an error to a recoverable failure.
 72 | err :: e -> ParserT st r e a
 73 | err e = ParserT \_fp !_r _eob _s _n st -> Err# st e
 74 | {-# inline err #-}
 75 | 
 76 | -- | Run the parser, if an error is thrown, handle it with the given function.
 77 | withError :: ParserT st r e b -> (e -> ParserT st r e b) -> ParserT st r e b
 78 | withError (ParserT f) hdl = ParserT $ \fp !r eob s n st -> case f fp r eob s n st of
 79 |   Err# st' e -> case hdl e of
 80 |     ParserT g -> g fp r eob s n st'
 81 |   x -> x
 82 | {-# inline withError #-}
 83 | 
 84 | -- | Run the parser, and handle each possible result.
 85 | withAnyResult
 86 |   :: ParserT st r t a         -- ^ The parser to run.
 87 |   -> (a -> ParserT st r e b)  -- ^ The parser to run in case of success.
 88 |   -> ParserT st r e b         -- ^ The parser to run in case of failure.
 89 |   -> (t -> ParserT st r e b)  -- ^ The parser to run in case of error.
 90 |   -> ParserT st r e b
 91 | withAnyResult (ParserT first) whenSuccess (ParserT whenFailure) whenError =
 92 |   ParserT \fp !r eob s n st ->
 93 |     case first fp r eob s n st of
 94 |       OK# st' a s' n' -> runParserT# (whenSuccess a) fp r eob s' n' st'
 95 |       Fail# st'       -> whenFailure fp r eob s n st'
 96 |       Err# st' e      -> runParserT# (whenError e) fp r eob s n st'
 97 | {-# INLINE withAnyResult #-}
 98 | 
 99 | -- | Convert a parsing error into failure.
100 | try :: ParserT st r e a -> ParserT st r e a
101 | try (ParserT p) = ParserT \fp !r eob s n st -> case p fp r eob s n st of
102 |   Err# st' _ -> Fail# st'
103 |   x          -> x
104 | {-# inline try #-}
105 | 
106 | -- | Convert a parsing failure to a success.
107 | fails :: ParserT st r e a -> ParserT st r e ()
108 | fails (ParserT p) = ParserT \fp !r eob s n st ->
109 |   case p fp r eob s n st of
110 |     OK#   st' _ _ _ -> Fail# st'
111 |     Fail# st'       -> OK#   st' () s n
112 |     Err#  st' e     -> Err#  st' e
113 | {-# inline fails #-}
114 | 
115 | -- | Convert a parsing failure to an error.
116 | cut :: ParserT st r e a -> e -> ParserT st r e a
117 | cut (ParserT p) e = ParserT \fp !r eob s n st -> case p fp r eob s n st of
118 |   Fail# st' -> Err# st' e
119 |   x         -> x
120 | {-# inline cut #-}
121 | 
122 | -- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the
123 | --   inner and the newly given errors using the @e -> e -> e@ function. This can be useful for
124 | --   implementing parsing errors which may propagate hints or accummulate contextual information.
125 | cutting :: ParserT st r e a -> e -> (e -> e -> e) -> ParserT st r e a
126 | cutting (ParserT p) e merge = ParserT \fp !r eob s n st -> case p fp r eob s n st of
127 |   Fail# st'    -> Err# st' e
128 |   Err#  st' e' -> Err# st' $! merge e' e
129 |   x            -> x
130 | {-# inline cutting #-}
131 | 
132 | -- | Convert a parsing failure to a `Maybe`. If possible, use `withOption`
133 | --   instead.
134 | optional :: ParserT st r e a -> ParserT st r e (Maybe a)
135 | optional p = (Just <$> p) <|> pure Nothing
136 | {-# inline optional #-}
137 | 
138 | -- | Convert a parsing failure to a `()`.
139 | optional_ :: ParserT st r e a -> ParserT st r e ()
140 | optional_ p = (() <$ p) <|> pure ()
141 | {-# inline optional_ #-}
142 | 
143 | -- | CPS'd version of `optional`. This is usually more efficient, since it gets
144 | --   rid of the extra `Maybe` allocation.
145 | withOption
146 |     :: ParserT st r e a -> (a -> ParserT st r e ret)
147 |     -> ParserT st r e ret -> ParserT st r e ret
148 | withOption (ParserT p) just (ParserT nothing) = ParserT \fp !r eob s n st ->
149 |     case p fp r eob s n st of
150 |       OK#   st' a s n' -> runParserT# (just a) fp r eob s n' st'
151 |       Fail# st'        -> nothing fp r eob s n st'
152 |       Err#  st' e      -> Err# st' e
153 | {-# inline withOption #-}
154 | 
155 | --------------------------------------------------------------------------------
156 | 
157 | -- | Succeed if the input is empty.
158 | eof :: ParserT st r e ()
159 | eof = ParserT \fp !r eob s n st -> case eqAddr# eob s of
160 |   1# -> OK#   st () s n
161 |   _  -> Fail# st
162 | {-# inline eof #-}
163 | 
164 | -- | Save the parsing state, then run a parser, then restore the state.
165 | lookahead :: ParserT st r e a -> ParserT st r e a
166 | lookahead (ParserT p) = ParserT \fp !r eob s n st ->
167 |   case p fp r eob s n st of
168 |     OK# st' a _ n -> OK# st' a s n
169 |     x             -> x
170 | {-# inline lookahead #-}
171 | 
172 | -- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes.
173 | --   All isolated bytes must be consumed.
174 | --
175 | -- Throws a runtime error if given a negative integer.
176 | isolate :: Int -> ParserT st r e a -> ParserT st r e a
177 | isolate = Common.withIntUnwrap# isolate#
178 | {-# inline isolate #-}
179 | 
180 | -- | @isolate# n# p@ runs the parser @p@ isolated to the next @n#@ bytes.
181 | --   All isolated bytes must be consumed.
182 | --
183 | -- Throws a runtime error if given a negative integer.
184 | isolate# :: Int# -> ParserT st r e a -> ParserT st r e a
185 | isolate# n# p = Common.withPosInt# n# (isolateUnsafe# n# p)
186 | {-# inline isolate# #-}
187 | 
188 | -- | @isolateUnsafe# i# p@ runs the parser @p@ isolated to the next @i#@ bytes.
189 | --   All isolated bytes must be consumed.
190 | --
191 | -- Undefined behaviour if given a negative integer.
192 | isolateUnsafe# :: Int# -> ParserT st r e ret -> ParserT st r e ret
193 | isolateUnsafe# i# (ParserT p) =
194 |     withEnsure# i# $ ParserT \fp !r eob s n st ->
195 |         let s' = plusAddr# s i#
196 |         in  case p fp r s' s n st of
197 |               OK# st' a s'' n' ->
198 |                 case eqAddr# s' s'' of
199 |                   1# -> OK#   st' a s'' n'
200 |                   _  -> Fail# st'
201 |               x -> x
202 | {-# inline isolateUnsafe# #-}
203 | 
204 | -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s,
205 | --   and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not
206 | --   the usual `chainl` function from the parsec libraries!
207 | chainl :: (b -> a -> b) -> ParserT st r e b -> ParserT st r e a -> ParserT st r e b
208 | chainl f start elem = start >>= go where
209 |   go b = withOption elem (\ !a -> go $! f b a) (pure b)
210 | {-# inline chainl #-}
211 | 
212 | -- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and
213 | --   combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not
214 | --   the usual `chainr` function from the parsec libraries!
215 | chainr :: (a -> b -> b) -> ParserT st r e a -> ParserT st r e b -> ParserT st r e b
216 | chainr f (ParserT elem) (ParserT end) = ParserT go where
217 |   go fp !r eob s n st = case elem fp r eob s n st of
218 |     OK#   st' a s' n' -> case go fp r eob s' n' st' of
219 |       OK# st'' b s'' n'' -> let !b' = f a b in OK# st'' b' s'' n''
220 |       x                  -> x
221 |     Fail# st'        -> end fp r eob s n st'
222 |     Err#  st' e      -> Err# st' e
223 | {-# inline chainr #-}
224 | 
225 | -- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third.
226 | --   This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not
227 | --   backtrack from the true/false cases.
228 | branch :: ParserT st r e a -> ParserT st r e b -> ParserT st r e b -> ParserT st r e b
229 | branch pa pt pf = ParserT \fp !r eob s n st -> case runParserT# pa fp r eob s n st of
230 |   OK#   st' _ s n' -> runParserT# pt fp r eob s n' st'
231 |   Fail# st'        -> runParserT# pf fp r eob s n  st'
232 |   Err#  st' e      -> Err# st' e
233 | {-# inline branch #-}
234 | 
235 | -- | Succeed if the first parser succeeds and the second one fails.
236 | notFollowedBy :: ParserT st r e a -> ParserT st r e b -> ParserT st r e a
237 | notFollowedBy p1 p2 = p1 <* fails p2
238 | {-# inline notFollowedBy #-}
239 | 
240 | --------------------------------------------------------------------------------
241 | 
242 | -- | Assert that there are at least @n@ bytes remaining.
243 | --
244 | -- Undefined behaviour if given a negative integer.
245 | ensure :: Int -> ParserT st r e ()
246 | ensure = Common.withIntUnwrap# ensure#
247 | {-# inline ensure #-}
248 | 
249 | -- | Assert that there are at least @n#@ bytes remaining.
250 | --
251 | -- Undefined behaviour if given a negative integer.
252 | ensure# :: Int# -> ParserT st r e ()
253 | ensure# n# = withEnsure# n# (pure ())
254 | {-# inline ensure# #-}
255 | 
256 | -- TODO: András: withEnsure operations seem superfluous to me?
257 | -- There's no unboxing in vanilla ensure that could be broken.
258 | 
259 | -- | Assert that there are at least @n#@ bytes remaining (CPS).
260 | --
261 | -- Undefined behaviour if given a negative integer.
262 | withEnsure :: Int -> ParserT st r e ret -> ParserT st r e ret
263 | withEnsure = Common.withIntUnwrap# withEnsure#
264 | {-# inline withEnsure #-}
265 | 
266 | -- | Assert that there is at least 1 byte remaining (CPS).
267 | --
268 | -- Undefined behaviour if given a negative integer.
269 | withEnsure1 :: ParserT st r e ret -> ParserT st r e ret
270 | withEnsure1 (ParserT p) = ParserT \fp !r eob s n st ->
271 |     case eqAddr# eob s of
272 |       0# -> p fp r eob s n st
273 |       _  -> Fail# st
274 | {-# inline withEnsure1 #-}
275 | 
276 | -- | Assert that there are at least @n#@ bytes remaining (CPS).
277 | --
278 | -- Undefined behaviour if given a negative integer.
279 | withEnsure# :: Int# -> ParserT st r e ret -> ParserT st r e ret
280 | withEnsure# n# (ParserT p) = ParserT \fp !r eob s n st ->
281 |     case n# <=# minusAddr# eob s of
282 |       1# -> p fp r eob s n st
283 |       _  -> Fail# st
284 | {-# inline withEnsure# #-}
285 | 
286 | --------------------------------------------------------------------------------
287 | 
288 | -- | Read @n@ bytes as a 'ByteString'. Fails if fewer than @n@ bytes are
289 | --   available.
290 | --
291 | -- Throws a runtime error if given a negative integer.
292 | --
293 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
294 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
295 | take :: Int -> ParserT st r e B.ByteString
296 | take (I# n#) = take# n#
297 | {-# inline take #-}
298 | 
299 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are
300 | --   available.
301 | --
302 | -- Throws a runtime error if given a negative integer.
303 | --
304 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
305 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
306 | take# :: Int# -> ParserT st r e B.ByteString
307 | take# n# = Common.withPosInt# n# (takeUnsafe# n#)
308 | {-# inline take# #-}
309 | 
310 | -- | Read @i#@ bytes as a 'ByteString'. Fails if newer than @i#@ bytes are
311 | --   available.
312 | --
313 | -- Undefined behaviour if given a negative integer.
314 | --
315 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
316 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
317 | takeUnsafe# :: Int# -> ParserT st r e B.ByteString
318 | takeUnsafe# i# = ParserT \fp !r eob s n st ->
319 |     case i# <=# minusAddr# eob s of
320 |       1# -> OK#   st (B.PS (ForeignPtr s fp) 0 (I# i#)) (plusAddr# s i#) n
321 |       _  -> Fail# st
322 | {-# inline takeUnsafe# #-}
323 | 
324 | -- | Consume the rest of the input. May return the empty bytestring.
325 | --
326 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input,
327 | -- and will keep it alive. To avoid this, use 'B.copy' on the output.
328 | takeRest :: ParserT st r e B.ByteString
329 | takeRest = ParserT \fp !r eob s n st ->
330 |   let i# = minusAddr# eob s
331 |   in  OK# st (B.PS (ForeignPtr s fp) 0 (I# i#)) eob n
332 | {-# inline takeRest #-}
333 | 
334 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available.
335 | --
336 | -- Throws a runtime error if given a negative integer.
337 | skip :: Int -> ParserT st r e ()
338 | skip (I# n#) = skip# n#
339 | {-# inline skip #-}
340 | 
341 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available.
342 | --
343 | -- Throws a runtime error if given a negative integer.
344 | skip# :: Int# -> ParserT st r e ()
345 | skip# n# = atSkip# n# (pure ())
346 | {-# inline skip# #-}
347 | 
348 | -- | Go back @i@ bytes in the input. Takes a positive integer.
349 | --
350 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea.
351 | skipBack :: Int -> ParserT st r e ()
352 | skipBack = Common.withIntUnwrap# skipBack#
353 | {-# inline skipBack #-}
354 | 
355 | -- | Go back @i#@ bytes in the input. Takes a positive integer.
356 | --
357 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea.
358 | skipBack# :: Int# -> ParserT st r e ()
359 | skipBack# i# = ParserT \fp !r eob s n st ->
360 |     OK# st () (plusAddr# s (negateInt# i#)) n
361 | {-# inline skipBack# #-}
362 | 
363 | -- | Skip forward @n#@ bytes and run the given parser. Fails if fewer than @n#@
364 | --   bytes are available.
365 | --
366 | -- Throws a runtime error if given a negative integer.
367 | atSkip# :: Int# -> ParserT st r e ret -> ParserT st r e ret
368 | atSkip# n# p = Common.withPosInt# n# (atSkipUnsafe# n# p)
369 | {-# inline atSkip# #-}
370 | 
371 | -- | Skip forward @i#@ bytes and run the given parser. Fails if fewer than @i@
372 | --   bytes are available.
373 | --
374 | -- Undefined behaviour if given a negative integer.
375 | atSkipUnsafe# :: Int# -> ParserT st r e ret -> ParserT st r e ret
376 | atSkipUnsafe# i# (ParserT p) =
377 |     withEnsure# i# $ ParserT \fp !r eob s n st ->
378 |         p fp r eob (plusAddr# s i#) n st
379 | {-# inline atSkipUnsafe# #-}
380 | 
381 | --------------------------------------------------------------------------------
382 | 
383 | -- | Skip a parser zero or more times.
384 | skipMany :: ParserT st r e a -> ParserT st r e ()
385 | skipMany (ParserT f) = ParserT go where
386 |   go fp !r eob s n st = case f fp r eob s n st of
387 |     OK# st a s n -> go fp r eob s n st
388 |     Fail# st     -> OK# st () s n
389 |     Err# st e    -> Err# st e
390 | {-# inline skipMany #-}
391 | 
392 | -- TODO identical to one from parser-combinators
393 | -- | Skip a parser one or more times.
394 | skipSome :: ParserT st r e a -> ParserT st r e ()
395 | skipSome p = p *> skipMany p
396 | {-# inline skipSome #-}
397 | 


--------------------------------------------------------------------------------
/src/FlatParse/Stateful.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE UnboxedTuples #-}
  2 | {-# LANGUAGE DataKinds #-}
  3 | {-# LANGUAGE KindSignatures #-}
  4 | {-# LANGUAGE RankNTypes #-}
  5 | {-# LANGUAGE FlexibleInstances #-}
  6 | {-# LANGUAGE ScopedTypeVariables #-}
  7 | 
  8 | {-|
  9 | Parser supporting a custom reader environment, custom error types and an 'Int'
 10 | state. A common use case of the `Int` state is to keep track of column numbers
 11 | to implement indentation-sensitive parsers.
 12 | -}
 13 | 
 14 | module FlatParse.Stateful (
 15 | 
 16 |   -- * Parser types
 17 |     FP.Parser.ParserT(..)
 18 |   , FP.Parser.Parser, FP.Parser.ParserIO, FP.Parser.ParserST
 19 | 
 20 |   -- * Running parsers
 21 |   , Result(..)
 22 |   , runParser
 23 |   , runParserUtf8
 24 |   , runParserIO
 25 |   , runParserST
 26 |   , embedParserST
 27 |   , embedParser
 28 | 
 29 |   -- ** Primitive result types
 30 |   , type FP.Parser.Res#
 31 |   , pattern FP.Parser.OK#, pattern FP.Parser.Err#, pattern FP.Parser.Fail#
 32 |   , type FP.Parser.ResI#
 33 | 
 34 |   -- * Embedding `ST` operations
 35 |   , liftST
 36 | 
 37 |   -- * Environment operations
 38 |   , ask
 39 |   , local
 40 | 
 41 |   -- * State operations
 42 |   , get
 43 |   , put
 44 |   , modify
 45 | 
 46 |   -- * UTF conversion
 47 |   , Common.strToUtf8
 48 |   , Common.utf8ToStr
 49 | 
 50 |   -- * Character predicates
 51 |   , Common.isDigit
 52 |   , Common.isLatinLetter
 53 |   , Common.isGreekLetter
 54 | 
 55 |   -- * Parsers
 56 |   -- ** Bytewise
 57 |   , FP.Base.eof
 58 |   , FP.Base.take
 59 |   , FP.Base.take#
 60 |   , FP.Base.takeUnsafe#
 61 |   , FP.Base.takeRest
 62 |   , FP.Base.skip
 63 |   , FP.Base.skip#
 64 |   , FP.Base.skipBack
 65 |   , FP.Base.skipBack#
 66 |   , FP.Base.atSkip#
 67 |   , FP.Base.atSkipUnsafe#
 68 | 
 69 |   , FP.Bytes.bytes
 70 |   , FP.Bytes.bytesUnsafe
 71 |   , byteString
 72 |   , anyCString
 73 |   , anyVarintProtobuf
 74 | 
 75 |   -- ** Combinators
 76 |   , (FP.Parser.<|>)
 77 |   , FP.Base.branch
 78 |   , FP.Base.notFollowedBy
 79 |   , FP.Base.chainl
 80 |   , FP.Base.chainr
 81 |   , FP.Base.lookahead
 82 |   , FP.Base.ensure
 83 |   , FP.Base.ensure#
 84 |   , FP.Base.withEnsure
 85 |   , FP.Base.withEnsure1
 86 |   , FP.Base.withEnsure#
 87 |   , FP.Base.isolate
 88 |   , isolateToNextNull
 89 |   , FP.Base.isolate#
 90 |   , FP.Base.isolateUnsafe#
 91 |   , FP.Switch.switch
 92 |   , FP.Switch.switchWithPost
 93 |   , FP.Switch.rawSwitchWithPost
 94 |   , Control.Applicative.many
 95 |   , FP.Base.skipMany
 96 |   , Control.Applicative.some
 97 |   , FP.Base.skipSome
 98 | 
 99 |   -- ** Errors and failures
100 |   , Control.Applicative.empty
101 |   , FP.Base.failed
102 |   , FP.Base.try
103 |   , FP.Base.err
104 |   , FP.Base.withError
105 |   , FP.Base.withAnyResult
106 |   , FP.Base.fails
107 |   , FP.Base.cut
108 |   , FP.Base.cutting
109 |   , FP.Base.optional
110 |   , FP.Base.optional_
111 |   , FP.Base.withOption
112 | 
113 |   -- ** Positions
114 |   , FlatParse.Common.Position.Pos(..)
115 |   , FlatParse.Common.Position.endPos
116 |   , FlatParse.Common.Position.addrToPos#
117 |   , FlatParse.Common.Position.posToAddr#
118 |   , FlatParse.Common.Position.Span(..)
119 |   , FlatParse.Common.Position.unsafeSlice
120 |   , getPos
121 |   , setPos
122 |   , spanOf
123 |   , withSpan
124 |   , byteStringOf
125 |   , withByteString
126 |   , inSpan
127 |   , Basic.validPos
128 |   , Basic.posLineCols
129 |   , Basic.mkPos
130 | 
131 |   -- ** Text
132 |   -- *** UTF-8
133 |   , FP.Text.char, FP.Text.string
134 |   , FP.Text.anyChar, FP.Text.skipAnyChar
135 |   , FP.Text.satisfy, FP.Text.skipSatisfy
136 |   , FP.Text.fusedSatisfy, FP.Text.skipFusedSatisfy
137 |   , FP.Text.takeLine
138 |   , FP.Text.takeRestString
139 |   , Basic.linesUtf8
140 | 
141 |   -- *** ASCII
142 |   , FP.Text.anyAsciiChar, FP.Text.skipAnyAsciiChar
143 |   , FP.Text.satisfyAscii, FP.Text.skipSatisfyAscii
144 | 
145 |   -- *** ASCII-encoded numbers
146 |   , FP.Text.anyAsciiDecimalWord
147 |   , FP.Text.anyAsciiDecimalInt
148 |   , FP.Text.anyAsciiDecimalInteger
149 |   , FP.Text.anyAsciiHexWord
150 |   , FP.Text.anyAsciiHexInt
151 | 
152 |   -- ** Machine integers
153 |   , module FP.Integers
154 | 
155 |   -- ** Debugging parsers
156 |   , FP.Text.traceLine
157 |   , FP.Text.traceRest
158 | 
159 |   -- * Unsafe
160 |   , unsafeSpanToByteString
161 | 
162 |   -- ** IO
163 |   , unsafeLiftIO
164 | 
165 |   -- ** Parsers
166 |   , module FP.Addr
167 |   , anyCStringUnsafe
168 | 
169 |   ) where
170 | 
171 | -- for WORDS_BIGENDIAN
172 | #include "MachDeps.h"
173 | 
174 | import qualified FlatParse.Basic as Basic
175 | import FlatParse.Stateful.Parser
176 | import FlatParse.Stateful.Base
177 | import FlatParse.Stateful.Integers
178 | import FlatParse.Stateful.Addr
179 | import FlatParse.Common.Position
180 | import qualified FlatParse.Common.Assorted as Common
181 | import qualified FlatParse.Common.Numbers  as Common
182 | 
183 | import qualified FlatParse.Stateful.Parser as FP.Parser
184 | import qualified FlatParse.Stateful.Base as FP.Base
185 | import qualified FlatParse.Stateful.Integers as FP.Integers
186 | import qualified FlatParse.Stateful.Bytes as FP.Bytes
187 | import qualified FlatParse.Stateful.Text as FP.Text
188 | import qualified FlatParse.Stateful.Switch as FP.Switch
189 | import qualified FlatParse.Stateful.Addr as FP.Addr
190 | 
191 | import qualified Control.Applicative
192 | import GHC.IO (IO(..), unsafeIOToST)
193 | import GHC.Int
194 | import GHC.Exts
195 | import GHC.ForeignPtr
196 | import GHC.ST (ST(..))
197 | import System.IO.Unsafe
198 | 
199 | import qualified Data.ByteString as B
200 | import qualified Data.ByteString.Internal as B
201 | import qualified Data.ByteString.Unsafe as B
202 | 
203 | --------------------------------------------------------------------------------
204 | 
205 | -- | Higher-level boxed data type for parsing results.
206 | data Result e a =
207 |     OK a Int !(B.ByteString)  -- ^ Contains return value, last `Int` state, unconsumed input.
208 |   | Fail                      -- ^ Recoverable-by-default failure.
209 |   | Err !e                    -- ^ Unrecoverble-by-default error.
210 |   deriving Show
211 | 
212 | instance Functor (Result e) where
213 |   fmap f (OK a s n) = let !b = f a in OK b s n
214 |   fmap f r          = unsafeCoerce# r
215 |   {-# inline fmap #-}
216 |   (<$) a (OK _ s n) = OK a s n
217 |   (<$) _ r          = unsafeCoerce# r
218 |   {-# inline (<$) #-}
219 | 
220 | -- | Embed an IO action in a 'ParserT'. This is slightly safer than 'unsafePerformIO' because
221 | -- it will sequenced correctly with respect to the surrounding actions, and its execution is guaranteed.
222 | unsafeLiftIO :: IO a -> ParserT st r e a
223 | unsafeLiftIO io = ParserT \fp !r eob s n st ->
224 |                    let !a = unsafePerformIO io
225 |                    in OK# st a s n
226 | {-# inline unsafeLiftIO #-}
227 | 
228 | --------------------------------------------------------------------------------
229 | 
230 | -- | Run a pure parser. The `Int` argument is the initial state.
231 | runParser :: Parser r e a -> r -> Int -> B.ByteString -> Result e a
232 | runParser (ParserT f) !r (I# n) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = unsafeDupablePerformIO $
233 |   B.unsafeUseAsCString b \(Ptr buf) -> do
234 |     let end = plusAddr# buf len
235 |     pure case f fp r end buf n proxy# of
236 |       OK# _st a s n' -> let offset = minusAddr# s buf
237 |                         in OK a (I# n') (B.drop (I# offset) b)
238 | 
239 |       Err# _st e ->  Err e
240 |       Fail# _st  ->  Fail
241 | {-# noinline runParser #-}
242 | -- We mark this as noinline to allow power users to safely do unsafe state token coercions.
243 | -- Details are discussed in https://github.com/AndrasKovacs/flatparse/pull/34#issuecomment-1326999390
244 | 
245 | -- | Run a parser on a 'String', converting it to the corresponding UTF-8 bytes.
246 | --   The `Int` argument is the initial state.
247 | --
248 | -- Reminder: @OverloadedStrings@ for 'B.ByteString' does not yield a valid UTF-8
249 | -- encoding! For non-ASCII 'B.ByteString' literal input, use this wrapper or
250 | -- convert your input using `strToUtf8`.
251 | runParserUtf8 :: Parser r e a -> r -> Int -> String -> Result e a
252 | runParserUtf8 pa r !n s = runParser pa r n (Common.strToUtf8 s)
253 | 
254 | -- | Run an `ST`-based parser. The `Int` argument is the initial state.
255 | runParserST :: ParserST s r e a -> r -> Int -> B.ByteString -> ST s (Result e a)
256 | runParserST pst !r i buf = unsafeIOToST (runParserIO (unsafeCoerce# pst) r i buf)
257 | {-# inlinable runParserST #-}
258 | 
259 | -- | Run an `IO`-based parser. The `Int` argument is the initial state.
260 | runParserIO :: ParserIO r e a -> r -> Int -> B.ByteString -> IO (Result e a)
261 | runParserIO (ParserT f) !r (I# n) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = do
262 |   B.unsafeUseAsCString b \(Ptr buf) -> do
263 |     let end = plusAddr# buf len
264 |     IO \st -> case f fp r end buf n st of
265 |       OK# rw' a s n' ->  let offset = minusAddr# s buf
266 |                          in (# rw', OK a (I# n') (B.drop (I# offset) b) #)
267 | 
268 |       Err# rw' e ->  (# rw', Err e #)
269 |       Fail# rw'  ->  (# rw', Fail #)
270 | {-# inlinable runParserIO #-}
271 | 
272 | -- | Run a `ParserST` inside any parser.
273 | embedParserST :: forall s r e a. (forall s. ParserST s r e a) -> ParserT s r e a
274 | embedParserST f = unsafeCoerce# (f :: ParserST RealWorld r e a)
275 | {-# inline embedParserST #-}
276 | 
277 | -- | Run a pure `Parser` inside any parser.
278 | embedParser :: forall s r e a. Parser r e a -> ParserT s r e a
279 | embedParser f = unsafeCoerce# f
280 | {-# inline embedParser #-}
281 | 
282 | --------------------------------------------------------------------------------
283 | 
284 | -- | Run an `ST` action in a `ParserST`.
285 | liftST :: ST s a -> ParserST s r e a
286 | liftST (ST f) = ParserT \fp !r eob s n st -> case f st of
287 |   (# st, a #) -> OK# st a s n
288 | {-# inline liftST #-}
289 | 
290 | --------------------------------------------------------------------------------
291 | 
292 | -- | Query the `Int` state.
293 | get :: ParserT st r e Int
294 | get = ParserT \fp !r eob s n st -> OK# st (I# n) s n
295 | {-# inline get #-}
296 | 
297 | -- | Write the `Int` state.
298 | put :: Int -> ParserT st r e ()
299 | put (I# n) = ParserT \fp !r eob s _ st -> OK# st () s n
300 | {-# inline put #-}
301 | 
302 | -- | Modify the `Int` state.
303 | modify :: (Int -> Int) -> ParserT st r e ()
304 | modify f = ParserT \fp !r eob s n st ->
305 |   case f (I# n) of
306 |     I# n -> OK# st () s n
307 | {-# inline modify #-}
308 | 
309 | -- | Query the environment.
310 | ask :: ParserT st r e r
311 | ask = ParserT \fp !r eob s n st -> OK# st r s n
312 | {-# inline ask #-}
313 | 
314 | -- | Run a parser in a modified environment.
315 | local :: (r -> r) -> ParserT st r e a -> ParserT st r e a
316 | local f (ParserT g) = ParserT \fp !r eob s n st -> let !r' = f r in g fp r' eob s n st
317 | {-# inline local #-}
318 | 
319 | --------------------------------------------------------------------------------
320 | 
321 | -- | Parse a given `B.ByteString`. If the bytestring is statically known, consider using 'bytes' instead.
322 | byteString :: B.ByteString -> ParserT st r e ()
323 | byteString (B.PS (ForeignPtr bs fcontent) _ (I# len)) =
324 | 
325 |   let go64 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e ()
326 |       go64 bs bsend s n rw =
327 |         let bs' = plusAddr# bs 8# in
328 |         case gtAddr# bs' bsend of
329 |           1# -> go8 bs bsend s n rw
330 | #if MIN_VERSION_base(4,17,0)
331 |           _  -> case eqWord64# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of
332 | #else
333 |           _  -> case eqWord# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of
334 | #endif
335 |             1# -> go64 bs' bsend (plusAddr# s 8#) n rw
336 |             _  -> Fail# rw
337 | 
338 |       go8 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e ()
339 |       go8 bs bsend s n rw = case ltAddr# bs bsend of
340 | #if MIN_VERSION_base(4,16,0)
341 |         1# -> case eqWord8# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of
342 | #else
343 |         1# -> case eqWord# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of
344 | #endif
345 |           1# -> go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) n rw
346 |           _  -> Fail# rw
347 |         _  -> OK# rw () s n
348 | 
349 |       go :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e ()
350 |       go bs bsend s n rw = case go64 bs bsend s n rw of
351 |         (# rw', res #) -> case touch# fcontent rw' of
352 |           rw'' -> (# rw'', res #)
353 | 
354 |   in ParserT \fp !r eob s n st ->
355 |       case len <=# minusAddr# eob s of
356 |            1# -> case runRW# (go bs (plusAddr# bs len) s n) of
357 |              (# rw, a #) -> (# st, a #)
358 |            _  -> Fail# st
359 | {-# inline byteString #-}
360 | 
361 | --------------------------------------------------------------------------------
362 | 
363 | -- | Get the current position in the input.
364 | getPos :: ParserT st r e Pos
365 | getPos = ParserT \fp !r eob s n st -> OK# st (addrToPos# eob s) s n
366 | {-# inline getPos #-}
367 | 
368 | -- | Set the input position.
369 | --
370 | -- Warning: this can result in crashes if the position points outside the
371 | -- current buffer. It is always safe to 'setPos' values which came from 'getPos'
372 | -- with the current input.
373 | setPos :: Pos -> ParserT st r e ()
374 | setPos s = ParserT \fp !r eob _ n st -> OK# st () (posToAddr# eob s) n
375 | {-# inline setPos #-}
376 | 
377 | -- | Return the consumed span of a parser. Use `withSpan` if possible for better efficiency.
378 | spanOf :: ParserT st r e a -> ParserT st r e Span
379 | spanOf (ParserT f) = ParserT \fp !r eob s n st -> case f fp r eob s n st of
380 |   OK# st' a s' n -> OK# st' (Span (addrToPos# eob s) (addrToPos# eob s')) s' n
381 |   x              -> unsafeCoerce# x
382 | {-# inline spanOf #-}
383 | 
384 | -- | Bind the result together with the span of the result. CPS'd version of `spanOf`
385 | --   for better unboxing.
386 | withSpan :: ParserT st r e a -> (a -> Span -> ParserT st r e b) -> ParserT st r e b
387 | withSpan (ParserT f) g = ParserT \fp !r eob s n st -> case f fp r eob s n st of
388 |   OK# st' a s' n -> runParserT# (g a (Span (addrToPos# eob s) (addrToPos# eob s'))) fp r eob s' n st'
389 |   x              -> unsafeCoerce# x
390 | {-# inline withSpan #-}
391 | 
392 | -- | Return the `B.ByteString` consumed by a parser. Note: it's more efficient to use `spanOf` and
393 | --   `withSpan` instead.
394 | byteStringOf :: ParserT st r e a -> ParserT st r e B.ByteString
395 | byteStringOf (ParserT f) = ParserT \fp !r eob s n st -> case f fp r eob s n st of
396 |   OK# st' a s' n -> OK# st' (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) s' n
397 |   x              -> unsafeCoerce# x
398 | {-# inline byteStringOf #-}
399 | 
400 | -- | CPS'd version of `byteStringOf`. Can be more efficient, because the result is more eagerly unboxed
401 | --   by GHC. It's more efficient to use `spanOf` or `withSpan` instead.
402 | withByteString :: ParserT st r e a -> (a -> B.ByteString -> ParserT st r e b) -> ParserT st r e b
403 | withByteString (ParserT f) g = ParserT \fp !r eob s n st -> case f fp r eob s n st of
404 |   OK# st' a s' n -> runParserT# (g a (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s)))) fp r eob s' n st'
405 |   x              -> unsafeCoerce# x
406 | {-# inline withByteString #-}
407 | 
408 | -- | Run a parser in a given input 'Span'.
409 | --
410 | -- The input position and the parser state is restored after the parser is
411 | -- finished, so 'inSpan' does not consume input and has no side effect.
412 | --
413 | -- Warning: this operation may crash if the given span points outside the
414 | -- current parsing buffer. It's always safe to use 'inSpan' if the 'Span' comes
415 | -- from a previous 'withSpan' or 'spanOf' call on the current input.
416 | inSpan :: Span -> ParserT st r e a -> ParserT st r e a
417 | inSpan (Span s eob) (ParserT f) = ParserT \fp !r eob' s' n' st ->
418 |   case f fp r (posToAddr# eob' eob) (posToAddr# eob' s) n' st of
419 |     OK# st' a _ _ -> OK# st' a s' n'
420 |     x             -> unsafeCoerce# x
421 | {-# inline inSpan #-}
422 | 
423 | --------------------------------------------------------------------------------
424 | 
425 | -- | Create a 'B.ByteString' from a 'Span'.
426 | --
427 | -- The result is invalid if the 'Span' points outside the current buffer, or if
428 | -- the 'Span' start is greater than the end position.
429 | unsafeSpanToByteString :: Span -> ParserT st r e B.ByteString
430 | unsafeSpanToByteString (Span l r) =
431 |   lookahead (setPos l >> byteStringOf (setPos r))
432 | {-# inline unsafeSpanToByteString #-}
433 | 
434 | --------------------------------------------------------------------------------
435 | 
436 | -- | Isolate the given parser up to (excluding) the next null byte.
437 | --
438 | -- Like 'isolate', all isolated bytes must be consumed. The null byte is
439 | -- consumed afterwards.
440 | --
441 | -- Useful for defining parsers for null-terminated data.
442 | isolateToNextNull :: ParserT st r e a -> ParserT st r e a
443 | isolateToNextNull (ParserT p) = ParserT \fp !r eob s n st -> go fp r n eob s st s
444 |   where
445 |     goP fp r n sNull s0 st =
446 |         case p fp r sNull s0 n st of
447 |           OK# st' a s' n' ->
448 |             case eqAddr# s' sNull of
449 |               1# -> -- consumed up to null: skip null, return
450 |                     OK#   st' a (sNull `plusAddr#` 1#) n'
451 |               _  -> Fail# st' -- didn't consume fully up to null: fail
452 |           x -> x
453 | 
454 |     go8 fp r n eob s0 st s =
455 |         case eqAddr# eob s of
456 |           1# -> Fail# st -- end of input, no null: fail
457 |           _  ->
458 |             let s' = s `plusAddr#` 1# in
459 | #if MIN_VERSION_base(4,16,0)
460 |             -- below may be made clearer with ExtendedLiterals (GHC 9.8)
461 |             case eqWord8# (indexWord8OffAddr# s 0#) (wordToWord8# 0##) of
462 | #else
463 |             case eqWord# (indexWord8OffAddr# s 0#) 0## of
464 | #endif
465 |               1# -> goP fp r n s   s0 st    --     0x00: isolate, execute parser
466 |               _  -> go8 fp r n eob s0 st s' -- not 0x00: next please!
467 | 
468 |     {- The "find first null byte" algorithms used here are adapted from
469 |        Hacker's Delight (2012) ch.6.
470 | 
471 |     We read a word (8 bytes) at a time for efficiency. The internal algorithm
472 |     does byte indexing, thus endianness matters. We switch between indexing
473 |     algorithms depending on compile-time native endianness. (The code
474 |     surrounding the indexing is endian-independent, so we do this inline).
475 |     -}
476 |     go fp r n eob s0 st s =
477 |         let sWord = s `plusAddr#` 8# in
478 |         case gtAddr# sWord eob of
479 |           1# -> -- <  8 bytes of input: revert to scanning byte by byte
480 |             -- we _could_ operate on a word and simply ensure not to use the
481 |             -- out-of-bounds data, which would be faster, but the act of
482 |             -- reading could probably segfault
483 |             go8 fp r n eob s0 st s
484 |           _  -> -- >= 8 bytes of input: use efficient 8-byte scanning
485 | #if defined(WORDS_BIGENDIAN)
486 |             -- big-endian ("L->R"): find leftmost null byte
487 |             let !x@(I64# x#) = Common.zbytel'intermediate (I64# (indexInt64OffAddr# s 0#)) in
488 | #else
489 |             -- little-endian ("R->L"): find rightmost null byte
490 |             let !x@(I64# x#) = Common.zbyter'intermediate (I64# (indexInt64OffAddr# s 0#)) in
491 | #endif
492 | #if MIN_VERSION_base(4,17,0)
493 |             case eqInt64# x# (intToInt64# 0#) of
494 | #else
495 |             case x# ==# 0# of
496 | #endif
497 |               1# -> go fp r n eob s0 st sWord -- no 0x00 in next word
498 |               _  -> -- 0x00 somewhere in next word
499 | #if defined(WORDS_BIGENDIAN)
500 |                 let !(I# nullIdx#) = Common.zbytel'toIdx x in
501 | #else
502 |                 let !(I# nullIdx#) = Common.zbyter'toIdx x in
503 |                 -- TO TEST BE ON LE: change above CPP to zbytel, uncomment below
504 |                 -- let !(I# nullIdx#) = Common.zbytel'toIdx (I# (word2Int# (byteSwap# (int2Word# x#)))) in
505 | #endif
506 |                 let sNull = s `plusAddr#` nullIdx# in
507 |                 goP fp r n sNull s0 st
508 | {-# inline isolateToNextNull #-}
509 | 
510 | -- | Read a null-terminated bytestring (a C-style string).
511 | --
512 | -- Consumes the null terminator.
513 | anyCString :: ParserT st r e B.ByteString
514 | anyCString = isolateToNextNull takeRest
515 | {-# inline anyCString #-}
516 | 
517 | -- | Read a null-terminated bytestring (a C-style string), where the bytestring
518 | --   is known to be null-terminated somewhere in the input.
519 | --
520 | -- Highly unsafe. Unless you have a guarantee that the string will be null
521 | -- terminated before the input ends, use 'anyCString' instead. Honestly, I'm not
522 | -- sure if this is a good function to define. But here it is.
523 | --
524 | -- Fails on GHC versions older than 9.0, since we make use of the
525 | -- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful
526 | -- without it.
527 | --
528 | -- Consumes the null terminator.
529 | anyCStringUnsafe :: ParserT st r e B.ByteString
530 | {-# inline anyCStringUnsafe #-}
531 | #if MIN_VERSION_base(4,15,0)
532 | anyCStringUnsafe = ParserT \fp !r eob s n st ->
533 |   case eqAddr# eob s of
534 |     1# -> Fail# st
535 |     _  -> let n#  = cstringLength# s
536 |               s'# = plusAddr# s (n# +# 1#)
537 |            in OK# st (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# n
538 | #else
539 | anyCStringUnsafe = error "Flatparse.Stateful.anyCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler"
540 | #endif
541 | 
542 | -- | Read a protobuf-style varint into a positive 'Int'.
543 | --
544 | -- protobuf-style varints are byte-aligned. For each byte, the lower 7 bits are
545 | -- data and the MSB indicates if there are further bytes. Once fully parsed, the
546 | -- 7-bit payloads are concatenated and interpreted as a little-endian unsigned
547 | -- integer.
548 | --
549 | -- Fails if the varint exceeds the positive 'Int' range.
550 | --
551 | -- Really, these are varnats. They also match with the LEB128 varint encoding.
552 | --
553 | -- protobuf encodes negatives in unsigned integers using zigzag encoding. See
554 | -- the @fromZigzag@ family of functions for this functionality.
555 | --
556 | -- Further reading:
557 | -- https://developers.google.com/protocol-buffers/docs/encoding#varints
558 | anyVarintProtobuf :: ParserT st r e Int
559 | anyVarintProtobuf = ParserT \fp !r eob s n st ->
560 |     case Common.anyVarintProtobuf# eob s of
561 |       (# (##) | #) -> Fail# st
562 |       (# | (# w#, s#, bits# #) #) ->
563 |         case bits# ># (WORD_SIZE_IN_BITS# -# 1#) of
564 |           0# -> OK# st (I# w#) s# n
565 |           _  -> Fail# st -- overflow
566 | {-# inline anyVarintProtobuf #-}
567 | 


--------------------------------------------------------------------------------