├── stack-8.6.5.yaml ├── stack-8.8.4.yaml ├── stack-9.0.2.yaml ├── stack-9.2.8.yaml ├── stack-9.4.8.yaml ├── stack-8.10.7.yaml ├── .gitignore ├── stack.yaml ├── stack-9.6.6.yaml ├── stack-9.8.4.yaml ├── bench ├── ReadInteger.hs ├── Common.hs ├── FPBasic.hs ├── FPStateful.hs ├── Attoparsec.hs ├── Megaparsec.hs ├── Parsec.hs └── Bench.hs ├── src └── FlatParse │ ├── Common │ ├── Parser.hs │ ├── GHCExts.hs │ ├── Position.hs │ ├── Switch.hs │ ├── Assorted.hs │ └── Numbers.hs │ ├── Basic │ ├── Bytes.hs │ ├── Addr.hs │ ├── Switch.hs │ ├── Parser.hs │ ├── Text.hs │ ├── Integers.hs │ └── Base.hs │ ├── Stateful │ ├── Bytes.hs │ ├── Addr.hs │ ├── Switch.hs │ ├── Parser.hs │ ├── Text.hs │ ├── Integers.hs │ └── Base.hs │ ├── Examples │ └── BasicLambda │ │ ├── Parser.hs │ │ └── Lexer.hs │ └── Stateful.hs ├── LICENSE ├── flake.nix ├── .github └── workflows │ └── haskell.yml ├── flake.lock ├── package.yaml ├── flatparse.cabal └── README.md /stack-8.6.5.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-14.27 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /stack-8.8.4.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-16.31 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /stack-9.0.2.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-19.33 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /stack-9.2.8.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-20.26 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /stack-9.4.8.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-21.25 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /stack-8.10.7.yaml: -------------------------------------------------------------------------------- 1 | 2 | resolver: lts-18.28 3 | 4 | packages: 5 | - . 6 | 7 | ghc-options: 8 | "$everything": -split-sections 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hi 2 | *.o 3 | ./dist 4 | .stack-work/ 5 | *# 6 | TAGS 7 | #*.lock 8 | *.dump-simpl 9 | *.dump-cmm 10 | *.dump-stg 11 | dist-newstyle/ 12 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | 2 | # ghc 9.10.2 3 | 4 | resolver: lts-24.7 5 | 6 | packages: 7 | - . 8 | 9 | ghc-options: 10 | "$everything": -split-sections 11 | -------------------------------------------------------------------------------- /stack-9.6.6.yaml: -------------------------------------------------------------------------------- 1 | 2 | # ghc 9.6.6 3 | 4 | resolver: lts-22.41 5 | 6 | packages: 7 | - . 8 | 9 | ghc-options: 10 | "$everything": -split-sections 11 | -------------------------------------------------------------------------------- /stack-9.8.4.yaml: -------------------------------------------------------------------------------- 1 | 2 | # ghc 9.8.4 3 | 4 | resolver: lts-23.25 5 | 6 | packages: 7 | - . 8 | 9 | ghc-options: 10 | "$everything": -split-sections 11 | -------------------------------------------------------------------------------- /bench/ReadInteger.hs: -------------------------------------------------------------------------------- 1 | 2 | module ReadInteger where 3 | 4 | import FlatParse.Basic as FPBasic 5 | 6 | readInt = runParser FPBasic.anyAsciiDecimalInt 7 | readInteger = runParser FPBasic.anyAsciiDecimalInteger 8 | -------------------------------------------------------------------------------- /bench/Common.hs: -------------------------------------------------------------------------------- 1 | {-# language Strict #-} 2 | 3 | module Common where 4 | 5 | import qualified Data.ByteString as B 6 | 7 | type Name = B.ByteString 8 | data Tm = Var Name | App Tm Tm | Lam Name Tm | Let Name Tm Tm | Int Int | Add Tm Tm | Mul Tm Tm 9 | deriving Show 10 | -------------------------------------------------------------------------------- /src/FlatParse/Common/Parser.hs: -------------------------------------------------------------------------------- 1 | -- | Common low-level parser definitions. 2 | 3 | module FlatParse.Common.Parser 4 | ( 5 | -- * Parser state token types 6 | -- $parser-state-token-types 7 | type PureMode 8 | , type IOMode 9 | , type STMode 10 | ) where 11 | 12 | import GHC.Exts 13 | import Data.Void ( Void ) 14 | 15 | {- $parser-state-token-types 16 | These type synonyms are used as parameters to @ParserT@. Different state tokens 17 | support different embedded effects. 18 | -} 19 | 20 | -- TODO 2023-01-17 raehik: perhaps more/better explanation? 21 | 22 | type PureMode = Proxy# Void 23 | type IOMode = State# RealWorld 24 | type STMode s = State# s 25 | -------------------------------------------------------------------------------- /src/FlatParse/Common/GHCExts.hs: -------------------------------------------------------------------------------- 1 | -- | 'GHC.Exts' compatibility wrapper. 2 | 3 | {-# LANGUAGE KindSignatures #-} 4 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why) 5 | {-# OPTIONS_GHC -Wno-dodgy-exports #-} 6 | 7 | module FlatParse.Common.GHCExts 8 | ( module FlatParse.Common.GHCExts 9 | , module GHC.Exts 10 | ) where 11 | 12 | import GHC.Exts 13 | 14 | #if !MIN_VERSION_base(4,17,0) 15 | {- 16 | GHC 9.4 clarified the story for types without runtime representations. These 17 | type synonyms are defined and used to simplify certain internal definitions 18 | (e.g. 'State#'). They are nicer than using the "expanded" type, so we define 19 | them here for older compilers. 20 | -} 21 | type ZeroBitRep = 'TupleRep ('[] :: [RuntimeRep]) 22 | type ZeroBitType = TYPE ZeroBitRep 23 | #endif 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 András Kovács 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs = { 3 | nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; 4 | flake-parts.url = "github:hercules-ci/flake-parts"; 5 | haskell-flake.url = "github:srid/haskell-flake"; 6 | }; 7 | outputs = inputs: 8 | let 9 | # simple devshell for non-dev compilers: really just want `cabal repl` 10 | nondevDevShell = compiler: { 11 | mkShellArgs.name = "${compiler}-flatparse"; 12 | hoogle = false; 13 | tools = _: { 14 | hlint = null; 15 | haskell-language-server = null; 16 | ghcid = null; 17 | }; 18 | }; 19 | in 20 | inputs.flake-parts.lib.mkFlake { inherit inputs; } { 21 | systems = inputs.nixpkgs.lib.systems.flakeExposed; 22 | imports = [ inputs.haskell-flake.flakeModule ]; 23 | perSystem = { self', pkgs, config, ... }: { 24 | packages.default = self'.packages.ghc96-flatparse; 25 | devShells.default = self'.devShells.ghc96; 26 | haskellProjects.ghc98 = { 27 | # shouldn't work, pkgs aren't up to date and mine aren't 9.8 ready 28 | basePackages = pkgs.haskell.packages.ghc98; 29 | devShell = nondevDevShell "ghc98"; 30 | }; 31 | haskellProjects.ghc96 = { 32 | basePackages = pkgs.haskell.packages.ghc96; 33 | devShell.mkShellArgs.name = "ghc96-flatparse"; 34 | devShell.tools = _: { 35 | haskell-language-server = null; # 2024-03-06: broken 36 | }; 37 | }; 38 | haskellProjects.ghc94 = { 39 | basePackages = pkgs.haskell.packages.ghc94; 40 | devShell = nondevDevShell "ghc94"; 41 | }; 42 | haskellProjects.ghc92 = { 43 | basePackages = pkgs.haskell.packages.ghc92; 44 | devShell = nondevDevShell "ghc92"; 45 | }; 46 | }; 47 | }; 48 | } 49 | -------------------------------------------------------------------------------- /bench/FPBasic.hs: -------------------------------------------------------------------------------- 1 | 2 | module FPBasic ( 3 | runSexp 4 | , runLongws 5 | , runNumcsv 6 | , runTm) where 7 | 8 | import FlatParse.Basic 9 | import Common 10 | import qualified Data.ByteString as B 11 | 12 | ws, open, close, ident, sexp, src :: Parser () () 13 | ws = skipMany $(switch [| case _ of " " -> pure (); "\n" -> pure () |]) 14 | open = $(char '(') >> ws 15 | close = $(char ')') >> ws 16 | ident = skipSome (skipSatisfyAscii isLatinLetter) >> ws 17 | sexp = branch open (skipSome sexp >> close) ident 18 | src = sexp >> eof 19 | runSexp = runParser src 20 | 21 | longw, longws :: Parser () () 22 | longw = $(string "thisisalongkeyword") 23 | longws = skipSome (longw >> ws) >> eof 24 | runLongws = runParser longws 25 | 26 | numeral, comma, numcsv :: Parser () () 27 | numeral = skipSome (skipSatisfyAscii isDigit) >> ws 28 | comma = $(char ',') >> ws 29 | numcsv = numeral >> skipMany (comma >> numeral) >> eof 30 | runNumcsv = runParser numcsv 31 | 32 | 33 | -------------------------------------------------------------------------------- 34 | 35 | ident' :: Parser () B.ByteString 36 | ident' = byteStringOf (skipSome (skipSatisfyAscii \c -> isLatinLetter c || isDigit c)) <* ws 37 | 38 | equal = $(string "=") <* ws 39 | semi = $(string ";") <* ws 40 | dot = $(string ".") <* ws 41 | addOp = $(string "+") <* ws 42 | mulOp = $(string "*") <* ws 43 | parl = $(string "(") <* ws 44 | parr = $(string ")") <* ws 45 | 46 | add :: Parser () Tm 47 | add = chainl Add mul (addOp *> mul) 48 | 49 | mul :: Parser () Tm 50 | mul = chainl Mul spine (mulOp *> spine) 51 | 52 | spine :: Parser () Tm 53 | spine = chainl App atom atom 54 | 55 | atom :: Parser () Tm 56 | atom = 57 | (Int <$> (anyAsciiDecimalInt <* ws)) 58 | <|> (Var <$> ident') 59 | <|> (parl *> tm <* parr) 60 | 61 | tm :: Parser () Tm 62 | tm = $(switch [| case _ of 63 | "fun" -> do {ws; x <- ident'; dot; t <- tm; pure (Lam x t)} 64 | "let" -> do {ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)} 65 | _ -> add |]) 66 | 67 | runTm = runParser (ws *> tm <* eof) 68 | -------------------------------------------------------------------------------- /bench/FPStateful.hs: -------------------------------------------------------------------------------- 1 | 2 | module FPStateful ( 3 | runSexp 4 | , runLongws 5 | , runNumcsv 6 | , runTm) where 7 | 8 | import FlatParse.Stateful 9 | import Common 10 | import qualified Data.ByteString as B 11 | 12 | ws, open, close, ident, sexp, src :: Parser () () () 13 | ws = skipMany $(switch [| case _ of " " -> pure (); "\n" -> pure () |]) 14 | open = $(char '(') >> ws 15 | close = $(char ')') >> ws 16 | ident = skipSome (skipSatisfyAscii isLatinLetter) >> ws 17 | sexp = branch open (skipSome sexp >> close) ident 18 | src = sexp >> eof 19 | runSexp = runParser src () 0 20 | 21 | longw, longws :: Parser () () () 22 | longw = $(string "thisisalongkeyword") 23 | longws = skipSome (longw >> ws) >> eof 24 | runLongws = runParser longws () 0 25 | 26 | numeral, comma, numcsv :: Parser () () () 27 | numeral = skipSome (skipSatisfyAscii isDigit) >> ws 28 | comma = $(char ',') >> ws 29 | numcsv = numeral >> skipMany (comma >> numeral) >> eof 30 | runNumcsv = runParser numcsv () 0 31 | 32 | 33 | -------------------------------------------------------------------------------- 34 | 35 | ident' :: Parser () () B.ByteString 36 | ident' = byteStringOf (skipSome (skipSatisfyAscii \c -> isLatinLetter c || isDigit c)) <* ws 37 | 38 | equal = $(string "=") <* ws 39 | semi = $(string ";") <* ws 40 | dot = $(string ".") <* ws 41 | addOp = $(string "+") <* ws 42 | mulOp = $(string "*") <* ws 43 | parl = $(string "(") <* ws 44 | parr = $(string ")") <* ws 45 | 46 | add :: Parser () () Tm 47 | add = chainl Add mul (addOp *> mul) 48 | 49 | mul :: Parser () () Tm 50 | mul = chainl Mul spine (mulOp *> spine) 51 | 52 | spine :: Parser () () Tm 53 | spine = chainl App atom atom 54 | 55 | atom :: Parser () () Tm 56 | atom = 57 | (Int <$> (anyAsciiDecimalInt <* ws)) 58 | <|> (Var <$> ident') 59 | <|> (parl *> tm <* parr) 60 | 61 | tm :: Parser () () Tm 62 | tm = $(switch [| case _ of 63 | "fun" -> do {ws; x <- ident'; dot; t <- tm; pure (Lam x t)} 64 | "let" -> do {ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)} 65 | _ -> add |]) 66 | 67 | runTm = runParser (ws *> tm <* eof) () 0 68 | -------------------------------------------------------------------------------- /src/FlatParse/Common/Position.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DerivingStrategies, DerivingVia #-} 2 | 3 | module FlatParse.Common.Position 4 | ( Pos(..), endPos, addrToPos#, posToAddr# 5 | , Span(..), unsafeSlice, leftPos, rightPos 6 | ) where 7 | 8 | import qualified Data.ByteString as B 9 | import qualified Data.ByteString.Internal as B 10 | import GHC.Int 11 | import GHC.ForeignPtr ( ForeignPtr(..) ) 12 | import GHC.Exts 13 | 14 | -- | Byte offset counted backwards from the end of the buffer. 15 | -- Note: the `Ord` instance for `Pos` considers the earlier positions to be 16 | -- smaller. 17 | newtype Pos = Pos { unPos :: Int } 18 | deriving stock (Show) 19 | deriving Eq via Int 20 | 21 | instance Ord Pos where 22 | (<=) (Pos x) (Pos y) = y <= x 23 | {-# inline (<=) #-} 24 | compare (Pos x) (Pos y) = compare y x 25 | {-# inline compare #-} 26 | 27 | -- | A pair of positions. 28 | data Span = Span !Pos !Pos 29 | deriving stock (Eq, Show) 30 | 31 | -- | Very unsafe conversion between a primitive address and a position. The 32 | -- first argument points to the end of the buffer, the second argument is 33 | -- being converted. 34 | addrToPos# :: Addr# -> Addr# -> Pos 35 | addrToPos# eob s = Pos (I# (minusAddr# eob s)) 36 | {-# inline addrToPos# #-} 37 | 38 | -- | Very unsafe conversion between a primitive address and a position. The 39 | -- first argument points to the end of the buffer. 40 | posToAddr# :: Addr# -> Pos -> Addr# 41 | posToAddr# eob (Pos (I# n)) = unsafeCoerce# (minusAddr# eob (unsafeCoerce# n)) 42 | {-# inline posToAddr# #-} 43 | 44 | -- | Slice into a `B.ByteString` using a `Span`. The result is invalid if the `Span` 45 | -- is not a valid slice of the first argument. 46 | unsafeSlice :: B.ByteString -> Span -> B.ByteString 47 | unsafeSlice (B.PS (ForeignPtr addr fp) (I# start) (I# len)) 48 | (Span (Pos (I# o1)) (Pos (I# o2))) = 49 | let end = addr `plusAddr#` start `plusAddr#` len 50 | in B.PS (ForeignPtr (plusAddr# end (negateInt# o1)) fp) (I# 0#) (I# (o1 -# o2)) 51 | {-# inline unsafeSlice #-} 52 | 53 | -- | The end of the input. 54 | endPos :: Pos 55 | endPos = Pos 0 56 | {-# inline endPos #-} 57 | 58 | leftPos :: Span -> Pos 59 | leftPos (Span p _) = p 60 | {-# inline leftPos #-} 61 | 62 | rightPos :: Span -> Pos 63 | rightPos (Span _ p) = p 64 | {-# inline rightPos #-} 65 | -------------------------------------------------------------------------------- /bench/Attoparsec.hs: -------------------------------------------------------------------------------- 1 | 2 | module Attoparsec (runSexp, runLongws, runNumcsv, runTm) where 3 | 4 | import Control.Applicative 5 | import Data.Attoparsec.ByteString.Char8 6 | import Common 7 | import qualified Data.ByteString as B 8 | 9 | isLatinLetter :: Char -> Bool 10 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') 11 | 12 | skipWhile1 :: (Char -> Bool) -> Parser () 13 | skipWhile1 pred = satisfy pred *> skipWhile pred 14 | 15 | ws, open, close, ident, sexp :: Parser () 16 | ws = skipWhile (\c -> c == ' ' || c == '\n') 17 | open = char '(' >> ws 18 | close = char ')' >> ws 19 | ident = skipWhile1 isLatinLetter <* ws 20 | sexp = (open *> skipMany1 sexp <* close) <|> ident 21 | runSexp = parseOnly sexp 22 | 23 | longw, longws :: Parser () 24 | longw = () <$ string "thisisalongkeyword" 25 | longws = skipMany1 (longw *> ws) <* endOfInput 26 | runLongws = parseOnly longws 27 | 28 | numeral, comma, numcsv :: Parser () 29 | numeral = skipWhile1 (\c -> '0' <= c && c <= '9') >> ws 30 | comma = char ',' >> ws 31 | numcsv = numeral >> skipMany1 (comma >> numeral) >> endOfInput 32 | runNumcsv = parseOnly numcsv 33 | 34 | 35 | -------------------------------------------------------------------------------- 36 | 37 | ident' :: Parser B.ByteString 38 | ident' = takeWhile1 (\c -> isLatinLetter c || isDigit c) <* ws 39 | 40 | equal = char '=' >> ws 41 | semi = char ';' >> ws 42 | dot = char '.' >> ws 43 | addOp = char '+' >> ws 44 | mulOp = char '*' >> ws 45 | parl = char '(' >> ws 46 | parr = char ')' >> ws 47 | 48 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b 49 | chainl f start elem = start >>= go where 50 | go b = do {!a <- elem; go $! f b a} <|> pure b 51 | {-# inline chainl #-} 52 | 53 | add :: Parser Tm 54 | add = chainl Add mul (addOp *> mul) 55 | 56 | mul :: Parser Tm 57 | mul = chainl Mul spine (mulOp *> spine) 58 | 59 | spine :: Parser Tm 60 | spine = chainl App atom atom 61 | 62 | atom :: Parser Tm 63 | atom = 64 | (Int <$> (decimal <* ws)) 65 | <|> (Var <$> ident') 66 | <|> (parl *> tm <* parr) 67 | 68 | tm :: Parser Tm 69 | tm = (do {_ <- string "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)}) 70 | <|> (do {_ <- string "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)}) 71 | <|> add 72 | 73 | runTm = parseOnly (ws *> tm <* endOfInput) 74 | -------------------------------------------------------------------------------- /.github/workflows/haskell.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | types: 8 | - opened 9 | - synchronize 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | ghc: ["8.6.5", "8.8.4", "8.10.7", "9.0.2", "9.2.8", "9.4.8", "9.6.7", "9.8.4", "9.10.2"] 16 | env: 17 | TESTCONFIG: "--enable-tests" 18 | BENCHCONFIG: "--enable-tests" 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: haskell-actions/setup@v2 22 | id: setup-haskell-cabal 23 | with: 24 | ghc-version: ${{ matrix.ghc }} 25 | - run: cabal freeze $CONFIG 26 | - uses: actions/cache@v4 27 | with: 28 | path: | 29 | ${{ steps.setup-haskell-cabal.outputs.cabal-store }} 30 | dist-newstyle 31 | key: ${{ runner.os }}-${{ matrix.ghc }}-${{ hashFiles('cabal.project.freeze') }} 32 | restore-keys: | 33 | ${{ runner.os }}-${{ matrix.ghc }}- 34 | - run: cabal build $CONFIG 35 | - run: cabal haddock $CONFIG 36 | - run: cabal sdist 37 | - run: cabal bench 38 | - run: cabal test 39 | 40 | wasm: 41 | runs-on: ubuntu-latest 42 | env: 43 | GHC_WASM_META_REV: 7927129e42bcd6a54b9e06e26455803fa4878261 44 | strategy: 45 | matrix: 46 | ghc: ['9.10', '9.12'] 47 | fail-fast: false 48 | steps: 49 | - name: Setup wasm32-wasi-ghc 50 | run: | 51 | cd $(mktemp -d) 52 | curl -L https://gitlab.haskell.org/ghc/ghc-wasm-meta/-/archive/$GHC_WASM_META_REV/ghc-wasm-meta.tar.gz | tar xz --strip-components=1 53 | ./setup.sh 54 | ~/.ghc-wasm/add_to_github_path.sh 55 | env: 56 | FLAVOUR: ${{ matrix.ghc }} 57 | - uses: actions/checkout@v4 58 | - uses: actions/cache@v4 59 | with: 60 | path: | 61 | ~/.ghc-wasm/.cabal/store 62 | key: wasi-${{ runner.os }}-${{ env.GHC_WASM_META_REV }}-flavour-${{ matrix.ghc }}-${{ github.sha }} 63 | restore-keys: | 64 | wasi-${{ runner.os }}-${{ env.GHC_WASM_META_REV }}-flavour-${{ matrix.ghc }}- 65 | - name: Build 66 | run: | 67 | wasm32-wasi-cabal build --enable-tests --enable-benchmarks 68 | - name: Test 69 | run: | 70 | wasm32-wasi-cabal test --test-wrapper=wasmtime 71 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "flake-parts": { 4 | "inputs": { 5 | "nixpkgs-lib": "nixpkgs-lib" 6 | }, 7 | "locked": { 8 | "lastModified": 1709336216, 9 | "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", 10 | "owner": "hercules-ci", 11 | "repo": "flake-parts", 12 | "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", 13 | "type": "github" 14 | }, 15 | "original": { 16 | "owner": "hercules-ci", 17 | "repo": "flake-parts", 18 | "type": "github" 19 | } 20 | }, 21 | "haskell-flake": { 22 | "locked": { 23 | "lastModified": 1709467224, 24 | "narHash": "sha256-xnPTkLMqq78BiTqt6WXj2TXLupclJi+NEH84HDbQSPc=", 25 | "owner": "srid", 26 | "repo": "haskell-flake", 27 | "rev": "9173cc45aeb72b7e7adfe0e5a53a425fe439e3ca", 28 | "type": "github" 29 | }, 30 | "original": { 31 | "owner": "srid", 32 | "repo": "haskell-flake", 33 | "type": "github" 34 | } 35 | }, 36 | "nixpkgs": { 37 | "locked": { 38 | "lastModified": 1709641832, 39 | "narHash": "sha256-bzzRc3DiV8Cm/67HDa39pyBymqF45ISgUbXqjrMk2UE=", 40 | "owner": "nixos", 41 | "repo": "nixpkgs", 42 | "rev": "bfa8b30043892dc2b660d403faa159bab7b65898", 43 | "type": "github" 44 | }, 45 | "original": { 46 | "owner": "nixos", 47 | "ref": "nixpkgs-unstable", 48 | "repo": "nixpkgs", 49 | "type": "github" 50 | } 51 | }, 52 | "nixpkgs-lib": { 53 | "locked": { 54 | "dir": "lib", 55 | "lastModified": 1709237383, 56 | "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", 57 | "owner": "NixOS", 58 | "repo": "nixpkgs", 59 | "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", 60 | "type": "github" 61 | }, 62 | "original": { 63 | "dir": "lib", 64 | "owner": "NixOS", 65 | "ref": "nixos-unstable", 66 | "repo": "nixpkgs", 67 | "type": "github" 68 | } 69 | }, 70 | "root": { 71 | "inputs": { 72 | "flake-parts": "flake-parts", 73 | "haskell-flake": "haskell-flake", 74 | "nixpkgs": "nixpkgs" 75 | } 76 | } 77 | }, 78 | "root": "root", 79 | "version": 7 80 | } 81 | -------------------------------------------------------------------------------- /bench/Megaparsec.hs: -------------------------------------------------------------------------------- 1 | 2 | module Megaparsec (runSexp, runLongws, runNumcsv, runTm) where 3 | 4 | import Control.Applicative 5 | import qualified Data.ByteString as B 6 | import Text.Megaparsec 7 | import Text.Megaparsec.Byte.Lexer 8 | import Data.Char 9 | import Common 10 | 11 | type Parser = Parsec () B.ByteString 12 | 13 | isLatinLetter :: Char -> Bool 14 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') 15 | 16 | char8 :: Char -> Parser () 17 | char8 c = () <$ single (fromIntegral (ord c)) 18 | 19 | {-# inline skipWhile #-} 20 | skipWhile :: (Char -> Bool) -> Parser () 21 | skipWhile f = () <$ takeWhileP Nothing (f . chr . fromIntegral) 22 | 23 | {-# inline skipWhile1 #-} 24 | skipWhile1 :: (Char -> Bool) -> Parser () 25 | skipWhile1 f = () <$ takeWhile1P Nothing (f . chr . fromIntegral) 26 | 27 | {-# inline takeWhile1 #-} 28 | takeWhile1 :: (Char -> Bool) -> Parser B.ByteString 29 | takeWhile1 f = takeWhile1P Nothing (f . chr . fromIntegral) 30 | 31 | ws, open, close, ident, sexp :: Parser () 32 | ws = skipWhile (\c -> c == ' ' || c == '\n') 33 | open = char8 '(' >> ws 34 | close = char8 ')' >> ws 35 | ident = skipWhile1 isLatinLetter <* ws 36 | sexp = (open *> skipSome sexp <* close) <|> ident 37 | runSexp = runParser sexp "" 38 | 39 | longw, longws :: Parser () 40 | longw = () <$ chunk "thisisalongkeyword" 41 | longws = skipSome (longw *> ws) <* eof 42 | runLongws = runParser longws "" 43 | 44 | numeral, comma, numcsv :: Parser () 45 | numeral = skipWhile1 (\c -> '0' <= c && c <= '9') >> ws 46 | comma = single (fromIntegral (ord ',')) >> ws 47 | numcsv = numeral >> skipSome (comma >> numeral) >> eof 48 | runNumcsv = runParser numcsv "" 49 | 50 | -------------------------------------------------------------------------------- 51 | 52 | ident' :: Parser B.ByteString 53 | ident' = takeWhile1 (\c -> isLatinLetter c || isDigit c) <* ws 54 | 55 | equal = char8 '=' >> ws 56 | semi = char8 ';' >> ws 57 | dot = char8 '.' >> ws 58 | addOp = char8 '+' >> ws 59 | mulOp = char8 '*' >> ws 60 | parl = char8 '(' >> ws 61 | parr = char8 ')' >> ws 62 | 63 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b 64 | chainl f start elem = start >>= go where 65 | go b = do {!a <- elem; go $! f b a} <|> pure b 66 | {-# inline chainl #-} 67 | 68 | add :: Parser Tm 69 | add = chainl Add mul (addOp *> mul) 70 | 71 | mul :: Parser Tm 72 | mul = chainl Mul spine (mulOp *> spine) 73 | 74 | spine :: Parser Tm 75 | spine = chainl App atom atom 76 | 77 | atom :: Parser Tm 78 | atom = 79 | (Int <$> (decimal <* ws)) 80 | <|> (Var <$> ident') 81 | <|> (parl *> tm <* parr) 82 | 83 | tm :: Parser Tm 84 | tm = (do {_ <- chunk "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)}) 85 | <|> (do {_ <- chunk "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)}) 86 | <|> add 87 | 88 | runTm = runParser (ws *> tm <* eof) "" 89 | -------------------------------------------------------------------------------- /package.yaml: -------------------------------------------------------------------------------- 1 | 2 | name: flatparse 3 | version: 0.5.3.1 4 | license: MIT 5 | category: Parsing 6 | synopsis: High-performance parsing from strict bytestrings 7 | author: András Kovács 8 | maintainer: puttamalac@gmail.com 9 | copyright: 2021 András Kovács 10 | bug-reports: https://github.com/AndrasKovacs/flatparse/issues 11 | github: AndrasKovacs/flatparse 12 | 13 | description: | 14 | @Flatparse@ is a high-performance parsing library for strict bytestring input. See the README for more information: 15 | . 16 | 17 | tested-with: 18 | - GHC == 8.6.5 19 | - GHC == 8.8.4 20 | - GHC == 8.10.7 21 | - GHC == 9.0.2 22 | - GHC == 9.2.8 23 | - GHC == 9.4.8 24 | - GHC == 9.6.7 25 | - GHC == 9.8.4 26 | 27 | flags: 28 | llvm: 29 | description: use llvm for building 30 | default: false 31 | manual : true 32 | dump: 33 | description: dump core, stg and cmm to files 34 | default: false 35 | manual: true 36 | 37 | dependencies: 38 | - base >= 4.7 && < 5 39 | - utf8-string ^>= 1.0.2 40 | 41 | extra-source-files: 42 | - README.md 43 | 44 | default-extensions: 45 | - BangPatterns 46 | - BlockArguments 47 | - CPP 48 | - ExplicitNamespaces 49 | - LambdaCase 50 | - MagicHash 51 | - OverloadedStrings 52 | - PatternSynonyms 53 | - TemplateHaskell 54 | - TupleSections 55 | 56 | ghc-options: 57 | - -Wall 58 | - -Wno-missing-signatures 59 | - -Wno-name-shadowing 60 | - -Wno-unused-binds 61 | - -Wno-unused-matches 62 | 63 | when: 64 | - condition: flag(dump) 65 | ghc-options: 66 | - -ddump-simpl 67 | - -ddump-stg-final 68 | - -ddump-cmm 69 | - -dsuppress-all 70 | - -dno-suppress-type-signatures 71 | - -ddump-to-file 72 | 73 | - condition: flag(llvm) 74 | ghc-options: 75 | - -fllvm 76 | 77 | library: 78 | source-dirs: 79 | - src 80 | dependencies: 81 | - bytestring 82 | - containers 83 | - integer-gmp 84 | - template-haskell 85 | ghc-options: 86 | - -O2 87 | 88 | tests: 89 | spec: 90 | main: Test.hs 91 | source-dirs: 92 | - test 93 | dependencies: 94 | - HUnit 95 | - bytestring 96 | - flatparse 97 | - hspec 98 | - QuickCheck 99 | - quickcheck-instances 100 | default-extensions: 101 | - ExtendedDefaultRules 102 | ghc-options: 103 | - -O2 104 | - -Wno-type-defaults 105 | 106 | benchmarks: 107 | bench: 108 | source-dirs: bench 109 | main: Bench.hs 110 | dependencies: 111 | - attoparsec 112 | - bytestring 113 | - flatparse 114 | - integer-gmp 115 | - megaparsec 116 | - parsec 117 | - primitive 118 | - tasty-bench 119 | ghc-options: 120 | - -O2 121 | -------------------------------------------------------------------------------- /bench/Parsec.hs: -------------------------------------------------------------------------------- 1 | 2 | module Parsec (runSexp, runLongws, runNumcsv, runTm) where 3 | 4 | import Control.Monad 5 | import Text.Parsec hiding (chainl, digit) 6 | import Text.Parsec.ByteString 7 | import qualified Data.ByteString as B 8 | import Data.Char hiding (isDigit) 9 | import Common 10 | 11 | isLatinLetter :: Char -> Bool 12 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') 13 | 14 | isDigit :: Char -> Bool 15 | isDigit c = '0' <= c && c <= '9' 16 | 17 | ws, open, close, ident, sexp :: Parser () 18 | ws = skipMany (satisfy \c -> c == ' ' || c == '\n') 19 | open = char '(' >> ws 20 | close = char ')' >> ws 21 | ident = skipMany1 (satisfy isLatinLetter) <* ws 22 | sexp = (open *> skipMany1 sexp <* close) <|> ident 23 | runSexp = parse sexp "" 24 | 25 | byteString :: B.ByteString -> Parser () 26 | byteString b = do 27 | i <- getInput 28 | guard $ B.isPrefixOf b i 29 | setInput $ B.drop (B.length b) i 30 | 31 | longw, longws :: Parser () 32 | longw = () <$ byteString "thisisalongkeyword" 33 | longws = skipMany1 (longw *> ws) <* eof 34 | runLongws = parse longws "" 35 | 36 | numeral, comma, numcsv :: Parser () 37 | numeral = skipMany1 (satisfy \c -> '0' <= c && c <= '9') >> ws 38 | comma = char ',' >> ws 39 | numcsv = numeral >> skipMany1 (comma >> numeral) >> eof 40 | runNumcsv = parse numcsv "" 41 | 42 | -------------------------------------------------------------------------------- 43 | 44 | {-# inline byteStringOf #-} 45 | byteStringOf :: Parser a -> Parser B.ByteString 46 | byteStringOf p = do 47 | i <- getInput 48 | _ <- p 49 | i' <- getInput 50 | pure $! B.take (B.length i - B.length i') i 51 | 52 | ident' :: Parser B.ByteString 53 | ident' = byteStringOf (skipMany1 (satisfy \c -> isLatinLetter c || isDigit c)) <* ws 54 | 55 | equal = char '=' >> ws 56 | semi = char ';' >> ws 57 | dot = char '.' >> ws 58 | addOp = char '+' >> ws 59 | mulOp = char '*' >> ws 60 | parl = char '(' >> ws 61 | parr = char ')' >> ws 62 | 63 | chainl :: (b -> a -> b) -> Parser b -> Parser a -> Parser b 64 | chainl f start elem = start >>= go where 65 | go b = do {!a <- elem; go $! f b a} <|> pure b 66 | {-# inline chainl #-} 67 | 68 | digit :: Parser Int 69 | digit = digitToInt <$> satisfy isDigit 70 | 71 | decimal :: Parser Int 72 | decimal = chainl (\x n -> 10*x + n) digit digit 73 | 74 | add :: Parser Tm 75 | add = chainl Add mul (addOp *> mul) 76 | 77 | mul :: Parser Tm 78 | mul = chainl Mul spine (mulOp *> spine) 79 | 80 | spine :: Parser Tm 81 | spine = chainl App atom atom 82 | 83 | atom :: Parser Tm 84 | atom = 85 | (Int <$> (decimal <* ws)) 86 | <|> (Var <$> ident') 87 | <|> (parl *> tm <* parr) 88 | 89 | tm :: Parser Tm 90 | tm = (do {byteString "fun"; ws; x <- ident'; dot; t <- tm; pure (Lam x t)}) 91 | <|> (do {byteString "let"; ws; x <- ident'; equal; t <- tm; semi; u <- tm; pure (Let x t u)}) 92 | <|> add 93 | 94 | runTm = parse (ws *> tm <* eof) "" 95 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Bytes.hs: -------------------------------------------------------------------------------- 1 | {- | Bytestring parsers. 2 | 3 | Module dependency complications prevent us from placing these in 4 | "FlatParse.Basic.Base". 5 | -} 6 | 7 | module FlatParse.Basic.Bytes 8 | ( bytes, bytesUnsafe 9 | ) where 10 | 11 | import FlatParse.Basic.Parser 12 | import FlatParse.Basic.Base ( withEnsure ) 13 | import FlatParse.Basic.Integers ( word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe ) 14 | import qualified FlatParse.Common.Assorted as Common 15 | import Language.Haskell.TH 16 | import GHC.Exts 17 | import GHC.Word 18 | 19 | -- | Read a sequence of bytes. This is a template function, you can use it as @$(bytes [3, 4, 5])@, 20 | -- for example, and the splice has type @Parser e ()@. For a non-TH variant see 'FlatParse.Basic.byteString'. 21 | bytes :: [Word] -> Q Exp 22 | bytes bs = do 23 | let !len = length bs 24 | [| withEnsure len $(bytesUnsafe bs) |] 25 | 26 | -- | Template function, creates a @Parser e ()@ which unsafely parses a given 27 | -- sequence of bytes. 28 | -- 29 | -- The caller must guarantee that the input has enough bytes. 30 | bytesUnsafe :: [Word] -> Q Exp 31 | bytesUnsafe bytes = do 32 | let !(leading, w8s) = Common.splitBytes $ (fromIntegral :: Word -> Word64) <$> bytes 33 | !scanw8s = go w8s where 34 | go (w8:[] ) = [| word64Unsafe w8 |] 35 | go (w8:w8s) = [| word64Unsafe w8 >> $(go w8s) |] 36 | go [] = [| pure () |] 37 | case w8s of 38 | [] -> go leading 39 | where 40 | go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w |] 41 | go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w >> $(go ws) |] 42 | go (a:b:[]) = let !w = Common.packBytes [a, b] in [| word16Unsafe w |] 43 | go (a:b:ws) = let !w = Common.packBytes [a, b] in [| word16Unsafe w >> $(go ws) |] 44 | go (a:[]) = [| word8Unsafe a |] 45 | go [] = [| pure () |] 46 | _ -> case leading of 47 | 48 | [] -> scanw8s 49 | [a] -> [| word8Unsafe a >> $scanw8s |] 50 | ws@[a, b] -> let !w = Common.packBytes ws in [| word16Unsafe w >> $scanw8s |] 51 | ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| word32Unsafe w >> $scanw8s |] 52 | ws -> let !w = Common.packBytes ws 53 | !l = length ws 54 | in [| scanPartial64# l w >> $scanw8s |] 55 | 56 | scanPartial64# :: Int -> Word64 -> ParserT st e () 57 | scanPartial64# (I# len) (W64# w) = ParserT \fp eob s st -> 58 | case indexWord64OffAddr# s 0# of 59 | w' -> case uncheckedIShiftL# (8# -# len) 3# of 60 | sh -> case uncheckedShiftL64# w' sh of 61 | w' -> case uncheckedShiftRL64# w' sh of 62 | #if MIN_VERSION_base(4,17,0) 63 | w' -> case eqWord64# w w' of 64 | #else 65 | w' -> case eqWord# w w' of 66 | #endif 67 | 1# -> OK# st () (plusAddr# s len) 68 | _ -> Fail# st 69 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Bytes.hs: -------------------------------------------------------------------------------- 1 | {- | Bytestring parsers. 2 | 3 | Module dependency complications prevent us from placing these in 4 | "FlatParse.Stateful.Base". 5 | -} 6 | 7 | module FlatParse.Stateful.Bytes 8 | ( bytes, bytesUnsafe 9 | ) where 10 | 11 | import FlatParse.Stateful.Parser 12 | import FlatParse.Stateful.Base ( withEnsure ) 13 | import FlatParse.Stateful.Integers ( word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe ) 14 | import qualified FlatParse.Common.Assorted as Common 15 | import Language.Haskell.TH 16 | import GHC.Exts 17 | import GHC.Word 18 | 19 | -- | Read a sequence of bytes. This is a template function, you can use it as 20 | -- @$(bytes [3, 4, 5])@, for example, and the splice has type @Parser e 21 | -- ()@. For a non-TH variant see 'FlatParse.Stateful.byteString'. 22 | bytes :: [Word] -> Q Exp 23 | bytes bs = do 24 | let !len = length bs 25 | [| withEnsure len $(bytesUnsafe bs) |] 26 | 27 | -- | Template function, creates a @Parser e ()@ which unsafely parses a given 28 | -- sequence of bytes. 29 | -- 30 | -- The caller must guarantee that the input has enough bytes. 31 | bytesUnsafe :: [Word] -> Q Exp 32 | bytesUnsafe bytes = do 33 | let !(leading, w8s) = Common.splitBytes $ (fromIntegral :: Word -> Word64) <$> bytes 34 | !scanw8s = go w8s where 35 | go (w8:[] ) = [| word64Unsafe w8 |] 36 | go (w8:w8s) = [| word64Unsafe w8 >> $(go w8s) |] 37 | go [] = [| pure () |] 38 | case w8s of 39 | [] -> go leading 40 | where 41 | go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w |] 42 | go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| word32Unsafe w >> $(go ws) |] 43 | go (a:b:[]) = let !w = Common.packBytes [a, b] in [| word16Unsafe w |] 44 | go (a:b:ws) = let !w = Common.packBytes [a, b] in [| word16Unsafe w >> $(go ws) |] 45 | go (a:[]) = [| word8Unsafe a |] 46 | go [] = [| pure () |] 47 | _ -> case leading of 48 | 49 | [] -> scanw8s 50 | [a] -> [| word8Unsafe a >> $scanw8s |] 51 | ws@[a, b] -> let !w = Common.packBytes ws in [| word16Unsafe w >> $scanw8s |] 52 | ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| word32Unsafe w >> $scanw8s |] 53 | ws -> let !w = Common.packBytes ws 54 | !l = length ws 55 | in [| scanPartial64# l w >> $scanw8s |] 56 | 57 | scanPartial64# :: Int -> Word64 -> ParserT st r e () 58 | scanPartial64# (I# len) (W64# w) = ParserT \fp !r eob s n st -> 59 | case indexWord64OffAddr# s 0# of 60 | w' -> case uncheckedIShiftL# (8# -# len) 3# of 61 | sh -> case uncheckedShiftL64# w' sh of 62 | w' -> case uncheckedShiftRL64# w' sh of 63 | #if MIN_VERSION_base(4,17,0) 64 | w' -> case eqWord64# w w' of 65 | #else 66 | w' -> case eqWord# w w' of 67 | #endif 68 | 1# -> OK# st () (plusAddr# s len) n 69 | _ -> Fail# st 70 | -------------------------------------------------------------------------------- /bench/Bench.hs: -------------------------------------------------------------------------------- 1 | {-# options_ghc -Wno-unused-imports #-} 2 | 3 | module Main where 4 | 5 | import Data.Primitive.ByteArray 6 | import Test.Tasty.Bench 7 | 8 | import qualified Data.ByteString.Char8 as B 9 | 10 | import qualified Attoparsec 11 | import qualified Megaparsec 12 | import qualified Parsec 13 | import qualified FPStateful 14 | import qualified FPBasic 15 | import qualified ReadInteger 16 | 17 | import qualified Data.ByteString.UTF8 18 | import FlatParse.Common.Assorted (strToUtf8) 19 | import qualified FlatParse.Basic 20 | 21 | import Common 22 | 23 | sexpInp :: B.ByteString 24 | sexpInp = 25 | B.concat $ "(" : replicate 33333 "(foo (foo (foo ((bar baza)))))" ++ [")"] 26 | 27 | longwsInp :: B.ByteString 28 | longwsInp = B.concat $ replicate 55555 "thisisalongkeyword " 29 | 30 | numcsvInp :: B.ByteString 31 | numcsvInp = B.concat ("0" : [B.pack (", " ++ show n) | n <- [1..100000::Int]]) 32 | 33 | readIntInp :: B.ByteString 34 | readIntInp = "12345678910" 35 | 36 | longString :: String 37 | longString = 38 | concat $ "(" : replicate 33333 "(foo (foo (foo ((bar baza)))))" ++ [")"] 39 | 40 | tmInp :: B.ByteString 41 | tmInp = B.pack (unlines (do 42 | x <- [0..3000::Int] 43 | pure ("let x" ++ show x ++ " = fun f. fun g. fun x. fun y. f (f (f ((g x y + g x y) * g x y * g x y * 13500)));") 44 | ++ ["x1000"])) 45 | 46 | 47 | main :: IO () 48 | main = defaultMain [ 49 | {- 50 | bgroup "String -> UTF-8 ByteString" [ 51 | bench "utf8-string" $ whnf Data.ByteString.UTF8.toString sexpInp, 52 | bench "fp" $ whnf FlatParse.Common.Assorted.utf8ToStr sexpInp 53 | ], 54 | bgroup "UTF-8 ByteString -> String" [ 55 | bench "utf8-string" $ whnf Data.ByteString.UTF8.fromString longString, 56 | bench "fp" $ whnf FlatParse.Common.Assorted.strToUtf8 longString 57 | ] 58 | ] 59 | -} 60 | bgroup "sexp" [ 61 | bench "fpbasic" $ whnf FPBasic.runSexp sexpInp, 62 | bench "fpstateful" $ whnf FPStateful.runSexp sexpInp, 63 | bench "attoparsec" $ whnf Attoparsec.runSexp sexpInp, 64 | bench "megaparsec" $ whnf Megaparsec.runSexp sexpInp, 65 | bench "parsec" $ whnf Parsec.runSexp sexpInp 66 | ], 67 | 68 | bgroup "long keyword" [ 69 | bench "fpbasic" $ whnf FPBasic.runLongws longwsInp, 70 | bench "fpstateful" $ whnf FPStateful.runLongws longwsInp, 71 | bench "attoparsec" $ whnf Attoparsec.runLongws longwsInp, 72 | bench "megaparsec" $ whnf Megaparsec.runLongws longwsInp, 73 | bench "parsec" $ whnf Parsec.runLongws longwsInp 74 | ], 75 | 76 | bgroup "numeral csv" [ 77 | bench "fpbasic" $ whnf FPBasic.runNumcsv numcsvInp, 78 | bench "fpstateful" $ whnf FPStateful.runNumcsv numcsvInp, 79 | bench "attoparsec" $ whnf Attoparsec.runNumcsv numcsvInp, 80 | bench "megaparsec" $ whnf Megaparsec.runNumcsv numcsvInp, 81 | bench "parsec" $ whnf Parsec.runNumcsv numcsvInp 82 | ], 83 | 84 | bgroup "lambda term" [ 85 | bench "fpbasic" $ whnf FPBasic.runTm tmInp, 86 | bench "fpstateful" $ whnf FPStateful.runTm tmInp, 87 | bench "attoparsec" $ whnf Attoparsec.runTm tmInp, 88 | bench "megaparsec" $ whnf Megaparsec.runTm tmInp, 89 | bench "parsec" $ whnf Parsec.runTm tmInp 90 | ], 91 | 92 | bgroup "readInt/readInteger" [ 93 | bench "readInt" $ whnf ReadInteger.readInt readIntInp, 94 | bench "readInteger" $ whnf ReadInteger.readInteger readIntInp 95 | ] 96 | ] 97 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Addr.hs: -------------------------------------------------------------------------------- 1 | {- | Unsafe, highly dangerous parsing primitives using 'Addr#'. 2 | 3 | Ensure to read the documentation before using any definitions from this module. 4 | 5 | This module exports primitives useful for efficiently parsing binary files that 6 | store data using an internal index. 7 | 8 | Often, such indices describes records using a starting offset and a length. 9 | Offsets are often relative to the file start, or some dynamic address in the 10 | file. This way, individual records can be read out efficiently (much faster than 11 | opening lots of small files!). 12 | 13 | We may parse these in-place efficiently by adding record offsets to a base 14 | memory address somewhere in the input. This is also extremely unsafe, and easy 15 | to get catastrophically wrong. Thus, we provide as much utility as reasonable to 16 | enable performing such parsing safely. (That means CPS functions.) 17 | 18 | Note that all definitions here should be considered unsafe. Any 'Int#' is not 19 | checked for positivity. You must perform any necessary checks when you obtain 20 | your offsets and lengths as 'Int#'. Failure to do so may result in undefined 21 | behaviour. 22 | -} 23 | 24 | module FlatParse.Basic.Addr where 25 | 26 | import FlatParse.Basic.Parser 27 | import FlatParse.Basic.Base ( takeUnsafe#, atSkipUnsafe#, lookahead ) 28 | 29 | import GHC.Exts 30 | 31 | import qualified Data.ByteString as B 32 | 33 | -- | Run a parser, passing it the current address the parser is at. 34 | -- 35 | -- Useful for parsing offset-based data tables. For example, you may use this to 36 | -- save the base address to use together with various relative offsets. 37 | withAddr# :: (Addr# -> ParserT st e a) -> ParserT st e a 38 | withAddr# p = ParserT \fp eob s st -> runParserT# (p s) fp eob s st 39 | {-# inline withAddr# #-} 40 | 41 | -- | @takeOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ 42 | -- bytes, reads @len#@ bytes into a 'ByteString', and restores the original 43 | -- address. 44 | -- 45 | -- The 'Addr#' should be from 'withAddr#'. 46 | -- 47 | -- Useful for parsing offset-based data tables. Ex: Your file contains an index 48 | -- storing @(OFFSET, LENGTH)@ entries where the offset is the byte position in 49 | -- the file. Begin with @'withAddr#' $ \tableBase# -> ...@, then read each entry 50 | -- like @'takeOffAddr#' tableBase# OFFSET LENGTH@. 51 | -- 52 | -- Fails if you attempt to read outside the input. 53 | -- 54 | -- Undefined behaviour if @offset#@ or @len#@ is negative. 55 | -- 56 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops. 57 | takeOffAddr# :: Addr# -> Int# -> Int# -> ParserT st e B.ByteString 58 | takeOffAddr# addr# offset# len# = withOffAddr# addr# offset# (takeUnsafe# len#) 59 | {-# inline takeOffAddr# #-} 60 | 61 | -- | @withOffAddr# addr# offset# p@ moves to @addr#@, skips @offset#@ 62 | -- bytes, then runs the given parser @p@. 63 | -- 64 | -- The 'Addr#' should be from 'withAddr#'. 65 | -- 66 | -- Fails if you attempt to read outside the input. 67 | -- 68 | -- Undefined behaviour if @offset#@ is negative. 69 | -- 70 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops. 71 | withOffAddr# :: Addr# -> Int# -> ParserT st e a -> ParserT st e a 72 | withOffAddr# addr# offset# = 73 | lookaheadFromAddr# addr# . atSkipUnsafe# offset# 74 | {-# inline withOffAddr# #-} 75 | 76 | -- | 'lookahead', but specify the address to lookahead from. 77 | -- 78 | -- The 'Addr#' should be from 'withAddr#'. 79 | lookaheadFromAddr# :: Addr# -> ParserT st e a -> ParserT st e a 80 | lookaheadFromAddr# s = lookahead . atAddr# s 81 | {-# inline lookaheadFromAddr# #-} 82 | 83 | -- | Run a parser at the given address. 84 | -- 85 | -- The 'Addr#' should be from 'withAddr#'. 86 | -- 87 | -- This is a highly internal function -- you likely want 'lookaheadFromAddr#', 88 | -- which will reset the address after running the parser. 89 | atAddr# :: Addr# -> ParserT st e a -> ParserT st e a 90 | atAddr# s (ParserT p) = ParserT \fp eob _ st -> p fp eob s st 91 | {-# inline atAddr# #-} 92 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Addr.hs: -------------------------------------------------------------------------------- 1 | {- | Unsafe, highly dangerous parsing primitives using 'Addr#'. 2 | 3 | Ensure to read the documentation before using any definitions from this module. 4 | 5 | This module exports primitives useful for efficiently parsing binary files that 6 | store data using an internal index. 7 | 8 | Often, such indices describes records using a starting offset and a length. 9 | Offsets are often relative to the file start, or some dynamic address in the 10 | file. This way, individual records can be read out efficiently (much faster than 11 | opening lots of small files!). 12 | 13 | We may parse these in-place efficiently by adding record offsets to a base 14 | memory address somewhere in the input. This is also extremely unsafe, and easy 15 | to get catastrophically wrong. Thus, we provide as much utility as reasonable to 16 | enable performing such parsing safely. (That means CPS functions.) 17 | 18 | Note that all definitions here should be considered unsafe. Any 'Int#' is not 19 | checked for positivity. You must perform any necessary checks when you obtain 20 | your offsets and lengths as 'Int#'. Failure to do so may result in undefined 21 | behaviour. 22 | -} 23 | 24 | module FlatParse.Stateful.Addr where 25 | 26 | import FlatParse.Stateful.Parser 27 | import FlatParse.Stateful.Base ( takeUnsafe#, atSkipUnsafe#, lookahead ) 28 | 29 | import GHC.Exts 30 | 31 | import qualified Data.ByteString as B 32 | 33 | -- | Run a parser, passing it the current address the parser is at. 34 | -- 35 | -- Useful for parsing offset-based data tables. For example, you may use this to 36 | -- save the base address to use together with various relative offsets. 37 | withAddr# :: (Addr# -> ParserT st r e a) -> ParserT st r e a 38 | withAddr# p = ParserT \fp !r eob s n st -> runParserT# (p s) fp r eob s n st 39 | {-# inline withAddr# #-} 40 | 41 | -- | @takeOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ 42 | -- bytes, reads @len#@ bytes into a 'ByteString', and restores the original 43 | -- address. 44 | -- 45 | -- The 'Addr#' should be from 'withAddr#'. 46 | -- 47 | -- Useful for parsing offset-based data tables. Ex: Your file contains an index 48 | -- storing @(OFFSET, LENGTH)@ entries where the offset is the byte position in 49 | -- the file. Begin with @'withAddr#' $ \tableBase# -> ...@, then read each entry 50 | -- like @'takeOffAddr#' tableBase# OFFSET LENGTH@. 51 | -- 52 | -- Fails if you attempt to read outside the input. 53 | -- 54 | -- Undefined behaviour if @offset#@ or @len#@ is negative. 55 | -- 56 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops. 57 | takeOffAddr# :: Addr# -> Int# -> Int# -> ParserT st r e B.ByteString 58 | takeOffAddr# addr# offset# len# = withOffAddr# addr# offset# (takeUnsafe# len#) 59 | {-# inline takeOffAddr# #-} 60 | 61 | -- | @withOffAddr# addr# offset# p@ moves to @addr#@, skips @offset#@ 62 | -- bytes, then runs the given parser @p@. 63 | -- 64 | -- The 'Addr#' should be from 'withAddr#'. 65 | -- 66 | -- Fails if you attempt to read outside the input. 67 | -- 68 | -- Undefined behaviour if @offset#@ is negative. 69 | -- 70 | -- Name adopted from the similar-ish @indexXOffAddr#@ primops. 71 | withOffAddr# :: Addr# -> Int# -> ParserT st r e a -> ParserT st r e a 72 | withOffAddr# addr# offset# = 73 | lookaheadFromAddr# addr# . atSkipUnsafe# offset# 74 | {-# inline withOffAddr# #-} 75 | 76 | -- | 'lookahead', but specify the address to lookahead from. 77 | -- 78 | -- The 'Addr#' should be from 'withAddr#'. 79 | lookaheadFromAddr# :: Addr# -> ParserT st r e a -> ParserT st r e a 80 | lookaheadFromAddr# s = lookahead . atAddr# s 81 | {-# inline lookaheadFromAddr# #-} 82 | 83 | -- | Run a parser at the given address. 84 | -- 85 | -- The 'Addr#' should be from 'withAddr#'. 86 | -- 87 | -- This is a highly internal function -- you likely want 'lookaheadFromAddr#', 88 | -- which will reset the address after running the parser. 89 | atAddr# :: Addr# -> ParserT st r e a -> ParserT st r e a 90 | atAddr# s (ParserT p) = ParserT \fp !r eob _ n st -> p fp r eob s n st 91 | {-# inline atAddr# #-} 92 | -------------------------------------------------------------------------------- /src/FlatParse/Examples/BasicLambda/Parser.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | This module contains a simple lambda calculus parser. This parser is not optimized for maximum 3 | performance; instead it's written in a style which emulates the look and feel of conventional 4 | monadic parsers. An optimized implementation would use low-level `switch` expressions more often. 5 | -} 6 | 7 | {-# language StrictData #-} 8 | 9 | module FlatParse.Examples.BasicLambda.Parser where 10 | 11 | import Data.Char (ord) 12 | import qualified Data.ByteString as B 13 | 14 | import FlatParse.Basic hiding (Parser, runParser, string, char, cut) 15 | import FlatParse.Examples.BasicLambda.Lexer 16 | 17 | -------------------------------------------------------------------------------- 18 | 19 | type Name = B.ByteString 20 | 21 | {-| 22 | A term in the language. The precedences of different constructs are the following, in decreasing 23 | order of strength: 24 | 25 | * Identifiers, literals and parenthesized expressions 26 | * Function application (left assoc) 27 | * Multiplication (left assoc) 28 | * Addition (left assoc) 29 | * Equality, less-than (non-assoc) 30 | * @lam@, @let@, @if@ (right assoc) 31 | 32 | -} 33 | data Tm 34 | = Var Name -- ^ @x@ 35 | | App Tm Tm -- ^ @t u@ 36 | | Lam Name Tm -- ^ @lam x. t@ 37 | | Let Name Tm Tm -- ^ @let x = t in u@ 38 | | BoolLit Bool -- ^ @true@ or @false@. 39 | | IntLit Int -- ^ A positive `Int` literal. 40 | | If Tm Tm Tm -- ^ @if t then u else v@ 41 | | Add Tm Tm -- ^ @t + u@ 42 | | Mul Tm Tm -- ^ @t * u@ 43 | | Eq Tm Tm -- ^ @t == u@ 44 | | Lt Tm Tm -- ^ @t < u@ 45 | deriving Show 46 | 47 | 48 | -- | Parse an identifier. This parser uses `isKeyword` to check that an identifier is not a 49 | -- keyword. 50 | ident :: Parser Name 51 | ident = token $ byteStringOf $ 52 | withSpan (identStartChar *> skipMany identChar) (\_ span -> fails (isKeyword span)) 53 | 54 | -- | Parse an identifier, throw a precise error on failure. 55 | ident' :: Parser Name 56 | ident' = ident `cut'` (Msg "identifier") 57 | 58 | digit :: Parser Int 59 | digit = (\c -> ord c - ord '0') <$> satisfyAscii isDigit 60 | 61 | int :: Parser Int 62 | int = token do 63 | (place, n) <- chainr (\n (!place, !acc) -> (place*10,acc+place*n)) digit (pure (1, 0)) 64 | case place of 65 | 1 -> empty 66 | _ -> pure n 67 | 68 | -- | Parse a literal, identifier or parenthesized expression. 69 | atom :: Parser Tm 70 | atom = 71 | (Var <$> ident) 72 | <|> (BoolLit True <$ $(keyword "true")) 73 | <|> (BoolLit False <$ $(keyword "false")) 74 | <|> (IntLit <$> int) 75 | <|> ($(symbol "(") *> tm' <* $(symbol' ")")) 76 | 77 | atom' :: Parser Tm 78 | atom' = atom 79 | `cut` [Msg "identifier", "true", "false", Msg "parenthesized expression", Msg "integer literal"] 80 | 81 | -- | Parse an `App`-level expression. 82 | app' :: Parser Tm 83 | app' = chainl App atom' atom 84 | 85 | -- | Parse a `Mul`-level expression. 86 | mul' :: Parser Tm 87 | mul' = chainl Mul app' ($(symbol "*") *> app') 88 | 89 | -- | Parse an `Add`-level expression. 90 | add' :: Parser Tm 91 | add' = chainl Add mul' ($(symbol "+") *> mul') 92 | 93 | -- | Parse an `FlatParse.Examples.BasicLambda.Parser.Eq` or `Lt`-level expression. 94 | eqLt' :: Parser Tm 95 | eqLt' = 96 | add' >>= \e1 -> 97 | branch $(symbol "==") (Eq e1 <$> add') $ 98 | branch $(symbol "<") (Lt e1 <$> add') $ 99 | pure e1 100 | 101 | -- | Parse a `Let`. 102 | pLet :: Parser Tm 103 | pLet = do 104 | $(keyword "let") 105 | x <- ident' 106 | $(symbol' "=") 107 | t <- tm' 108 | $(keyword' "in") 109 | u <- tm' 110 | pure $ Let x t u 111 | 112 | -- | Parse a `Lam`. 113 | lam :: Parser Tm 114 | lam = do 115 | $(keyword "lam") 116 | x <- ident' 117 | $(symbol' ".") 118 | t <- tm' 119 | pure $ Lam x t 120 | 121 | -- | Parse an `If`. 122 | pIf :: Parser Tm 123 | pIf = do 124 | $(keyword "if") 125 | t <- tm' 126 | $(keyword' "then") 127 | u <- tm' 128 | $(keyword' "else") 129 | v <- tm' 130 | pure $ If t u v 131 | 132 | -- | Parse any `Tm`. 133 | tm' :: Parser Tm 134 | tm' = (pLet <|> lam <|> pIf <|> eqLt') `cut` ["let", "lam", "if"] 135 | 136 | -- | Parse a complete source file. 137 | src' :: Parser Tm 138 | src' = ws *> tm' <* eof `cut` [Msg "end of input (lexical error)"] 139 | 140 | 141 | -- Examples 142 | -------------------------------------------------------------------------------- 143 | 144 | -- testParser src' p1 145 | p1 = unlines [ 146 | "let f = lam x. lam y. x (x (x y)) in", 147 | "let g = if f true then false else true in", 148 | "let h = f x y + 200 in", 149 | "f g g h" 150 | ] 151 | -------------------------------------------------------------------------------- /src/FlatParse/Common/Switch.hs: -------------------------------------------------------------------------------- 1 | module FlatParse.Common.Switch where 2 | 3 | import Control.Monad (forM) 4 | import Data.Foldable (foldl') 5 | import Data.Map (Map) 6 | import Language.Haskell.TH 7 | 8 | import qualified Data.Map.Strict as M 9 | 10 | import FlatParse.Common.Assorted 11 | 12 | -- Switch trie compilation 13 | -------------------------------------------------------------------------------- 14 | 15 | data Trie a = Branch !a !(Map Word (Trie a)) 16 | deriving Show 17 | 18 | type Rule = Maybe Int 19 | 20 | nilTrie :: Trie Rule 21 | nilTrie = Branch Nothing mempty 22 | 23 | updRule :: Int -> Maybe Int -> Maybe Int 24 | updRule rule = Just . maybe rule (min rule) 25 | 26 | insert :: Int -> [Word] -> Trie Rule -> Trie Rule 27 | insert rule = go where 28 | go [] (Branch rule' ts) = 29 | Branch (updRule rule rule') ts 30 | go (c:cs) (Branch rule' ts) = 31 | Branch rule' (M.alter (Just . maybe (go cs nilTrie) (go cs)) c ts) 32 | 33 | listToTrie :: [(Int, String)] -> Trie Rule 34 | listToTrie = foldl' (\t (!r, !s) -> insert r (charToBytes =<< s) t) nilTrie 35 | 36 | -- | Decorate a trie with the minimum lengths of non-empty paths. This 37 | -- is used later to place `ensure`. 38 | mindepths :: Trie Rule -> Trie (Rule, Int) 39 | mindepths (Branch rule ts) = 40 | if M.null ts then 41 | Branch (rule, 0) mempty 42 | else 43 | let !ts' = M.map mindepths ts 44 | !min = minimum (M.map (\(Branch (rule,d) _) -> maybe (d + 1) (\_ -> 1) rule) ts') in 45 | Branch (rule, min) ts' 46 | 47 | data Trie' a 48 | = Branch' !a !(Map Word (Trie' a)) 49 | | Path !a ![Word] !(Trie' a) 50 | deriving Show 51 | 52 | -- | Compress linear paths. 53 | pathify :: Trie (Rule, Int) -> Trie' (Rule, Int) 54 | pathify (Branch a ts) = case M.toList ts of 55 | [] -> Branch' a mempty 56 | [(w, t)] -> case pathify t of 57 | Path (Nothing, _) ws t -> Path a (w:ws) t 58 | t -> Path a [w] t 59 | _ -> Branch' a (M.map pathify ts) 60 | 61 | -- | Compute where to fall back after we exhausted a branch. If the branch is 62 | -- empty, that means we've succeded at reading and we jump to the rhs rule. 63 | fallbacks :: Trie' (Rule, Int) -> Trie' (Rule, Int, Int) 64 | fallbacks = go Nothing 0 where 65 | go :: Rule -> Int -> Trie' (Rule, Int) -> Trie' (Rule, Int, Int) 66 | go !rule !n (Branch' (rule', d) ts) 67 | | M.null ts = Branch' (rule', 0, d) mempty 68 | | Nothing <- rule' = Branch' (rule, n, d) (go rule (n + 1) <$> ts) 69 | | otherwise = Branch' (rule', 0, d) (go rule' 1 <$> ts) 70 | go rule n (Path (rule', d) ws t) 71 | | Nothing <- rule' = Path (rule, n, d) ws (go rule (n + length ws) t) 72 | | otherwise = Path (rule', 0, d) ws (go rule' (length ws) t) 73 | 74 | -- | Decorate with `ensure` invocations, represented as 75 | -- `Maybe Int`. 76 | ensureBytes :: Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) 77 | ensureBytes = go 0 where 78 | go :: Int -> Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) 79 | go !res = \case 80 | Branch' (r, n, d) ts 81 | | M.null ts -> Branch' (r, n, Nothing) mempty 82 | | res < 1 -> Branch' (r, n, Just d ) (go (d - 1) <$> ts) 83 | | otherwise -> Branch' (r, n, Nothing) (go (res - 1) <$> ts) 84 | Path (r, n, d) ws t -> case length ws of 85 | l | res < l -> Path (r, n, Just d ) ws (go (d - l) t) 86 | | otherwise -> Path (r, n, Nothing) ws (go (res - l) t) 87 | 88 | compileTrie :: [(Int, String)] -> Trie' (Rule, Int, Maybe Int) 89 | compileTrie = ensureBytes . fallbacks . pathify . mindepths . listToTrie 90 | 91 | parseSwitch :: Q Exp -> Q ([(String, Exp)], Maybe Exp) 92 | parseSwitch exp = exp >>= \case 93 | CaseE (UnboundVarE _) [] -> error "switch: empty clause list" 94 | CaseE (UnboundVarE _) cases -> do 95 | (!cases, !last) <- pure (init cases, last cases) 96 | !cases <- forM cases \case 97 | Match (LitP (StringL str)) (NormalB rhs) [] -> pure (str, rhs) 98 | _ -> error "switch: expected a match clause on a string literal" 99 | (!cases, !last) <- case last of 100 | Match (LitP (StringL str)) (NormalB rhs) [] -> pure (cases ++ [(str, rhs)], Nothing) 101 | Match WildP (NormalB rhs) [] -> pure (cases, Just rhs) 102 | _ -> error "switch: expected a match clause on a string literal or a wildcard" 103 | pure (cases, last) 104 | _ -> error "switch: expected a \"case _ of\" expression" 105 | 106 | makeRawSwitch :: [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp 107 | makeRawSwitch branches deflt = do 108 | branches <- forM branches $ \(s, body) -> do 109 | body <- body 110 | pure $ Match (LitP (StringL s)) (NormalB body) [] 111 | branches <- case deflt of 112 | Nothing -> pure branches 113 | Just deflt -> do 114 | deflt <- deflt 115 | pure $ branches ++ [Match WildP (NormalB deflt) []] 116 | pure $ CaseE (UnboundVarE (mkName "_")) branches 117 | -------------------------------------------------------------------------------- /flatparse.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 1.12 2 | 3 | -- This file has been generated from package.yaml by hpack version 0.38.1. 4 | -- 5 | -- see: https://github.com/sol/hpack 6 | 7 | name: flatparse 8 | version: 0.5.3.1 9 | synopsis: High-performance parsing from strict bytestrings 10 | description: @Flatparse@ is a high-performance parsing library for strict bytestring input. See the README for more information: 11 | . 12 | category: Parsing 13 | homepage: https://github.com/AndrasKovacs/flatparse#readme 14 | bug-reports: https://github.com/AndrasKovacs/flatparse/issues 15 | author: András Kovács 16 | maintainer: puttamalac@gmail.com 17 | copyright: 2021 András Kovács 18 | license: MIT 19 | license-file: LICENSE 20 | build-type: Simple 21 | tested-with: 22 | GHC == 8.6.5 23 | , GHC == 8.8.4 24 | , GHC == 8.10.7 25 | , GHC == 9.0.2 26 | , GHC == 9.2.8 27 | , GHC == 9.4.8 28 | , GHC == 9.6.7 29 | , GHC == 9.8.4 30 | extra-source-files: 31 | README.md 32 | 33 | source-repository head 34 | type: git 35 | location: https://github.com/AndrasKovacs/flatparse 36 | 37 | flag dump 38 | description: dump core, stg and cmm to files 39 | manual: True 40 | default: False 41 | 42 | flag llvm 43 | description: use llvm for building 44 | manual: True 45 | default: False 46 | 47 | library 48 | exposed-modules: 49 | FlatParse.Basic 50 | FlatParse.Basic.Addr 51 | FlatParse.Basic.Base 52 | FlatParse.Basic.Bytes 53 | FlatParse.Basic.Integers 54 | FlatParse.Basic.Parser 55 | FlatParse.Basic.Switch 56 | FlatParse.Basic.Text 57 | FlatParse.Common.Assorted 58 | FlatParse.Common.GHCExts 59 | FlatParse.Common.Numbers 60 | FlatParse.Common.Parser 61 | FlatParse.Common.Position 62 | FlatParse.Common.Switch 63 | FlatParse.Examples.BasicLambda.Lexer 64 | FlatParse.Examples.BasicLambda.Parser 65 | FlatParse.Stateful 66 | FlatParse.Stateful.Addr 67 | FlatParse.Stateful.Base 68 | FlatParse.Stateful.Bytes 69 | FlatParse.Stateful.Integers 70 | FlatParse.Stateful.Parser 71 | FlatParse.Stateful.Switch 72 | FlatParse.Stateful.Text 73 | other-modules: 74 | Paths_flatparse 75 | hs-source-dirs: 76 | src 77 | default-extensions: 78 | BangPatterns 79 | BlockArguments 80 | CPP 81 | ExplicitNamespaces 82 | LambdaCase 83 | MagicHash 84 | OverloadedStrings 85 | PatternSynonyms 86 | TemplateHaskell 87 | TupleSections 88 | ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2 89 | build-depends: 90 | base >=4.7 && <5 91 | , bytestring 92 | , containers 93 | , integer-gmp 94 | , template-haskell 95 | , utf8-string >=1.0.2 && <1.1 96 | default-language: Haskell2010 97 | if flag(dump) 98 | ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file 99 | if flag(llvm) 100 | ghc-options: -fllvm 101 | 102 | test-suite spec 103 | type: exitcode-stdio-1.0 104 | main-is: Test.hs 105 | other-modules: 106 | Paths_flatparse 107 | hs-source-dirs: 108 | test 109 | default-extensions: 110 | BangPatterns 111 | BlockArguments 112 | CPP 113 | ExplicitNamespaces 114 | LambdaCase 115 | MagicHash 116 | OverloadedStrings 117 | PatternSynonyms 118 | TemplateHaskell 119 | TupleSections 120 | ExtendedDefaultRules 121 | ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2 -Wno-type-defaults 122 | build-depends: 123 | HUnit 124 | , QuickCheck 125 | , base >=4.7 && <5 126 | , bytestring 127 | , flatparse 128 | , hspec 129 | , quickcheck-instances 130 | , utf8-string >=1.0.2 && <1.1 131 | default-language: Haskell2010 132 | if flag(dump) 133 | ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file 134 | if flag(llvm) 135 | ghc-options: -fllvm 136 | 137 | benchmark bench 138 | type: exitcode-stdio-1.0 139 | main-is: Bench.hs 140 | other-modules: 141 | Attoparsec 142 | Common 143 | FPBasic 144 | FPStateful 145 | Megaparsec 146 | Parsec 147 | ReadInteger 148 | Paths_flatparse 149 | hs-source-dirs: 150 | bench 151 | default-extensions: 152 | BangPatterns 153 | BlockArguments 154 | CPP 155 | ExplicitNamespaces 156 | LambdaCase 157 | MagicHash 158 | OverloadedStrings 159 | PatternSynonyms 160 | TemplateHaskell 161 | TupleSections 162 | ghc-options: -Wall -Wno-missing-signatures -Wno-name-shadowing -Wno-unused-binds -Wno-unused-matches -O2 163 | build-depends: 164 | attoparsec 165 | , base >=4.7 && <5 166 | , bytestring 167 | , flatparse 168 | , integer-gmp 169 | , megaparsec 170 | , parsec 171 | , primitive 172 | , tasty-bench 173 | , utf8-string >=1.0.2 && <1.1 174 | default-language: Haskell2010 175 | if flag(dump) 176 | ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file 177 | if flag(llvm) 178 | ghc-options: -fllvm 179 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Switch.hs: -------------------------------------------------------------------------------- 1 | -- | Efficient literal branching using Template Haskell. 2 | 3 | module FlatParse.Stateful.Switch 4 | ( switch, switchWithPost, rawSwitchWithPost 5 | ) where 6 | 7 | import Control.Monad 8 | import Data.Foldable 9 | import Data.Map (Map) 10 | import Language.Haskell.TH 11 | 12 | import qualified Data.Map.Strict as M 13 | 14 | import FlatParse.Common.Switch 15 | import FlatParse.Stateful.Base ( ensure, skipBack, branch, failed ) 16 | import FlatParse.Stateful.Bytes ( bytesUnsafe ) 17 | import FlatParse.Stateful.Integers ( anyWord8Unsafe ) 18 | 19 | {-| 20 | This is a template function which makes it possible to branch on a collection of string literals in 21 | an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing 22 | operations, which has optimized control flow, vectorized reads and grouped checking for needed input 23 | bytes. 24 | 25 | The syntax is slightly magical, it overloads the usual @case@ expression. An example: 26 | 27 | @ 28 | $(switch [| case _ of 29 | "foo" -> pure True 30 | "bar" -> pure False |]) 31 | @ 32 | 33 | The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally 34 | we may have a default case, like in 35 | 36 | @ 37 | $(switch [| case _ of 38 | "foo" -> pure 10 39 | "bar" -> pure 20 40 | _ -> pure 30 |]) 41 | @ 42 | 43 | All case right hand sides must be parsers with the same type. That type is also the type 44 | of the whole `switch` expression. 45 | 46 | A `switch` has longest match semantics, and the order of cases does not matter, except for 47 | the default case, which may only appear as the last case. 48 | 49 | If a `switch` does not have a default case, and no case matches the input, then it returns with 50 | failure, \without\ having consumed any input. A fallthrough to the default case also does not 51 | consume any input. 52 | -} 53 | switch :: Q Exp -> Q Exp 54 | switch = switchWithPost Nothing 55 | 56 | {-| 57 | Switch expression with an optional first argument for performing a post-processing action after 58 | every successful branch matching. For example, if we have @ws :: ParserT st r e ()@ for a 59 | whitespace parser, we might want to consume whitespace after matching on any of the switch 60 | cases. For that case, we can define a "lexeme" version of `switch` as follows. 61 | 62 | @ 63 | switch' :: Q Exp -> Q Exp 64 | switch' = switchWithPost (Just [| ws |]) 65 | @ 66 | 67 | Note that this @switch'@ function cannot be used in the same module it's defined in, because of the 68 | stage restriction of Template Haskell. 69 | -} 70 | switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp 71 | switchWithPost postAction exp = do 72 | !postAction <- sequence postAction 73 | (!cases, !fallback) <- parseSwitch exp 74 | genTrie $! genSwitchTrie' postAction cases fallback 75 | 76 | -- | Version of `switchWithPost` without syntactic sugar. The second argument is the 77 | -- list of cases, the third is the default case. 78 | rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp 79 | rawSwitchWithPost postAction cases fallback = do 80 | !postAction <- sequence postAction 81 | !cases <- forM cases \(str, rhs) -> (str,) <$> rhs 82 | !fallback <- sequence fallback 83 | genTrie $! genSwitchTrie' postAction cases fallback 84 | 85 | #if MIN_VERSION_base(4,15,0) 86 | mkDoE = DoE Nothing 87 | {-# inline mkDoE #-} 88 | #else 89 | mkDoE = DoE 90 | {-# inline mkDoE #-} 91 | #endif 92 | 93 | genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp 94 | genTrie (rules, t) = do 95 | branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules 96 | 97 | let ix m k = case M.lookup k m of 98 | Nothing -> error ("key not in map: " ++ show k) 99 | Just a -> a 100 | 101 | let ensure' :: Maybe Int -> Maybe (Q Exp) 102 | ensure' = fmap (\n -> [| ensure n |]) 103 | 104 | fallback :: Rule -> Int -> Q Exp 105 | fallback rule 0 = pure $ VarE $ fst $ ix branches rule 106 | fallback rule n = [| skipBack n >> $(pure $ VarE $ fst $ ix branches rule) |] 107 | 108 | let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp 109 | go = \case 110 | Branch' (r, n, alloc) ts 111 | | M.null ts -> pure $ VarE $ fst $ branches M.! r 112 | | otherwise -> do 113 | !next <- (traverse . traverse) go (M.toList ts) 114 | !defaultCase <- fallback r (n + 1) 115 | 116 | let cases = mkDoE $ 117 | [BindS (VarP (mkName "c")) (VarE 'anyWord8Unsafe), 118 | NoBindS (CaseE (VarE (mkName "c")) 119 | (map (\(w, t) -> 120 | Match (LitP (IntegerL (fromIntegral w))) 121 | (NormalB t) 122 | []) 123 | next 124 | ++ [Match WildP (NormalB defaultCase) []]))] 125 | 126 | case ensure' alloc of 127 | Nothing -> pure cases 128 | Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |] 129 | 130 | Path (r, n, alloc) ws t -> 131 | case ensure' alloc of 132 | Nothing -> [| branch $(bytesUnsafe ws) $(go t) $(fallback r n)|] 133 | Just alloc -> [| branch ($alloc >> $(bytesUnsafe ws)) $(go t) $(fallback r n) |] 134 | 135 | letE 136 | (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches)) 137 | (go t) 138 | 139 | genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp 140 | -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) 141 | genSwitchTrie' postAction cases fallback = 142 | 143 | let (!branches, !strings) = unzip do 144 | (!i, (!str, !rhs)) <- zip [0..] cases 145 | case postAction of 146 | Nothing -> pure ((Just i, rhs), (i, str)) 147 | Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) 148 | 149 | !m = M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches) 150 | !trie = compileTrie strings 151 | in (m , trie) 152 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Switch.hs: -------------------------------------------------------------------------------- 1 | -- | Efficient literal branching using Template Haskell. 2 | 3 | module FlatParse.Basic.Switch 4 | ( switch, switchWithPost, rawSwitchWithPost 5 | ) where 6 | 7 | import Control.Monad 8 | import Data.Foldable 9 | import Data.Map (Map) 10 | import Language.Haskell.TH 11 | 12 | import qualified Data.Map.Strict as M 13 | 14 | import FlatParse.Common.Switch 15 | import FlatParse.Basic.Base ( ensure, skipBack, branch, failed ) 16 | import FlatParse.Basic.Bytes ( bytesUnsafe ) 17 | import FlatParse.Basic.Integers ( anyWord8Unsafe ) 18 | 19 | {-| 20 | This is a template function which makes it possible to branch on a collection of string literals in 21 | an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing 22 | operations, which has optimized control flow, vectorized reads and grouped checking for needed input 23 | bytes. 24 | 25 | The syntax is slightly magical, it overloads the usual @case@ expression. An example: 26 | 27 | @ 28 | $(switch [| case _ of 29 | "foo" -> pure True 30 | "bar" -> pure False |]) 31 | @ 32 | 33 | The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally 34 | we may have a default case, like in 35 | 36 | @ 37 | $(switch [| case _ of 38 | "foo" -> pure 10 39 | "bar" -> pure 20 40 | _ -> pure 30 |]) 41 | @ 42 | 43 | All case right hand sides must be parsers with the same type. That type is also the type 44 | of the whole `switch` expression. 45 | 46 | A `switch` has longest match semantics, and the order of cases does not matter, except for 47 | the default case, which may only appear as the last case. 48 | 49 | If a `switch` does not have a default case, and no case matches the input, then it returns with 50 | failure, \without\ having consumed any input. A fallthrough to the default case also does not 51 | consume any input. 52 | -} 53 | switch :: Q Exp -> Q Exp 54 | switch = switchWithPost Nothing 55 | 56 | {-| 57 | Switch expression with an optional first argument for performing a post-processing action after 58 | every successful branch matching, not including the default branch. For example, if we have 59 | @ws :: ParserT st e ()@ for a whitespace parser, we might want to consume whitespace after matching 60 | on any of the switch cases. For that case, we can define a "lexeme" version of `switch` as 61 | follows. 62 | 63 | @ 64 | switch' :: Q Exp -> Q Exp 65 | switch' = switchWithPost (Just [| ws |]) 66 | @ 67 | 68 | Note that this @switch'@ function cannot be used in the same module it's defined in, because of the 69 | stage restriction of Template Haskell. 70 | -} 71 | switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp 72 | switchWithPost postAction exp = do 73 | !postAction <- sequence postAction 74 | (!cases, !fallback) <- parseSwitch exp 75 | genTrie $! genSwitchTrie' postAction cases fallback 76 | 77 | -- | Version of `switchWithPost` without syntactic sugar. The second argument is the 78 | -- list of cases, the third is the default case. 79 | rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp 80 | rawSwitchWithPost postAction cases fallback = do 81 | !postAction <- sequence postAction 82 | !cases <- forM cases \(str, rhs) -> (str,) <$> rhs 83 | !fallback <- sequence fallback 84 | genTrie $! genSwitchTrie' postAction cases fallback 85 | 86 | #if MIN_VERSION_base(4,15,0) 87 | mkDoE = DoE Nothing 88 | {-# inline mkDoE #-} 89 | #else 90 | mkDoE = DoE 91 | {-# inline mkDoE #-} 92 | #endif 93 | 94 | genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp 95 | genTrie (rules, t) = do 96 | branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules 97 | 98 | let ix m k = case M.lookup k m of 99 | Nothing -> error ("key not in map: " ++ show k) 100 | Just a -> a 101 | 102 | let ensure' :: Maybe Int -> Maybe (Q Exp) 103 | ensure' = fmap (\n -> [| ensure n |]) 104 | 105 | fallback :: Rule -> Int -> Q Exp 106 | fallback rule 0 = pure $ VarE $ fst $ ix branches rule 107 | fallback rule n = [| skipBack n >> $(pure $ VarE $ fst $ ix branches rule) |] 108 | 109 | let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp 110 | go = \case 111 | Branch' (r, n, alloc) ts 112 | | M.null ts -> pure $ VarE $ fst $ branches M.! r 113 | | otherwise -> do 114 | !next <- (traverse . traverse) go (M.toList ts) 115 | !defaultCase <- fallback r (n + 1) 116 | 117 | let cases = mkDoE $ 118 | [BindS (VarP (mkName "c")) (VarE 'anyWord8Unsafe), 119 | NoBindS (CaseE (VarE (mkName "c")) 120 | (map (\(w, t) -> 121 | Match (LitP (IntegerL (fromIntegral w))) 122 | (NormalB t) 123 | []) 124 | next 125 | ++ [Match WildP (NormalB defaultCase) []]))] 126 | 127 | case ensure' alloc of 128 | Nothing -> pure cases 129 | Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |] 130 | 131 | Path (r, n, alloc) ws t -> 132 | case ensure' alloc of 133 | Nothing -> [| branch $(bytesUnsafe ws) $(go t) $(fallback r n)|] 134 | Just alloc -> [| branch ($alloc >> $(bytesUnsafe ws)) $(go t) $(fallback r n) |] 135 | 136 | letE 137 | (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches)) 138 | (go t) 139 | 140 | genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp 141 | -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) 142 | genSwitchTrie' postAction cases fallback = 143 | 144 | let (!branches, !strings) = unzip do 145 | (!i, (!str, !rhs)) <- zip [0..] cases 146 | case postAction of 147 | Nothing -> pure ((Just i, rhs), (i, str)) 148 | Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) 149 | 150 | !m = M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches) 151 | !trie = compileTrie strings 152 | in (m , trie) 153 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Parser.hs: -------------------------------------------------------------------------------- 1 | -- | Minimal parser definition. 2 | 3 | {-# LANGUAGE UnboxedTuples #-} 4 | {-# LANGUAGE KindSignatures #-} 5 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why) 6 | {-# LANGUAGE FlexibleInstances #-} 7 | 8 | module FlatParse.Basic.Parser 9 | ( 10 | -- * Parser 11 | ParserT(..) 12 | , Parser, ParserIO, ParserST 13 | , pureLazy 14 | 15 | -- * Result 16 | , type Res# 17 | , pattern OK#, pattern Err#, pattern Fail# 18 | 19 | -- ** Internal 20 | , type ResI# 21 | 22 | -- * Choice operator (defined with right associativity) 23 | , (<|>) 24 | ) where 25 | 26 | import FlatParse.Common.GHCExts ( Addr#, unsafeCoerce#, ZeroBitType ) 27 | import FlatParse.Common.Parser 28 | 29 | import GHC.ForeignPtr ( ForeignPtrContents ) 30 | 31 | import qualified Control.Applicative 32 | import Control.Monad ( MonadPlus(..) ) 33 | import Control.Monad.IO.Class ( MonadIO(..) ) 34 | import GHC.IO ( IO(IO) ) 35 | 36 | -- | @ParserT st e a@ is a parser with a state token type @st@, an error type 37 | -- @e@ and a return type @a@. The different state token types support 38 | -- different embedded effects; see `Parser`, `ParserIO` and `ParserST` below. 39 | newtype ParserT (st :: ZeroBitType) e a = 40 | ParserT { runParserT# :: ForeignPtrContents -> Addr# -> Addr# -> st -> Res# st e a } 41 | 42 | -- | The type of pure parsers. 43 | type Parser = ParserT PureMode 44 | 45 | -- | The type of parsers which can embed `IO` actions. 46 | type ParserIO = ParserT IOMode 47 | 48 | -- | The type of parsers which can embed `ST` actions. 49 | type ParserST s = ParserT (STMode s) 50 | 51 | -- | You may lift IO actions into a 'ParserIO' using `liftIO`. 52 | instance MonadIO (ParserIO e) where 53 | liftIO (IO a) = ParserT \fp eob s rw -> 54 | case a rw of (# rw', a #) -> OK# rw' a s 55 | {-# inline liftIO #-} 56 | 57 | instance Functor (ParserT st e) where 58 | fmap f (ParserT g) = ParserT \fp eob s st -> case g fp eob s st of 59 | OK# st' a s -> let !b = f a in OK# st' b s 60 | x -> unsafeCoerce# x 61 | {-# inline fmap #-} 62 | 63 | (<$) a' (ParserT g) = ParserT \fp eob s st -> case g fp eob s st of 64 | OK# st' _a s -> OK# st' a' s 65 | x -> unsafeCoerce# x 66 | {-# inline (<$) #-} 67 | 68 | instance Applicative (ParserT st e) where 69 | pure !a = ParserT \fp eob s st -> OK# st a s 70 | {-# inline pure #-} 71 | ParserT ff <*> ParserT fa = ParserT \fp eob s st -> case ff fp eob s st of 72 | OK# st' f s -> case fa fp eob s st' of 73 | OK# st'' a s -> let !b = f a in OK# st'' b s 74 | x -> unsafeCoerce# x 75 | x -> unsafeCoerce# x 76 | {-# inline (<*>) #-} 77 | ParserT fa <* ParserT fb = ParserT \fp eob s st -> case fa fp eob s st of 78 | OK# st' a s -> case fb fp eob s st' of 79 | OK# st'' _b s -> OK# st'' a s 80 | x -> unsafeCoerce# x 81 | x -> unsafeCoerce# x 82 | {-# inline (<*) #-} 83 | ParserT fa *> ParserT fb = ParserT \fp eob s st -> case fa fp eob s st of 84 | OK# st' _a s -> fb fp eob s st' 85 | x -> unsafeCoerce# x 86 | {-# inline (*>) #-} 87 | 88 | -- | Same as `pure` for `ParserT` except that it does not force the returned value. 89 | pureLazy :: a -> ParserT st e a 90 | pureLazy a = ParserT \fp eob s st -> OK# st a s 91 | {-# inline pureLazy #-} 92 | 93 | instance Monad (ParserT st e) where 94 | return = pure 95 | {-# inline return #-} 96 | ParserT fa >>= f = ParserT \fp eob s st -> case fa fp eob s st of 97 | OK# st' a s -> runParserT# (f a) fp eob s st' 98 | x -> unsafeCoerce# x 99 | {-# inline (>>=) #-} 100 | (>>) = (*>) 101 | {-# inline (>>) #-} 102 | 103 | -- | By default, parser choice `(<|>)` arbitrarily backtracks on parser failure. 104 | instance Control.Applicative.Alternative (ParserT st e) where 105 | empty = ParserT \fp eob s st -> Fail# st 106 | {-# inline empty #-} 107 | 108 | (<|>) = (<|>) 109 | {-# inline (Control.Applicative.<|>) #-} 110 | 111 | many (ParserT f) = ParserT go where 112 | go fp eob s st = case f fp eob s st of 113 | OK# st a s -> case go fp eob s st of 114 | OK# st as s -> OK# st (a:as) s 115 | x -> x 116 | Fail# st -> OK# st [] s 117 | Err# st e -> Err# st e 118 | {-# inline many #-} 119 | 120 | some p = (:) <$> p <*> Control.Applicative.many p 121 | {-# inline some #-} 122 | 123 | infixr 6 <|> 124 | -- | Choose between two parsers. If the first parser fails, try the second one, 125 | -- but if the first one throws an error, propagate the error. This operation 126 | -- can arbitrarily backtrack. 127 | -- 128 | -- Note: this exported operator has different fixity than the same operator in 129 | -- `Control.Applicative`. Hide this operator if you want to use the 130 | -- `Alternative` version. 131 | (<|>) :: ParserT st e a -> ParserT st e a -> ParserT st e a 132 | (<|>) (ParserT f) (ParserT g) = ParserT \fp eob s st -> 133 | case f fp eob s st of 134 | Fail# st' -> g fp eob s st' 135 | x -> x 136 | {-# inline[1] (<|>) #-} 137 | 138 | {-# RULES 139 | 140 | "flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r) 141 | 142 | #-} 143 | 144 | instance MonadPlus (ParserT st e) where 145 | mzero = Control.Applicative.empty 146 | {-# inline mzero #-} 147 | mplus = (<|>) 148 | {-# inline mplus #-} 149 | 150 | -------------------------------------------------------------------------------- 151 | 152 | -- | Primitive parser result wrapped with a state token. 153 | -- 154 | -- You should rarely need to manipulate values of this type directly. Use the 155 | -- provided bidirectional pattern synonyms 'OK#', 'Fail#' and 'Err#'. 156 | type Res# (st :: ZeroBitType) e a = 157 | (# st, ResI# e a #) 158 | 159 | -- | Primitive parser result. 160 | type ResI# e a = 161 | (# 162 | (# a, Addr# #) 163 | | (# #) 164 | | (# e #) 165 | #) 166 | 167 | -- | 'Res#' constructor for a successful parse. 168 | -- Contains the return value and a pointer to the rest of the input buffer, 169 | -- plus a state token. 170 | pattern OK# :: (st :: ZeroBitType) -> a -> Addr# -> Res# st e a 171 | pattern OK# st a s = (# st, (# (# a, s #) | | #) #) 172 | 173 | -- | 'Res#' constructor for recoverable failure. 174 | -- Contains only a state token. 175 | pattern Fail# :: (st :: ZeroBitType) -> Res# st e a 176 | pattern Fail# st = (# st, (# | (# #) | #) #) 177 | 178 | -- | 'Res#' constructor for errors which are by default non-recoverable. 179 | -- Contains the error, plus a state token. 180 | pattern Err# :: (st :: ZeroBitType) -> e -> Res# st e a 181 | pattern Err# st e = (# st, (# | | (# e #) #) #) 182 | {-# complete OK#, Fail#, Err# #-} 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # flatparse 2 | 3 | [![Hackage](https://img.shields.io/hackage/v/flatparse.svg)](https://hackage.haskell.org/package/flatparse) 4 | ![CI](https://github.com/AndrasKovacs/flatparse/actions/workflows/haskell.yml/badge.svg) 5 | 6 | `flatparse` is a high-performance parsing library, supporting parsing for __programming languages__, __human-readable data__ and __machine-readable data__. The "flat" in the name refers to the `ByteString` parsing input, which has pinned contiguous data, and also to the library internals, which avoids indirections and heap allocations whenever possible. `flatparse` is generally __lower-level__ than `parsec`-style libraries, but it is possible to build higher-level features (such as source spans, hints, indentation parsing) on top of it, without making any compromises in performance. 7 | 8 | ### LLVM 9 | 10 | It is advised to build with [`-fllvm` 11 | option](https://downloads.haskell.org/~ghc/latest/docs/html/users_guide/codegens.html#llvm-code-generator-fllvm) 12 | when using this package, since that can result in significant speedups (20-40% 13 | from what I've seen). Additionally, you can enable `-fllvm` for `flatparse` 14 | specifically by enabling the `llvm` package flag. However, this has minor 15 | impact, since almost all parser code will be typically inlined into modules 16 | outside `flatparse`, and compiled there. 17 | 18 | ## Features and non-features 19 | 20 | * __Excellent performance__. On microbenchmarks, `flatparse` is 2-10 times faster than `attoparsec` or `megaparsec`. On examples with heavier use of source positions and spans and/or indentation parsing, you can expect a bigger gap between `megaparsec` and `flatparse`. Compile times and executable sizes are also significantly better with `flatparse` than with `megaparsec` or `attoparsec`. `flatparse` internals make liberal use of unboxed tuples and GHC primops. As a result, pure validators (parsers returning `()`) in `flatparse` are not difficult to implement with zero heap allocation. 21 | * __No incremental parsing__, and __only strict `ByteString`__ is supported as input. However, it can be still useful to convert from `Text`, `String` or other types to `ByteString`, and then use `flatparse` for parsing, since `flatparse` performance usually more than makes up for the conversion costs. 22 | * __Only little-endian systems are currently supported as the host machine__. This may change in the future. However, `flatparse` does include primitive integer parsers with specific endianness. 23 | * __Support for fast source location handling, indentation parsing and informative error messages__. `flatparse` provides a low-level interface to these. Batteries are _not included_, but it should be possible for users to build custom solutions, which are more sophisticated, but still as fast as possible. In my experience, the included batteries in other libraries often come with major unavoidable overheads, and often we still have to extend existing machinery in order to scale to production features. 24 | * The __backtracking model__ of `flatparse` is different to parsec libraries, and is more close to the [nom](https://github.com/Geal/nom) library in Rust. The idea is that _parser failure_ is distinguished from _parsing error_. The former is used for control flow, and we can backtrack from it. The latter is used for unrecoverable errors, and by default it's propagated to the top. `flatparse` does not track whether parsers have consumed inputs. In my experience, what we really care about is the failure/error distinction, and in `parsec` or `megaparsec` the consumed/non-consumed separation is often muddled and discarded in larger parser implementations. By default, basic `flatparse` parsers can fail but can not throw errors, with the exception of the specifically error-throwing operations. Hence, `flatparse` users have to be mindful about grammar, and explicitly insert errors where it is known that the input can't be valid. 25 | 26 | `flatparse` comes in two flavors: [`FlatParse.Basic`][basic] and [`FlatParse.Stateful`][stateful]. Both support a custom error type. Also, both come in three modes, where we can respectively run `IO` actions, `ST` actions, or no side effects. The modes are selected by a state token type parameter on the parser types. 27 | 28 | * [`FlatParse.Basic`][basic] only supports the above features. If you don't need 29 | indentation parsing, this is sufficient. 30 | * [`FlatParse.Stateful`][stateful] additionally supports a built-in `Int` worth 31 | of internal state and an additional custom reader environment. This can 32 | support a wide range of indentation parsing features. There is a moderate 33 | overhead in performance and code size compared to `Basic`. In microbenchmarks 34 | and small parsers, the performance difference between `Basic` and `Stateful` 35 | is more up to the whims of GHC and LLVM, and is a bit more "random". 36 | 37 | ## Tutorial 38 | 39 | Informative tutorials are work in progress. See [`src/FlatParse/Examples`](src/FlatParse/Examples) 40 | for a lexer/parser example with acceptably good error messages. 41 | 42 | ## Contribution 43 | 44 | Pull requests are welcome. I'm fairly quick to add PR authors as collaborators. 45 | 46 | ## Some benchmarks 47 | 48 | Execution times below. See source code in [bench](bench). Compiled with GHC 9.10.2 `-O2 -fllvm` with 49 | `flatparse-0.5.3.1`. Executed on AMD 9800X3D CPU. Uses `lts-24.7` Stackage snapshot for the involved 50 | packages. 51 | 52 | | benchmark | runtime | 53 | |-----------------------------|------------- 54 | |sexp/fpbasic | 1.80 ms| 55 | |sexp/fpstateful | 1.25 ms| 56 | |sexp/attoparsec | 10.2 ms| 57 | |sexp/megaparsec | 6.92 ms| 58 | |sexp/parsec | 39.9 ms| 59 | |long keyword/fpbasic | 0.054 ms | 60 | |long keyword/fpstateful | 0.062 ms | 61 | |long keyword/attoparsec | 0.308 ms | 62 | |long keyword/megaparsec | 0.687 ms | 63 | |long keyword/parsec | 3.50 ms | 64 | |numeral csv/fpbasic | 0.540 ms | 65 | |numeral csv/fpstateful | 0.504 ms | 66 | |numeral csv/attoparsec | 3.17 ms | 67 | |numeral csv/megaparsec | 1.09 ms | 68 | |numeral csv/parsec | 13.8 ms | 69 | |lambda term/fpbasic | 1.52 ms| 70 | |lambda term/fpstateful | 1.56 ms| 71 | |lambda term/attoparsec | 4.94 ms| 72 | |lambda term/megaparsec | 5.35 ms| 73 | |lambda term/parsec | 17.7 ms| 74 | 75 | Object file sizes for each module containing the `s-exp`, `long keyword`, `numeral csv` and `lambda term` benchmarks. 76 | 77 | | library | object file size (bytes) | 78 | | ------- | ------------------------ | 79 | | fpbasic | 71088 | 80 | | fpstateful | 73576 | 81 | | attoparsec | 242816 | 82 | | megaparsec | 402984 | 83 | | parsec | 329008 | 84 | 85 | [basic]: https://hackage.haskell.org/package/flatparse/docs/FlatParse-Basic.html 86 | [stateful]: https://hackage.haskell.org/package/flatparse/docs/FlatParse-Stateful.html 87 | -------------------------------------------------------------------------------- /src/FlatParse/Examples/BasicLambda/Lexer.hs: -------------------------------------------------------------------------------- 1 | 2 | {-| 3 | This module contains lexer and error message primitives for a simple lambda calculus parser. It 4 | demonstrates a simple but decently informative implementation of error message propagation. 5 | -} 6 | 7 | {-# language StrictData #-} 8 | 9 | module FlatParse.Examples.BasicLambda.Lexer where 10 | 11 | import FlatParse.Basic hiding (Parser, runParser, string, char, cut) 12 | 13 | import qualified FlatParse.Basic as FP 14 | import qualified Data.ByteString as B 15 | import Language.Haskell.TH 16 | 17 | import Data.String 18 | import qualified Data.Set as S 19 | 20 | import qualified Data.ByteString.UTF8 as UTF8 21 | 22 | -------------------------------------------------------------------------------- 23 | 24 | -- | An expected item which is displayed in error messages. 25 | data Expected 26 | = Msg String -- ^ An error message. 27 | | Lit String -- ^ A literal expected thing. 28 | deriving (Eq, Show, Ord) 29 | 30 | instance IsString Expected where fromString = Lit 31 | 32 | -- | A parsing error. 33 | data Error 34 | = Precise Pos Expected -- ^ A precisely known error, like leaving out "in" from "let". 35 | | Imprecise Pos [Expected] -- ^ An imprecise error, when we expect a number of different things, 36 | -- but parse something else. 37 | deriving Show 38 | 39 | errorPos :: Error -> Pos 40 | errorPos (Precise p _) = p 41 | errorPos (Imprecise p _) = p 42 | 43 | -- | Merge two errors. Inner errors (which were thrown at points with more consumed inputs) 44 | -- are preferred. If errors are thrown at identical input positions, we prefer precise errors 45 | -- to imprecise ones. 46 | -- 47 | -- The point of prioritizing inner and precise errors is to suppress the deluge of "expected" 48 | -- items, and instead try to point to a concrete issue to fix. 49 | merge :: Error -> Error -> Error 50 | merge e e' = case (errorPos e, errorPos e') of 51 | (p, p') | p < p' -> e' 52 | (p, p') | p > p' -> e 53 | (p, p') -> case (e, e') of 54 | (Precise{} , _ ) -> e 55 | (_ , Precise{} ) -> e' 56 | (Imprecise _ es , Imprecise _ es' ) -> Imprecise p (es ++ es') 57 | {-# noinline merge #-} -- merge is "cold" code, so we shouldn't inline it. 58 | 59 | type Parser = FP.Parser Error 60 | 61 | -- | Pretty print an error. The `B.ByteString` input is the source file. The offending line from the 62 | -- source is displayed in the output. 63 | prettyError :: B.ByteString -> Error -> String 64 | prettyError b e = 65 | 66 | let pos :: Pos 67 | pos = case e of Imprecise pos e -> pos 68 | Precise pos e -> pos 69 | ls = FP.linesUtf8 b 70 | (l, c) = case FP.posLineCols b [pos] of 71 | x: _ -> x 72 | _ -> error "impossible" 73 | line = if l < length ls then ls !! l else "" 74 | linum = show l 75 | lpad = map (const ' ') linum 76 | 77 | expected (Lit s) = show s 78 | expected (Msg s) = s 79 | 80 | err (Precise _ e) = expected e 81 | err (Imprecise _ es) = imprec $ S.toList $ S.fromList es 82 | 83 | imprec :: [Expected] -> String 84 | imprec [] = error "impossible" 85 | imprec [e] = expected e 86 | imprec (e:es) = expected e ++ go es where 87 | go [] = "" 88 | go [e] = " or " ++ expected e 89 | go (e:es) = ", " ++ expected e ++ go es 90 | 91 | in show l ++ ":" ++ show c ++ ":\n" ++ 92 | lpad ++ "|\n" ++ 93 | linum ++ "| " ++ line ++ "\n" ++ 94 | lpad ++ "| " ++ replicate c ' ' ++ "^\n" ++ 95 | "parse error: expected " ++ 96 | err e 97 | 98 | -- | Imprecise cut: we slap a list of items on inner errors. 99 | cut :: Parser a -> [Expected] -> Parser a 100 | cut p es = do 101 | pos <- getPos 102 | FP.cutting p (Imprecise pos es) merge 103 | 104 | -- | Precise cut: we propagate at most a single error. 105 | cut' :: Parser a -> Expected -> Parser a 106 | cut' p e = do 107 | pos <- getPos 108 | FP.cutting p (Precise pos e) merge 109 | 110 | runParser :: Parser a -> B.ByteString -> Result Error a 111 | runParser = FP.runParser 112 | 113 | -- | Run parser, print pretty error on failure. 114 | testParser :: Show a => Parser a -> String -> IO () 115 | testParser p str = case UTF8.fromString str of 116 | b -> case runParser p b of 117 | Err e -> putStrLn $ prettyError b e 118 | OK a _ -> print a 119 | Fail -> putStrLn "uncaught parse error" 120 | 121 | -- | Parse a line comment. 122 | lineComment :: Parser () 123 | lineComment = 124 | withOption anyWord8 125 | (\case 10 -> ws 126 | _ -> lineComment) 127 | (pure ()) 128 | 129 | -- | Parse a potentially nested multiline comment. 130 | multilineComment :: Parser () 131 | multilineComment = go (1 :: Int) where 132 | go 0 = ws 133 | go n = $(switch [| case _ of 134 | "-}" -> go (n - 1) 135 | "{-" -> go (n + 1) 136 | _ -> branch anyWord8 (go n) (pure ()) |]) 137 | 138 | -- | Consume whitespace. 139 | ws :: Parser () 140 | ws = $(switch [| case _ of 141 | " " -> ws 142 | "\n" -> ws 143 | "\t" -> ws 144 | "\r" -> ws 145 | "--" -> lineComment 146 | "{-" -> multilineComment 147 | _ -> pure () |]) 148 | 149 | -- | Consume whitespace after running a parser. 150 | token :: Parser a -> Parser a 151 | token p = p <* ws 152 | {-# inline token #-} 153 | 154 | -- | Read a starting character of an identifier. 155 | identStartChar :: Parser Char 156 | identStartChar = satisfyAscii isLatinLetter 157 | {-# inline identStartChar #-} 158 | 159 | -- | Read a non-starting character of an identifier. 160 | identChar :: Parser Char 161 | identChar = satisfyAscii (\c -> isLatinLetter c || isDigit c) 162 | {-# inline identChar #-} 163 | 164 | -- | Check whether a `Span` contains exactly a keyword. Does not change parsing state. 165 | isKeyword :: Span -> Parser () 166 | isKeyword span = inSpan span do 167 | $(FP.switch [| case _ of 168 | "lam" -> pure () 169 | "let" -> pure () 170 | "in" -> pure () 171 | "if" -> pure () 172 | "then" -> pure () 173 | "else" -> pure () 174 | "true" -> pure () 175 | "false" -> pure () |]) 176 | eof 177 | 178 | -- | Parse a non-keyword string. 179 | symbol :: String -> Q Exp 180 | symbol str = [| token $(FP.string str) |] 181 | 182 | -- | Parser a non-keyword string, throw precise error on failure. 183 | symbol' :: String -> Q Exp 184 | symbol' str = [| $(symbol str) `cut'` Lit str |] 185 | 186 | -- | Parse a keyword string. 187 | keyword :: String -> Q Exp 188 | keyword str = [| token ($(FP.string str) `notFollowedBy` identChar) |] 189 | 190 | -- | Parse a keyword string, throw precise error on failure. 191 | keyword' :: String -> Q Exp 192 | keyword' str = [| $(keyword str) `cut'` Lit str |] 193 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Parser.hs: -------------------------------------------------------------------------------- 1 | -- | Minimal parser definition. 2 | 3 | {-# LANGUAGE UnboxedTuples #-} 4 | {-# LANGUAGE KindSignatures #-} 5 | {-# LANGUAGE DataKinds #-} -- needed for manual ZeroBitType def (unsure why) 6 | {-# LANGUAGE FlexibleInstances #-} 7 | 8 | module FlatParse.Stateful.Parser 9 | ( 10 | -- * Parser 11 | ParserT(..) 12 | , Parser, ParserIO, ParserST 13 | , pureLazy 14 | 15 | -- ** Result 16 | , type Res# 17 | , pattern OK#, pattern Err#, pattern Fail# 18 | 19 | -- *** Internal 20 | , type ResI# 21 | 22 | -- * Choice operator (defined with right associativity) 23 | , (<|>) 24 | ) where 25 | 26 | import FlatParse.Common.GHCExts ( Addr#, unsafeCoerce#, ZeroBitType, Int# ) 27 | import FlatParse.Common.Parser 28 | 29 | import GHC.ForeignPtr ( ForeignPtrContents ) 30 | 31 | import qualified Control.Applicative 32 | import Control.Monad ( MonadPlus(..) ) 33 | import Control.Monad.IO.Class ( MonadIO(..) ) 34 | import GHC.IO ( IO(IO) ) 35 | 36 | -- | @ParserT st r e a@ is a parser with a state token type @st@, a reader 37 | -- environment @r@, an error type @e@ and a return type @a@. The different 38 | -- state token types support different embedded effects; see `Parser`, 39 | -- `ParserIO` and `ParserST` below. 40 | newtype ParserT (st :: ZeroBitType) r e a = 41 | ParserT { runParserT# :: ForeignPtrContents -> r -> Addr# -> Addr# -> Int# -> st -> Res# st e a } 42 | 43 | -- | The type of pure parsers. 44 | type Parser = ParserT PureMode 45 | 46 | -- | The type of parsers which can embed `IO` actions. 47 | type ParserIO = ParserT IOMode 48 | 49 | -- | The type of parsers which can embed `ST` actions. 50 | type ParserST s = ParserT (STMode s) 51 | 52 | -- | You may lift IO actions into a 'ParserIO' using `liftIO`. 53 | instance MonadIO (ParserT IOMode r e) where 54 | liftIO (IO a) = ParserT \fp !r eob s n rw -> 55 | case a rw of (# rw', a #) -> OK# rw' a s n 56 | {-# inline liftIO #-} 57 | 58 | instance Functor (ParserT st r e) where 59 | fmap f (ParserT g) = ParserT \fp !r eob s n st -> case g fp r eob s n st of 60 | OK# st' a s n -> let !b = f a in OK# st' b s n 61 | x -> unsafeCoerce# x 62 | {-# inline fmap #-} 63 | 64 | (<$) a' (ParserT g) = ParserT \fp !r eob s n st -> case g fp r eob s n st of 65 | OK# st' _a s n -> OK# st' a' s n 66 | x -> unsafeCoerce# x 67 | {-# inline (<$) #-} 68 | 69 | instance Applicative (ParserT st r e) where 70 | pure !a = ParserT \_fp !_r _eob s n st -> OK# st a s n 71 | {-# inline pure #-} 72 | ParserT ff <*> ParserT fa = ParserT \fp !r eob s n st -> case ff fp r eob s n st of 73 | OK# st' f s n -> case fa fp r eob s n st' of 74 | OK# st'' a s n -> let b = f a in OK# st'' b s n 75 | x -> unsafeCoerce# x 76 | x -> unsafeCoerce# x 77 | {-# inline (<*>) #-} 78 | ParserT fa <* ParserT fb = ParserT \fp !r eob s n st -> case fa fp r eob s n st of 79 | OK# st' a s n -> case fb fp r eob s n st' of 80 | OK# st'' _b s n -> OK# st'' a s n 81 | x -> unsafeCoerce# x 82 | x -> unsafeCoerce# x 83 | {-# inline (<*) #-} 84 | ParserT fa *> ParserT fb = ParserT \fp !r eob s n st -> case fa fp r eob s n st of 85 | OK# st' _a s n -> fb fp r eob s n st' 86 | x -> unsafeCoerce# x 87 | {-# inline (*>) #-} 88 | 89 | -- | Same as `pure` for `ParserT` except that it does not force the returned value. 90 | pureLazy :: a -> ParserT st r e a 91 | pureLazy a = ParserT \_fp !_r _eob s n st -> OK# st a s n 92 | {-# inline pureLazy #-} 93 | 94 | instance Monad (ParserT st r e) where 95 | return = pure 96 | {-# inline return #-} 97 | ParserT fa >>= f = ParserT \fp !r eob s n st -> case fa fp r eob s n st of 98 | OK# st' a s n -> runParserT# (f a) fp r eob s n st' 99 | x -> unsafeCoerce# x 100 | {-# inline (>>=) #-} 101 | (>>) = (*>) 102 | {-# inline (>>) #-} 103 | 104 | instance Control.Applicative.Alternative (ParserT st r e) where 105 | empty = ParserT \fp !r eob s n st -> Fail# st 106 | {-# inline empty #-} 107 | 108 | (<|>) = (<|>) 109 | {-# inline (Control.Applicative.<|>) #-} 110 | 111 | many (ParserT f) = ParserT go where 112 | go fp !r eob s n st = 113 | case f fp r eob s n st of 114 | OK# st a s n -> 115 | case go fp r eob s n st of 116 | OK# st as s n -> OK# st (a:as) s n 117 | x -> x 118 | Fail# st' -> OK# st [] s n 119 | Err# st' e -> Err# st e 120 | {-# inline many #-} 121 | 122 | some p = (:) <$> p <*> Control.Applicative.many p 123 | {-# inline some #-} 124 | 125 | infixr 6 <|> 126 | -- | Choose between two parsers. If the first parser fails, try the second one, 127 | -- but if the first one throws an error, propagate the error. This operation 128 | -- can arbitrarily backtrack. 129 | -- 130 | -- Note: this exported operator has different fixity than the same operator in 131 | -- `Control.Applicative`. Hide this operator if you want to use the 132 | -- `Alternative` version. 133 | (<|>) :: ParserT st r e a -> ParserT st r e a -> ParserT st r e a 134 | (<|>) (ParserT f) (ParserT g) = ParserT \fp !r eob s n st -> 135 | case f fp r eob s n st of 136 | Fail# st' -> g fp r eob s n st' 137 | x -> x 138 | {-# inline[1] (<|>) #-} 139 | 140 | {-# RULES 141 | 142 | "flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r) 143 | 144 | #-} 145 | 146 | instance MonadPlus (ParserT st r e) where 147 | mzero = Control.Applicative.empty 148 | {-# inline mzero #-} 149 | mplus = (<|>) 150 | {-# inline mplus #-} 151 | 152 | -------------------------------------------------------------------------------- 153 | 154 | -- | Primitive parser result wrapped with a state token. 155 | -- 156 | -- You should rarely need to manipulate values of this type directly. Use the 157 | -- provided bidirectional pattern synonyms 'OK#', 'Fail#' and 'Err#'. 158 | type Res# (st :: ZeroBitType) e a = 159 | (# st, ResI# e a #) 160 | 161 | -- | Primitive parser result. 162 | type ResI# e a = 163 | (# 164 | (# a, Addr#, Int# #) 165 | | (# #) 166 | | (# e #) 167 | #) 168 | 169 | -- | 'Res#' constructor for a successful parse. 170 | -- Contains the return value, a pointer to the rest of the input buffer, and 171 | -- the next 'Int' state, plus a state token. 172 | pattern OK# :: (st :: ZeroBitType) -> a -> Addr# -> Int# -> Res# st e a 173 | pattern OK# st a s n = (# st, (# (# a, s, n #) | | #) #) 174 | 175 | -- | 'Res#' constructor for recoverable failure. 176 | -- Contains only a state token. 177 | pattern Fail# :: (st :: ZeroBitType) -> Res# st e a 178 | pattern Fail# st = (# st, (# | (# #) | #) #) 179 | 180 | -- | 'Res#' constructor for errors which are by default non-recoverable. 181 | -- Contains the error, plus a state token. 182 | pattern Err# :: (st :: ZeroBitType) -> e -> Res# st e a 183 | pattern Err# st e = (# st, (# | | (# e #) #) #) 184 | {-# complete OK#, Fail#, Err# #-} 185 | -------------------------------------------------------------------------------- /src/FlatParse/Common/Assorted.hs: -------------------------------------------------------------------------------- 1 | module FlatParse.Common.Assorted 2 | ( 3 | -- * Compatibility 4 | shortInteger 5 | 6 | -- * 'Char' predicates 7 | , isDigit, isLatinLetter, isGreekLetter 8 | 9 | -- * Other 10 | , packBytes, splitBytes 11 | 12 | -- * UTF-8 conversions 13 | , charToBytes, strToBytes 14 | , strToUtf8, utf8ToStr 15 | 16 | -- * Shortcuts 17 | , derefChar8# 18 | 19 | -- * Boxed integer coercions 20 | -- $boxed-integer-coercion 21 | , word16ToInt16 22 | , word32ToInt32 23 | , word64ToInt64 24 | 25 | -- * Helpers 26 | , withPosInt#, withIntUnwrap# 27 | 28 | -- * Bit manipulation 29 | , zbytel, zbytel'intermediate, zbytel'toIdx 30 | , zbyter, zbyter'intermediate, zbyter'toIdx 31 | ) where 32 | 33 | import Data.Bits 34 | import Data.Char ( ord ) 35 | import Data.Foldable (foldl') 36 | import GHC.Exts 37 | 38 | import qualified Data.ByteString as B 39 | 40 | import Data.Word 41 | import Data.Int 42 | 43 | #if MIN_VERSION_base(4,15,0) 44 | import GHC.Num.Integer (Integer(..)) 45 | #else 46 | import GHC.Integer.GMP.Internals (Integer(..)) 47 | #endif 48 | 49 | import qualified Data.ByteString.UTF8 as UTF8 50 | 51 | 52 | -- Compatibility 53 | -------------------------------------------------------------------------------- 54 | 55 | shortInteger :: Int# -> Integer 56 | #if MIN_VERSION_base(4,15,0) 57 | shortInteger = IS 58 | #else 59 | shortInteger = S# 60 | #endif 61 | {-# inline shortInteger #-} 62 | 63 | 64 | -- Char predicates 65 | -------------------------------------------------------------------------------- 66 | 67 | -- | @isDigit c = \'0\' <= c && c <= \'9\'@ 68 | isDigit :: Char -> Bool 69 | isDigit c = '0' <= c && c <= '9' 70 | {-# inline isDigit #-} 71 | 72 | -- | @isLatinLetter c = (\'A\' <= c && c <= \'Z\') || (\'a\' <= c && c <= \'z\')@ 73 | isLatinLetter :: Char -> Bool 74 | isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') 75 | {-# inline isLatinLetter #-} 76 | 77 | -- | @isGreekLetter c = (\'Α\' <= c && c <= \'Ω\') || (\'α\' <= c && c <= \'ω\')@ 78 | isGreekLetter :: Char -> Bool 79 | isGreekLetter c = ('Α' <= c && c <= 'Ω') || ('α' <= c && c <= 'ω') 80 | {-# inline isGreekLetter #-} 81 | 82 | -- UTF conversions 83 | -------------------------------------------------------------------------------- 84 | 85 | packBytes :: [Word64] -> Word64 86 | packBytes = fst . foldl' go (0, 0) where 87 | go (acc, shift) w | shift == 64 = error "packBytes: too many bytes" 88 | go (acc, shift) w = (unsafeShiftL (fromIntegral w) shift .|. acc, shift+8) 89 | 90 | -- TODO chunks into 8-bytes for 64-bit performance 91 | splitBytes :: [Word64] -> ([Word64], [Word64]) 92 | splitBytes ws = case quotRem (length ws) 8 of 93 | (0, _) -> (ws, []) 94 | (_, r) -> (as, chunk8s bs) where 95 | (as, bs) = splitAt r ws 96 | chunk8s [] = [] 97 | chunk8s ws = let (as, bs) = splitAt 8 ws in 98 | packBytes as : chunk8s bs 99 | 100 | -- | Shortcut for 'indexCharOffAddr# addr# 0#'. 101 | derefChar8# :: Addr# -> Char# 102 | derefChar8# addr# = indexCharOffAddr# addr# 0# 103 | {-# inline derefChar8# #-} 104 | 105 | -------------------------------------------------------------------------------- 106 | 107 | {- $boxed-integer-coercion 108 | 109 | These functions should be no-ops. They correspond to the similarly-named GHC 9.4 110 | primops which work on unboxed integers. 111 | -} 112 | 113 | -- | Coerce a 'Word16' to 'Int16'. 114 | word16ToInt16 :: Word16 -> Int16 115 | word16ToInt16 = fromIntegral 116 | {-# inline word16ToInt16 #-} 117 | 118 | -- | Coerce a 'Word32' to 'Int32'. 119 | word32ToInt32 :: Word32 -> Int32 120 | word32ToInt32 = fromIntegral 121 | {-# inline word32ToInt32 #-} 122 | 123 | -- | Coerce a 'Word64' to 'Int64'. 124 | word64ToInt64 :: Word64 -> Int64 125 | word64ToInt64 = fromIntegral 126 | {-# inline word64ToInt64 #-} 127 | 128 | -------------------------------------------------------------------------------- 129 | 130 | -- | Assert for the given 'Int#' that @n >= 0@. 131 | -- 132 | -- Throws a runtime error if given a negative integer. 133 | withPosInt# :: Int# -> r -> r 134 | withPosInt# n# r = case n# >=# 0# of 135 | 1# -> r 136 | _ -> error "FlatParse.Basic.Base.withPosInt#: negative integer" 137 | {-# inline withPosInt# #-} 138 | 139 | -- | Unwrap the 'Int#' from an 'Int' and apply it to the given function. 140 | withIntUnwrap# :: (Int# -> r) -> Int -> r 141 | withIntUnwrap# f (I# i#) = f i# 142 | {-# inline withIntUnwrap# #-} 143 | 144 | -------------------------------------------------------------------------------- 145 | 146 | charToBytes :: Char -> [Word] 147 | charToBytes c' 148 | | c <= 0x7f = [fromIntegral c] 149 | | c <= 0x7ff = [0xc0 .|. y, 0x80 .|. z] 150 | | c <= 0xffff = [0xe0 .|. x, 0x80 .|. y, 0x80 .|. z] 151 | | c <= 0x10ffff = [0xf0 .|. w, 0x80 .|. x, 0x80 .|. y, 0x80 .|. z] 152 | | otherwise = error "Not a valid Unicode code point" 153 | where 154 | c = ord c' 155 | z = fromIntegral (c .&. 0x3f) 156 | y = fromIntegral (unsafeShiftR c 6 .&. 0x3f) 157 | x = fromIntegral (unsafeShiftR c 12 .&. 0x3f) 158 | w = fromIntegral (unsafeShiftR c 18 .&. 0x7) 159 | 160 | strToBytes :: String -> [Word] 161 | strToBytes = concatMap charToBytes 162 | {-# inline strToBytes #-} 163 | 164 | -- | Convert an UTF8-encoded `String` to a `B.ByteString`. 165 | strToUtf8 :: String -> B.ByteString 166 | strToUtf8 = UTF8.fromString 167 | {-# inline strToUtf8 #-} 168 | 169 | -- | Convert a `B.ByteString` to an UTF8-encoded `String`. 170 | utf8ToStr :: B.ByteString -> String 171 | utf8ToStr = UTF8.toString 172 | {-# inline utf8ToStr #-} 173 | 174 | -------------------------------------------------------------------------------- 175 | 176 | -- | Index of leftmost null byte, or (number of bytes in type) if not present. 177 | -- 178 | -- Adapted from Hacker's Delight 6-1. Useful in big-endian environments. 179 | zbytel :: (FiniteBits a, Num a) => a -> Int 180 | zbytel = zbytel'toIdx . zbytel'intermediate 181 | {-# inline zbytel #-} 182 | 183 | -- | bit mangling, returns 0 for inputs without a null byte 184 | -- 185 | -- Separating allows us to skip some index calculation if there was no null byte. 186 | zbytel'intermediate :: (FiniteBits a, Num a) => a -> a 187 | zbytel'intermediate a = 188 | let a' = (a .&. mask) + mask 189 | in complement (a' .|. a .|. mask) 190 | where 191 | mask = 0x7F7F7F7F7F7F7F7F 192 | {-# inline zbytel'intermediate #-} 193 | 194 | -- | bit mangling, turns intermediate value into an index 195 | -- 196 | -- Separating allows us to skip some index calculation if there was no null byte. 197 | zbytel'toIdx :: (FiniteBits a, Num a) => a -> Int 198 | zbytel'toIdx a = countLeadingZeros a `unsafeShiftR` 3 199 | {-# inline zbytel'toIdx #-} 200 | 201 | -- | Index of rightmost null byte, or (number of bytes in type) if not present 202 | -- 203 | -- Adapted from Hacker's Delight 6-1. Useful in little-endian environments. 204 | zbyter :: (FiniteBits a, Num a) => a -> Int 205 | zbyter = zbyter'toIdx . zbyter'intermediate 206 | {-# inline zbyter #-} 207 | 208 | -- | bit mangling, returns 0 for inputs without a null byte 209 | -- 210 | -- Separating allows us to skip some index calculation if there was no null byte. 211 | zbyter'intermediate :: (FiniteBits a, Num a) => a -> a 212 | zbyter'intermediate a = (a - 0x0101010101010101) .&. (complement a) .&. 0x8080808080808080 213 | {-# inline zbyter'intermediate #-} 214 | 215 | -- | bit mangling, turns intermediate value into an index 216 | -- 217 | -- Separating allows us to skip some index calculation if there was no null byte. 218 | zbyter'toIdx :: (FiniteBits a, Num a) => a -> Int 219 | zbyter'toIdx a = countTrailingZeros a `unsafeShiftR` 3 220 | {-# inline zbyter'toIdx #-} 221 | -------------------------------------------------------------------------------- /src/FlatParse/Common/Numbers.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples, BinaryLiterals #-} 2 | 3 | module FlatParse.Common.Numbers where 4 | 5 | import FlatParse.Common.Assorted ( shortInteger ) 6 | import Language.Haskell.TH.Syntax (lift) 7 | 8 | import GHC.Exts 9 | import GHC.ForeignPtr 10 | 11 | import qualified Data.ByteString.Char8 as BC8 12 | import qualified Data.ByteString.Internal as B 13 | 14 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'. 15 | -- Fails on overflow. 16 | anyAsciiDecimalWord# :: Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #) 17 | anyAsciiDecimalWord# eob s = case anyAsciiDecimalWord_# 0## eob s of 18 | (# | (# n, s' #) #) | 0# <- eqAddr# s s' 19 | -> (# | (# n, s' #) #) 20 | _ -> (# (# #) | #) 21 | {-# inline anyAsciiDecimalWord# #-} 22 | 23 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'. 24 | -- Fails on overflow. 25 | anyAsciiDecimalInt# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) 26 | anyAsciiDecimalInt# eob s = case anyAsciiDecimalWord_# 0## eob s of 27 | (# | (# n, s' #) #) | 0# <- eqAddr# s s' 28 | , 1# <- leWord# n (int2Word# (unI# maxBound)) 29 | -> (# | (# word2Int# n, s' #) #) 30 | _ -> (# (##) | #) 31 | {-# inline anyAsciiDecimalInt# #-} 32 | 33 | anyAsciiDecimalWord_# :: Word# -> Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #) 34 | anyAsciiDecimalWord_# acc eob s = case eqAddr# s eob of 35 | 1# -> (# | (# acc, s #) #) 36 | _ -> case indexWord8OffAddr# s 0# of 37 | #if MIN_VERSION_base(4,16,0) 38 | w | 1# <- leWord8# (wordToWord8# 0x30##) w 39 | , 1# <- leWord8# w (wordToWord8# 0x39##) 40 | -> case timesWord2# acc 10## of 41 | (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x30##) of 42 | #else 43 | w | 1# <- leWord# 0x30## w 44 | , 1# <- leWord# w 0x39## 45 | -> case timesWord2# acc 10## of 46 | (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x30##) of 47 | #endif 48 | (# q, 0# #) -> anyAsciiDecimalWord_# q eob (s `plusAddr#` 1#) 49 | _ -> (# (##) | #) 50 | _ -> (# (##) | #) 51 | _ -> (# | (# acc, s #) #) 52 | 53 | -------------------------------------------------------------------------------- 54 | 55 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'. 56 | -- May overflow. 57 | anyAsciiDecimalIntOverflow# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) 58 | anyAsciiDecimalIntOverflow# eob s = case anyAsciiDecimalIntOverflow_# 0# eob s of 59 | (# n, s' #) | 0# <- eqAddr# s s' 60 | -> (# | (# n, s' #) #) 61 | 62 | | otherwise 63 | -> (# (##) | #) 64 | {-# inline anyAsciiDecimalIntOverflow# #-} 65 | 66 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'. 67 | anyAsciiDecimalInteger# :: ForeignPtrContents -> Addr# -> Addr# -> (# (##) | (# Integer, Addr# #) #) 68 | anyAsciiDecimalInteger# fp eob s = case anyAsciiDecimalIntOverflow_# 0# eob s of 69 | (# n, s' #) 70 | | 1# <- eqAddr# s s' -> (# (##) | #) 71 | 72 | -- Simple heuristic, using the largest number of digits that can be in an 73 | -- 'Int#', such that we can use the 'IS' constructor. 74 | | 1# <- minusAddr# s' s <=# maxDigitsInt -> (# | (# shortInteger n, s' #) #) 75 | | otherwise -> case BC8.readInteger (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) of 76 | Nothing -> (# (##) | #) 77 | Just (i, _) -> (# | (# i, s' #) #) 78 | where 79 | maxDigitsInt :: Int# 80 | !(I# maxDigitsInt) = $(let p e = 10 ^ e - 1 <= toInteger (maxBound :: Int) 81 | in lift $ last $ takeWhile p [0 :: Int ..]) 82 | {-# inline anyAsciiDecimalInteger# #-} 83 | 84 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'. 85 | -- May overflow. 86 | anyAsciiDecimalIntOverflow_# :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #) 87 | anyAsciiDecimalIntOverflow_# acc eob s = case eqAddr# s eob of 88 | 1# -> (# acc, s #) 89 | _ -> case indexWord8OffAddr# s 0# of 90 | #if MIN_VERSION_base(4,16,0) 91 | w | 1# <- leWord8# (wordToWord8# 0x30##) w, 1# <- leWord8# w (wordToWord8# 0x39##) -> 92 | anyAsciiDecimalIntOverflow_# (mul10# acc +# (word2Int# (word8ToWord# w) -# 0x30#)) eob (plusAddr# s 1#) 93 | #else 94 | w | 1# <- leWord# 0x30## w, 1# <- leWord# w 0x39## -> 95 | anyAsciiDecimalIntOverflow_# (mul10# acc +# (word2Int# w -# 0x30#)) eob (plusAddr# s 1#) 96 | #endif 97 | _ -> (# acc, s #) 98 | 99 | -------------------------------------------------------------------------------- 100 | 101 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 102 | -- 'Word'. 103 | -- Fails on overflow. 104 | anyAsciiHexWord# :: Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #) 105 | anyAsciiHexWord# eob s = case anyAsciiHexWord_# 0## eob s of 106 | (# | (# n, s' #) #) | 0# <- eqAddr# s s' 107 | -> (# | (# n, s' #) #) 108 | _ -> (# (# #) | #) 109 | {-# inline anyAsciiHexWord# #-} 110 | 111 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 112 | -- positive 'Int'. 113 | -- Fails on overflow. 114 | anyAsciiHexInt# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) 115 | anyAsciiHexInt# eob s = case anyAsciiHexWord_# 0## eob s of 116 | (# | (# n, s' #) #) | 0# <- eqAddr# s s' 117 | , 1# <- leWord# n (int2Word# (unI# maxBound)) 118 | -> (# | (# word2Int# n, s' #) #) 119 | 120 | | otherwise 121 | -> (# (##) | #) 122 | (# (##) | #) -> (# (##) | #) 123 | {-# inline anyAsciiHexInt# #-} 124 | 125 | anyAsciiHexWord_# :: Word# -> Addr# -> Addr# -> (# (##) | (# Word#, Addr# #) #) 126 | anyAsciiHexWord_# acc eob s = case eqAddr# s eob of 127 | 1# -> (# | (# acc, s #) #) 128 | _ -> case indexWord8OffAddr# s 0# of 129 | #if MIN_VERSION_base(4,16,0) 130 | w | 1# <- leWord8# (wordToWord8# 0x30##) w 131 | , 1# <- leWord8# w (wordToWord8# 0x39##) 132 | -> case timesWord2# acc 16## of 133 | (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x30##) of 134 | #else 135 | w | 1# <- leWord# 0x30## w 136 | , 1# <- leWord# w 0x39## 137 | -> case timesWord2# acc 16## of 138 | (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x30##) of 139 | #endif 140 | (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#) 141 | _ -> (# (##) | #) 142 | _ -> (# (##) | #) 143 | #if MIN_VERSION_base(4,16,0) 144 | | 1# <- leWord8# (wordToWord8# 0x41##) w 145 | , 1# <- leWord8# w (wordToWord8# 0x46##) 146 | -> case timesWord2# acc 16## of 147 | (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x37##) of 148 | #else 149 | | 1# <- leWord# 0x41## w 150 | , 1# <- leWord# w 0x46## 151 | -> case timesWord2# acc 16## of 152 | (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x37##) of 153 | #endif 154 | (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#) 155 | _ -> (# (##) | #) 156 | _ -> (# (##) | #) 157 | #if MIN_VERSION_base(4,16,0) 158 | | 1# <- leWord8# (wordToWord8# 0x61##) w 159 | , 1# <- leWord8# w (wordToWord8# 0x66##) 160 | -> case timesWord2# acc 16## of 161 | 162 | (# 0##, r #) -> case addWordC# r (word8ToWord# w `minusWord#` 0x57##) of 163 | #else 164 | | 1# <- leWord# 0x61## w 165 | , 1# <- leWord# w 0x66## 166 | -> case timesWord2# acc 16## of 167 | 168 | (# 0##, r #) -> case addWordC# r (w `minusWord#` 0x57##) of 169 | #endif 170 | (# q, 0# #) -> anyAsciiHexWord_# q eob (s `plusAddr#` 1#) 171 | _ -> (# (##) | #) 172 | _ -> (# (##) | #) 173 | _ -> (# | (# acc, s #) #) 174 | 175 | -------------------------------------------------------------------------------- 176 | -- Zigzag encoding 177 | -- See: https://hackage.haskell.org/package/zigzag-0.0.1.0/docs/src/Data.Word.Zigzag.html 178 | 179 | fromZigzagNative :: Word -> Int 180 | fromZigzagNative (W# w#) = I# (fromZigzagNative# w#) 181 | {-# inline fromZigzagNative #-} 182 | 183 | -- GHC should optimize to this, but to be sure, here it is 184 | fromZigzagNative# :: Word# -> Int# 185 | fromZigzagNative# w# = 186 | word2Int# ((w# `uncheckedShiftRL#` 1#) `xor#` (not# (w# `and#` 1##) `plusWord#` 1##)) 187 | {-# inline fromZigzagNative# #-} 188 | 189 | toZigzagNative :: Int -> Word 190 | toZigzagNative (I# i#) = W# (toZigzagNative# i#) 191 | {-# inline toZigzagNative #-} 192 | 193 | -- GHC should optimize to this, but to be sure, here it is 194 | toZigzagNative# :: Int# -> Word# 195 | toZigzagNative# i# = toZigzagNative'# (int2Word# i#) 196 | {-# inline toZigzagNative# #-} 197 | 198 | -- GHC should optimize to this, but to be sure, here it is 199 | toZigzagNative'# :: Word# -> Word# 200 | toZigzagNative'# w# = 201 | (w# `uncheckedShiftL#` 1#) `xor#` (w# `uncheckedShiftRL#` 63#) 202 | {-# inline toZigzagNative'# #-} 203 | 204 | -------------------------------------------------------------------------------- 205 | 206 | -- | protobuf style (LE, redundant, on continues) 207 | anyVarintProtobuf# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #) 208 | 209 | #if MIN_VERSION_base(4,16,0) 210 | 211 | anyVarintProtobuf# end# = go 0# 0# 212 | where 213 | word8ToInt# :: Word8# -> Int# 214 | word8ToInt# w8# = word2Int# (word8ToWord# w8#) 215 | {-# inline word8ToInt# #-} 216 | go :: Int# -> Int# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #) 217 | go i# n# s# = case eqAddr# s# end# of 218 | 1# -> (# (##) | #) 219 | _ -> 220 | let w8# = indexWord8OffAddr# s# 0# 221 | w8'# = word8ToInt# (w8# `andWord8#` (wordToWord8# 0b01111111##)) 222 | i'# = i# `orI#` (w8'# `uncheckedIShiftL#` n#) 223 | s'# = s# `plusAddr#` 1# 224 | n'# = n# +# 7# 225 | in case w8# `geWord8#` wordToWord8# 0b10000000## of 226 | 1# -> go i'# n'# s'# 227 | _ -> (# | (# i'#, s'#, n'# #) #) 228 | 229 | #else 230 | 231 | anyVarintProtobuf# end# = go 0# 0# 232 | where 233 | go :: Int# -> Int# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #) 234 | go i# n# s# = case eqAddr# s# end# of 235 | 1# -> (# (##) | #) 236 | _ -> 237 | let w8# = indexWord8OffAddr# s# 0# 238 | w8'# = word2Int# (w8# `and#` 0b01111111##) 239 | i'# = i# `orI#` (w8'# `uncheckedIShiftL#` n#) 240 | s'# = s# `plusAddr#` 1# 241 | n'# = n# +# 7# 242 | in case w8# `geWord#` 0b10000000## of 243 | 1# -> go i'# n'# s'# 244 | _ -> (# | (# i'#, s'#, n'# #) #) 245 | 246 | #endif 247 | 248 | -------------------------------------------------------------------------------- 249 | 250 | unI# :: Int -> Int# 251 | unI# (I# i) = i 252 | {-# inline unI# #-} 253 | 254 | mul10# :: Int# -> Int# 255 | mul10# n = uncheckedIShiftL# n 3# +# uncheckedIShiftL# n 1# 256 | {-# inline mul10# #-} 257 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Text.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples #-} 2 | 3 | -- | Parsers for textual data (UTF-8, ASCII). 4 | 5 | module FlatParse.Basic.Text 6 | ( 7 | -- * UTF-8 8 | char, string 9 | , anyChar, skipAnyChar 10 | , satisfy, skipSatisfy 11 | , fusedSatisfy, skipFusedSatisfy 12 | , takeLine 13 | , takeRestString 14 | 15 | -- * ASCII 16 | , anyAsciiChar, skipAnyAsciiChar 17 | , satisfyAscii, skipSatisfyAscii 18 | 19 | -- ** ASCII-encoded numbers 20 | , anyAsciiDecimalWord 21 | , anyAsciiDecimalInt 22 | , anyAsciiDecimalInteger 23 | , anyAsciiHexWord 24 | , anyAsciiHexInt 25 | 26 | -- * Debugging parsers 27 | , traceLine 28 | , traceRest 29 | ) where 30 | 31 | import FlatParse.Basic.Parser 32 | import FlatParse.Basic.Base ( withEnsure1, lookahead, eof, branch ) 33 | import FlatParse.Basic.Bytes ( bytes ) 34 | 35 | import FlatParse.Common.GHCExts 36 | 37 | import Language.Haskell.TH 38 | import qualified FlatParse.Common.Numbers as Common 39 | import qualified FlatParse.Common.Assorted as Common 40 | 41 | -- | Parse any single Unicode character encoded using UTF-8 as a 'Char'. 42 | anyChar :: ParserT st e Char 43 | anyChar = ParserT \fp eob buf st -> case eqAddr# eob buf of 44 | 1# -> Fail# st 45 | _ -> case Common.derefChar8# buf of 46 | c1 -> case c1 `leChar#` '\x7F'# of 47 | 1# -> OK# st (C# c1) (plusAddr# buf 1#) 48 | _ -> case eqAddr# eob (plusAddr# buf 1#) of 49 | 1# -> Fail# st 50 | _ -> case indexCharOffAddr# buf 1# of 51 | c2 -> case c1 `leChar#` '\xDF'# of 52 | 1# -> 53 | let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` 54 | (ord# c2 -# 0x80#) 55 | in OK# st (C# (chr# resc)) (plusAddr# buf 2#) 56 | _ -> case eqAddr# eob (plusAddr# buf 2#) of 57 | 1# -> Fail# st 58 | _ -> case indexCharOffAddr# buf 2# of 59 | c3 -> case c1 `leChar#` '\xEF'# of 60 | 1# -> 61 | let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` 62 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 63 | (ord# c3 -# 0x80#) 64 | in OK# st (C# (chr# resc)) (plusAddr# buf 3#) 65 | _ -> case eqAddr# eob (plusAddr# buf 3#) of 66 | 1# -> Fail# st 67 | _ -> case indexCharOffAddr# buf 3# of 68 | c4 -> 69 | let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` 70 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` 71 | ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 72 | (ord# c4 -# 0x80#) 73 | in OK# st (C# (chr# resc)) (plusAddr# buf 4#) 74 | {-# inline anyChar #-} 75 | 76 | -- | Skip any single Unicode character encoded using UTF-8. 77 | skipAnyChar :: ParserT st e () 78 | skipAnyChar = ParserT \fp eob buf st -> case eqAddr# eob buf of 79 | 1# -> Fail# st 80 | _ -> case Common.derefChar8# buf of 81 | c1 -> case c1 `leChar#` '\x7F'# of 82 | 1# -> OK# st () (plusAddr# buf 1#) 83 | _ -> 84 | let buf' = 85 | case c1 `leChar#` '\xDF'# of 86 | 1# -> plusAddr# buf 2# 87 | _ -> case c1 `leChar#` '\xEF'# of 88 | 1# -> plusAddr# buf 3# 89 | _ -> plusAddr# buf 4# 90 | in case leAddr# buf' eob of 91 | 1# -> OK# st () buf' 92 | _ -> Fail# st 93 | {-# inline skipAnyChar #-} 94 | 95 | withSatisfy 96 | :: (Char -> Bool) -> (Char -> ParserT st e r) -> ParserT st e r 97 | withSatisfy f p = ParserT \fp eob s st -> 98 | case runParserT# anyChar fp eob s st of 99 | -- This is OK# unfolded, to silence incomplete pattern warnings 100 | -- in GHC <= 8.8.4. 101 | (# st, (# (# c, s #) | | #) #) | f c -> runParserT# (p c) fp eob s st 102 | 103 | (# st, _ #) -> Fail# st 104 | {-# inline withSatisfy #-} 105 | 106 | -- | Parse a UTF-8 'Char' for which a predicate holds. 107 | satisfy :: (Char -> Bool) -> ParserT st e Char 108 | satisfy f = withSatisfy f pure 109 | {-# inline satisfy #-} 110 | 111 | -- | Skip a UTF-8 `Char` for which a predicate holds. 112 | skipSatisfy :: (Char -> Bool) -> ParserT st e () 113 | skipSatisfy f = withSatisfy f (\_ -> pure ()) 114 | {-# inline skipSatisfy #-} 115 | 116 | withSatisfyAscii :: (Char -> Bool) -> (Char -> ParserT st e r) -> ParserT st e r 117 | withSatisfyAscii f p = withEnsure1 $ ParserT \fp eob s st -> 118 | case Common.derefChar8# s of 119 | c1 | f (C# c1) -> runParserT# (p (C# c1)) fp eob (plusAddr# s 1#) st 120 | | otherwise -> Fail# st 121 | {-# inline withSatisfyAscii #-} 122 | 123 | -- | Parse an ASCII `Char` for which a predicate holds. 124 | -- 125 | -- Assumption: the predicate must only return 'True' for ASCII-range characters. 126 | -- Otherwise this function might read a 128-255 range byte, thereby breaking 127 | -- UTF-8 decoding. 128 | satisfyAscii :: (Char -> Bool) -> ParserT st e Char 129 | satisfyAscii f = withSatisfyAscii f pure 130 | {-# inline satisfyAscii #-} 131 | 132 | -- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate 133 | -- must only return `True` for ASCII-range characters. 134 | skipSatisfyAscii :: (Char -> Bool) -> ParserT st e () 135 | skipSatisfyAscii f = withSatisfyAscii f (\_ -> pure ()) 136 | {-# inline skipSatisfyAscii #-} 137 | 138 | -- | This is a variant of `satisfy` which allows more optimization. We can pick four testing 139 | -- functions for the four cases for the possible number of bytes in the UTF-8 character. So in 140 | -- @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with 141 | -- @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups. 142 | -- 143 | -- For example, if we want to accept any letter, the naive solution would be to use 144 | -- `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We 145 | -- can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the 146 | -- `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of 147 | -- all cases without accessing the character table. 148 | fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st e Char 149 | fusedSatisfy f1 f2 f3 f4 = ParserT \fp eob buf st -> 150 | case eqAddr# eob buf of 151 | 1# -> Fail# st 152 | _ -> case Common.derefChar8# buf of 153 | c1 -> case c1 `leChar#` '\x7F'# of 154 | 1# | f1 (C# c1) -> OK# st (C# c1) (plusAddr# buf 1#) 155 | | otherwise -> Fail# st 156 | _ -> case eqAddr# eob (plusAddr# buf 1#) of 157 | 1# -> Fail# st 158 | _ -> case indexCharOffAddr# buf 1# of 159 | c2 -> case c1 `leChar#` '\xDF'# of 160 | 1# -> 161 | let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` 162 | (ord# c2 -# 0x80#))) 163 | in case f2 resc of 164 | True -> OK# st resc (plusAddr# buf 2#) 165 | _ -> Fail# st 166 | _ -> case eqAddr# eob (plusAddr# buf 2#) of 167 | 1# -> Fail# st 168 | _ -> case indexCharOffAddr# buf 2# of 169 | c3 -> case c1 `leChar#` '\xEF'# of 170 | 1# -> 171 | let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` 172 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 173 | (ord# c3 -# 0x80#))) 174 | in case f3 resc of 175 | True -> OK# st resc (plusAddr# buf 3#) 176 | _ -> Fail# st 177 | _ -> case eqAddr# eob (plusAddr# buf 3#) of 178 | 1# -> Fail# st 179 | _ -> case indexCharOffAddr# buf 3# of 180 | c4 -> 181 | let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` 182 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` 183 | ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 184 | (ord# c4 -# 0x80#))) 185 | in case f4 resc of 186 | True -> OK# st resc (plusAddr# buf 4#) 187 | _ -> Fail# st 188 | {-# inline fusedSatisfy #-} 189 | 190 | -- | Skipping variant of `fusedSatisfy`. 191 | skipFusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e () 192 | skipFusedSatisfy f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4 193 | {-# inline skipFusedSatisfy #-} 194 | 195 | -------------------------------------------------------------------------------- 196 | 197 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'. 198 | -- Fails on overflow. 199 | anyAsciiDecimalWord :: ParserT st e Word 200 | anyAsciiDecimalWord = ParserT \fp eob s st -> 201 | case Common.anyAsciiDecimalWord# eob s of 202 | (# | (# w, s' #) #) -> OK# st (W# w) s' 203 | (# (##) | #) -> Fail# st 204 | {-# inline anyAsciiDecimalWord #-} 205 | 206 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'. 207 | -- Fails on overflow. 208 | anyAsciiDecimalInt :: ParserT st e Int 209 | anyAsciiDecimalInt = ParserT \fp eob s st -> 210 | case Common.anyAsciiDecimalInt# eob s of 211 | (# | (# n, s' #) #) -> OK# st (I# n) s' 212 | (# (##) | #) -> Fail# st 213 | {-# inline anyAsciiDecimalInt #-} 214 | 215 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'. 216 | anyAsciiDecimalInteger :: ParserT st e Integer 217 | anyAsciiDecimalInteger = ParserT \fp eob s st -> 218 | case Common.anyAsciiDecimalInteger# fp eob s of 219 | (# | (# i, s' #) #) -> OK# st i s' 220 | (# (##) | #) -> Fail# st 221 | {-# inline anyAsciiDecimalInteger #-} 222 | 223 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 224 | -- 'Word'. 225 | -- Fails on overflow. 226 | anyAsciiHexWord :: ParserT st e Word 227 | anyAsciiHexWord = ParserT \fp eob s st -> 228 | case Common.anyAsciiHexWord# eob s of 229 | (# | (# w, s' #) #) -> OK# st (W# w) s' 230 | (# (##) | #) -> Fail# st 231 | {-# inline anyAsciiHexWord #-} 232 | 233 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 234 | -- positive 'Int'. 235 | -- Fails on overflow. 236 | anyAsciiHexInt :: ParserT st e Int 237 | anyAsciiHexInt = ParserT \fp eob s st -> 238 | case Common.anyAsciiHexInt# eob s of 239 | (# | (# n, s' #) #) -> OK# st (I# n) s' 240 | (# (##) | #) -> Fail# st 241 | {-# inline anyAsciiHexInt #-} 242 | 243 | -------------------------------------------------------------------------------- 244 | 245 | -- | Parse any single ASCII character (a single byte) as a 'Char'. 246 | -- 247 | -- More efficient than 'anyChar' for ASCII-only input. 248 | anyAsciiChar :: ParserT st e Char 249 | anyAsciiChar = withEnsure1 $ ParserT \fp eob buf st -> 250 | case Common.derefChar8# buf of 251 | c1 -> case c1 `leChar#` '\x7F'# of 252 | 1# -> OK# st (C# c1) (plusAddr# buf 1#) 253 | _ -> Fail# st 254 | {-# inline anyAsciiChar #-} 255 | 256 | -- | Skip any single ASCII character (a single byte). 257 | -- 258 | -- More efficient than 'skipAnyChar' for ASCII-only input. 259 | skipAnyAsciiChar :: ParserT st e () 260 | skipAnyAsciiChar = () <$ anyAsciiChar 261 | {-# inline skipAnyAsciiChar #-} 262 | 263 | -------------------------------------------------------------------------------- 264 | 265 | -- | Parse a UTF-8 character literal. This is a template function, you can use it as 266 | -- @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@. 267 | char :: Char -> Q Exp 268 | char c = string [c] 269 | 270 | -- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, 271 | -- for example, and the splice has type @Parser e ()@. 272 | string :: String -> Q Exp 273 | string str = bytes (Common.strToBytes str) 274 | 275 | -------------------------------------------------------------------------------- 276 | 277 | -- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, 278 | -- throws an error if the encoding is invalid. 279 | takeLine :: ParserT st e String 280 | takeLine = branch eof (pure "") do 281 | c <- anyChar 282 | case c of 283 | '\n' -> pure "" 284 | _ -> (c:) <$> takeLine 285 | 286 | -- | Parse the rest of the current line as a `String`, but restore the parsing state. 287 | -- Assumes UTF-8 encoding. This can be used for debugging. 288 | traceLine :: ParserT st e String 289 | traceLine = lookahead takeLine 290 | 291 | -- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. 292 | takeRestString :: ParserT st e String 293 | takeRestString = branch eof (pure "") do 294 | c <- anyChar 295 | cs <- takeRestString 296 | pure (c:cs) 297 | 298 | -- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. 299 | -- This can be used for debugging. 300 | traceRest :: ParserT st e String 301 | traceRest = lookahead takeRestString 302 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Text.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples #-} 2 | 3 | -- | Parsers and textual data (UTF-8, ASCII). 4 | 5 | module FlatParse.Stateful.Text 6 | ( 7 | -- * UTF-8 8 | char, string 9 | , anyChar, skipAnyChar 10 | , satisfy, skipSatisfy 11 | , fusedSatisfy, skipFusedSatisfy 12 | , takeLine 13 | , takeRestString 14 | 15 | -- * ASCII 16 | , anyAsciiChar, skipAnyAsciiChar 17 | , satisfyAscii, skipSatisfyAscii 18 | 19 | -- ** ASCII-encoded numbers 20 | , anyAsciiDecimalWord 21 | , anyAsciiDecimalInt 22 | , anyAsciiDecimalInteger 23 | , anyAsciiHexWord 24 | , anyAsciiHexInt 25 | 26 | -- * Debugging parsers 27 | , traceLine 28 | , traceRest 29 | ) where 30 | 31 | import FlatParse.Stateful.Parser 32 | import FlatParse.Stateful.Base ( withEnsure1, lookahead, eof, branch ) 33 | import FlatParse.Stateful.Bytes ( bytes ) 34 | 35 | import FlatParse.Common.GHCExts 36 | 37 | import Language.Haskell.TH 38 | import qualified FlatParse.Common.Numbers as Common 39 | import qualified FlatParse.Common.Assorted as Common 40 | 41 | -- | Parse any single Unicode character encoded using UTF-8 as a 'Char'. 42 | anyChar :: ParserT st r e Char 43 | anyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of 44 | 1# -> Fail# st 45 | _ -> case Common.derefChar8# buf of 46 | c1 -> case c1 `leChar#` '\x7F'# of 47 | 1# -> OK# st (C# c1) (plusAddr# buf 1#) n 48 | _ -> case eqAddr# eob (plusAddr# buf 1#) of 49 | 1# -> Fail# st 50 | _ -> case indexCharOffAddr# buf 1# of 51 | c2 -> case c1 `leChar#` '\xDF'# of 52 | 1# -> 53 | let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` 54 | (ord# c2 -# 0x80#) 55 | in OK# st (C# (chr# resc)) (plusAddr# buf 2#) n 56 | _ -> case eqAddr# eob (plusAddr# buf 2#) of 57 | 1# -> Fail# st 58 | _ -> case indexCharOffAddr# buf 2# of 59 | c3 -> case c1 `leChar#` '\xEF'# of 60 | 1# -> 61 | let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` 62 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 63 | (ord# c3 -# 0x80#) 64 | in OK# st (C# (chr# resc)) (plusAddr# buf 3#) n 65 | _ -> case eqAddr# eob (plusAddr# buf 3#) of 66 | 1# -> Fail# st 67 | _ -> case indexCharOffAddr# buf 3# of 68 | c4 -> 69 | let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` 70 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` 71 | ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 72 | (ord# c4 -# 0x80#) 73 | in OK# st (C# (chr# resc)) (plusAddr# buf 4#) n 74 | {-# inline anyChar #-} 75 | 76 | -- | Skip any single Unicode character encoded using UTF-8. 77 | skipAnyChar :: ParserT st r e () 78 | skipAnyChar = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of 79 | 1# -> Fail# st 80 | _ -> case Common.derefChar8# buf of 81 | c1 -> case c1 `leChar#` '\x7F'# of 82 | 1# -> OK# st () (plusAddr# buf 1#) n 83 | _ -> 84 | let buf' = 85 | case c1 `leChar#` '\xDF'# of 86 | 1# -> plusAddr# buf 2# 87 | _ -> case c1 `leChar#` '\xEF'# of 88 | 1# -> plusAddr# buf 3# 89 | _ -> plusAddr# buf 4# 90 | in case leAddr# buf' eob of 91 | 1# -> OK# st () buf' n 92 | _ -> Fail# st 93 | {-# inline skipAnyChar #-} 94 | 95 | withSatisfy 96 | :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret 97 | withSatisfy f p = ParserT \fp !r eob s n st -> 98 | case runParserT# anyChar fp r eob s n st of 99 | -- This is OK# unfolded, to silence incomplete pattern warnings 100 | -- in GHC <= 8.8.4. 101 | (# st, (# (# c, s, n #) | | #) #) | f c -> runParserT# (p c) fp r eob s n st 102 | (# st, _ #) -> Fail# st 103 | {-# inline withSatisfy #-} 104 | 105 | -- | Parse a UTF-8 'Char' for which a predicate holds. 106 | satisfy :: (Char -> Bool) -> ParserT st r e Char 107 | satisfy f = withSatisfy f pure 108 | {-# inline satisfy #-} 109 | 110 | -- | Skip a UTF-8 `Char` for which a predicate holds. 111 | skipSatisfy :: (Char -> Bool) -> ParserT st r e () 112 | skipSatisfy f = withSatisfy f (\_ -> pure ()) 113 | {-# inline skipSatisfy #-} 114 | 115 | withSatisfyAscii 116 | :: (Char -> Bool) -> (Char -> ParserT st r e ret) -> ParserT st r e ret 117 | withSatisfyAscii f p = withEnsure1 $ ParserT \fp !r eob s n st -> 118 | case Common.derefChar8# s of 119 | c1 | f (C# c1) -> runParserT# (p (C# c1)) fp r eob (plusAddr# s 1#) n st 120 | | otherwise -> Fail# st 121 | {-# inline withSatisfyAscii #-} 122 | 123 | -- | Parse an ASCII 'Char' for which a predicate holds. 124 | -- 125 | -- Assumption: the predicate must only return 'True' for ASCII-range characters. 126 | -- Otherwise this function might read a 128-255 range byte, thereby breaking 127 | -- UTF-8 decoding. 128 | satisfyAscii :: (Char -> Bool) -> ParserT st r e Char 129 | satisfyAscii f = withSatisfyAscii f pure 130 | {-# inline satisfyAscii #-} 131 | 132 | -- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate 133 | -- must only return `True` for ASCII-range characters. 134 | skipSatisfyAscii :: (Char -> Bool) -> ParserT st r e () 135 | skipSatisfyAscii f = withSatisfyAscii f (\_ -> pure ()) 136 | {-# inline skipSatisfyAscii #-} 137 | 138 | -- | This is a variant of `satisfy` which allows more optimization. We can pick four testing 139 | -- functions for the four cases for the possible number of bytes in the UTF-8 character. So in 140 | -- @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with 141 | -- @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups. 142 | -- 143 | -- For example, if we want to accept any letter, the naive solution would be to use 144 | -- `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We 145 | -- can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the 146 | -- `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of 147 | -- all cases without accessing the character table. 148 | fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e Char 149 | fusedSatisfy f1 f2 f3 f4 = ParserT \fp !r eob buf n st -> 150 | case eqAddr# eob buf of 151 | 1# -> Fail# st 152 | _ -> case Common.derefChar8# buf of 153 | c1 -> case c1 `leChar#` '\x7F'# of 154 | 1# | f1 (C# c1) -> OK# st (C# c1) (plusAddr# buf 1#) n 155 | | otherwise -> Fail# st 156 | _ -> case eqAddr# eob (plusAddr# buf 1#) of 157 | 1# -> Fail# st 158 | _ -> case indexCharOffAddr# buf 1# of 159 | c2 -> case c1 `leChar#` '\xDF'# of 160 | 1# -> 161 | let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` 162 | (ord# c2 -# 0x80#))) 163 | in case f2 resc of 164 | True -> OK# st resc (plusAddr# buf 2#) n 165 | _ -> Fail# st 166 | _ -> case eqAddr# eob (plusAddr# buf 2#) of 167 | 1# -> Fail# st 168 | _ -> case indexCharOffAddr# buf 2# of 169 | c3 -> case c1 `leChar#` '\xEF'# of 170 | 1# -> 171 | let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` 172 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 173 | (ord# c3 -# 0x80#))) 174 | in case f3 resc of 175 | True -> OK# st resc (plusAddr# buf 3#) n 176 | _ -> Fail# st 177 | _ -> case eqAddr# eob (plusAddr# buf 3#) of 178 | 1# -> Fail# st 179 | _ -> case indexCharOffAddr# buf 3# of 180 | c4 -> 181 | let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` 182 | ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` 183 | ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` 184 | (ord# c4 -# 0x80#))) 185 | in case f4 resc of 186 | True -> OK# st resc (plusAddr# buf 4#) n 187 | _ -> Fail# st 188 | {-# inline fusedSatisfy #-} 189 | 190 | -- | Skipping variant of `fusedSatisfy`. 191 | skipFusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> ParserT st r e () 192 | skipFusedSatisfy f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4 193 | {-# inline skipFusedSatisfy #-} 194 | 195 | -------------------------------------------------------------------------------- 196 | 197 | -- | Parse a non-empty ASCII decimal digit sequence as a 'Word'. 198 | -- Fails on overflow. 199 | anyAsciiDecimalWord :: ParserT st r e Word 200 | anyAsciiDecimalWord = ParserT \fp !r eob s n st -> 201 | case Common.anyAsciiDecimalWord# eob s of 202 | (# | (# w, s' #) #) -> OK# st (W# w) s' n 203 | (# (##) | #) -> Fail# st 204 | {-# inline anyAsciiDecimalWord #-} 205 | 206 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Int'. 207 | -- Fails on overflow. 208 | anyAsciiDecimalInt :: ParserT st r e Int 209 | anyAsciiDecimalInt = ParserT \fp !r eob s n st -> 210 | case Common.anyAsciiDecimalInt# eob s of 211 | (# | (# i, s' #) #) -> OK# st (I# i) s' n 212 | (# (##) | #) -> Fail# st 213 | {-# inline anyAsciiDecimalInt #-} 214 | 215 | -- | Parse a non-empty ASCII decimal digit sequence as a positive 'Integer'. 216 | anyAsciiDecimalInteger :: ParserT st r e Integer 217 | anyAsciiDecimalInteger = ParserT \fp !r eob s n st -> 218 | case Common.anyAsciiDecimalInteger# fp eob s of 219 | (# | (# i, s' #) #) -> OK# st i s' n 220 | (# (##) | #) -> Fail# st 221 | {-# inline anyAsciiDecimalInteger #-} 222 | 223 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 224 | -- 'Word'. 225 | -- Fails on overflow. 226 | anyAsciiHexWord :: ParserT st r e Word 227 | anyAsciiHexWord = ParserT \fp !r eob s n st -> 228 | case Common.anyAsciiHexWord# eob s of 229 | (# | (# w, s' #) #) -> OK# st (W# w) s' n 230 | (# (##) | #) -> Fail# st 231 | {-# inline anyAsciiHexWord #-} 232 | 233 | -- | Parse a non-empty, case-insensitive ASCII hexadecimal digit sequence as a 234 | -- positive 'Int'. 235 | -- Fails on overflow. 236 | anyAsciiHexInt :: ParserT st r e Int 237 | anyAsciiHexInt = ParserT \fp !r eob s n st -> 238 | case Common.anyAsciiHexInt# eob s of 239 | (# | (# i, s' #) #) -> OK# st (I# i) s' n 240 | (# (##) | #) -> Fail# st 241 | {-# inline anyAsciiHexInt #-} 242 | 243 | -------------------------------------------------------------------------------- 244 | 245 | -- | Parse any single ASCII character (a single byte) as a 'Char'. 246 | -- 247 | -- More efficient than 'anyChar' for ASCII-only input. 248 | anyAsciiChar :: ParserT st r e Char 249 | anyAsciiChar = withEnsure1 $ ParserT \fp !r eob buf n st -> 250 | case Common.derefChar8# buf of 251 | c1 -> case c1 `leChar#` '\x7F'# of 252 | 1# -> OK# st (C# c1) (plusAddr# buf 1#) n 253 | _ -> Fail# st 254 | {-# inline anyAsciiChar #-} 255 | 256 | -- | Skip any single ASCII character (a single byte). 257 | -- 258 | -- More efficient than 'anyChar_' for ASCII-only input. 259 | skipAnyAsciiChar :: ParserT st r e () 260 | skipAnyAsciiChar = () <$ anyAsciiChar 261 | {-# inline skipAnyAsciiChar #-} 262 | 263 | -------------------------------------------------------------------------------- 264 | 265 | -- | Parse a UTF-8 character literal. This is a template function, you can use it as 266 | -- @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@. 267 | char :: Char -> Q Exp 268 | char c = string [c] 269 | 270 | -- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, 271 | -- for example, and the splice has type @Parser e ()@. 272 | string :: String -> Q Exp 273 | string str = bytes (Common.strToBytes str) 274 | 275 | -------------------------------------------------------------------------------- 276 | 277 | -- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, 278 | -- throws an error if the encoding is invalid. 279 | takeLine :: ParserT st r e String 280 | takeLine = branch eof (pure "") do 281 | c <- anyChar 282 | case c of 283 | '\n' -> pure "" 284 | _ -> (c:) <$> takeLine 285 | 286 | -- | Parse the rest of the current line as a `String`, but restore the parsing state. 287 | -- Assumes UTF-8 encoding. This can be used for debugging. 288 | traceLine :: ParserT st r e String 289 | traceLine = lookahead takeLine 290 | 291 | -- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. 292 | takeRestString :: ParserT st r e String 293 | takeRestString = branch eof (pure "") do 294 | c <- anyChar 295 | cs <- takeRestString 296 | pure (c:cs) 297 | 298 | -- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. 299 | -- This can be used for debugging. 300 | traceRest :: ParserT st r e String 301 | traceRest = lookahead takeRestString 302 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Integers.hs: -------------------------------------------------------------------------------- 1 | -- | Machine integer parsers. 2 | 3 | module FlatParse.Basic.Integers 4 | ( 5 | -- * Native byte order 6 | anyWord8, anyWord16, anyWord32, anyWord64 7 | , anyInt8, anyInt16, anyInt32, anyInt64 8 | , anyWord, anyInt 9 | 10 | -- * Explicit endianness 11 | -- $explicit-endianness 12 | , anyWord16le, anyWord16be 13 | , anyWord32le, anyWord32be 14 | , anyWord64le, anyWord64be 15 | , anyInt16le, anyInt16be 16 | , anyInt32le, anyInt32be 17 | , anyInt64le, anyInt64be 18 | 19 | -- * Value assertions 20 | , word8 21 | 22 | -- * CPS parsers 23 | , withAnyWord8, withAnyWord16, withAnyWord32, withAnyWord64 24 | , withAnyInt8, withAnyInt16, withAnyInt32, withAnyInt64 25 | , withAnyWord, withAnyInt 26 | 27 | -- * Unsafe 28 | -- $unsafe 29 | , anyWord8Unsafe 30 | 31 | -- ** Value assertions 32 | , word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe 33 | 34 | -- * Helper definitions 35 | , withAnySized#, withAnySizedUnsafe# 36 | , sizedUnsafe# 37 | ) where 38 | 39 | -- for WORDS_BIGENDIAN 40 | #include "MachDeps.h" 41 | 42 | import FlatParse.Basic.Parser 43 | import FlatParse.Basic.Base ( withEnsure# ) 44 | 45 | import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) 46 | 47 | import FlatParse.Common.GHCExts 48 | import GHC.Word 49 | import GHC.Int 50 | 51 | import Control.Applicative ( Alternative(empty) ) 52 | 53 | -------------------------------------------------------------------------------- 54 | 55 | -- | Helper for defining CPS parsers for types of a constant byte size (i.e. 56 | -- machine integers). 57 | -- 58 | -- Call this with an @indexXYZOffAddr@ primop (e.g. 59 | -- 'GHC.Exts.indexWord8OffAddr') and the size in bytes of the type you're 60 | -- parsing. 61 | withAnySized# 62 | :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st e r) -> ParserT st e r 63 | withAnySized# size# indexOffAddr p = 64 | withEnsure# size# (withAnySizedUnsafe# size# indexOffAddr p) 65 | {-# inline withAnySized# #-} 66 | 67 | -- | Unsafe helper for defining CPS parsers for types of a constant byte size 68 | -- (i.e. machine integers). 69 | -- 70 | -- Is really just syntactic sugar for applying the given parser and shifting the 71 | -- buffer along. 72 | -- 73 | -- The caller must guarantee that the input has enough bytes. 74 | withAnySizedUnsafe# 75 | :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st e r) -> ParserT st e r 76 | withAnySizedUnsafe# size# indexOffAddr p = ParserT \fp eob buf st -> 77 | let !a = indexOffAddr buf 0# 78 | buf' = plusAddr# buf size# 79 | in runParserT# (p a) fp eob buf' st 80 | {-# inline withAnySizedUnsafe# #-} 81 | 82 | -- | Parse any 'Word8' (CPS). 83 | withAnyWord8 :: (Word8 -> ParserT st e r) -> ParserT st e r 84 | withAnyWord8 p = ParserT \fp eob buf st -> case eqAddr# eob buf of 85 | 1# -> Fail# st 86 | _ -> let w# = indexWord8OffAddr# buf 0# 87 | in runParserT# (p (W8# w#)) fp eob (plusAddr# buf 1#) st 88 | {-# inline withAnyWord8 #-} 89 | 90 | -- | Parse any 'Word16' (native byte order) (CPS). 91 | withAnyWord16 :: (Word16 -> ParserT st e r) -> ParserT st e r 92 | withAnyWord16 = withAnySized# 2# (\a i -> W16# (indexWord16OffAddr# a i)) 93 | {-# inline withAnyWord16 #-} 94 | 95 | -- | Parse any 'Word32' (native byte order) (CPS). 96 | withAnyWord32 :: (Word32 -> ParserT st e r) -> ParserT st e r 97 | withAnyWord32 = withAnySized# 4# (\a i -> W32# (indexWord32OffAddr# a i)) 98 | {-# inline withAnyWord32 #-} 99 | 100 | -- | Parse any 'Word64' (native byte order) (CPS). 101 | withAnyWord64 :: (Word64 -> ParserT st e r) -> ParserT st e r 102 | withAnyWord64 = withAnySized# 8# (\a i -> W64# (indexWord64OffAddr# a i)) 103 | {-# inline withAnyWord64 #-} 104 | 105 | -- | Parse any 'Int8' (CPS). 106 | withAnyInt8 :: (Int8 -> ParserT st e r) -> ParserT st e r 107 | withAnyInt8 p = ParserT \fp eob buf st -> case eqAddr# eob buf of 108 | 1# -> Fail# st 109 | _ -> let i# = indexInt8OffAddr# buf 0# 110 | in runParserT# (p (I8# i#)) fp eob (plusAddr# buf 1#) st 111 | {-# inline withAnyInt8 #-} 112 | 113 | -- | Parse any 'Int16' (native byte order) (CPS). 114 | withAnyInt16 :: (Int16 -> ParserT st e r) -> ParserT st e r 115 | withAnyInt16 = withAnySized# 2# (\a i -> I16# (indexInt16OffAddr# a i)) 116 | {-# inline withAnyInt16 #-} 117 | 118 | -- | Parse any 'Int32' (native byte order) (CPS). 119 | withAnyInt32 :: (Int32 -> ParserT st e r) -> ParserT st e r 120 | withAnyInt32 = withAnySized# 4# (\a i -> I32# (indexInt32OffAddr# a i)) 121 | {-# inline withAnyInt32 #-} 122 | 123 | -- | Parse any 'Int64' (native byte order) (CPS). 124 | withAnyInt64 :: (Int64 -> ParserT st e r) -> ParserT st e r 125 | withAnyInt64 = withAnySized# 8# (\a i -> I64# (indexInt64OffAddr# a i)) 126 | {-# inline withAnyInt64 #-} 127 | 128 | -- | Parse any 'Word' (native size) (CPS). 129 | withAnyWord :: (Word -> ParserT st e r) -> ParserT st e r 130 | withAnyWord p = ParserT \fp eob buf st -> case SIZEOF_HSWORD# <=# minusAddr# eob buf of 131 | 0# -> Fail# st 132 | _ -> let w# = indexWordOffAddr# buf 0# 133 | in runParserT# (p (W# w#)) fp eob (plusAddr# buf SIZEOF_HSWORD#) st 134 | {-# inline withAnyWord #-} 135 | 136 | -- | Parse any 'Int' (native size) (CPS). 137 | withAnyInt :: (Int -> ParserT st e r) -> ParserT st e r 138 | withAnyInt p = ParserT \fp eob buf st -> case SIZEOF_HSWORD# <=# minusAddr# eob buf of 139 | 0# -> Fail# st 140 | _ -> let i# = indexIntOffAddr# buf 0# 141 | in runParserT# (p (I# i#)) fp eob (plusAddr# buf SIZEOF_HSWORD#) st 142 | {-# inline withAnyInt #-} 143 | 144 | -------------------------------------------------------------------------------- 145 | 146 | -- | Parse any 'Word8'. 147 | anyWord8 :: ParserT st e Word8 148 | anyWord8 = withAnyWord8 pure 149 | {-# inline anyWord8 #-} 150 | 151 | -- | Parse any 'Word16' (native byte order). 152 | anyWord16 :: ParserT st e Word16 153 | anyWord16 = withAnyWord16 pure 154 | {-# inline anyWord16 #-} 155 | 156 | -- | Parse any 'Word32' (native byte order). 157 | anyWord32 :: ParserT st e Word32 158 | anyWord32 = withAnyWord32 pure 159 | {-# inline anyWord32 #-} 160 | 161 | -- | Parse any 'Word64' (native byte order). 162 | anyWord64 :: ParserT st e Word64 163 | anyWord64 = withAnyWord64 pure 164 | {-# inline anyWord64 #-} 165 | 166 | -- | Parse any 'Int8'. 167 | anyInt8 :: ParserT st e Int8 168 | anyInt8 = withAnyInt8 pure 169 | {-# inline anyInt8 #-} 170 | 171 | -- | Parse any 'Int16' (native byte order). 172 | anyInt16 :: ParserT st e Int16 173 | anyInt16 = withAnyInt16 pure 174 | {-# inline anyInt16 #-} 175 | 176 | -- | Parse any 'Int32' (native byte order). 177 | anyInt32 :: ParserT st e Int32 178 | anyInt32 = withAnyInt32 pure 179 | {-# inline anyInt32 #-} 180 | 181 | -- | Parse any 'Int64' (native byte order). 182 | anyInt64 :: ParserT st e Int64 183 | anyInt64 = withAnyInt64 pure 184 | {-# inline anyInt64 #-} 185 | 186 | -- | Parse any 'Word' (native size). 187 | anyWord :: ParserT st e Word 188 | anyWord = withAnyWord pure 189 | {-# inline anyWord #-} 190 | 191 | -- | Parse any 'Int' (native size). 192 | anyInt :: ParserT st e Int 193 | anyInt = withAnyInt pure 194 | {-# inline anyInt #-} 195 | 196 | -------------------------------------------------------------------------------- 197 | 198 | {- $explicit-endianness 199 | Native endianness parsers are used where possible. For non-native endianness 200 | parsers, we parse then use the corresponding @byteSwapX@ function. On x86, this 201 | is inlined as a single @BSWAP@ instruction. 202 | -} 203 | 204 | -- | Parse any 'Word16' (little-endian). 205 | anyWord16le :: ParserT st e Word16 206 | #if defined(WORDS_BIGENDIAN) 207 | anyWord16le = withAnyWord16 (pure . byteSwap16) 208 | #else 209 | anyWord16le = anyWord16 210 | #endif 211 | {-# inline anyWord16le #-} 212 | 213 | -- | Parse any 'Word16' (big-endian). 214 | anyWord16be :: ParserT st e Word16 215 | #if defined(WORDS_BIGENDIAN) 216 | anyWord16be = anyWord16 217 | #else 218 | anyWord16be = withAnyWord16 (pure . byteSwap16) 219 | #endif 220 | {-# inline anyWord16be #-} 221 | 222 | -- | Parse any 'Word32' (little-endian). 223 | anyWord32le :: ParserT st e Word32 224 | #if defined(WORDS_BIGENDIAN) 225 | anyWord32le = withAnyWord32 (pure . byteSwap32) 226 | #else 227 | anyWord32le = anyWord32 228 | #endif 229 | {-# inline anyWord32le #-} 230 | 231 | -- | Parse any 'Word32' (big-endian). 232 | anyWord32be :: ParserT st e Word32 233 | #if defined(WORDS_BIGENDIAN) 234 | anyWord32be = anyWord32 235 | #else 236 | anyWord32be = withAnyWord32 (pure . byteSwap32) 237 | #endif 238 | {-# inline anyWord32be #-} 239 | 240 | -- | Parse any 'Word64' (little-endian). 241 | anyWord64le :: ParserT st e Word64 242 | #if defined(WORDS_BIGENDIAN) 243 | anyWord64le = withAnyWord64 (pure . byteSwap64) 244 | #else 245 | anyWord64le = anyWord64 246 | #endif 247 | {-# inline anyWord64le #-} 248 | 249 | -- | Parse any 'Word64' (big-endian). 250 | anyWord64be :: ParserT st e Word64 251 | #if defined(WORDS_BIGENDIAN) 252 | anyWord64be = anyWord64 253 | #else 254 | anyWord64be = withAnyWord64 (pure . byteSwap64) 255 | #endif 256 | {-# inline anyWord64be #-} 257 | 258 | -- | Parse any 'Int16' (little-endian). 259 | anyInt16le :: ParserT st e Int16 260 | #if defined(WORDS_BIGENDIAN) 261 | anyInt16le = withAnyWord16 (pure . word16ToInt16 . byteSwap16) 262 | #else 263 | anyInt16le = anyInt16 264 | #endif 265 | {-# inline anyInt16le #-} 266 | 267 | -- | Parse any 'Int16' (big-endian). 268 | anyInt16be :: ParserT st e Int16 269 | #if defined(WORDS_BIGENDIAN) 270 | anyInt16be = anyInt16 271 | #else 272 | anyInt16be = withAnyWord16 (pure . word16ToInt16 . byteSwap16) 273 | #endif 274 | {-# inline anyInt16be #-} 275 | 276 | -- | Parse any 'Int32' (little-endian). 277 | anyInt32le :: ParserT st e Int32 278 | #if defined(WORDS_BIGENDIAN) 279 | anyInt32le = withAnyWord32 (pure . word32ToInt32 . byteSwap32) 280 | #else 281 | anyInt32le = anyInt32 282 | #endif 283 | {-# inline anyInt32le #-} 284 | 285 | -- | Parse any 'Int32' (big-endian). 286 | anyInt32be :: ParserT st e Int32 287 | #if defined(WORDS_BIGENDIAN) 288 | anyInt32be = anyInt32 289 | #else 290 | anyInt32be = withAnyWord32 (pure . word32ToInt32 . byteSwap32) 291 | #endif 292 | {-# inline anyInt32be #-} 293 | 294 | -- | Parse any 'Int64' (little-endian). 295 | anyInt64le :: ParserT st e Int64 296 | #if defined(WORDS_BIGENDIAN) 297 | anyInt64le = withAnyWord64 (pure . word64ToInt64 . byteSwap64) 298 | #else 299 | anyInt64le = anyInt64 300 | #endif 301 | {-# inline anyInt64le #-} 302 | 303 | -- | Parse any 'Int64' (big-endian). 304 | anyInt64be :: ParserT st e Int64 305 | #if defined(WORDS_BIGENDIAN) 306 | anyInt64be = anyInt64 307 | #else 308 | anyInt64be = withAnyWord64 (pure . word64ToInt64 . byteSwap64) 309 | #endif 310 | {-# inline anyInt64be #-} 311 | 312 | -------------------------------------------------------------------------------- 313 | 314 | -- | Read the next 1 byte and assert its value as a 'Word8'. 315 | word8 :: Word8 -> ParserT st e () 316 | word8 wExpected = ParserT \fp eob buf st -> case eqAddr# eob buf of 317 | 1# -> Fail# st 318 | _ -> let w# = indexWord8OffAddr# buf 0# 319 | in if W8# w# == wExpected 320 | then OK# st () (plusAddr# buf 1#) 321 | else Fail# st 322 | {-# inline word8 #-} 323 | 324 | -------------------------------------------------------------------------------- 325 | 326 | {- $unsafe 327 | These unsafe parsers and helpers may be useful for efficient parsing in special 328 | situations e.g. you already know that the input has enough bytes. You should 329 | only use them if you can assert their necessary guarantees (see the individual 330 | function documentation). 331 | -} 332 | 333 | -- | Unsafe helper for defining parsers for types of a constant byte size (i.e. 334 | -- machine integers) which assert the parsed value's... value. 335 | -- 336 | -- Call this with an @indexXYZOffAddr@ primop (e.g. 337 | -- 'GHC.Exts.indexWord8OffAddr'), the size in bytes of the type you're parsing, 338 | -- and the expected value to test the parsed value against. 339 | -- 340 | -- The caller must guarantee that the input has enough bytes. 341 | sizedUnsafe# :: Eq a => Int# -> (Addr# -> Int# -> a) -> a -> ParserT st e () 342 | sizedUnsafe# size# indexOffAddr aExpected = 343 | withAnySizedUnsafe# size# indexOffAddr go 344 | where 345 | go aParsed = 346 | if aParsed == aExpected 347 | then pure () 348 | else empty 349 | {-# inline sizedUnsafe# #-} 350 | 351 | -- | Unsafely read the next 1 byte and assert its value as a 'Word8'. 352 | -- 353 | -- The caller must guarantee that the input has enough bytes. 354 | word8Unsafe :: Word8 -> ParserT st e () 355 | word8Unsafe = sizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) 356 | {-# inline word8Unsafe #-} 357 | 358 | -- | Unsafely read the next 2 bytes and assert their value as a 'Word16' 359 | -- (native byte order). 360 | -- 361 | -- The caller must guarantee that the input has enough bytes. 362 | word16Unsafe :: Word16 -> ParserT st e () 363 | word16Unsafe = sizedUnsafe# 2# (\a i -> W16# (indexWord16OffAddr# a i)) 364 | {-# inline word16Unsafe #-} 365 | 366 | -- | Unsafely read the next 4 bytes and assert their value as a 'Word32'. 367 | -- (native byte order). 368 | -- 369 | -- The caller must guarantee that the input has enough bytes. 370 | word32Unsafe :: Word32 -> ParserT st e () 371 | word32Unsafe = sizedUnsafe# 4# (\a i -> W32# (indexWord32OffAddr# a i)) 372 | {-# inline word32Unsafe #-} 373 | 374 | -- | Unsafely read the next 8 bytes and assert their value as a 'Word64'. 375 | -- (native byte order). 376 | -- 377 | -- The caller must guarantee that the input has enough bytes. 378 | word64Unsafe :: Word64 -> ParserT st e () 379 | word64Unsafe = sizedUnsafe# 8# (\a i -> W64# (indexWord64OffAddr# a i)) 380 | {-# inline word64Unsafe #-} 381 | 382 | -------------------------------------------------------------------------------- 383 | 384 | -- | Unsafely parse any 'Word8', without asserting the input is non-empty. 385 | -- 386 | -- The caller must guarantee that the input has enough bytes. 387 | anyWord8Unsafe :: ParserT st e Word8 388 | anyWord8Unsafe = withAnySizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) pure 389 | {-# inline anyWord8Unsafe #-} 390 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Integers.hs: -------------------------------------------------------------------------------- 1 | -- | Machine integer parsers. 2 | 3 | module FlatParse.Stateful.Integers 4 | ( 5 | -- * Native byte order 6 | anyWord8, anyWord16, anyWord32, anyWord64 7 | , anyInt8, anyInt16, anyInt32, anyInt64 8 | , anyWord, anyInt 9 | 10 | -- * Explicit endianness 11 | -- $explicit-endianness 12 | , anyWord16le, anyWord16be 13 | , anyWord32le, anyWord32be 14 | , anyWord64le, anyWord64be 15 | , anyInt16le, anyInt16be 16 | , anyInt32le, anyInt32be 17 | , anyInt64le, anyInt64be 18 | 19 | -- * Value assertions 20 | , word8 21 | 22 | -- * CPS parsers 23 | , withAnyWord8, withAnyWord16, withAnyWord32, withAnyWord64 24 | , withAnyInt8, withAnyInt16, withAnyInt32, withAnyInt64 25 | , withAnyWord, withAnyInt 26 | 27 | -- * Unsafe 28 | -- $unsafe 29 | , anyWord8Unsafe 30 | 31 | -- ** Value assertions 32 | , word8Unsafe, word16Unsafe, word32Unsafe, word64Unsafe 33 | 34 | -- * Helper definitions 35 | , withAnySized#, withAnySizedUnsafe# 36 | , sizedUnsafe# 37 | ) where 38 | 39 | -- for WORDS_BIGENDIAN 40 | #include "MachDeps.h" 41 | 42 | import FlatParse.Stateful.Parser 43 | import FlatParse.Stateful.Base ( withEnsure# ) 44 | 45 | import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) 46 | 47 | import FlatParse.Common.GHCExts 48 | import GHC.Word 49 | import GHC.Int 50 | 51 | import Control.Applicative ( Alternative(empty) ) 52 | 53 | -------------------------------------------------------------------------------- 54 | 55 | -- | Helper for defining CPS parsers for types of a constant byte size (i.e. 56 | -- machine integers). 57 | -- 58 | -- Call this with an @indexXYZOffAddr@ primop (e.g. 59 | -- 'GHC.Exts.indexWord8OffAddr') and the size in bytes of the type you're 60 | -- parsing. 61 | withAnySized# 62 | :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st r e ret) 63 | -> ParserT st r e ret 64 | withAnySized# size# indexOffAddr p = 65 | withEnsure# size# (withAnySizedUnsafe# size# indexOffAddr p) 66 | {-# inline withAnySized# #-} 67 | 68 | -- | Unsafe helper for defining CPS parsers for types of a constant byte size 69 | -- (i.e. machine integers). 70 | -- 71 | -- Is really just syntactic sugar for applying the given parser and shifting the 72 | -- buffer along. 73 | -- 74 | -- The caller must guarantee that the input has enough bytes. 75 | withAnySizedUnsafe# 76 | :: Int# -> (Addr# -> Int# -> a) -> (a -> ParserT st r e ret) 77 | -> ParserT st r e ret 78 | withAnySizedUnsafe# size# indexOffAddr p = ParserT \fp !r eob buf n st -> 79 | let a = indexOffAddr buf 0# 80 | buf' = plusAddr# buf size# 81 | in runParserT# (p a) fp r eob buf' n st 82 | {-# inline withAnySizedUnsafe# #-} 83 | 84 | -- | Parse any 'Word8' (CPS). 85 | withAnyWord8 :: (Word8 -> ParserT st r e ret) -> ParserT st r e ret 86 | withAnyWord8 p = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of 87 | 1# -> Fail# st 88 | _ -> let w# = indexWord8OffAddr# buf 0# 89 | in runParserT# (p (W8# w#)) fp r eob (plusAddr# buf 1#) n st 90 | {-# inline withAnyWord8 #-} 91 | 92 | -- | Parse any 'Word16' (native byte order) (CPS). 93 | withAnyWord16 :: (Word16 -> ParserT st r e ret) -> ParserT st r e ret 94 | withAnyWord16 = withAnySized# 2# (\a i -> W16# (indexWord16OffAddr# a i)) 95 | {-# inline withAnyWord16 #-} 96 | 97 | -- | Parse any 'Word32' (native byte order) (CPS). 98 | withAnyWord32 :: (Word32 -> ParserT st r e ret) -> ParserT st r e ret 99 | withAnyWord32 = withAnySized# 4# (\a i -> W32# (indexWord32OffAddr# a i)) 100 | {-# inline withAnyWord32 #-} 101 | 102 | -- | Parse any 'Word64' (native byte order) (CPS). 103 | withAnyWord64 :: (Word64 -> ParserT st r e ret) -> ParserT st r e ret 104 | withAnyWord64 = withAnySized# 8# (\a i -> W64# (indexWord64OffAddr# a i)) 105 | {-# inline withAnyWord64 #-} 106 | 107 | -- | Parse any 'Int8' (CPS). 108 | withAnyInt8 :: (Int8 -> ParserT st r e ret) -> ParserT st r e ret 109 | withAnyInt8 p = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of 110 | 1# -> Fail# st 111 | _ -> let i# = indexInt8OffAddr# buf 0# 112 | in runParserT# (p (I8# i#)) fp r eob (plusAddr# buf 1#) n st 113 | {-# inline withAnyInt8 #-} 114 | 115 | -- | Parse any 'Int16' (native byte order) (CPS). 116 | withAnyInt16 :: (Int16 -> ParserT st r e ret) -> ParserT st r e ret 117 | withAnyInt16 = withAnySized# 2# (\a i -> I16# (indexInt16OffAddr# a i)) 118 | {-# inline withAnyInt16 #-} 119 | 120 | -- | Parse any 'Int32' (native byte order) (CPS). 121 | withAnyInt32 :: (Int32 -> ParserT st r e ret) -> ParserT st r e ret 122 | withAnyInt32 = withAnySized# 4# (\a i -> I32# (indexInt32OffAddr# a i)) 123 | {-# inline withAnyInt32 #-} 124 | 125 | -- | Parse any 'Int64' (native byte order) (CPS). 126 | withAnyInt64 :: (Int64 -> ParserT st r e ret) -> ParserT st r e ret 127 | withAnyInt64 = withAnySized# 8# (\a i -> I64# (indexInt64OffAddr# a i)) 128 | {-# inline withAnyInt64 #-} 129 | 130 | -- | Parse any 'Word' (native size) (CPS). 131 | withAnyWord :: (Word -> ParserT st r e ret) -> ParserT st r e ret 132 | withAnyWord p = ParserT \fp !r eob buf n st -> case 8# <=# minusAddr# eob buf of 133 | 0# -> Fail# st 134 | _ -> let w# = indexWordOffAddr# buf 0# 135 | in runParserT# (p (W# w#)) fp r eob (plusAddr# buf SIZEOF_HSWORD#) n st 136 | {-# inline withAnyWord #-} 137 | 138 | -- | Parse any 'Int' (native size) (CPS). 139 | withAnyInt :: (Int -> ParserT st r e ret) -> ParserT st r e ret 140 | withAnyInt p = ParserT \fp !r eob buf n st -> case 8# <=# minusAddr# eob buf of 141 | 0# -> Fail# st 142 | _ -> let i# = indexIntOffAddr# buf 0# 143 | in runParserT# (p (I# i#)) fp r eob (plusAddr# buf SIZEOF_HSWORD#) n st 144 | {-# inline withAnyInt #-} 145 | 146 | -------------------------------------------------------------------------------- 147 | 148 | -- | Parse any 'Word8'. 149 | anyWord8 :: ParserT st r e Word8 150 | anyWord8 = withAnyWord8 pure 151 | {-# inline anyWord8 #-} 152 | 153 | -- | Parse any 'Word16' (native byte order). 154 | anyWord16 :: ParserT st r e Word16 155 | anyWord16 = withAnyWord16 pure 156 | {-# inline anyWord16 #-} 157 | 158 | -- | Parse any 'Word32' (native byte order). 159 | anyWord32 :: ParserT st r e Word32 160 | anyWord32 = withAnyWord32 pure 161 | {-# inline anyWord32 #-} 162 | 163 | -- | Parse any 'Word64' (native byte order). 164 | anyWord64 :: ParserT st r e Word64 165 | anyWord64 = withAnyWord64 pure 166 | {-# inline anyWord64 #-} 167 | 168 | -- | Parse any 'Int8'. 169 | anyInt8 :: ParserT st r e Int8 170 | anyInt8 = withAnyInt8 pure 171 | {-# inline anyInt8 #-} 172 | 173 | -- | Parse any 'Int16' (native byte order). 174 | anyInt16 :: ParserT st r e Int16 175 | anyInt16 = withAnyInt16 pure 176 | {-# inline anyInt16 #-} 177 | 178 | -- | Parse any 'Int32' (native byte order). 179 | anyInt32 :: ParserT st r e Int32 180 | anyInt32 = withAnyInt32 pure 181 | {-# inline anyInt32 #-} 182 | 183 | -- | Parse any 'Int64' (native byte order). 184 | anyInt64 :: ParserT st r e Int64 185 | anyInt64 = withAnyInt64 pure 186 | {-# inline anyInt64 #-} 187 | 188 | -- | Parse any 'Word' (native size). 189 | anyWord :: ParserT st r e Word 190 | anyWord = withAnyWord pure 191 | {-# inline anyWord #-} 192 | 193 | -- | Parse any 'Int' (native size). 194 | anyInt :: ParserT st r e Int 195 | anyInt = withAnyInt pure 196 | {-# inline anyInt #-} 197 | 198 | -------------------------------------------------------------------------------- 199 | 200 | {- $explicit-endianness 201 | Native endianness parsers are used where possible. For non-native endianness 202 | parsers, we parse then use the corresponding @byteSwapX@ function. On x86, this 203 | is inlined as a single @BSWAP@ instruction. 204 | -} 205 | 206 | -- | Parse any 'Word16' (little-endian). 207 | anyWord16le :: ParserT st r e Word16 208 | #if defined(WORDS_BIGENDIAN) 209 | anyWord16le = withAnyWord16 (pure . byteSwap16) 210 | #else 211 | anyWord16le = anyWord16 212 | #endif 213 | {-# inline anyWord16le #-} 214 | 215 | -- | Parse any 'Word16' (big-endian). 216 | anyWord16be :: ParserT st r e Word16 217 | #if defined(WORDS_BIGENDIAN) 218 | anyWord16be = anyWord16 219 | #else 220 | anyWord16be = withAnyWord16 (pure . byteSwap16) 221 | #endif 222 | {-# inline anyWord16be #-} 223 | 224 | -- | Parse any 'Word32' (little-endian). 225 | anyWord32le :: ParserT st r e Word32 226 | #if defined(WORDS_BIGENDIAN) 227 | anyWord32le = withAnyWord32 (pure . byteSwap32) 228 | #else 229 | anyWord32le = anyWord32 230 | #endif 231 | {-# inline anyWord32le #-} 232 | 233 | -- | Parse any 'Word32' (big-endian). 234 | anyWord32be :: ParserT st r e Word32 235 | #if defined(WORDS_BIGENDIAN) 236 | anyWord32be = anyWord32 237 | #else 238 | anyWord32be = withAnyWord32 (pure . byteSwap32) 239 | #endif 240 | {-# inline anyWord32be #-} 241 | 242 | -- | Parse any 'Word64' (little-endian). 243 | anyWord64le :: ParserT st r e Word64 244 | #if defined(WORDS_BIGENDIAN) 245 | anyWord64le = withAnyWord64 (pure . byteSwap64) 246 | #else 247 | anyWord64le = anyWord64 248 | #endif 249 | {-# inline anyWord64le #-} 250 | 251 | -- | Parse any 'Word64' (big-endian). 252 | anyWord64be :: ParserT st r e Word64 253 | #if defined(WORDS_BIGENDIAN) 254 | anyWord64be = anyWord64 255 | #else 256 | anyWord64be = withAnyWord64 (pure . byteSwap64) 257 | #endif 258 | {-# inline anyWord64be #-} 259 | 260 | -- | Parse any 'Int16' (little-endian). 261 | anyInt16le :: ParserT st r e Int16 262 | #if defined(WORDS_BIGENDIAN) 263 | anyInt16le = withAnyWord16 (pure . word16ToInt16 . byteSwap16) 264 | #else 265 | anyInt16le = anyInt16 266 | #endif 267 | {-# inline anyInt16le #-} 268 | 269 | -- | Parse any 'Int16' (big-endian). 270 | anyInt16be :: ParserT st r e Int16 271 | #if defined(WORDS_BIGENDIAN) 272 | anyInt16be = anyInt16 273 | #else 274 | anyInt16be = withAnyWord16 (pure . word16ToInt16 . byteSwap16) 275 | #endif 276 | {-# inline anyInt16be #-} 277 | 278 | -- | Parse any 'Int32' (little-endian). 279 | anyInt32le :: ParserT st r e Int32 280 | #if defined(WORDS_BIGENDIAN) 281 | anyInt32le = withAnyWord32 (pure . word32ToInt32 . byteSwap32) 282 | #else 283 | anyInt32le = anyInt32 284 | #endif 285 | {-# inline anyInt32le #-} 286 | 287 | -- | Parse any 'Int32' (big-endian). 288 | anyInt32be :: ParserT st r e Int32 289 | #if defined(WORDS_BIGENDIAN) 290 | anyInt32be = anyInt32 291 | #else 292 | anyInt32be = withAnyWord32 (pure . word32ToInt32 . byteSwap32) 293 | #endif 294 | {-# inline anyInt32be #-} 295 | 296 | -- | Parse any 'Int64' (little-endian). 297 | anyInt64le :: ParserT st r e Int64 298 | #if defined(WORDS_BIGENDIAN) 299 | anyInt64le = withAnyWord64 (pure . word64ToInt64 . byteSwap64) 300 | #else 301 | anyInt64le = anyInt64 302 | #endif 303 | {-# inline anyInt64le #-} 304 | 305 | -- | Parse any 'Int64' (big-endian). 306 | anyInt64be :: ParserT st r e Int64 307 | #if defined(WORDS_BIGENDIAN) 308 | anyInt64be = anyInt64 309 | #else 310 | anyInt64be = withAnyWord64 (pure . word64ToInt64 . byteSwap64) 311 | #endif 312 | {-# inline anyInt64be #-} 313 | 314 | -------------------------------------------------------------------------------- 315 | 316 | -- | Read the next 1 byte and assert its value as a 'Word8'. 317 | word8 :: Word8 -> ParserT st r e () 318 | word8 wExpected = ParserT \fp !r eob buf n st -> case eqAddr# eob buf of 319 | 1# -> Fail# st 320 | _ -> let w# = indexWord8OffAddr# buf 0# 321 | in if W8# w# == wExpected 322 | then OK# st () (plusAddr# buf 1#) n 323 | else Fail# st 324 | {-# inline word8 #-} 325 | 326 | -------------------------------------------------------------------------------- 327 | 328 | {- $unsafe 329 | These unsafe parsers and helpers may be useful for efficient parsing in special 330 | situations e.g. you already know that the input has enough bytes. You should 331 | only use them if you can assert their necessary guarantees (see the individual 332 | function documentation). 333 | -} 334 | 335 | -- | Unsafe helper for defining parsers for types of a constant byte size (i.e. 336 | -- machine integers) which assert the parsed value's... value. 337 | -- 338 | -- Call this with an @indexXYZOffAddr@ primop (e.g. 339 | -- 'GHC.Exts.indexWord8OffAddr'), the size in bytes of the type you're parsing, 340 | -- and the expected value to test the parsed value against. 341 | -- 342 | -- The caller must guarantee that the input has enough bytes. 343 | sizedUnsafe# :: Eq a => Int# -> (Addr# -> Int# -> a) -> a -> ParserT st r e () 344 | sizedUnsafe# size# indexOffAddr aExpected = 345 | withAnySizedUnsafe# size# indexOffAddr go 346 | where 347 | go aParsed = 348 | if aParsed == aExpected 349 | then pure () 350 | else empty 351 | {-# inline sizedUnsafe# #-} 352 | 353 | -- | Unsafely read the next 1 byte and assert its value as a 'Word8'. 354 | -- 355 | -- The caller must guarantee that the input has enough bytes. 356 | word8Unsafe :: Word8 -> ParserT st r e () 357 | word8Unsafe = sizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) 358 | {-# inline word8Unsafe #-} 359 | 360 | -- | Unsafely read the next 2 bytes and assert their value as a 'Word16' 361 | -- (native byte order). 362 | -- 363 | -- The caller must guarantee that the input has enough bytes. 364 | word16Unsafe :: Word16 -> ParserT st r e () 365 | word16Unsafe = sizedUnsafe# 2# (\a i -> W16# (indexWord16OffAddr# a i)) 366 | {-# inline word16Unsafe #-} 367 | 368 | -- | Unsafely read the next 4 bytes and assert their value as a 'Word32'. 369 | -- (native byte order). 370 | -- 371 | -- The caller must guarantee that the input has enough bytes. 372 | word32Unsafe :: Word32 -> ParserT st r e () 373 | word32Unsafe = sizedUnsafe# 4# (\a i -> W32# (indexWord32OffAddr# a i)) 374 | {-# inline word32Unsafe #-} 375 | 376 | -- | Unsafely read the next 8 bytes and assert their value as a 'Word64'. 377 | -- (native byte order). 378 | -- 379 | -- The caller must guarantee that the input has enough bytes. 380 | word64Unsafe :: Word64 -> ParserT st r e () 381 | word64Unsafe = sizedUnsafe# 8# (\a i -> W64# (indexWord64OffAddr# a i)) 382 | {-# inline word64Unsafe #-} 383 | 384 | -------------------------------------------------------------------------------- 385 | 386 | -- | Unsafely parse any 'Word8', without asserting the input is non-empty. 387 | -- 388 | -- The caller must guarantee that the input has enough bytes. 389 | anyWord8Unsafe :: ParserT st r e Word8 390 | anyWord8Unsafe = withAnySizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) pure 391 | {-# inline anyWord8Unsafe #-} 392 | -------------------------------------------------------------------------------- /src/FlatParse/Basic/Base.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples #-} 2 | 3 | -- | Basic parser building blocks. 4 | 5 | module FlatParse.Basic.Base 6 | ( 7 | -- * Bytewise 8 | eof 9 | , take 10 | , take# 11 | , takeUnsafe# 12 | , takeRest 13 | , skip 14 | , skip# 15 | , skipBack 16 | , skipBack# 17 | , atSkip# 18 | , atSkipUnsafe# 19 | 20 | -- * Combinators 21 | , branch 22 | , notFollowedBy 23 | , chainl 24 | , chainr 25 | , lookahead 26 | , ensure 27 | , ensure# 28 | , withEnsure 29 | , withEnsure1 30 | , withEnsure# 31 | , isolate 32 | , isolate# 33 | , isolateUnsafe# 34 | 35 | -- ** Non-specific (TODO) 36 | , skipMany 37 | , skipSome 38 | 39 | -- * Errors and failures 40 | , failed 41 | , try 42 | , err 43 | , withError 44 | , withAnyResult 45 | , fails 46 | , cut 47 | , cutting 48 | , optional 49 | , optional_ 50 | , withOption 51 | ) where 52 | 53 | import Prelude hiding ( take ) 54 | 55 | import FlatParse.Basic.Parser 56 | import qualified FlatParse.Common.Assorted as Common 57 | 58 | import GHC.Exts 59 | import qualified Data.ByteString as B 60 | import qualified Data.ByteString.Internal as B 61 | import GHC.ForeignPtr ( ForeignPtr(..) ) 62 | import qualified Control.Applicative 63 | 64 | -- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks 65 | -- on parser failure. 66 | failed :: ParserT st e a 67 | failed = Control.Applicative.empty 68 | {-# inline failed #-} 69 | 70 | -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack 71 | -- on parser error. Use `try` to convert an error to a recoverable failure. 72 | err :: e -> ParserT st e a 73 | err e = ParserT \_fp _eob _s st -> Err# st e 74 | {-# inline err #-} 75 | 76 | -- | Run the parser, if an error is thrown, handle it with the given function. 77 | withError :: ParserT st e b -> (e -> ParserT st e b) -> ParserT st e b 78 | withError (ParserT f) hdl = ParserT $ \fp eob s st -> case f fp eob s st of 79 | Err# st' e -> case hdl e of 80 | ParserT g -> g fp eob s st' 81 | x -> x 82 | {-# inline withError #-} 83 | 84 | -- | Run the parser, and handle each possible result. 85 | withAnyResult 86 | :: ParserT st t a -- ^ The parser to run. 87 | -> (a -> ParserT st e b) -- ^ The parser to run in case of success. 88 | -> ParserT st e b -- ^ The parser to run in case of failure. 89 | -> (t -> ParserT st e b) -- ^ The parser to run in case of error. 90 | -> ParserT st e b 91 | withAnyResult (ParserT first) whenSuccess (ParserT whenFailure) whenError = 92 | ParserT \fp eob n st -> 93 | case first fp eob n st of 94 | OK# st' a n' -> runParserT# (whenSuccess a) fp eob n' st' 95 | Fail# st' -> whenFailure fp eob n st' 96 | Err# st' e -> runParserT# (whenError e) fp eob n st' 97 | {-# INLINE withAnyResult #-} 98 | 99 | -- | Convert a parsing error into failure. 100 | try :: ParserT st e a -> ParserT st e a 101 | try (ParserT f) = ParserT \fp eob s st -> case f fp eob s st of 102 | Err# st' _ -> Fail# st' 103 | x -> x 104 | {-# inline try #-} 105 | 106 | -- | Convert a parsing failure to a success. 107 | fails :: ParserT st e a -> ParserT st e () 108 | fails (ParserT f) = ParserT \fp eob s st -> 109 | case f fp eob s st of 110 | OK# st' _ _ -> Fail# st' 111 | Fail# st' -> OK# st' () s 112 | Err# st' e -> Err# st' e 113 | {-# inline fails #-} 114 | 115 | -- | Convert a parsing failure to an error. 116 | cut :: ParserT st e a -> e -> ParserT st e a 117 | cut (ParserT f) e = ParserT \fp eob s st -> case f fp eob s st of 118 | Fail# st' -> Err# st' e 119 | x -> x 120 | {-# inline cut #-} 121 | 122 | -- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the 123 | -- inner and the newly given errors using the @e -> e -> e@ function. This can be useful for 124 | -- implementing parsing errors which may propagate hints or accummulate contextual information. 125 | cutting :: ParserT st e a -> e -> (e -> e -> e) -> ParserT st e a 126 | cutting (ParserT f) e merge = ParserT \fp eob s st -> case f fp eob s st of 127 | Fail# st' -> Err# st' e 128 | Err# st' e' -> Err# st' $! merge e' e 129 | x -> x 130 | {-# inline cutting #-} 131 | 132 | -- | Convert a parsing failure to a `Maybe`. If possible, use `withOption` 133 | -- instead. 134 | optional :: ParserT st e a -> ParserT st e (Maybe a) 135 | optional p = (Just <$> p) <|> pure Nothing 136 | {-# inline optional #-} 137 | 138 | -- | Convert a parsing failure to a `()`. 139 | optional_ :: ParserT st e a -> ParserT st e () 140 | optional_ p = (() <$ p) <|> pure () 141 | {-# inline optional_ #-} 142 | 143 | -- | CPS'd version of `optional`. This is usually more efficient, since it gets 144 | -- rid of the extra `Maybe` allocation. 145 | withOption :: ParserT st e a -> (a -> ParserT st e r) -> ParserT st e r -> ParserT st e r 146 | withOption (ParserT f) just (ParserT nothing) = ParserT \fp eob s st -> 147 | case f fp eob s st of 148 | OK# st' a s -> runParserT# (just a) fp eob s st' 149 | Fail# st' -> nothing fp eob s st' 150 | Err# st' e -> Err# st' e 151 | {-# inline withOption #-} 152 | 153 | -------------------------------------------------------------------------------- 154 | 155 | -- | Succeed if the input is empty. 156 | eof :: ParserT st e () 157 | eof = ParserT \fp eob s st -> case eqAddr# eob s of 158 | 1# -> OK# st () s 159 | _ -> Fail# st 160 | {-# inline eof #-} 161 | 162 | -- | Save the parsing state, then run a parser, then restore the state. 163 | lookahead :: ParserT st e a -> ParserT st e a 164 | lookahead (ParserT f) = ParserT \fp eob s st -> 165 | case f fp eob s st of 166 | OK# st' a _ -> OK# st' a s 167 | x -> x 168 | {-# inline lookahead #-} 169 | 170 | -- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. 171 | -- All isolated bytes must be consumed. 172 | -- 173 | -- Throws a runtime error if given a negative integer. 174 | isolate :: Int -> ParserT st e a -> ParserT st e a 175 | isolate = Common.withIntUnwrap# isolate# 176 | {-# inline isolate #-} 177 | 178 | -- | @isolate# n# p@ runs the parser @p@ isolated to the next @n#@ bytes. 179 | -- All isolated bytes must be consumed. 180 | -- 181 | -- Throws a runtime error if given a negative integer. 182 | isolate# :: Int# -> ParserT st e a -> ParserT st e a 183 | isolate# n# p = Common.withPosInt# n# (isolateUnsafe# n# p) 184 | {-# inline isolate# #-} 185 | 186 | -- | @isolateUnsafe# n# p@ runs the parser @p@ isolated to the next @n#@ bytes. 187 | -- All isolated bytes must be consumed. 188 | -- 189 | -- Undefined behaviour if given a negative integer. 190 | isolateUnsafe# :: Int# -> ParserT st e a -> ParserT st e a 191 | isolateUnsafe# n# (ParserT p) = 192 | withEnsure# n# $ ParserT \fp eob s st -> 193 | let s' = plusAddr# s n# 194 | in case p fp s' s st of 195 | OK# st' a s'' -> 196 | case eqAddr# s' s'' of 197 | 1# -> OK# st' a s'' 198 | _ -> Fail# st' 199 | x -> x 200 | {-# inline isolateUnsafe# #-} 201 | 202 | -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, 203 | -- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not 204 | -- the usual `chainl` function from the parsec libraries! 205 | chainl :: (b -> a -> b) -> ParserT st e b -> ParserT st e a -> ParserT st e b 206 | chainl f start elem = start >>= go where 207 | go b = withOption elem (\ !a -> go $! f b a) (pure b) 208 | {-# inline chainl #-} 209 | 210 | -- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and 211 | -- combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not 212 | -- the usual `chainr` function from the parsec libraries! 213 | chainr :: (a -> b -> b) -> ParserT st e a -> ParserT st e b -> ParserT st e b 214 | chainr f (ParserT elem) (ParserT end) = ParserT go where 215 | go fp eob s st = case elem fp eob s st of 216 | OK# st' a s -> case go fp eob s st' of 217 | OK# st'' b s -> let !b' = f a b in OK# st'' b' s 218 | x -> x 219 | Fail# st' -> end fp eob s st' 220 | Err# st' e -> Err# st' e 221 | {-# inline chainr #-} 222 | 223 | -- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. 224 | -- This can produce slightly more efficient code than `(<|>)`. Moreover, `branch` does not 225 | -- backtrack from the true/false cases. 226 | branch :: ParserT st e a -> ParserT st e b -> ParserT st e b -> ParserT st e b 227 | branch pa pt pf = ParserT \fp eob s st -> case runParserT# pa fp eob s st of 228 | OK# st' _ s -> runParserT# pt fp eob s st' 229 | Fail# st' -> runParserT# pf fp eob s st' 230 | Err# st' e -> Err# st' e 231 | {-# inline branch #-} 232 | 233 | -- | Succeed if the first parser succeeds and the second one fails. 234 | notFollowedBy :: ParserT st e a -> ParserT st e b -> ParserT st e a 235 | notFollowedBy p1 p2 = p1 <* fails p2 236 | {-# inline notFollowedBy #-} 237 | 238 | -------------------------------------------------------------------------------- 239 | 240 | -- | Assert that there are at least @n@ bytes remaining. 241 | -- 242 | -- Undefined behaviour if given a negative integer. 243 | ensure :: Int -> ParserT st e () 244 | ensure = Common.withIntUnwrap# ensure# 245 | {-# inline ensure #-} 246 | 247 | -- | Assert that there are at least @n#@ bytes remaining. 248 | -- 249 | -- Undefined behaviour if given a negative integer. 250 | ensure# :: Int# -> ParserT st e () 251 | ensure# n# = withEnsure# n# (pure ()) 252 | {-# inline ensure# #-} 253 | 254 | -- TODO: András: why do we need withEnsure-s? 255 | -- There's no unboxing to be improved. 256 | 257 | -- | Assert that there are at least @n#@ bytes remaining (CPS). 258 | -- 259 | -- Undefined behaviour if given a negative integer. 260 | withEnsure :: Int -> ParserT st e r -> ParserT st e r 261 | withEnsure = Common.withIntUnwrap# withEnsure# 262 | {-# inline withEnsure #-} 263 | 264 | -- | Assert that there is at least 1 byte remaining (CPS). 265 | -- 266 | -- Undefined behaviour if given a negative integer. 267 | withEnsure1 :: ParserT st e r -> ParserT st e r 268 | withEnsure1 (ParserT p) = ParserT \fp eob s st -> 269 | case eqAddr# eob s of 270 | 0# -> p fp eob s st 271 | _ -> Fail# st 272 | {-# inline withEnsure1 #-} 273 | 274 | -- | Assert that there are at least @n#@ bytes remaining (CPS). 275 | -- 276 | -- Undefined behaviour if given a negative integer. 277 | withEnsure# :: Int# -> ParserT st e r -> ParserT st e r 278 | withEnsure# n# (ParserT p) = ParserT \fp eob s st -> 279 | case n# <=# minusAddr# eob s of 280 | 1# -> p fp eob s st 281 | _ -> Fail# st 282 | {-# inline withEnsure# #-} 283 | 284 | -------------------------------------------------------------------------------- 285 | 286 | -- | Read the given number of bytes as a 'ByteString'. 287 | -- 288 | -- Throws a runtime error if given a negative integer. 289 | -- 290 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 291 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 292 | take :: Int -> ParserT st e B.ByteString 293 | take (I# n#) = take# n# 294 | {-# inline take #-} 295 | 296 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are 297 | -- available. 298 | -- 299 | -- Throws a runtime error if given a negative integer. 300 | -- 301 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 302 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 303 | take# :: Int# -> ParserT st e B.ByteString 304 | take# n# = Common.withPosInt# n# (takeUnsafe# n#) 305 | {-# inline take# #-} 306 | 307 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are 308 | -- available. 309 | -- 310 | -- Undefined behaviour if given a negative integer. 311 | -- 312 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 313 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 314 | takeUnsafe# :: Int# -> ParserT st e B.ByteString 315 | takeUnsafe# n# = ParserT \fp eob s st -> 316 | case n# <=# minusAddr# eob s of 317 | 1# -> OK# st (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) 318 | _ -> Fail# st 319 | {-# inline takeUnsafe# #-} 320 | 321 | -- | Consume the rest of the input. May return the empty bytestring. 322 | -- 323 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 324 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 325 | takeRest :: ParserT st e B.ByteString 326 | takeRest = ParserT \fp eob s st -> 327 | let n# = minusAddr# eob s 328 | in OK# st (B.PS (ForeignPtr s fp) 0 (I# n#)) eob 329 | {-# inline takeRest #-} 330 | 331 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. 332 | -- 333 | -- Throws a runtime error if given a negative integer. 334 | skip :: Int -> ParserT st e () 335 | skip (I# n#) = skip# n# 336 | {-# inline skip #-} 337 | 338 | -- | Skip forward @n#@ bytes. Fails if fewer than @n#@ bytes are available. 339 | -- 340 | -- Throws a runtime error if given a negative integer. 341 | skip# :: Int# -> ParserT st e () 342 | skip# n# = atSkip# n# (pure ()) 343 | {-# inline skip# #-} 344 | 345 | -- | Go back @i@ bytes in the input. Takes a positive integer. 346 | -- 347 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea. 348 | skipBack :: Int -> ParserT st e () 349 | skipBack = Common.withIntUnwrap# skipBack# 350 | {-# inline skipBack #-} 351 | 352 | -- | Go back @n#@ bytes. Takes a positive integer. 353 | -- 354 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea. 355 | skipBack# :: Int# -> ParserT st e () 356 | skipBack# n# = ParserT \fp eob s st -> 357 | OK# st () (plusAddr# s (negateInt# n#)) 358 | {-# inline skipBack# #-} 359 | 360 | -- | Skip forward @n#@ bytes and run the given parser. Fails if fewer than @n#@ 361 | -- bytes are available. 362 | -- 363 | -- Throws a runtime error if given a negative integer. 364 | atSkip# :: Int# -> ParserT st e a -> ParserT st e a 365 | atSkip# n# p = Common.withPosInt# n# (atSkipUnsafe# n# p) 366 | {-# inline atSkip# #-} 367 | 368 | -- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ 369 | -- bytes are available. 370 | -- 371 | -- Undefined behaviour if given a negative integer. 372 | atSkipUnsafe# :: Int# -> ParserT st e r -> ParserT st e r 373 | atSkipUnsafe# n# (ParserT p) = 374 | withEnsure# n# $ ParserT \fp eob s st -> 375 | p fp eob (plusAddr# s n#) st 376 | {-# inline atSkipUnsafe# #-} 377 | 378 | -------------------------------------------------------------------------------- 379 | 380 | -- | Skip a parser zero or more times. 381 | skipMany :: ParserT st e a -> ParserT st e () 382 | skipMany (ParserT f) = ParserT go where 383 | go fp eob s st = case f fp eob s st of 384 | OK# st a s -> go fp eob s st 385 | Fail# st -> OK# st () s 386 | Err# st e -> Err# st e 387 | {-# inline skipMany #-} 388 | 389 | -- TODO identical to one from parser-combinators 390 | -- | Skip a parser one or more times. 391 | skipSome :: ParserT st e a -> ParserT st e () 392 | skipSome p = p *> skipMany p 393 | {-# inline skipSome #-} 394 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful/Base.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples #-} 2 | 3 | -- | Basic parser building blocks. 4 | 5 | module FlatParse.Stateful.Base 6 | ( 7 | -- * Bytewise 8 | eof 9 | , take 10 | , take# 11 | , takeUnsafe# 12 | , takeRest 13 | , skip 14 | , skip# 15 | , skipBack 16 | , skipBack# 17 | , atSkip# 18 | , atSkipUnsafe# 19 | 20 | -- * Combinators 21 | , branch 22 | , notFollowedBy 23 | , chainl 24 | , chainr 25 | , lookahead 26 | , ensure 27 | , ensure# 28 | , withEnsure 29 | , withEnsure1 30 | , withEnsure# 31 | , isolate 32 | , isolate# 33 | , isolateUnsafe# 34 | 35 | -- ** Non-specific (TODO) 36 | , skipMany 37 | , skipSome 38 | 39 | -- * Errors and failures 40 | , failed 41 | , try 42 | , err 43 | , withError 44 | , withAnyResult 45 | , fails 46 | , cut 47 | , cutting 48 | , optional 49 | , optional_ 50 | , withOption 51 | ) where 52 | 53 | import Prelude hiding ( take ) 54 | 55 | import FlatParse.Stateful.Parser 56 | import qualified FlatParse.Common.Assorted as Common 57 | 58 | import GHC.Exts 59 | import qualified Data.ByteString as B 60 | import qualified Data.ByteString.Internal as B 61 | import GHC.ForeignPtr ( ForeignPtr(..) ) 62 | import qualified Control.Applicative 63 | 64 | -- | The failing parser. By default, parser choice `(<|>)` arbitrarily 65 | -- backtracks on parser failure. This is a synonym for `Control.Applicative.empty`. 66 | failed :: ParserT st r e a 67 | failed = Control.Applicative.empty 68 | {-# inline failed #-} 69 | 70 | -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack 71 | -- on parser error. Use `try` to convert an error to a recoverable failure. 72 | err :: e -> ParserT st r e a 73 | err e = ParserT \_fp !_r _eob _s _n st -> Err# st e 74 | {-# inline err #-} 75 | 76 | -- | Run the parser, if an error is thrown, handle it with the given function. 77 | withError :: ParserT st r e b -> (e -> ParserT st r e b) -> ParserT st r e b 78 | withError (ParserT f) hdl = ParserT $ \fp !r eob s n st -> case f fp r eob s n st of 79 | Err# st' e -> case hdl e of 80 | ParserT g -> g fp r eob s n st' 81 | x -> x 82 | {-# inline withError #-} 83 | 84 | -- | Run the parser, and handle each possible result. 85 | withAnyResult 86 | :: ParserT st r t a -- ^ The parser to run. 87 | -> (a -> ParserT st r e b) -- ^ The parser to run in case of success. 88 | -> ParserT st r e b -- ^ The parser to run in case of failure. 89 | -> (t -> ParserT st r e b) -- ^ The parser to run in case of error. 90 | -> ParserT st r e b 91 | withAnyResult (ParserT first) whenSuccess (ParserT whenFailure) whenError = 92 | ParserT \fp !r eob s n st -> 93 | case first fp r eob s n st of 94 | OK# st' a s' n' -> runParserT# (whenSuccess a) fp r eob s' n' st' 95 | Fail# st' -> whenFailure fp r eob s n st' 96 | Err# st' e -> runParserT# (whenError e) fp r eob s n st' 97 | {-# INLINE withAnyResult #-} 98 | 99 | -- | Convert a parsing error into failure. 100 | try :: ParserT st r e a -> ParserT st r e a 101 | try (ParserT p) = ParserT \fp !r eob s n st -> case p fp r eob s n st of 102 | Err# st' _ -> Fail# st' 103 | x -> x 104 | {-# inline try #-} 105 | 106 | -- | Convert a parsing failure to a success. 107 | fails :: ParserT st r e a -> ParserT st r e () 108 | fails (ParserT p) = ParserT \fp !r eob s n st -> 109 | case p fp r eob s n st of 110 | OK# st' _ _ _ -> Fail# st' 111 | Fail# st' -> OK# st' () s n 112 | Err# st' e -> Err# st' e 113 | {-# inline fails #-} 114 | 115 | -- | Convert a parsing failure to an error. 116 | cut :: ParserT st r e a -> e -> ParserT st r e a 117 | cut (ParserT p) e = ParserT \fp !r eob s n st -> case p fp r eob s n st of 118 | Fail# st' -> Err# st' e 119 | x -> x 120 | {-# inline cut #-} 121 | 122 | -- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the 123 | -- inner and the newly given errors using the @e -> e -> e@ function. This can be useful for 124 | -- implementing parsing errors which may propagate hints or accummulate contextual information. 125 | cutting :: ParserT st r e a -> e -> (e -> e -> e) -> ParserT st r e a 126 | cutting (ParserT p) e merge = ParserT \fp !r eob s n st -> case p fp r eob s n st of 127 | Fail# st' -> Err# st' e 128 | Err# st' e' -> Err# st' $! merge e' e 129 | x -> x 130 | {-# inline cutting #-} 131 | 132 | -- | Convert a parsing failure to a `Maybe`. If possible, use `withOption` 133 | -- instead. 134 | optional :: ParserT st r e a -> ParserT st r e (Maybe a) 135 | optional p = (Just <$> p) <|> pure Nothing 136 | {-# inline optional #-} 137 | 138 | -- | Convert a parsing failure to a `()`. 139 | optional_ :: ParserT st r e a -> ParserT st r e () 140 | optional_ p = (() <$ p) <|> pure () 141 | {-# inline optional_ #-} 142 | 143 | -- | CPS'd version of `optional`. This is usually more efficient, since it gets 144 | -- rid of the extra `Maybe` allocation. 145 | withOption 146 | :: ParserT st r e a -> (a -> ParserT st r e ret) 147 | -> ParserT st r e ret -> ParserT st r e ret 148 | withOption (ParserT p) just (ParserT nothing) = ParserT \fp !r eob s n st -> 149 | case p fp r eob s n st of 150 | OK# st' a s n' -> runParserT# (just a) fp r eob s n' st' 151 | Fail# st' -> nothing fp r eob s n st' 152 | Err# st' e -> Err# st' e 153 | {-# inline withOption #-} 154 | 155 | -------------------------------------------------------------------------------- 156 | 157 | -- | Succeed if the input is empty. 158 | eof :: ParserT st r e () 159 | eof = ParserT \fp !r eob s n st -> case eqAddr# eob s of 160 | 1# -> OK# st () s n 161 | _ -> Fail# st 162 | {-# inline eof #-} 163 | 164 | -- | Save the parsing state, then run a parser, then restore the state. 165 | lookahead :: ParserT st r e a -> ParserT st r e a 166 | lookahead (ParserT p) = ParserT \fp !r eob s n st -> 167 | case p fp r eob s n st of 168 | OK# st' a _ n -> OK# st' a s n 169 | x -> x 170 | {-# inline lookahead #-} 171 | 172 | -- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. 173 | -- All isolated bytes must be consumed. 174 | -- 175 | -- Throws a runtime error if given a negative integer. 176 | isolate :: Int -> ParserT st r e a -> ParserT st r e a 177 | isolate = Common.withIntUnwrap# isolate# 178 | {-# inline isolate #-} 179 | 180 | -- | @isolate# n# p@ runs the parser @p@ isolated to the next @n#@ bytes. 181 | -- All isolated bytes must be consumed. 182 | -- 183 | -- Throws a runtime error if given a negative integer. 184 | isolate# :: Int# -> ParserT st r e a -> ParserT st r e a 185 | isolate# n# p = Common.withPosInt# n# (isolateUnsafe# n# p) 186 | {-# inline isolate# #-} 187 | 188 | -- | @isolateUnsafe# i# p@ runs the parser @p@ isolated to the next @i#@ bytes. 189 | -- All isolated bytes must be consumed. 190 | -- 191 | -- Undefined behaviour if given a negative integer. 192 | isolateUnsafe# :: Int# -> ParserT st r e ret -> ParserT st r e ret 193 | isolateUnsafe# i# (ParserT p) = 194 | withEnsure# i# $ ParserT \fp !r eob s n st -> 195 | let s' = plusAddr# s i# 196 | in case p fp r s' s n st of 197 | OK# st' a s'' n' -> 198 | case eqAddr# s' s'' of 199 | 1# -> OK# st' a s'' n' 200 | _ -> Fail# st' 201 | x -> x 202 | {-# inline isolateUnsafe# #-} 203 | 204 | -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, 205 | -- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not 206 | -- the usual `chainl` function from the parsec libraries! 207 | chainl :: (b -> a -> b) -> ParserT st r e b -> ParserT st r e a -> ParserT st r e b 208 | chainl f start elem = start >>= go where 209 | go b = withOption elem (\ !a -> go $! f b a) (pure b) 210 | {-# inline chainl #-} 211 | 212 | -- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and 213 | -- combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not 214 | -- the usual `chainr` function from the parsec libraries! 215 | chainr :: (a -> b -> b) -> ParserT st r e a -> ParserT st r e b -> ParserT st r e b 216 | chainr f (ParserT elem) (ParserT end) = ParserT go where 217 | go fp !r eob s n st = case elem fp r eob s n st of 218 | OK# st' a s' n' -> case go fp r eob s' n' st' of 219 | OK# st'' b s'' n'' -> let !b' = f a b in OK# st'' b' s'' n'' 220 | x -> x 221 | Fail# st' -> end fp r eob s n st' 222 | Err# st' e -> Err# st' e 223 | {-# inline chainr #-} 224 | 225 | -- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. 226 | -- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not 227 | -- backtrack from the true/false cases. 228 | branch :: ParserT st r e a -> ParserT st r e b -> ParserT st r e b -> ParserT st r e b 229 | branch pa pt pf = ParserT \fp !r eob s n st -> case runParserT# pa fp r eob s n st of 230 | OK# st' _ s n' -> runParserT# pt fp r eob s n' st' 231 | Fail# st' -> runParserT# pf fp r eob s n st' 232 | Err# st' e -> Err# st' e 233 | {-# inline branch #-} 234 | 235 | -- | Succeed if the first parser succeeds and the second one fails. 236 | notFollowedBy :: ParserT st r e a -> ParserT st r e b -> ParserT st r e a 237 | notFollowedBy p1 p2 = p1 <* fails p2 238 | {-# inline notFollowedBy #-} 239 | 240 | -------------------------------------------------------------------------------- 241 | 242 | -- | Assert that there are at least @n@ bytes remaining. 243 | -- 244 | -- Undefined behaviour if given a negative integer. 245 | ensure :: Int -> ParserT st r e () 246 | ensure = Common.withIntUnwrap# ensure# 247 | {-# inline ensure #-} 248 | 249 | -- | Assert that there are at least @n#@ bytes remaining. 250 | -- 251 | -- Undefined behaviour if given a negative integer. 252 | ensure# :: Int# -> ParserT st r e () 253 | ensure# n# = withEnsure# n# (pure ()) 254 | {-# inline ensure# #-} 255 | 256 | -- TODO: András: withEnsure operations seem superfluous to me? 257 | -- There's no unboxing in vanilla ensure that could be broken. 258 | 259 | -- | Assert that there are at least @n#@ bytes remaining (CPS). 260 | -- 261 | -- Undefined behaviour if given a negative integer. 262 | withEnsure :: Int -> ParserT st r e ret -> ParserT st r e ret 263 | withEnsure = Common.withIntUnwrap# withEnsure# 264 | {-# inline withEnsure #-} 265 | 266 | -- | Assert that there is at least 1 byte remaining (CPS). 267 | -- 268 | -- Undefined behaviour if given a negative integer. 269 | withEnsure1 :: ParserT st r e ret -> ParserT st r e ret 270 | withEnsure1 (ParserT p) = ParserT \fp !r eob s n st -> 271 | case eqAddr# eob s of 272 | 0# -> p fp r eob s n st 273 | _ -> Fail# st 274 | {-# inline withEnsure1 #-} 275 | 276 | -- | Assert that there are at least @n#@ bytes remaining (CPS). 277 | -- 278 | -- Undefined behaviour if given a negative integer. 279 | withEnsure# :: Int# -> ParserT st r e ret -> ParserT st r e ret 280 | withEnsure# n# (ParserT p) = ParserT \fp !r eob s n st -> 281 | case n# <=# minusAddr# eob s of 282 | 1# -> p fp r eob s n st 283 | _ -> Fail# st 284 | {-# inline withEnsure# #-} 285 | 286 | -------------------------------------------------------------------------------- 287 | 288 | -- | Read @n@ bytes as a 'ByteString'. Fails if fewer than @n@ bytes are 289 | -- available. 290 | -- 291 | -- Throws a runtime error if given a negative integer. 292 | -- 293 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 294 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 295 | take :: Int -> ParserT st r e B.ByteString 296 | take (I# n#) = take# n# 297 | {-# inline take #-} 298 | 299 | -- | Read @n#@ bytes as a 'ByteString'. Fails if fewer than @n#@ bytes are 300 | -- available. 301 | -- 302 | -- Throws a runtime error if given a negative integer. 303 | -- 304 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 305 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 306 | take# :: Int# -> ParserT st r e B.ByteString 307 | take# n# = Common.withPosInt# n# (takeUnsafe# n#) 308 | {-# inline take# #-} 309 | 310 | -- | Read @i#@ bytes as a 'ByteString'. Fails if newer than @i#@ bytes are 311 | -- available. 312 | -- 313 | -- Undefined behaviour if given a negative integer. 314 | -- 315 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 316 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 317 | takeUnsafe# :: Int# -> ParserT st r e B.ByteString 318 | takeUnsafe# i# = ParserT \fp !r eob s n st -> 319 | case i# <=# minusAddr# eob s of 320 | 1# -> OK# st (B.PS (ForeignPtr s fp) 0 (I# i#)) (plusAddr# s i#) n 321 | _ -> Fail# st 322 | {-# inline takeUnsafe# #-} 323 | 324 | -- | Consume the rest of the input. May return the empty bytestring. 325 | -- 326 | -- This does no copying. The 'B.ByteString' returned is a "slice" of the input, 327 | -- and will keep it alive. To avoid this, use 'B.copy' on the output. 328 | takeRest :: ParserT st r e B.ByteString 329 | takeRest = ParserT \fp !r eob s n st -> 330 | let i# = minusAddr# eob s 331 | in OK# st (B.PS (ForeignPtr s fp) 0 (I# i#)) eob n 332 | {-# inline takeRest #-} 333 | 334 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. 335 | -- 336 | -- Throws a runtime error if given a negative integer. 337 | skip :: Int -> ParserT st r e () 338 | skip (I# n#) = skip# n# 339 | {-# inline skip #-} 340 | 341 | -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. 342 | -- 343 | -- Throws a runtime error if given a negative integer. 344 | skip# :: Int# -> ParserT st r e () 345 | skip# n# = atSkip# n# (pure ()) 346 | {-# inline skip# #-} 347 | 348 | -- | Go back @i@ bytes in the input. Takes a positive integer. 349 | -- 350 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea. 351 | skipBack :: Int -> ParserT st r e () 352 | skipBack = Common.withIntUnwrap# skipBack# 353 | {-# inline skipBack #-} 354 | 355 | -- | Go back @i#@ bytes in the input. Takes a positive integer. 356 | -- 357 | -- Extremely unsafe. Makes no checks. Almost certainly a Bad Idea. 358 | skipBack# :: Int# -> ParserT st r e () 359 | skipBack# i# = ParserT \fp !r eob s n st -> 360 | OK# st () (plusAddr# s (negateInt# i#)) n 361 | {-# inline skipBack# #-} 362 | 363 | -- | Skip forward @n#@ bytes and run the given parser. Fails if fewer than @n#@ 364 | -- bytes are available. 365 | -- 366 | -- Throws a runtime error if given a negative integer. 367 | atSkip# :: Int# -> ParserT st r e ret -> ParserT st r e ret 368 | atSkip# n# p = Common.withPosInt# n# (atSkipUnsafe# n# p) 369 | {-# inline atSkip# #-} 370 | 371 | -- | Skip forward @i#@ bytes and run the given parser. Fails if fewer than @i@ 372 | -- bytes are available. 373 | -- 374 | -- Undefined behaviour if given a negative integer. 375 | atSkipUnsafe# :: Int# -> ParserT st r e ret -> ParserT st r e ret 376 | atSkipUnsafe# i# (ParserT p) = 377 | withEnsure# i# $ ParserT \fp !r eob s n st -> 378 | p fp r eob (plusAddr# s i#) n st 379 | {-# inline atSkipUnsafe# #-} 380 | 381 | -------------------------------------------------------------------------------- 382 | 383 | -- | Skip a parser zero or more times. 384 | skipMany :: ParserT st r e a -> ParserT st r e () 385 | skipMany (ParserT f) = ParserT go where 386 | go fp !r eob s n st = case f fp r eob s n st of 387 | OK# st a s n -> go fp r eob s n st 388 | Fail# st -> OK# st () s n 389 | Err# st e -> Err# st e 390 | {-# inline skipMany #-} 391 | 392 | -- TODO identical to one from parser-combinators 393 | -- | Skip a parser one or more times. 394 | skipSome :: ParserT st r e a -> ParserT st r e () 395 | skipSome p = p *> skipMany p 396 | {-# inline skipSome #-} 397 | -------------------------------------------------------------------------------- /src/FlatParse/Stateful.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE UnboxedTuples #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE KindSignatures #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE ScopedTypeVariables #-} 7 | 8 | {-| 9 | Parser supporting a custom reader environment, custom error types and an 'Int' 10 | state. A common use case of the `Int` state is to keep track of column numbers 11 | to implement indentation-sensitive parsers. 12 | -} 13 | 14 | module FlatParse.Stateful ( 15 | 16 | -- * Parser types 17 | FP.Parser.ParserT(..) 18 | , FP.Parser.Parser, FP.Parser.ParserIO, FP.Parser.ParserST 19 | 20 | -- * Running parsers 21 | , Result(..) 22 | , runParser 23 | , runParserUtf8 24 | , runParserIO 25 | , runParserST 26 | , embedParserST 27 | , embedParser 28 | 29 | -- ** Primitive result types 30 | , type FP.Parser.Res# 31 | , pattern FP.Parser.OK#, pattern FP.Parser.Err#, pattern FP.Parser.Fail# 32 | , type FP.Parser.ResI# 33 | 34 | -- * Embedding `ST` operations 35 | , liftST 36 | 37 | -- * Environment operations 38 | , ask 39 | , local 40 | 41 | -- * State operations 42 | , get 43 | , put 44 | , modify 45 | 46 | -- * UTF conversion 47 | , Common.strToUtf8 48 | , Common.utf8ToStr 49 | 50 | -- * Character predicates 51 | , Common.isDigit 52 | , Common.isLatinLetter 53 | , Common.isGreekLetter 54 | 55 | -- * Parsers 56 | -- ** Bytewise 57 | , FP.Base.eof 58 | , FP.Base.take 59 | , FP.Base.take# 60 | , FP.Base.takeUnsafe# 61 | , FP.Base.takeRest 62 | , FP.Base.skip 63 | , FP.Base.skip# 64 | , FP.Base.skipBack 65 | , FP.Base.skipBack# 66 | , FP.Base.atSkip# 67 | , FP.Base.atSkipUnsafe# 68 | 69 | , FP.Bytes.bytes 70 | , FP.Bytes.bytesUnsafe 71 | , byteString 72 | , anyCString 73 | , anyVarintProtobuf 74 | 75 | -- ** Combinators 76 | , (FP.Parser.<|>) 77 | , FP.Base.branch 78 | , FP.Base.notFollowedBy 79 | , FP.Base.chainl 80 | , FP.Base.chainr 81 | , FP.Base.lookahead 82 | , FP.Base.ensure 83 | , FP.Base.ensure# 84 | , FP.Base.withEnsure 85 | , FP.Base.withEnsure1 86 | , FP.Base.withEnsure# 87 | , FP.Base.isolate 88 | , isolateToNextNull 89 | , FP.Base.isolate# 90 | , FP.Base.isolateUnsafe# 91 | , FP.Switch.switch 92 | , FP.Switch.switchWithPost 93 | , FP.Switch.rawSwitchWithPost 94 | , Control.Applicative.many 95 | , FP.Base.skipMany 96 | , Control.Applicative.some 97 | , FP.Base.skipSome 98 | 99 | -- ** Errors and failures 100 | , Control.Applicative.empty 101 | , FP.Base.failed 102 | , FP.Base.try 103 | , FP.Base.err 104 | , FP.Base.withError 105 | , FP.Base.withAnyResult 106 | , FP.Base.fails 107 | , FP.Base.cut 108 | , FP.Base.cutting 109 | , FP.Base.optional 110 | , FP.Base.optional_ 111 | , FP.Base.withOption 112 | 113 | -- ** Positions 114 | , FlatParse.Common.Position.Pos(..) 115 | , FlatParse.Common.Position.endPos 116 | , FlatParse.Common.Position.addrToPos# 117 | , FlatParse.Common.Position.posToAddr# 118 | , FlatParse.Common.Position.Span(..) 119 | , FlatParse.Common.Position.unsafeSlice 120 | , getPos 121 | , setPos 122 | , spanOf 123 | , withSpan 124 | , byteStringOf 125 | , withByteString 126 | , inSpan 127 | , Basic.validPos 128 | , Basic.posLineCols 129 | , Basic.mkPos 130 | 131 | -- ** Text 132 | -- *** UTF-8 133 | , FP.Text.char, FP.Text.string 134 | , FP.Text.anyChar, FP.Text.skipAnyChar 135 | , FP.Text.satisfy, FP.Text.skipSatisfy 136 | , FP.Text.fusedSatisfy, FP.Text.skipFusedSatisfy 137 | , FP.Text.takeLine 138 | , FP.Text.takeRestString 139 | , Basic.linesUtf8 140 | 141 | -- *** ASCII 142 | , FP.Text.anyAsciiChar, FP.Text.skipAnyAsciiChar 143 | , FP.Text.satisfyAscii, FP.Text.skipSatisfyAscii 144 | 145 | -- *** ASCII-encoded numbers 146 | , FP.Text.anyAsciiDecimalWord 147 | , FP.Text.anyAsciiDecimalInt 148 | , FP.Text.anyAsciiDecimalInteger 149 | , FP.Text.anyAsciiHexWord 150 | , FP.Text.anyAsciiHexInt 151 | 152 | -- ** Machine integers 153 | , module FP.Integers 154 | 155 | -- ** Debugging parsers 156 | , FP.Text.traceLine 157 | , FP.Text.traceRest 158 | 159 | -- * Unsafe 160 | , unsafeSpanToByteString 161 | 162 | -- ** IO 163 | , unsafeLiftIO 164 | 165 | -- ** Parsers 166 | , module FP.Addr 167 | , anyCStringUnsafe 168 | 169 | ) where 170 | 171 | -- for WORDS_BIGENDIAN 172 | #include "MachDeps.h" 173 | 174 | import qualified FlatParse.Basic as Basic 175 | import FlatParse.Stateful.Parser 176 | import FlatParse.Stateful.Base 177 | import FlatParse.Stateful.Integers 178 | import FlatParse.Stateful.Addr 179 | import FlatParse.Common.Position 180 | import qualified FlatParse.Common.Assorted as Common 181 | import qualified FlatParse.Common.Numbers as Common 182 | 183 | import qualified FlatParse.Stateful.Parser as FP.Parser 184 | import qualified FlatParse.Stateful.Base as FP.Base 185 | import qualified FlatParse.Stateful.Integers as FP.Integers 186 | import qualified FlatParse.Stateful.Bytes as FP.Bytes 187 | import qualified FlatParse.Stateful.Text as FP.Text 188 | import qualified FlatParse.Stateful.Switch as FP.Switch 189 | import qualified FlatParse.Stateful.Addr as FP.Addr 190 | 191 | import qualified Control.Applicative 192 | import GHC.IO (IO(..), unsafeIOToST) 193 | import GHC.Int 194 | import GHC.Exts 195 | import GHC.ForeignPtr 196 | import GHC.ST (ST(..)) 197 | import System.IO.Unsafe 198 | 199 | import qualified Data.ByteString as B 200 | import qualified Data.ByteString.Internal as B 201 | import qualified Data.ByteString.Unsafe as B 202 | 203 | -------------------------------------------------------------------------------- 204 | 205 | -- | Higher-level boxed data type for parsing results. 206 | data Result e a = 207 | OK a Int !(B.ByteString) -- ^ Contains return value, last `Int` state, unconsumed input. 208 | | Fail -- ^ Recoverable-by-default failure. 209 | | Err !e -- ^ Unrecoverble-by-default error. 210 | deriving Show 211 | 212 | instance Functor (Result e) where 213 | fmap f (OK a s n) = let !b = f a in OK b s n 214 | fmap f r = unsafeCoerce# r 215 | {-# inline fmap #-} 216 | (<$) a (OK _ s n) = OK a s n 217 | (<$) _ r = unsafeCoerce# r 218 | {-# inline (<$) #-} 219 | 220 | -- | Embed an IO action in a 'ParserT'. This is slightly safer than 'unsafePerformIO' because 221 | -- it will sequenced correctly with respect to the surrounding actions, and its execution is guaranteed. 222 | unsafeLiftIO :: IO a -> ParserT st r e a 223 | unsafeLiftIO io = ParserT \fp !r eob s n st -> 224 | let !a = unsafePerformIO io 225 | in OK# st a s n 226 | {-# inline unsafeLiftIO #-} 227 | 228 | -------------------------------------------------------------------------------- 229 | 230 | -- | Run a pure parser. The `Int` argument is the initial state. 231 | runParser :: Parser r e a -> r -> Int -> B.ByteString -> Result e a 232 | runParser (ParserT f) !r (I# n) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = unsafeDupablePerformIO $ 233 | B.unsafeUseAsCString b \(Ptr buf) -> do 234 | let end = plusAddr# buf len 235 | pure case f fp r end buf n proxy# of 236 | OK# _st a s n' -> let offset = minusAddr# s buf 237 | in OK a (I# n') (B.drop (I# offset) b) 238 | 239 | Err# _st e -> Err e 240 | Fail# _st -> Fail 241 | {-# noinline runParser #-} 242 | -- We mark this as noinline to allow power users to safely do unsafe state token coercions. 243 | -- Details are discussed in https://github.com/AndrasKovacs/flatparse/pull/34#issuecomment-1326999390 244 | 245 | -- | Run a parser on a 'String', converting it to the corresponding UTF-8 bytes. 246 | -- The `Int` argument is the initial state. 247 | -- 248 | -- Reminder: @OverloadedStrings@ for 'B.ByteString' does not yield a valid UTF-8 249 | -- encoding! For non-ASCII 'B.ByteString' literal input, use this wrapper or 250 | -- convert your input using `strToUtf8`. 251 | runParserUtf8 :: Parser r e a -> r -> Int -> String -> Result e a 252 | runParserUtf8 pa r !n s = runParser pa r n (Common.strToUtf8 s) 253 | 254 | -- | Run an `ST`-based parser. The `Int` argument is the initial state. 255 | runParserST :: ParserST s r e a -> r -> Int -> B.ByteString -> ST s (Result e a) 256 | runParserST pst !r i buf = unsafeIOToST (runParserIO (unsafeCoerce# pst) r i buf) 257 | {-# inlinable runParserST #-} 258 | 259 | -- | Run an `IO`-based parser. The `Int` argument is the initial state. 260 | runParserIO :: ParserIO r e a -> r -> Int -> B.ByteString -> IO (Result e a) 261 | runParserIO (ParserT f) !r (I# n) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = do 262 | B.unsafeUseAsCString b \(Ptr buf) -> do 263 | let end = plusAddr# buf len 264 | IO \st -> case f fp r end buf n st of 265 | OK# rw' a s n' -> let offset = minusAddr# s buf 266 | in (# rw', OK a (I# n') (B.drop (I# offset) b) #) 267 | 268 | Err# rw' e -> (# rw', Err e #) 269 | Fail# rw' -> (# rw', Fail #) 270 | {-# inlinable runParserIO #-} 271 | 272 | -- | Run a `ParserST` inside any parser. 273 | embedParserST :: forall s r e a. (forall s. ParserST s r e a) -> ParserT s r e a 274 | embedParserST f = unsafeCoerce# (f :: ParserST RealWorld r e a) 275 | {-# inline embedParserST #-} 276 | 277 | -- | Run a pure `Parser` inside any parser. 278 | embedParser :: forall s r e a. Parser r e a -> ParserT s r e a 279 | embedParser f = unsafeCoerce# f 280 | {-# inline embedParser #-} 281 | 282 | -------------------------------------------------------------------------------- 283 | 284 | -- | Run an `ST` action in a `ParserST`. 285 | liftST :: ST s a -> ParserST s r e a 286 | liftST (ST f) = ParserT \fp !r eob s n st -> case f st of 287 | (# st, a #) -> OK# st a s n 288 | {-# inline liftST #-} 289 | 290 | -------------------------------------------------------------------------------- 291 | 292 | -- | Query the `Int` state. 293 | get :: ParserT st r e Int 294 | get = ParserT \fp !r eob s n st -> OK# st (I# n) s n 295 | {-# inline get #-} 296 | 297 | -- | Write the `Int` state. 298 | put :: Int -> ParserT st r e () 299 | put (I# n) = ParserT \fp !r eob s _ st -> OK# st () s n 300 | {-# inline put #-} 301 | 302 | -- | Modify the `Int` state. 303 | modify :: (Int -> Int) -> ParserT st r e () 304 | modify f = ParserT \fp !r eob s n st -> 305 | case f (I# n) of 306 | I# n -> OK# st () s n 307 | {-# inline modify #-} 308 | 309 | -- | Query the environment. 310 | ask :: ParserT st r e r 311 | ask = ParserT \fp !r eob s n st -> OK# st r s n 312 | {-# inline ask #-} 313 | 314 | -- | Run a parser in a modified environment. 315 | local :: (r -> r) -> ParserT st r e a -> ParserT st r e a 316 | local f (ParserT g) = ParserT \fp !r eob s n st -> let !r' = f r in g fp r' eob s n st 317 | {-# inline local #-} 318 | 319 | -------------------------------------------------------------------------------- 320 | 321 | -- | Parse a given `B.ByteString`. If the bytestring is statically known, consider using 'bytes' instead. 322 | byteString :: B.ByteString -> ParserT st r e () 323 | byteString (B.PS (ForeignPtr bs fcontent) _ (I# len)) = 324 | 325 | let go64 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e () 326 | go64 bs bsend s n rw = 327 | let bs' = plusAddr# bs 8# in 328 | case gtAddr# bs' bsend of 329 | 1# -> go8 bs bsend s n rw 330 | #if MIN_VERSION_base(4,17,0) 331 | _ -> case eqWord64# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of 332 | #else 333 | _ -> case eqWord# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of 334 | #endif 335 | 1# -> go64 bs' bsend (plusAddr# s 8#) n rw 336 | _ -> Fail# rw 337 | 338 | go8 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e () 339 | go8 bs bsend s n rw = case ltAddr# bs bsend of 340 | #if MIN_VERSION_base(4,16,0) 341 | 1# -> case eqWord8# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of 342 | #else 343 | 1# -> case eqWord# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of 344 | #endif 345 | 1# -> go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) n rw 346 | _ -> Fail# rw 347 | _ -> OK# rw () s n 348 | 349 | go :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> Res# (State# RealWorld) e () 350 | go bs bsend s n rw = case go64 bs bsend s n rw of 351 | (# rw', res #) -> case touch# fcontent rw' of 352 | rw'' -> (# rw'', res #) 353 | 354 | in ParserT \fp !r eob s n st -> 355 | case len <=# minusAddr# eob s of 356 | 1# -> case runRW# (go bs (plusAddr# bs len) s n) of 357 | (# rw, a #) -> (# st, a #) 358 | _ -> Fail# st 359 | {-# inline byteString #-} 360 | 361 | -------------------------------------------------------------------------------- 362 | 363 | -- | Get the current position in the input. 364 | getPos :: ParserT st r e Pos 365 | getPos = ParserT \fp !r eob s n st -> OK# st (addrToPos# eob s) s n 366 | {-# inline getPos #-} 367 | 368 | -- | Set the input position. 369 | -- 370 | -- Warning: this can result in crashes if the position points outside the 371 | -- current buffer. It is always safe to 'setPos' values which came from 'getPos' 372 | -- with the current input. 373 | setPos :: Pos -> ParserT st r e () 374 | setPos s = ParserT \fp !r eob _ n st -> OK# st () (posToAddr# eob s) n 375 | {-# inline setPos #-} 376 | 377 | -- | Return the consumed span of a parser. Use `withSpan` if possible for better efficiency. 378 | spanOf :: ParserT st r e a -> ParserT st r e Span 379 | spanOf (ParserT f) = ParserT \fp !r eob s n st -> case f fp r eob s n st of 380 | OK# st' a s' n -> OK# st' (Span (addrToPos# eob s) (addrToPos# eob s')) s' n 381 | x -> unsafeCoerce# x 382 | {-# inline spanOf #-} 383 | 384 | -- | Bind the result together with the span of the result. CPS'd version of `spanOf` 385 | -- for better unboxing. 386 | withSpan :: ParserT st r e a -> (a -> Span -> ParserT st r e b) -> ParserT st r e b 387 | withSpan (ParserT f) g = ParserT \fp !r eob s n st -> case f fp r eob s n st of 388 | OK# st' a s' n -> runParserT# (g a (Span (addrToPos# eob s) (addrToPos# eob s'))) fp r eob s' n st' 389 | x -> unsafeCoerce# x 390 | {-# inline withSpan #-} 391 | 392 | -- | Return the `B.ByteString` consumed by a parser. Note: it's more efficient to use `spanOf` and 393 | -- `withSpan` instead. 394 | byteStringOf :: ParserT st r e a -> ParserT st r e B.ByteString 395 | byteStringOf (ParserT f) = ParserT \fp !r eob s n st -> case f fp r eob s n st of 396 | OK# st' a s' n -> OK# st' (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) s' n 397 | x -> unsafeCoerce# x 398 | {-# inline byteStringOf #-} 399 | 400 | -- | CPS'd version of `byteStringOf`. Can be more efficient, because the result is more eagerly unboxed 401 | -- by GHC. It's more efficient to use `spanOf` or `withSpan` instead. 402 | withByteString :: ParserT st r e a -> (a -> B.ByteString -> ParserT st r e b) -> ParserT st r e b 403 | withByteString (ParserT f) g = ParserT \fp !r eob s n st -> case f fp r eob s n st of 404 | OK# st' a s' n -> runParserT# (g a (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s)))) fp r eob s' n st' 405 | x -> unsafeCoerce# x 406 | {-# inline withByteString #-} 407 | 408 | -- | Run a parser in a given input 'Span'. 409 | -- 410 | -- The input position and the parser state is restored after the parser is 411 | -- finished, so 'inSpan' does not consume input and has no side effect. 412 | -- 413 | -- Warning: this operation may crash if the given span points outside the 414 | -- current parsing buffer. It's always safe to use 'inSpan' if the 'Span' comes 415 | -- from a previous 'withSpan' or 'spanOf' call on the current input. 416 | inSpan :: Span -> ParserT st r e a -> ParserT st r e a 417 | inSpan (Span s eob) (ParserT f) = ParserT \fp !r eob' s' n' st -> 418 | case f fp r (posToAddr# eob' eob) (posToAddr# eob' s) n' st of 419 | OK# st' a _ _ -> OK# st' a s' n' 420 | x -> unsafeCoerce# x 421 | {-# inline inSpan #-} 422 | 423 | -------------------------------------------------------------------------------- 424 | 425 | -- | Create a 'B.ByteString' from a 'Span'. 426 | -- 427 | -- The result is invalid if the 'Span' points outside the current buffer, or if 428 | -- the 'Span' start is greater than the end position. 429 | unsafeSpanToByteString :: Span -> ParserT st r e B.ByteString 430 | unsafeSpanToByteString (Span l r) = 431 | lookahead (setPos l >> byteStringOf (setPos r)) 432 | {-# inline unsafeSpanToByteString #-} 433 | 434 | -------------------------------------------------------------------------------- 435 | 436 | -- | Isolate the given parser up to (excluding) the next null byte. 437 | -- 438 | -- Like 'isolate', all isolated bytes must be consumed. The null byte is 439 | -- consumed afterwards. 440 | -- 441 | -- Useful for defining parsers for null-terminated data. 442 | isolateToNextNull :: ParserT st r e a -> ParserT st r e a 443 | isolateToNextNull (ParserT p) = ParserT \fp !r eob s n st -> go fp r n eob s st s 444 | where 445 | goP fp r n sNull s0 st = 446 | case p fp r sNull s0 n st of 447 | OK# st' a s' n' -> 448 | case eqAddr# s' sNull of 449 | 1# -> -- consumed up to null: skip null, return 450 | OK# st' a (sNull `plusAddr#` 1#) n' 451 | _ -> Fail# st' -- didn't consume fully up to null: fail 452 | x -> x 453 | 454 | go8 fp r n eob s0 st s = 455 | case eqAddr# eob s of 456 | 1# -> Fail# st -- end of input, no null: fail 457 | _ -> 458 | let s' = s `plusAddr#` 1# in 459 | #if MIN_VERSION_base(4,16,0) 460 | -- below may be made clearer with ExtendedLiterals (GHC 9.8) 461 | case eqWord8# (indexWord8OffAddr# s 0#) (wordToWord8# 0##) of 462 | #else 463 | case eqWord# (indexWord8OffAddr# s 0#) 0## of 464 | #endif 465 | 1# -> goP fp r n s s0 st -- 0x00: isolate, execute parser 466 | _ -> go8 fp r n eob s0 st s' -- not 0x00: next please! 467 | 468 | {- The "find first null byte" algorithms used here are adapted from 469 | Hacker's Delight (2012) ch.6. 470 | 471 | We read a word (8 bytes) at a time for efficiency. The internal algorithm 472 | does byte indexing, thus endianness matters. We switch between indexing 473 | algorithms depending on compile-time native endianness. (The code 474 | surrounding the indexing is endian-independent, so we do this inline). 475 | -} 476 | go fp r n eob s0 st s = 477 | let sWord = s `plusAddr#` 8# in 478 | case gtAddr# sWord eob of 479 | 1# -> -- < 8 bytes of input: revert to scanning byte by byte 480 | -- we _could_ operate on a word and simply ensure not to use the 481 | -- out-of-bounds data, which would be faster, but the act of 482 | -- reading could probably segfault 483 | go8 fp r n eob s0 st s 484 | _ -> -- >= 8 bytes of input: use efficient 8-byte scanning 485 | #if defined(WORDS_BIGENDIAN) 486 | -- big-endian ("L->R"): find leftmost null byte 487 | let !x@(I64# x#) = Common.zbytel'intermediate (I64# (indexInt64OffAddr# s 0#)) in 488 | #else 489 | -- little-endian ("R->L"): find rightmost null byte 490 | let !x@(I64# x#) = Common.zbyter'intermediate (I64# (indexInt64OffAddr# s 0#)) in 491 | #endif 492 | #if MIN_VERSION_base(4,17,0) 493 | case eqInt64# x# (intToInt64# 0#) of 494 | #else 495 | case x# ==# 0# of 496 | #endif 497 | 1# -> go fp r n eob s0 st sWord -- no 0x00 in next word 498 | _ -> -- 0x00 somewhere in next word 499 | #if defined(WORDS_BIGENDIAN) 500 | let !(I# nullIdx#) = Common.zbytel'toIdx x in 501 | #else 502 | let !(I# nullIdx#) = Common.zbyter'toIdx x in 503 | -- TO TEST BE ON LE: change above CPP to zbytel, uncomment below 504 | -- let !(I# nullIdx#) = Common.zbytel'toIdx (I# (word2Int# (byteSwap# (int2Word# x#)))) in 505 | #endif 506 | let sNull = s `plusAddr#` nullIdx# in 507 | goP fp r n sNull s0 st 508 | {-# inline isolateToNextNull #-} 509 | 510 | -- | Read a null-terminated bytestring (a C-style string). 511 | -- 512 | -- Consumes the null terminator. 513 | anyCString :: ParserT st r e B.ByteString 514 | anyCString = isolateToNextNull takeRest 515 | {-# inline anyCString #-} 516 | 517 | -- | Read a null-terminated bytestring (a C-style string), where the bytestring 518 | -- is known to be null-terminated somewhere in the input. 519 | -- 520 | -- Highly unsafe. Unless you have a guarantee that the string will be null 521 | -- terminated before the input ends, use 'anyCString' instead. Honestly, I'm not 522 | -- sure if this is a good function to define. But here it is. 523 | -- 524 | -- Fails on GHC versions older than 9.0, since we make use of the 525 | -- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful 526 | -- without it. 527 | -- 528 | -- Consumes the null terminator. 529 | anyCStringUnsafe :: ParserT st r e B.ByteString 530 | {-# inline anyCStringUnsafe #-} 531 | #if MIN_VERSION_base(4,15,0) 532 | anyCStringUnsafe = ParserT \fp !r eob s n st -> 533 | case eqAddr# eob s of 534 | 1# -> Fail# st 535 | _ -> let n# = cstringLength# s 536 | s'# = plusAddr# s (n# +# 1#) 537 | in OK# st (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# n 538 | #else 539 | anyCStringUnsafe = error "Flatparse.Stateful.anyCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" 540 | #endif 541 | 542 | -- | Read a protobuf-style varint into a positive 'Int'. 543 | -- 544 | -- protobuf-style varints are byte-aligned. For each byte, the lower 7 bits are 545 | -- data and the MSB indicates if there are further bytes. Once fully parsed, the 546 | -- 7-bit payloads are concatenated and interpreted as a little-endian unsigned 547 | -- integer. 548 | -- 549 | -- Fails if the varint exceeds the positive 'Int' range. 550 | -- 551 | -- Really, these are varnats. They also match with the LEB128 varint encoding. 552 | -- 553 | -- protobuf encodes negatives in unsigned integers using zigzag encoding. See 554 | -- the @fromZigzag@ family of functions for this functionality. 555 | -- 556 | -- Further reading: 557 | -- https://developers.google.com/protocol-buffers/docs/encoding#varints 558 | anyVarintProtobuf :: ParserT st r e Int 559 | anyVarintProtobuf = ParserT \fp !r eob s n st -> 560 | case Common.anyVarintProtobuf# eob s of 561 | (# (##) | #) -> Fail# st 562 | (# | (# w#, s#, bits# #) #) -> 563 | case bits# ># (WORD_SIZE_IN_BITS# -# 1#) of 564 | 0# -> OK# st (I# w#) s# n 565 | _ -> Fail# st -- overflow 566 | {-# inline anyVarintProtobuf #-} 567 | --------------------------------------------------------------------------------