├── Setup.hs ├── .gitignore ├── Text ├── Earley │ ├── Generator.hs │ ├── Parser.hs │ ├── Derived.hs │ ├── Mixfix.hs │ ├── Grammar.hs │ ├── Generator │ │ └── Internal.hs │ └── Parser │ │ └── Internal.hs └── Earley.hs ├── tests ├── Arbitrary.hs ├── Generator.hs ├── Main.hs ├── InlineAlts.hs ├── ReversedWords.hs ├── VeryAmbiguous.hs ├── Empty.hs ├── Constraint.hs ├── Issue14.hs ├── Mixfix.hs ├── UnbalancedPars.hs ├── Optional.hs ├── Issue11.hs ├── Lambda.hs └── Expr.hs ├── examples ├── VeryAmbiguous.hs ├── Infinite.hs ├── Words.hs ├── Expr.hs ├── Expr2.hs ├── RomanNumerals.hs ├── English.hs └── Mixfix.hs ├── experiments ├── LICENSE ├── CHANGELOG.md ├── bench └── BenchAll.hs ├── Earley.cabal ├── README.md └── docs └── implementation.md /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hi 2 | *.o 3 | *.aux 4 | *.hp 5 | *.prof 6 | dist/ 7 | .cabal-sandbox 8 | cabal.sandbox.config 9 | dist-newstyle/ 10 | .stack-work 11 | -------------------------------------------------------------------------------- /Text/Earley/Generator.hs: -------------------------------------------------------------------------------- 1 | module Text.Earley.Generator 2 | ( Result(..) 3 | , Generator 4 | , generator 5 | , language 6 | , upTo 7 | , exactly 8 | ) where 9 | import Text.Earley.Generator.Internal 10 | 11 | -------------------------------------------------------------------------------- /Text/Earley/Parser.hs: -------------------------------------------------------------------------------- 1 | -- | Parsing. 2 | module Text.Earley.Parser 3 | ( Report(..) 4 | , Result(..) 5 | , Parser 6 | , parser 7 | , allParses 8 | , fullParses 9 | , report 10 | ) where 11 | import Text.Earley.Parser.Internal 12 | -------------------------------------------------------------------------------- /tests/Arbitrary.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | module Arbitrary where 3 | 4 | import qualified Test.QuickCheck as QC 5 | 6 | import Text.Earley.Generator 7 | 8 | -- | Generate an arbitrary member generated by a 'Generator'. 9 | arbitrary :: Generator t a -> QC.Gen (a, [t]) 10 | arbitrary gen = QC.sized $ \n -> QC.elements (take (1 `max` n) xs) 11 | where 12 | xs = language gen 13 | 14 | -------------------------------------------------------------------------------- /examples/VeryAmbiguous.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | import Control.Applicative 3 | import System.Environment 4 | import Text.Earley 5 | 6 | g :: Grammar r (Prod r Char Char ()) 7 | g = mdo 8 | s <- rule $ () <$ token 'b' 9 | <|> () <$ s <* s 10 | <|> () <$ s <* s <* s 11 | 's' 12 | return s 13 | 14 | main :: IO () 15 | main = do 16 | xs:_ <- getArgs 17 | print $ report (parser g) xs 18 | -------------------------------------------------------------------------------- /experiments: -------------------------------------------------------------------------------- 1 | 2x slower with HashTable instead of Map for conts and next 2 | 3x faster without ReaderT for ParserState 3 | 6% faster without lens dependency 4 | 10% faster with inlining 5 | 45% faster with homemade, specialised, pipes 6 | 55% faster with CPS transform 7 | 16% faster without using Map of positions (only list) 8 | slower but uses less memory with vector 9 | 23% faster after refactoring 10 | 18% faster without strict fmap f Pure 11 | 30% faster with simpler driver 12 | 13% faster without parser monad 13 | -------------------------------------------------------------------------------- /tests/Generator.hs: -------------------------------------------------------------------------------- 1 | module Generator where 2 | import Control.Applicative 3 | import Test.Tasty 4 | import Test.Tasty.HUnit as HU 5 | 6 | import Text.Earley 7 | 8 | tests :: TestTree 9 | tests = testGroup "Lambda" 10 | [ HU.testCase "Generate exactly 0" $ 11 | exactly 0 (generator (pure $ pure ()) "") @?= [((), [])] 12 | , HU.testCase "Generate upTo 0" $ 13 | upTo 0 (generator (pure $ pure ()) "") @?= [((), [])] 14 | , HU.testCase "Generate exactly 1" $ 15 | exactly 1 (generator (pure $ pure ()) "") @?= [] 16 | , HU.testCase "Generate upTo 1" $ 17 | upTo 1 (generator (pure $ pure ()) "") @?= [((), [])] 18 | ] 19 | -------------------------------------------------------------------------------- /Text/Earley.hs: -------------------------------------------------------------------------------- 1 | -- | Parsing all context-free grammars using Earley's algorithm. 2 | module Text.Earley 3 | ( -- * Context-free grammars 4 | Prod, terminal, (), constraint, Grammar, rule 5 | , -- * Derived operators 6 | satisfy, token, namedToken, anyToken, list, listLike, matches 7 | , -- * Parsing 8 | Report(..), Parser.Result(..), Parser, parser, allParses, fullParses 9 | , -- * Recognition 10 | report 11 | , -- * Language generation 12 | Generator, generator, language, upTo, exactly 13 | ) 14 | where 15 | import Text.Earley.Derived 16 | import Text.Earley.Generator 17 | import Text.Earley.Grammar 18 | import Text.Earley.Parser as Parser 19 | -------------------------------------------------------------------------------- /examples/Infinite.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | import Control.Applicative 3 | import Text.Earley 4 | 5 | grammar :: Grammar r (Prod r () Char [Maybe Char]) 6 | grammar = mdo 7 | as <- rule $ pure [] 8 | <|> (:) <$> optional (token 'a') <*> as 9 | return as 10 | 11 | -- This grammar has an infinite number of results. We can still recognise the 12 | -- language, i.e. get a report, but we can't get the results, because in doing 13 | -- so the library will try to force a circular value. 14 | main :: IO () 15 | main = do 16 | let input = "aaa" 17 | print $ report (parser grammar) input -- Works 18 | print $ fullParses (parser grammar) input -- Hangs 19 | -------------------------------------------------------------------------------- /examples/Words.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | import Data.Char 3 | import Control.Applicative 4 | import System.Environment 5 | 6 | import Text.Earley 7 | 8 | grammar :: Grammar r (Prod r String Char [String]) 9 | grammar = mdo 10 | whitespace <- rule $ () <$ many (satisfy isSpace) 11 | whitespace1 <- rule $ () <$ satisfy isSpace <* whitespace "whitespace" 12 | 13 | ident <- rule 14 | $ (:) <$> satisfy isAlpha <*> many (satisfy isAlphaNum) 15 | "identifier" 16 | 17 | expr <- rule 18 | $ (:) <$> ident <* whitespace1 <*> expr 19 | <|> (:[]) <$> ident <* whitespace 20 | 21 | return expr 22 | 23 | main :: IO () 24 | main = do 25 | x:_ <- getArgs 26 | print $ fullParses (parser grammar) x 27 | -------------------------------------------------------------------------------- /tests/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | import Test.Tasty 3 | 4 | import qualified Constraint 5 | import qualified Empty 6 | import qualified Expr 7 | import qualified Generator 8 | import qualified InlineAlts 9 | import qualified Issue11 10 | import qualified Issue14 11 | import qualified Lambda 12 | import qualified Mixfix 13 | import qualified Optional 14 | import qualified ReversedWords 15 | import qualified UnbalancedPars 16 | import qualified VeryAmbiguous 17 | 18 | main :: IO () 19 | main = defaultMain $ testGroup "Tests" 20 | [ Constraint.tests 21 | , Empty.tests 22 | , Expr.tests 23 | , Generator.tests 24 | , InlineAlts.tests 25 | , Issue11.tests 26 | , Issue14.tests 27 | , Lambda.tests 28 | , Mixfix.tests 29 | , Optional.tests 30 | , ReversedWords.tests 31 | , UnbalancedPars.tests 32 | , VeryAmbiguous.tests 33 | ] 34 | -------------------------------------------------------------------------------- /examples/Expr.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | import Control.Applicative 3 | import Data.Char 4 | import System.Environment 5 | import Text.Earley 6 | 7 | data Expr 8 | = Add Expr Expr 9 | | Mul Expr Expr 10 | | Var String 11 | deriving (Eq, Ord, Show) 12 | 13 | expr :: Grammar r (Prod r String String Expr) 14 | expr = mdo 15 | x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2 16 | <|> x2 17 | "sum" 18 | x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3 19 | <|> x3 20 | "product" 21 | x3 <- rule $ Var <$> (satisfy ident "identifier") 22 | <|> namedToken "(" *> x1 <* namedToken ")" 23 | return x1 24 | where 25 | ident (x:_) = isAlpha x 26 | ident _ = False 27 | 28 | main :: IO () 29 | main = do 30 | x:_ <- getArgs 31 | print $ fullParses (parser expr) $ words x 32 | -------------------------------------------------------------------------------- /tests/InlineAlts.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-} 2 | module InlineAlts where 3 | import Control.Applicative 4 | import Test.Tasty 5 | import Test.Tasty.HUnit as HU 6 | 7 | import Text.Earley 8 | 9 | tests :: TestTree 10 | tests = testGroup "Inline alternatives" 11 | [ HU.testCase "They work when parsed" $ 12 | let input = "ababbbaaabaa" in 13 | allParses (parser inlineAlts) input @?= allParses (parser nonInlineAlts) input 14 | , HU.testCase "They work when generated" $ 15 | take 1000 (language $ generator inlineAlts "ab") @?= 16 | take 1000 (language $ generator nonInlineAlts "ab") 17 | ] 18 | 19 | inlineAlts :: Grammar r (Prod r Char Char String) 20 | inlineAlts = mdo 21 | p <- rule $ pure [] 22 | <|> (:) <$> (namedToken 'a' <|> namedToken 'b') <*> p 23 | return p 24 | 25 | nonInlineAlts :: Grammar r (Prod r Char Char String) 26 | nonInlineAlts = mdo 27 | ab <- rule $ namedToken 'a' <|> namedToken 'b' 28 | p <- rule $ pure [] <|> (:) <$> ab <*> p 29 | return p 30 | -------------------------------------------------------------------------------- /tests/ReversedWords.hs: -------------------------------------------------------------------------------- 1 | module ReversedWords where 2 | import Control.Applicative 3 | import Test.Tasty 4 | import Test.Tasty.HUnit as HU 5 | 6 | import Text.Earley 7 | 8 | someWords :: Grammar r (Prod r () Char [String]) 9 | someWords = return $ flip (:) <$> (map reverse <$> some (list "word")) <*> list "stop" 10 | 11 | tests :: TestTree 12 | tests = testGroup "Reversed words" 13 | [ HU.testCase "Parse" $ 14 | let input = "wordwordstop" 15 | l = length input in 16 | allParses (parser someWords) input 17 | @?= (,) [(["stop", "drow", "drow"], l)] Report { position = l 18 | , expected = [] 19 | , unconsumed = [] 20 | } 21 | , HU.testCase "Generate" $ 22 | upTo 16 (generator someWords "stopwrd") 23 | @?= 24 | [ (["stop", "drow"], "wordstop") 25 | , (["stop", "drow", "drow"], "wordwordstop") 26 | , (["stop","drow","drow","drow"],"wordwordwordstop") 27 | ] 28 | ] 29 | -------------------------------------------------------------------------------- /tests/VeryAmbiguous.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-} 2 | module VeryAmbiguous where 3 | import Control.Applicative 4 | import Test.Tasty 5 | import Test.Tasty.HUnit as HU 6 | 7 | import Text.Earley 8 | 9 | tests :: TestTree 10 | tests = testGroup "Very ambiguous" 11 | [ HU.testCase "Gives the right number of results" $ 12 | length (fst $ fullParses (parser veryAmbiguous) $ replicate 8 'b') @?= 2871 13 | , HU.testCase "Gives the correct report" $ 14 | report (parser veryAmbiguous) (replicate 3 'b') @?= 15 | Report {position = 3, expected = "s", unconsumed = ""} 16 | , HU.testCase "Parser agrees with generator" $ and (do 17 | n <- [0..8] 18 | let str = replicate n 'b' 19 | numParses = length (fst $ fullParses (parser veryAmbiguous) str) 20 | numGens = length $ exactly n $ generator veryAmbiguous "b" 21 | return $ numParses == numGens) 22 | @? "Parser agrees with generator" 23 | ] 24 | 25 | veryAmbiguous :: Grammar r (Prod r Char Char ()) 26 | veryAmbiguous = mdo 27 | s <- rule $ () <$ token 'b' 28 | <|> () <$ s <* s 29 | <|> () <$ s <* s <* s 30 | 's' 31 | return s 32 | -------------------------------------------------------------------------------- /examples/Expr2.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables, RecursiveDo #-} 2 | import Data.Char 3 | import System.Environment 4 | import Control.Applicative 5 | import Text.Earley 6 | 7 | data Expr 8 | = Expr :+: Expr 9 | | Expr :*: Expr 10 | | Var String 11 | | Lit Int 12 | deriving (Show) 13 | 14 | grammar :: forall r. Grammar r (Prod r String Char Expr) 15 | grammar = mdo 16 | 17 | whitespace <- rule $ many $ satisfy isSpace 18 | 19 | let tok :: Prod r String Char a -> Prod r String Char a 20 | tok p = whitespace *> p 21 | 22 | sym x = tok $ token x [x] 23 | 24 | ident = tok $ (:) <$> satisfy isAlpha <*> many (satisfy isAlphaNum) "identifier" 25 | num = tok $ some (satisfy isDigit) "number" 26 | 27 | expr0 <- rule 28 | $ (Lit . read) <$> num 29 | <|> Var <$> ident 30 | <|> sym '(' *> expr2 <* sym ')' 31 | 32 | expr1 <- rule 33 | $ (:*:) <$> expr1 <* sym '*' <*> expr0 34 | <|> expr0 35 | 36 | expr2 <- rule 37 | $ (:+:) <$> expr2 <* sym '+' <*> expr1 38 | <|> expr1 39 | 40 | return $ expr2 <* whitespace 41 | 42 | main :: IO () 43 | main = do 44 | x:_ <- getArgs 45 | print $ fullParses (parser grammar) x 46 | -------------------------------------------------------------------------------- /tests/Empty.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | module Empty where 3 | import Control.Applicative 4 | import Test.Tasty 5 | import Test.Tasty.HUnit as HU 6 | import Test.Tasty.QuickCheck as QC 7 | 8 | import Text.Earley 9 | 10 | tests :: TestTree 11 | tests = testGroup "Empty productions" 12 | [ QC.testProperty "The empty production doesn't parse anything" $ 13 | \(input :: String) -> 14 | allParses (parser emptyGrammar) input 15 | == (,) [] Report { position = 0 16 | , expected = [] 17 | , unconsumed = input 18 | } 19 | , HU.testCase "The empty production doesn't generate anything" $ 20 | language (generator emptyGrammar "abc") @?= [] 21 | , QC.testProperty "Many empty productions parse very little" $ 22 | \(input :: String) -> 23 | allParses (parser manyEmpty) input 24 | == (,) [([], 0)] Report { position = 0 25 | , expected = [] 26 | , unconsumed = input 27 | } 28 | , HU.testCase "Many empty productions generate very little" $ 29 | language (generator manyEmpty "blahc") @?= [([], "")] 30 | ] 31 | 32 | emptyGrammar :: Grammar r (Prod r () Char ()) 33 | emptyGrammar = return empty 34 | 35 | manyEmpty :: Grammar r (Prod r () Char [()]) 36 | manyEmpty = return $ many empty <* pure "blah" 37 | -------------------------------------------------------------------------------- /examples/RomanNumerals.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | module Main where 3 | 4 | import Control.Applicative ((<|>), (<**>)) 5 | import System.Environment (getArgs) 6 | import Text.Earley 7 | 8 | numeral :: String -> Int -> Prod r String Char Int 9 | numeral str n = n <$ list str 10 | 11 | romanNumeralsGrammar :: Grammar r (Prod r String Char Int) 12 | romanNumeralsGrammar = mdo 13 | 14 | thousands <- rule 15 | $ pure 0 16 | <|> numeral "M" 1000 <**> fmap (+) thousands 17 | 18 | le300 <- rule 19 | $ pure 0 20 | <|> numeral "C" 100 21 | <|> numeral "CC" 200 22 | <|> numeral "CCC" 300 23 | 24 | hundreds <- rule 25 | $ le300 26 | <|> numeral "CD" 400 27 | <|> numeral "D" 500 <**> fmap (+) le300 28 | <|> numeral "CM" 900 29 | 30 | le30 <- rule 31 | $ pure 0 32 | <|> numeral "X" 10 33 | <|> numeral "XX" 20 34 | <|> numeral "XXX" 30 35 | 36 | tens <- rule 37 | $ le30 38 | <|> numeral "XL" 40 39 | <|> numeral "L" 50 <**> fmap (+) le30 40 | <|> numeral "XC" 90 41 | 42 | le3 <- rule 43 | $ pure 0 44 | <|> numeral "I" 1 45 | <|> numeral "II" 2 46 | <|> numeral "III" 3 47 | 48 | units <- rule 49 | $ le3 50 | <|> numeral "IV" 4 51 | <|> numeral "V" 5 <**> fmap (+) le3 52 | <|> numeral "IX" 9 53 | 54 | return 55 | $ thousands 56 | <**> fmap (+) hundreds 57 | <**> fmap (+) tens 58 | <**> fmap (+) units 59 | 60 | 61 | main :: IO () 62 | main = do 63 | x:_ <- getArgs 64 | print $ fullParses (parser romanNumeralsGrammar) x 65 | -------------------------------------------------------------------------------- /Text/Earley/Derived.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Rank2Types #-} 2 | -- | Derived operators. 3 | module Text.Earley.Derived where 4 | import Control.Applicative hiding (many) 5 | import Control.Monad (guard) 6 | import Data.ListLike(ListLike) 7 | import qualified Data.ListLike as ListLike 8 | 9 | import Text.Earley.Grammar 10 | import Text.Earley.Parser 11 | 12 | -- | Match a token that satisfies the given predicate. Returns the matched 13 | -- token. See also 'terminal'. 14 | {-# INLINE satisfy #-} 15 | satisfy :: (t -> Bool) -> Prod r e t t 16 | satisfy p = terminal ((<$) <*> guard . p) 17 | 18 | -- | Match a single token. 19 | token :: Eq t => t -> Prod r e t t 20 | token x = satisfy (== x) 21 | 22 | -- | Match a single token and give it the name of the token. 23 | namedToken :: Eq t => t -> Prod r t t t 24 | namedToken x = token x x 25 | 26 | -- | Match a single token with any value 27 | anyToken :: Prod r e t t 28 | anyToken = terminal Just 29 | 30 | -- | Match a list of tokens in sequence. 31 | {-# INLINE list #-} 32 | list :: Eq t => [t] -> Prod r e t [t] 33 | list = listLike 34 | 35 | -- | Match a 'ListLike' of tokens in sequence. 36 | {-# INLINE listLike #-} 37 | listLike :: (Eq t, ListLike i t) => i -> Prod r e t i 38 | listLike = ListLike.foldr (liftA2 ListLike.cons . satisfy . (==)) (pure ListLike.empty) 39 | 40 | -- | Whether or not the grammar matches the input string. Equivalently, 41 | -- whether the given input is in the language described by the grammars. 42 | matches :: ListLike i t => (forall r. Grammar r (Prod r e t a)) -> i -> Bool 43 | matches grammar = not . null . fst . fullParses (parser grammar) 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2019, Olle Fredriksson 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Olle Fredriksson nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /examples/English.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | import Control.Applicative 3 | import Text.Earley 4 | import qualified Data.HashSet as HS 5 | 6 | type Noun = String 7 | type Verb = String 8 | type Adjective = String 9 | 10 | nouns, verbs, adjectives :: HS.HashSet String 11 | nouns = HS.fromList ["parsers", "sentences", "grammars"] 12 | verbs = HS.fromList ["parse", "munch", "annihilate", "confuse", "use"] 13 | adjectives = HS.fromList ["many", "great", "long", "confusing"] 14 | 15 | 16 | data Sentence = Sentence NounPhrase VerbPhrase 17 | deriving Show 18 | data NounPhrase = NounPhrase Adjective NounPhrase 19 | | Noun Noun 20 | deriving Show 21 | data VerbPhrase = VerbPhrase Verb NounPhrase 22 | | Verb Verb 23 | deriving Show 24 | 25 | sentence :: Grammar r (Prod r String String Sentence) 26 | sentence = mdo 27 | noun <- rule $ satisfy (`HS.member` nouns) "noun" 28 | verb <- rule $ satisfy (`HS.member` verbs) "verb" 29 | adjective <- rule $ satisfy (`HS.member` adjectives) "adjective" 30 | nounPhrase <- rule $ NounPhrase <$> adjective <*> nounPhrase 31 | <|> Noun <$> noun 32 | "noun phrase" 33 | verbPhrase <- rule $ VerbPhrase <$> verb <*> nounPhrase 34 | <|> Verb <$> verb 35 | "verb phrase" 36 | return $ Sentence <$> nounPhrase <*> verbPhrase "sentence" 37 | 38 | main :: IO () 39 | main = do 40 | let p = fullParses (parser sentence) . words 41 | print $ p "parsers use grammars" 42 | print $ p "parsers munch long sentences" 43 | print $ p "many great sentences confuse parsers" 44 | print $ p "parsers use use" 45 | print $ p "grammars many great confusing" 46 | -------------------------------------------------------------------------------- /tests/Constraint.hs: -------------------------------------------------------------------------------- 1 | module Constraint where 2 | import Control.Applicative 3 | import Data.List 4 | import Data.Set(fromList) 5 | 6 | import Test.Tasty 7 | import Test.Tasty.HUnit as HU 8 | 9 | import Text.Earley 10 | 11 | oneToken :: Grammar r (Prod r () t t) 12 | oneToken = rule anyToken 13 | 14 | someTokens :: Grammar r (Prod r () t [t]) 15 | someTokens = rule (some anyToken) 16 | 17 | tests :: TestTree 18 | tests = testGroup "New features" 19 | [ HU.testCase "anyToken1" $ 20 | let input = "hello" 21 | l = length input in 22 | allParses (parser oneToken) input 23 | @?= (,) [('h', 1)] Report { position = 1 24 | , expected = [] 25 | , unconsumed = drop 1 input 26 | } 27 | , HU.testCase "anyToken2" $ 28 | allParses (parser oneToken) "" 29 | @?= (,) [] Report { position = 0 30 | , expected = [] 31 | , unconsumed = "" 32 | } 33 | , HU.testCase "anyToken3" $ 34 | let input = "hello" 35 | l = length input in 36 | allParses (parser someTokens) input 37 | @?= (,) [(init, length init) | init <- inits input, not (null init) ] 38 | Report { position = l 39 | , expected = [] 40 | , unconsumed = [] 41 | } 42 | , HU.testCase "constraint" $ 43 | matches noRepeats "salut" 44 | @?= True 45 | , HU.testCase "constraint2" $ 46 | matches noRepeats "hello" 47 | @?= False 48 | , HU.testCase "constraint3" $ 49 | fromList (map fst $ exactly 2 $ generator noRepeats "abcd") 50 | @?= fromList [[x, y] | x <- "abcd", y <- "abcd", x /= y] 51 | ] 52 | 53 | noRepeats = rule $ 54 | constraint (\x -> length x == length (fromList x)) $ 55 | many anyToken 56 | -------------------------------------------------------------------------------- /tests/Issue14.hs: -------------------------------------------------------------------------------- 1 | module Issue14 where 2 | import Control.Applicative 3 | import Test.Tasty 4 | import Test.Tasty.QuickCheck as QC 5 | 6 | import Text.Earley 7 | 8 | tests :: TestTree 9 | tests = testGroup "Issue 14" 10 | [ QC.testProperty "The same rule in alternatives gives many results" $ 11 | \x -> fullParses (parser (issue14 x)) "" 12 | == (,) (replicate (issue14Length x) ()) 13 | Report { position = 0, expected = [], unconsumed = [] } 14 | , QC.testProperty "The same rule in alternatives generates many results" $ 15 | \x -> language (generator (issue14 x) "") 16 | == replicate (issue14Length x) ((), "") 17 | ] 18 | 19 | data Issue14 a 20 | = Pure a 21 | | Alt (Issue14 a) (Issue14 a) 22 | | Ap (Issue14 a) (Issue14 a) 23 | deriving (Eq, Ord, Show) 24 | 25 | instance Arbitrary a => Arbitrary (Issue14 a) where 26 | arbitrary = sized arbTree 27 | where arbTree n | n > 0 = oneof [ Pure <$> arbitrary 28 | , Alt <$> arbTree1 <*> arbTree1 29 | , Ap <$> arbTree1 <*> arbTree1 30 | ] 31 | where arbTree1 = arbTree (n `div` 2) 32 | arbTree _ = Pure <$> arbitrary 33 | 34 | shrink (Pure a) = Pure <$> shrink a 35 | shrink (Alt a b) = a : b : [Alt a' b | a' <- shrink a] ++ [Alt a b' | b' <- shrink b] 36 | shrink (Ap a b) = a : b : [Ap a' b | a' <- shrink a] ++ [Ap a b' | b' <- shrink b] 37 | 38 | issue14Length :: Issue14 () -> Int 39 | issue14Length (Pure ()) = 1 40 | issue14Length (Alt a b) = issue14Length a + issue14Length b 41 | issue14Length (Ap a b) = issue14Length a * issue14Length b 42 | 43 | issue14 :: Issue14 () -> Grammar r (Prod r () Char ()) 44 | issue14 tree = do 45 | emptyRule <- rule $ pure () 46 | let x = go emptyRule tree 47 | return x 48 | where 49 | go x (Pure ()) = x 50 | go x (Alt b1 b2) = go x b1 <|> go x b2 51 | go x (Ap b1 b2) = go x b1 <* go x b2 52 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Unreleased 2 | 3 | - Add `satisfyMaybe` for matching tokens with a predicate that returns a `Maybe` result 4 | 5 | # 0.13.0.1 6 | 7 | - Add a missing test module to the Cabal file 8 | 9 | # 0.13.0.0 10 | 11 | - Remove the previously deprecated operators `symbol`, `namedSymbol`, and `word` 12 | - Change `Prod`'s `Monoid` and `Semigroup` instances to lift their element instances instead of being the same as the `Alternative` instance 13 | - Add unbalanced parentheses/EOF test 14 | 15 | # 0.12.1.0 16 | 17 | - GHC 8.4.1 support 18 | - Update 'base' dependency bounds 19 | - Add `Semigroup` instance to the `Prod` type 20 | 21 | # 0.12.0.1 22 | 23 | - Update 'base' dependency bounds 24 | 25 | # 0.12.0.0 26 | 27 | - Add the `Generator` module for generating grammar members 28 | - Change (simplify) the type returned by `parser`, introducing a `Parser` type synonym for it, and change the signature of `allParses`, `fullParses`, and `report` to accept a `Parser` 29 | - The `Text.Earley.Internal` module is now `Text.Earley.Parser.Internal` 30 | 31 | # 0.11.0.1 32 | 33 | - Add missing modules to Cabal file 34 | 35 | # 0.11.0.0 36 | 37 | - Add `IsString Prod` instance 38 | - Change the signature of `Terminal` to take a function `a -> Maybe b`, and add a new operator `terminal` 39 | - Move `satisfy` to the `Derived` module 40 | - Add the `token`, `namedToken`, and `list` operators 41 | - Deprecate the `symbol`, `namedSymbol`, and `word` operators (use the above instead) 42 | - Add the `listLike` operator 43 | 44 | # 0.10.1.0 45 | 46 | - Fix bug concerning nullable rules (#14) 47 | - Add `runGrammar` 48 | 49 | # 0.10.0.1 50 | 51 | - Add changelog 52 | 53 | # 0.10 54 | 55 | - Remove `Args`, and use `Results` instead 56 | - Make `parser` function not take input directly 57 | - Remove redundant type parameter to `Grammar` 58 | 59 | # 0.9 60 | 61 | - Optimise handling of nullable non-terminals 62 | - Pass a record of arguments in the parse routine 63 | - Add support for consecutive mixfix holes 64 | -------------------------------------------------------------------------------- /tests/Mixfix.hs: -------------------------------------------------------------------------------- 1 | module Mixfix where 2 | import Control.Applicative 3 | import Test.Tasty 4 | import Test.Tasty.HUnit as HU 5 | 6 | import Text.Earley 7 | import Text.Earley.Mixfix 8 | 9 | tests :: TestTree 10 | tests = testGroup "Mixfix" 11 | [ HU.testCase "1" $ 12 | let x = Ident [Just "x"] in 13 | fullParses (parser mixfixGrammar) (words "if x then x else x") 14 | @?= (,) [App ifthenelse [x, x, x]] Report {position = 6, expected = [], unconsumed = []} 15 | , HU.testCase "2" $ 16 | let x = Ident [Just "x"] in 17 | fullParses (parser mixfixGrammar) (words "prefix x postfix") 18 | @?= (,) [App prefix [App postfix [x]]] Report {position = 3, expected = [], unconsumed = []} 19 | , HU.testCase "3" $ 20 | let x = Ident [Just "x"] in 21 | fullParses (parser mixfixGrammar) (words "x infix1 x infix2 x") 22 | @?= (,) [App infix1 [x, App infix2 [x, x]]] Report {position = 5, expected = [], unconsumed = []} 23 | , HU.testCase "4" $ 24 | let x = Ident [Just "x"] in 25 | fullParses (parser mixfixGrammar) (words "[ x ]") 26 | @?= (,) [App closed [x]] Report {position = 3, expected = [], unconsumed = []} 27 | ] 28 | 29 | data MixfixExpr = Ident (Holey String) | App (Holey String) [MixfixExpr] 30 | deriving (Eq, Show) 31 | 32 | mixfixGrammar :: Grammar r (Prod r String String MixfixExpr) 33 | mixfixGrammar = mixfixExpression table 34 | (Ident . pure . Just <$> namedToken "x") 35 | App 36 | where 37 | hident = map (fmap token) 38 | table = 39 | [ [(hident ifthenelse, RightAssoc)] 40 | , [(hident prefix, RightAssoc)] 41 | , [(hident postfix, LeftAssoc)] 42 | , [(hident infix1, LeftAssoc)] 43 | , [(hident infix2, RightAssoc)] 44 | , [(hident closed, NonAssoc)] 45 | ] 46 | 47 | ifthenelse, prefix, postfix, infix1, infix2, closed :: Holey String 48 | ifthenelse = [Just "if", Nothing, Just "then", Nothing, Just "else", Nothing] 49 | prefix = [Just "prefix", Nothing] 50 | postfix = [Nothing, Just "postfix"] 51 | infix1 = [Nothing, Just "infix1", Nothing] 52 | infix2 = [Nothing, Just "infix2", Nothing] 53 | closed = [Just "[", Nothing, Just "]"] 54 | -------------------------------------------------------------------------------- /tests/UnbalancedPars.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts, RankNTypes, RecursiveDo, ScopedTypeVariables #-} 2 | module UnbalancedPars where 3 | 4 | import Data.Char (isAlpha) 5 | 6 | import Control.Applicative 7 | import Test.Tasty 8 | import Test.Tasty.HUnit as HU 9 | 10 | import Text.Earley 11 | 12 | tests :: TestTree 13 | tests = testGroup "Unbalanced parentheses" 14 | [ HU.testCase "Parses balanced" $ 15 | fst (fullParses' unbalancedPars 16 | "((x))") @?= [(b . b) x] 17 | , HU.testCase "Parses one unbalanced" $ 18 | fst (fullParses' unbalancedPars 19 | "((x)") @?= [(u . b) x] 20 | , HU.testCase "Parses two unbalanced" $ 21 | fst (fullParses' unbalancedPars 22 | "((x") @?= [(u . u) x] 23 | ] 24 | where 25 | -- [b]alanced 26 | b :: Expr -> Expr 27 | b e = ExprInBrackets "(" e ")" 28 | 29 | -- [u]nbalanced 30 | u :: Expr -> Expr 31 | u e = ExprInBrackets "(" e "" 32 | 33 | -- [x] variable 34 | x :: Expr 35 | x = Var 'x' 36 | 37 | data Token = EOF | Char !Char 38 | deriving (Eq, Ord, Show) 39 | 40 | fullParses' 41 | :: (forall r. Grammar r (Prod r e Token a)) 42 | -> String 43 | -> ([a], Report e String) 44 | fullParses' g s = 45 | let (res, rep) = allParses (parser $ (<* eof) <$> g) $ fmap Char s ++ repeat EOF 46 | in 47 | ( fst <$> res 48 | , rep { unconsumed = go $ unconsumed rep } 49 | ) 50 | where 51 | go (Char c:xs) = c : go xs 52 | go _ = [] 53 | 54 | data Expr = 55 | Var Char | ExprInBrackets String Expr String 56 | deriving (Eq, Ord, Show) 57 | 58 | eof :: Prod r e Token Token 59 | eof = token EOF 60 | 61 | leftPar :: Prod r e Token String 62 | leftPar = "(" <$ token (Char '(') 63 | 64 | rightPar :: Prod r e Token String 65 | rightPar = ")" <$ token (Char ')') 66 | 67 | var :: Prod r e Token Expr 68 | var = terminal $ \t -> case t of 69 | Char c | isAlpha c -> Just $ Var c 70 | _ -> Nothing 71 | 72 | unbalancedPars :: Grammar r (Prod r String Token Expr) 73 | unbalancedPars = mdo 74 | expr <- rule $ var <|> exprInBrackets 75 | exprInBrackets <- rule $ 76 | ExprInBrackets 77 | <$> leftPar 78 | <*> expr 79 | <*> (rightPar <|> ("" <$ eof)) 80 | "parenthesized expression" 81 | return expr 82 | -------------------------------------------------------------------------------- /examples/Mixfix.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, RecursiveDo #-} 2 | import Control.Applicative 3 | import Control.Arrow(first) 4 | import Data.Maybe 5 | #if !MIN_VERSION_base(4,8,0) 6 | import Data.Monoid 7 | #endif 8 | import System.Environment 9 | import Text.Earley 10 | import Text.Earley.Mixfix 11 | import qualified Data.HashSet as HS 12 | 13 | holey :: String -> Holey String 14 | holey "" = [] 15 | holey ('_':xs) = Nothing : holey xs 16 | holey xs = Just i : holey rest 17 | where (i, rest) = span (/= '_') xs 18 | 19 | data Expr = V (Holey String) | App Expr [Expr] 20 | deriving Show 21 | 22 | identTable :: [[(Holey String, Associativity)]] 23 | identTable = (map . map) (first holey) 24 | [ [("_->_", RightAssoc)] 25 | , [("_,_", NonAssoc)] 26 | , [("if_then_else_", RightAssoc)] 27 | , [("_|-_:_", NonAssoc)] 28 | , [("_+_", LeftAssoc)] 29 | , [("_*_", LeftAssoc)] 30 | ] 31 | 32 | grammar :: Grammar r (Prod r String String Expr) 33 | grammar = mdo 34 | ident <- rule $ (V . pure . Just) <$> satisfy (not . (`HS.member` mixfixParts)) 35 | "identifier" 36 | atom <- rule $ ident 37 | <|> namedToken "(" *> expr <* namedToken ")" 38 | normalApp <- rule $ atom 39 | <|> App <$> atom <*> some atom 40 | expr <- mixfixExpression table normalApp (App . V) 41 | return expr 42 | where 43 | table = map (map $ first $ map $ fmap namedToken) identTable 44 | mixfixParts = HS.fromList [s | xs <- identTable , (ys, _) <- xs 45 | , Just s <- ys] 46 | `mappend` HS.fromList ["(", ")"] 47 | 48 | pretty :: Expr -> String 49 | pretty (V ps) = concatMap (fromMaybe "_") ps 50 | pretty (App e es) = "(" ++ pretty e ++ " " ++ unwords (map pretty es) ++ ")" 51 | 52 | tokenize :: String -> [String] 53 | tokenize "" = [] 54 | tokenize (' ':xs) = tokenize xs 55 | tokenize ('\n':xs) = tokenize xs 56 | tokenize (x:xs) 57 | | x `HS.member` special = [x] : tokenize xs 58 | | otherwise = (x:as) : tokenize bs 59 | where 60 | (as, bs) = break (`HS.member` special) xs 61 | special = HS.fromList "(), \n" 62 | 63 | main :: IO () 64 | main = do 65 | x:_ <- getArgs 66 | print $ first (map pretty) $ fullParses (parser grammar) $ tokenize x 67 | -------------------------------------------------------------------------------- /tests/Optional.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-} 2 | module Optional where 3 | import Control.Applicative 4 | import Test.Tasty 5 | import Test.Tasty.HUnit as HU 6 | 7 | import Text.Earley 8 | 9 | tests :: TestTree 10 | tests = testGroup "Optional" 11 | [ HU.testCase "Nothing" $ 12 | fullParses (parser $ return optional_) "b" 13 | @?= (,) [(Nothing, 'b')] Report {position = 1, expected = "", unconsumed = ""} 14 | , HU.testCase "Just" $ 15 | fullParses (parser $ return optional_) "ab" 16 | @?= (,) [(Just 'a', 'b')] Report {position = 2, expected = "", unconsumed = ""} 17 | , HU.testCase "Using rules Nothing" $ 18 | fullParses (parser optionalRule) "b" 19 | @?= (,) [(Nothing, 'b')] Report {position = 1, expected = "", unconsumed = ""} 20 | , HU.testCase "Using rules Just" $ 21 | fullParses (parser optionalRule) "ab" 22 | @?= (,) [(Just 'a', 'b')] Report {position = 2, expected = "", unconsumed = ""} 23 | , HU.testCase "Without continuation Nothing" $ 24 | fullParses (parser $ return $ optional $ namedToken 'a') "" 25 | @?= (,) [Nothing] Report {position = 0, expected = "a", unconsumed = ""} 26 | , HU.testCase "Without continuation Just" $ 27 | fullParses (parser $ return $ optional $ namedToken 'a') "a" 28 | @?= (,) [Just 'a'] Report {position = 1, expected = "", unconsumed = ""} 29 | , HU.testCase "Using rules without continuation Nothing" $ 30 | fullParses (parser $ rule $ optional $ namedToken 'a') "" 31 | @?= (,) [Nothing] Report {position = 0, expected = "a", unconsumed = ""} 32 | , HU.testCase "Using rules without continuation Just" $ 33 | fullParses (parser $ rule $ optional $ namedToken 'a') "a" 34 | @?= (,) [Just 'a'] Report {position = 1, expected = "", unconsumed = ""} 35 | , HU.testCase "Generate optional" $ 36 | language (generator (return optional_) "ab") 37 | @?= [((Nothing, 'b'), "b"), ((Just 'a', 'b'), "ab")] 38 | , HU.testCase "Generate optional using rules" $ 39 | language (generator optionalRule "ab") 40 | @?= [((Nothing, 'b'), "b"), ((Just 'a', 'b'), "ab")] 41 | , HU.testCase "Generate optional without continuation" $ 42 | language (generator (return $ optional $ namedToken 'a') "ab") 43 | @?= [(Nothing, ""), (Just 'a', "a")] 44 | , HU.testCase "Generate optional using rules without continuation" $ 45 | language (generator (rule $ optional $ namedToken 'a') "ab") 46 | @?= [(Nothing, ""), (Just 'a', "a")] 47 | ] 48 | 49 | optional_ :: Prod r Char Char (Maybe Char, Char) 50 | optional_ = (,) <$> optional (namedToken 'a') <*> namedToken 'b' 51 | 52 | optionalRule :: Grammar r (Prod r Char Char (Maybe Char, Char)) 53 | optionalRule = mdo 54 | test <- rule $ (,) <$> optional (namedToken 'a') <*> namedToken 'b' 55 | return test 56 | -------------------------------------------------------------------------------- /tests/Issue11.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | module Issue11 where 3 | import Control.Applicative 4 | import Test.Tasty 5 | import Test.Tasty.HUnit as HU 6 | 7 | import Text.Earley 8 | import Text.Earley.Mixfix 9 | 10 | tests :: TestTree 11 | tests = testGroup "Issue 11" 12 | [ let x = words "+ + 5 6 7" in 13 | HU.testCase "1" $ 14 | fullParses (parser $ grammar LeftAssoc) x 15 | @?= (,) [] Report {position = 1, expected = [], unconsumed = drop 1 x} 16 | , let x = words "+ 5 + 6 7" in 17 | HU.testCase "2" $ 18 | fullParses (parser $ grammar LeftAssoc) x 19 | @?= (,) [] Report {position = 2, expected = [], unconsumed = drop 2 x} 20 | , let x = words "+ 5 6" in 21 | HU.testCase "3" $ 22 | fullParses (parser $ grammar LeftAssoc) x 23 | @?= (,) [Plus (Var "5") (Var "6")] 24 | Report {position = 3, expected = [], unconsumed = []} 25 | , let x = words "+ + 5 6 7" in 26 | HU.testCase "4" $ 27 | fullParses (parser $ grammar RightAssoc) x 28 | @?= (,) [Plus (Plus (Var "5") (Var "6")) (Var "7")] 29 | Report {position = 5, expected = [], unconsumed = []} 30 | , let x = words "+ 5 + 6 7" in 31 | HU.testCase "5" $ 32 | fullParses (parser $ grammar RightAssoc) x 33 | @?= (,) [Plus (Var "5") (Plus (Var "6") (Var "7"))] 34 | Report {position = 5, expected = [], unconsumed = []} 35 | , let x = words "+ 5 6" in 36 | HU.testCase "6" $ 37 | fullParses (parser $ grammar RightAssoc) x 38 | @?= (,) [Plus (Var "5") (Var "6")] 39 | Report {position = 3, expected = [], unconsumed = []} 40 | , let x = words "+ + 5 6 7" in 41 | HU.testCase "7" $ 42 | fullParses (parser $ grammar NonAssoc) x 43 | @?= (,) [Plus (Plus (Var "5") (Var "6")) (Var "7")] 44 | Report {position = 5, expected = [], unconsumed = []} 45 | , let x = words "+ 5 + 6 7" in 46 | HU.testCase "8" $ 47 | fullParses (parser $ grammar NonAssoc) x 48 | @?= (,) [Plus (Var "5") (Plus (Var "6") (Var "7"))] 49 | Report {position = 5, expected = [], unconsumed = []} 50 | , let x = words "+ 5 6" in 51 | HU.testCase "9" $ 52 | fullParses (parser $ grammar NonAssoc) x 53 | @?= (,) [Plus (Var "5") (Var "6")] 54 | Report {position = 3, expected = [], unconsumed = []} 55 | ] 56 | 57 | data AST 58 | = Var String 59 | | Plus AST AST 60 | deriving (Eq, Ord, Show) 61 | 62 | grammar :: Associativity -> Grammar r (Prod r String String AST) 63 | grammar a = mdo 64 | atomicExpr <- rule $ Var <$> satisfy (/= "+") 65 | 66 | expr <- mixfixExpression 67 | [[([Just (token "+"), Nothing, Nothing], a)]] 68 | atomicExpr 69 | (\x y -> case (x,y) of 70 | ([Just "+", Nothing, Nothing], [e1,e2]) -> Plus e1 e2 71 | _ -> undefined) 72 | 73 | return expr 74 | -------------------------------------------------------------------------------- /tests/Lambda.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | module Lambda where 3 | import Control.Applicative 4 | import Data.List as List 5 | import Data.Foldable 6 | import Test.Tasty 7 | import Test.Tasty.HUnit as HU 8 | import Test.Tasty.QuickCheck as QC 9 | 10 | import Text.Earley 11 | 12 | import qualified Arbitrary 13 | 14 | tests :: TestTree 15 | tests = testGroup "Lambda" 16 | [ HU.testCase "Generate exactly 0" $ 17 | exactly 0 gen @?= [] 18 | , HU.testCase "Generate upTo 0" $ 19 | upTo 0 gen @?= [] 20 | , HU.testCase "Generate exactly 4" $ 21 | sort (snd <$> exactly 4 gen) 22 | @?= 23 | ["(a)a","(a)b","(aa)","(ab)","(b)a","(b)b","(ba)","(bb)" 24 | ,"\\a.a","\\a.b","\\b.a","\\b.b","a(a)","a(b)","a+aa","a+ab" 25 | ,"a+ba","a+bb","aa+a","aa+b","aaaa","aaab","aaba","aabb" 26 | ,"ab+a","ab+b","abaa","abab","abba","abbb","b(a)","b(b)" 27 | ,"b+aa","b+ab","b+ba","b+bb","ba+a","ba+b","baaa","baab" 28 | ,"baba","babb","bb+a","bb+b","bbaa","bbab","bbba","bbbb" 29 | ] 30 | , HU.testCase "upTo contains exactly" $ List.and (do 31 | m <- [0..5] 32 | let ys = snd <$> upTo m gen 33 | n <- [0..m] 34 | (_, x) <- upTo n gen 35 | return $ x `List.elem` ys) 36 | @? "exactly contains upTo" 37 | , HU.testCase "language contains upTo" $ do 38 | let ys = snd <$> language gen 39 | List.and (do 40 | n <- [0..5] 41 | (_, x) <- upTo n gen 42 | return $ x `List.elem` ys) 43 | @? "exactly contains upTo" 44 | , QC.testProperty "Arbitrary" $ do 45 | let p = parser grammar 46 | (e, s) <- Arbitrary.arbitrary $ generator grammar tokens 47 | return 48 | $ [e] === fst (fullParses p $ prettyExpr 0 e) 49 | .&&. [e] === fst (fullParses p s) 50 | ] 51 | where 52 | gen = generator grammar tokens 53 | 54 | data Expr 55 | = Var Char 56 | | Lam String Expr 57 | | App Expr Expr 58 | | Add Expr Expr 59 | deriving (Eq, Ord, Show) 60 | 61 | prettyExpr :: Int -> Expr -> String 62 | prettyExpr _ (Var c) = [c] 63 | prettyExpr d (Lam xs e) = prettyParens (d > 0) $ "\\" ++ xs ++ "." ++ prettyExpr d e 64 | prettyExpr d (Add a b) = prettyParens (d > 1) $ prettyExpr 2 a ++ "+" ++ prettyExpr 1 b 65 | prettyExpr d (App a b) = prettyParens (d > 3) $ prettyExpr 3 a ++ prettyExpr 4 b 66 | 67 | prettyParens :: Bool -> String -> String 68 | prettyParens True s = "(" ++ s ++ ")" 69 | prettyParens False s = s 70 | 71 | tokens :: String 72 | tokens = "(\\ab.+*)" 73 | 74 | instance Arbitrary Expr where 75 | arbitrary = sized go 76 | where 77 | var = elements "ab" 78 | go 0 = Var <$> var 79 | go n = oneof 80 | [ Var <$> var 81 | , Lam <$> (take 2 <$> listOf1 var) <*> go' 82 | , App <$> go' <*> go' 83 | , Add <$> go' <*> go' 84 | ] 85 | where 86 | go' = go (n `div` 10) 87 | 88 | shrink (Var _) = [] 89 | shrink (Lam xs e) = e : [Lam xs' e' | xs' <- shrink xs, not (null xs), e' <- shrink e] 90 | shrink (App a b) = a : b : [App a' b' | a' <- shrink a, b' <- shrink b] 91 | shrink (Add a b) = a : b : [Add a' b' | a' <- shrink a, b' <- shrink b] 92 | 93 | grammar :: Grammar r (Prod r String Char Expr) 94 | grammar = mdo 95 | let v = asum (token <$> "ab") 96 | "variable" 97 | x1 <- rule 98 | $ Lam <$ token '\\' <*> some v <* token '.' <*> x1 99 | <|> x2 100 | "lambda" 101 | x2 <- rule 102 | $ Add <$> x3 <* token '+' <*> x2 103 | <|> x3 104 | "sum" 105 | x3 <- rule 106 | $ App <$> x3 <*> x4 107 | <|> x4 108 | "application" 109 | let x4 = Var <$> v 110 | <|> token '(' *> x1 <* token ')' 111 | return x1 112 | -------------------------------------------------------------------------------- /bench/BenchAll.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo, FlexibleContexts #-} 2 | module Main where 3 | 4 | import Control.Applicative 5 | import Control.Exception 6 | import Control.DeepSeq 7 | import Criterion.Main 8 | import Data.Char 9 | import Data.Maybe 10 | import Text.Earley 11 | import qualified Text.Parsec as Parsec 12 | import qualified Text.Parsec.Pos as Parsec 13 | 14 | data Expr 15 | = Add Expr Expr 16 | | Mul Expr Expr 17 | | Var String 18 | deriving (Eq, Ord, Show) 19 | 20 | instance NFData Expr where 21 | rnf (Add a b) = rnf a `seq` rnf b 22 | rnf (Mul a b) = rnf a `seq` rnf b 23 | rnf (Var s) = rnf s 24 | 25 | type Token = String 26 | 27 | tokenParens :: Bool -> [Token] -> [Token] 28 | tokenParens True s = ["("] ++ s ++ [")"] 29 | tokenParens False s = s 30 | 31 | tokenExpr :: Int -> Expr -> [Token] 32 | tokenExpr _ (Var s) = [s] 33 | tokenExpr d (Add a b) = tokenParens (d > 0) $ tokenExpr 0 a ++ ["+"] ++ tokenExpr 1 b 34 | tokenExpr d (Mul a b) = tokenParens (d > 1) $ tokenExpr 1 a ++ ["*"] ++ tokenExpr 2 b 35 | 36 | linearSum :: Int -> Expr 37 | linearSum 1 = Var "x" 38 | linearSum n = Add (linearSum $ n - 1) (Var "x") 39 | 40 | treeSum :: Int -> Expr 41 | treeSum 1 = Var "x" 42 | treeSum n = let a = n `div` 2 -- will be at least 1 43 | b = n - a 44 | in Add (treeSum a) (treeSum b) 45 | 46 | -- Earley parser 47 | 48 | expr :: Grammar r (Prod r String Token Expr) 49 | expr = mdo 50 | x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2 51 | <|> x2 52 | "sum" 53 | x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3 54 | <|> x3 55 | "product" 56 | x3 <- rule $ Var <$> (satisfy isIdent "identifier") 57 | <|> namedToken "(" *> x1 <* namedToken ")" 58 | return x1 59 | 60 | isIdent :: String -> Bool 61 | isIdent (x:_) = isAlpha x 62 | isIdent _ = False 63 | 64 | sepBy1 :: Prod r e t a -> Prod r e t op -> Grammar r (Prod r e t [a]) 65 | sepBy1 p op = mdo 66 | ops <- rule $ pure [] <|> (:) <$ op <*> p <*> ops 67 | rule $ (:) <$> p <*> ops 68 | 69 | expr' :: Grammar r (Prod r String Token Expr) 70 | expr' = mdo 71 | let var = Var <$> satisfy isIdent <|> token "(" *> mul <* token ")" 72 | mul <- fmap (foldl1 Mul) <$> add `sepBy1` token "*" 73 | add <- fmap (foldl1 Add) <$> var `sepBy1` token "+" 74 | return mul 75 | 76 | parseEarley :: [Token] -> Maybe Expr 77 | parseEarley input = listToMaybe (fst (fullParses (parser expr) input)) 78 | 79 | parseEarley' :: [Token] -> Maybe Expr 80 | parseEarley' input = listToMaybe (fst (fullParses (parser expr') input)) 81 | 82 | -- Parsec parsec 83 | 84 | type Parsec = Parsec.Parsec [Token] () 85 | 86 | parsecExpr :: Parsec Expr 87 | parsecExpr = mul 88 | where mul = foldl1 Mul <$> add `Parsec.sepBy1` t "*" 89 | add = foldl1 Add <$> var `Parsec.sepBy1` t "+" 90 | ident = Parsec.token id pos $ \y -> if isIdent y then Just (Var y) else Nothing 91 | var = ident <|> (t "(" *> mul <* t ")") 92 | t x = Parsec.token id pos $ \y -> if x == y then Just x else Nothing 93 | pos = const (Parsec.initialPos "") 94 | 95 | parseParsec :: [Token] -> Maybe Expr 96 | parseParsec = either (const Nothing) Just . Parsec.parse parsecExpr "" 97 | 98 | -- Our benchmark harness. 99 | 100 | linearInput :: Int -> (String, [Token]) 101 | linearInput size = (show size, tokenExpr 0 $ linearSum size) 102 | 103 | treeInput :: Int -> (String, [Token]) 104 | treeInput size = (show size, tokenExpr 0 $ treeSum size) 105 | 106 | inputBench :: (String, [Token]) -> Benchmark 107 | inputBench (name, input) = bench name $ nf id input 108 | 109 | earleyBench :: (String, [Token]) -> Benchmark 110 | earleyBench (name, input) = bench name $ nf parseEarley input 111 | 112 | earley'Bench :: (String, [Token]) -> Benchmark 113 | earley'Bench (name, input) = bench name $ nf parseEarley' input 114 | 115 | parsecBench :: (String, [Token]) -> Benchmark 116 | parsecBench (name, input) = bench name $ nf parseParsec input 117 | 118 | benchSizes :: [Int] 119 | benchSizes = [100, 200] -- [51, 101, 151, 201] 120 | 121 | linearInputs :: [(String, [Token])] 122 | linearInputs = map linearInput benchSizes 123 | 124 | treeInputs :: [(String, [Token])] 125 | treeInputs = map treeInput benchSizes 126 | 127 | main :: IO () 128 | main = do 129 | evaluate (rnf linearInputs) 130 | evaluate (rnf treeInputs) 131 | defaultMain 132 | [ -- bgroup "inputs" $ map inputBench linearInputs 133 | bgroup "earley" $ map earleyBench linearInputs 134 | , bgroup "earley'" $ map earley'Bench linearInputs 135 | , bgroup "parsec" $ map parsecBench linearInputs 136 | -- , bgroup "inputsTree" $ map inputBench treeInputs 137 | , bgroup "earleyTree" $ map earleyBench treeInputs 138 | , bgroup "earley'Tree" $ map earley'Bench treeInputs 139 | , bgroup "parsecTree" $ map parsecBench treeInputs 140 | ] 141 | 142 | -------------------------------------------------------------------------------- /Text/Earley/Mixfix.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, RecursiveDo #-} 2 | module Text.Earley.Mixfix 3 | ( Associativity(..) 4 | , Holey 5 | , mixfixExpression 6 | , mixfixExpressionSeparate 7 | ) where 8 | 9 | #if !MIN_VERSION_base(4,8,0) 10 | import Control.Applicative 11 | import Data.Traversable(sequenceA) 12 | #endif 13 | import Data.Either 14 | import Data.Foldable(asum, foldrM) 15 | import Text.Earley 16 | 17 | replicateA :: Applicative f => Int -> f a -> f [a] 18 | replicateA n = sequenceA . replicate n 19 | 20 | consA :: Applicative f => f a -> f [a] -> f [a] 21 | consA p q = (:) <$> p <*> q 22 | 23 | data Associativity 24 | = LeftAssoc 25 | | NonAssoc 26 | | RightAssoc 27 | deriving (Eq, Show) 28 | 29 | -- | An identifier with identifier parts ('Just's), and holes ('Nothing's) 30 | -- representing the positions of its arguments. 31 | -- 32 | -- Example (commonly written "if_then_else_"): 33 | -- @['Just' "if", 'Nothing', 'Just' "then", 'Nothing', 'Just' "else", 'Nothing'] :: 'Holey' 'String'@ 34 | type Holey a = [Maybe a] 35 | 36 | -- | Create a grammar for parsing mixfix expressions. 37 | mixfixExpression 38 | :: [[(Holey (Prod r e t ident), Associativity)]] 39 | -- ^ A table of holey identifier parsers, with associativity information. 40 | -- The identifiers should be in groups of precedence levels listed from 41 | -- binding the least to the most tightly. 42 | -- 43 | -- The associativity is taken into account when an identifier starts or ends 44 | -- with holes, or both. Internal holes (e.g. after "if" in "if_then_else_") 45 | -- start from the beginning of the table. 46 | -- 47 | -- Note that this rule also applies to identifiers with multiple consecutive 48 | -- holes, e.g. "if__" --- the associativity then applies to both holes. 49 | -> Prod r e t expr 50 | -- ^ An atom, i.e. what is parsed at the lowest level. This will 51 | -- commonly be a (non-mixfix) identifier or a parenthesised expression. 52 | -> (Holey ident -> [expr] -> expr) 53 | -- ^ How to combine the successful application of a holey identifier to its 54 | -- arguments into an expression. 55 | -> Grammar r (Prod r e t expr) 56 | mixfixExpression table atom app = mixfixExpressionSeparate table' atom 57 | where 58 | table' = [[(holey, assoc, app) | (holey, assoc) <- row] | row <- table] 59 | 60 | -- | A version of 'mixfixExpression' with a separate semantic action for each 61 | -- individual 'Holey' identifier. 62 | mixfixExpressionSeparate 63 | :: [[(Holey (Prod r e t ident), Associativity, Holey ident -> [expr] -> expr)]] 64 | -- ^ A table of holey identifier parsers, with associativity information and 65 | -- semantic actions. The identifiers should be in groups of precedence 66 | -- levels listed from binding the least to the most tightly. 67 | -- 68 | -- The associativity is taken into account when an identifier starts or ends 69 | -- with holes, or both. Internal holes (e.g. after "if" in "if_then_else_") 70 | -- start from the beginning of the table. 71 | -- 72 | -- Note that this rule also applies to identifiers with multiple consecutive 73 | -- holes, e.g. "if__" --- the associativity then applies to both holes. 74 | -> Prod r e t expr 75 | -- ^ An atom, i.e. what is parsed at the lowest level. This will 76 | -- commonly be a (non-mixfix) identifier or a parenthesised expression. 77 | -> Grammar r (Prod r e t expr) 78 | mixfixExpressionSeparate table atom = mdo 79 | expr <- foldrM ($) atom $ map (level expr) table 80 | return expr 81 | where 82 | level expr idents next = mdo 83 | same <- rule $ asum $ next : map (mixfixIdent same) idents 84 | return same 85 | where 86 | -- Group consecutive holes and ident parts. 87 | grp [] = [] 88 | grp (Nothing:ps) = case grp ps of 89 | Left n:rest -> (Left $! (n + 1)) : rest 90 | rest -> Left 1 : rest 91 | grp (Just p:ps) = case grp ps of 92 | Right ps':rest -> Right (consA p ps') : rest 93 | rest -> Right (consA p $ pure []) : rest 94 | 95 | mixfixIdent same (ps, a, f) = f' <$> go (grp ps) 96 | where 97 | f' xs = f (concatMap (either (map $ const Nothing) $ map Just) xs) 98 | $ concat $ lefts xs 99 | go ps' = case ps' of 100 | [] -> pure [] 101 | [Right p] -> pure . Right <$> p 102 | Left n:rest -> consA 103 | (Left <$> replicateA n (if a == RightAssoc then next 104 | else same)) 105 | $ go rest 106 | [Right p, Left n] -> consA 107 | (Right <$> p) 108 | $ pure . Left <$> replicateA n (if a == LeftAssoc then next 109 | else same) 110 | Right p:Left n:rest -> consA (Right <$> p) 111 | $ consA (Left <$> replicateA n expr) 112 | $ go rest 113 | Right _:Right _:_ -> error 114 | $ "Earley.mixfixExpression: The impossible happened. " 115 | ++ "Please report this as a bug." 116 | -------------------------------------------------------------------------------- /Earley.cabal: -------------------------------------------------------------------------------- 1 | name: Earley 2 | version: 0.13.0.1 3 | synopsis: Parsing all context-free grammars using Earley's algorithm. 4 | description: See for more 5 | information and 6 | for 7 | examples. 8 | license: BSD3 9 | license-file: LICENSE 10 | author: Olle Fredriksson 11 | maintainer: fredriksson.olle@gmail.com 12 | copyright: (c) 2014-2019 Olle Fredriksson 13 | category: Parsing 14 | build-type: Simple 15 | cabal-version: >=1.10 16 | tested-with: GHC==7.6.3, GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1, GHC==8.4.1 17 | 18 | extra-source-files: 19 | README.md 20 | CHANGELOG.md 21 | 22 | Flag Examples 23 | Description: "Build examples" 24 | Default: False 25 | Manual: True 26 | 27 | source-repository head 28 | type: git 29 | location: https://github.com/ollef/Earley.git 30 | 31 | library 32 | exposed-modules: 33 | Text.Earley, 34 | Text.Earley.Derived, 35 | Text.Earley.Generator, 36 | Text.Earley.Generator.Internal, 37 | Text.Earley.Grammar, 38 | Text.Earley.Mixfix, 39 | Text.Earley.Parser, 40 | Text.Earley.Parser.Internal 41 | build-depends: base >=4.6 && <5, ListLike >=4.1 42 | if impl(ghc < 8.0) 43 | build-depends: semigroups >=0.18 44 | default-language: Haskell2010 45 | ghc-options: -Wall 46 | -funbox-strict-fields 47 | 48 | executable earley-english 49 | if !flag(examples) 50 | buildable: False 51 | main-is: English.hs 52 | ghc-options: -Wall 53 | hs-source-dirs: examples 54 | default-language: Haskell2010 55 | build-depends: base, Earley, unordered-containers >=0.2 56 | 57 | executable earley-expr 58 | if !flag(examples) 59 | buildable: False 60 | main-is: Expr.hs 61 | ghc-options: -Wall 62 | hs-source-dirs: examples 63 | default-language: Haskell2010 64 | build-depends: base, Earley 65 | 66 | executable earley-expr2 67 | if !flag(examples) 68 | buildable: False 69 | main-is: Expr2.hs 70 | ghc-options: -Wall 71 | hs-source-dirs: examples 72 | default-language: Haskell2010 73 | build-depends: base, Earley 74 | 75 | executable earley-mixfix 76 | if !flag(examples) 77 | buildable: False 78 | main-is: Mixfix.hs 79 | ghc-options: -Wall 80 | hs-source-dirs: examples 81 | default-language: Haskell2010 82 | build-depends: base, Earley, unordered-containers 83 | 84 | executable earley-roman-numerals 85 | if !flag(examples) 86 | buildable: False 87 | main-is: RomanNumerals.hs 88 | ghc-options: -Wall 89 | hs-source-dirs: examples 90 | default-language: Haskell2010 91 | build-depends: base, Earley 92 | 93 | executable earley-very-ambiguous 94 | if !flag(examples) 95 | buildable: False 96 | main-is: VeryAmbiguous.hs 97 | ghc-options: -Wall 98 | hs-source-dirs: examples 99 | default-language: Haskell2010 100 | build-depends: base, Earley 101 | 102 | executable earley-words 103 | if !flag(examples) 104 | buildable: False 105 | main-is: Words.hs 106 | ghc-options: -Wall 107 | hs-source-dirs: examples 108 | default-language: Haskell2010 109 | build-depends: base, Earley 110 | 111 | executable earley-infinite 112 | if !flag(examples) 113 | buildable: False 114 | main-is: Infinite.hs 115 | ghc-options: -Wall 116 | hs-source-dirs: examples 117 | default-language: Haskell2010 118 | build-depends: base, Earley 119 | 120 | benchmark bench 121 | type: exitcode-stdio-1.0 122 | hs-source-dirs: . bench 123 | main-is: BenchAll.hs 124 | build-depends: base, Earley, ListLike, deepseq, criterion >=1.1, parsec >=3.1 125 | if impl(ghc < 8.0) 126 | build-depends: semigroups >=0.18 127 | default-language: Haskell2010 128 | ghc-options: -Wall 129 | 130 | test-suite tests 131 | type: exitcode-stdio-1.0 132 | main-is: Main.hs 133 | ghc-options: -Wall 134 | hs-source-dirs: tests 135 | default-language: Haskell2010 136 | build-depends: base, Earley, tasty >=0.10, tasty-quickcheck >=0.8, tasty-hunit >= 0.9, QuickCheck >= 2.8, containers >= 0.6 137 | other-modules: 138 | Arbitrary, 139 | Constraint, 140 | Empty, 141 | Expr, 142 | Generator, 143 | InlineAlts, 144 | Issue11, 145 | Issue14, 146 | Lambda, 147 | Mixfix, 148 | Optional, 149 | ReversedWords, 150 | UnbalancedPars, 151 | VeryAmbiguous 152 | -------------------------------------------------------------------------------- /tests/Expr.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecursiveDo #-} 2 | module Expr where 3 | import Control.Applicative 4 | import Data.Char 5 | import Test.Tasty 6 | import Test.Tasty.QuickCheck as QC 7 | 8 | import Text.Earley 9 | 10 | import qualified Arbitrary 11 | 12 | tests :: TestTree 13 | tests = testGroup "Expr" 14 | [ QC.testProperty "Left-recursive: parse . pretty = id" $ 15 | \e -> [e] === parseLeftExpr (prettyLeftExpr 0 e) 16 | , QC.testProperty "Left-recursive: parse . pretty = id (generator)" $ do 17 | (e, s) <- Arbitrary.arbitrary $ generator leftExpr tokens 18 | return 19 | $ [e] === parseLeftExpr (prettyLeftExpr 0 e) 20 | .&&. [e] === parseLeftExpr (unwords s) 21 | , QC.testProperty "Right-recursive: parse . pretty = id" $ 22 | \e -> [e] === parseRightExpr (prettyRightExpr 0 e) 23 | , QC.testProperty "Right-recursive: parse . pretty = id (generator)" $ do 24 | (e, s) <- Arbitrary.arbitrary $ generator rightExpr tokens 25 | return 26 | $ [e] === parseRightExpr (prettyRightExpr 0 e) 27 | .&&. [e] === parseRightExpr (unwords s) 28 | , QC.testProperty "Ambiguous: parse . pretty ≈ id" $ 29 | \e -> e `elem` parseAmbiguousExpr (prettyLeftExpr 0 e) 30 | .&&. e `elem` parseAmbiguousExpr (prettyRightExpr 0 e) 31 | .&&. [e] == parseAmbiguousExpr (prettyAmbiguousExpr e) 32 | , QC.testProperty "Ambiguous: parse . pretty ≈ id (generator)" $ do 33 | (e, s) <- Arbitrary.arbitrary $ generator ambiguousExpr tokens 34 | return $ e `elem` parseAmbiguousExpr (prettyLeftExpr 0 e) 35 | .&&. e `elem` parseAmbiguousExpr (prettyRightExpr 0 e) 36 | .&&. [e] == parseAmbiguousExpr (prettyAmbiguousExpr e) 37 | .&&. e `elem` parseAmbiguousExpr (unwords s) 38 | ] 39 | 40 | tokens :: [String] 41 | tokens = pure <$> "abcxyz+*()" 42 | 43 | parseLeftExpr :: String -> [Expr] 44 | parseLeftExpr input = fst (fullParses (parser leftExpr) (lexExpr input)) 45 | 46 | parseRightExpr :: String -> [Expr] 47 | parseRightExpr input = fst (fullParses (parser rightExpr) (lexExpr input)) 48 | 49 | parseAmbiguousExpr :: String -> [Expr] 50 | parseAmbiguousExpr input = fst (fullParses (parser ambiguousExpr) (lexExpr input)) 51 | 52 | data Expr 53 | = Add Expr Expr 54 | | Mul Expr Expr 55 | | Var String 56 | deriving (Eq, Ord, Show) 57 | 58 | instance Arbitrary Expr where 59 | arbitrary = sized arbExpr 60 | where arbIdent = Var <$> elements ["a", "b", "c", "x", "y", "z"] 61 | arbExpr n | n > 0 = oneof [ arbIdent 62 | , Add <$> arbExpr1 <*> arbExpr1 63 | , Mul <$> arbExpr1 <*> arbExpr1 64 | ] 65 | where arbExpr1 = arbExpr (n `div` 3) 66 | arbExpr _ = arbIdent 67 | 68 | shrink (Var _) = [] 69 | shrink (Add a b) = a : b : [ Add a' b | a' <- shrink a ] ++ [ Add a b' | b' <- shrink b ] 70 | shrink (Mul a b) = a : b : [ Mul a' b | a' <- shrink a ] ++ [ Mul a b' | b' <- shrink b ] 71 | 72 | leftExpr :: Grammar r (Prod r String String Expr) 73 | leftExpr = mdo 74 | x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2 75 | <|> x2 76 | "sum" 77 | x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3 78 | <|> x3 79 | "product" 80 | x3 <- rule $ Var <$> (satisfy ident "identifier") 81 | <|> namedToken "(" *> x1 <* namedToken ")" 82 | return x1 83 | where 84 | ident (x:_) = isAlpha x 85 | ident _ = False 86 | 87 | rightExpr :: Grammar r (Prod r String String Expr) 88 | rightExpr = mdo 89 | x1 <- rule $ Add <$> x2 <* namedToken "+" <*> x1 90 | <|> x2 91 | "sum" 92 | x2 <- rule $ Mul <$> x3 <* namedToken "*" <*> x2 93 | <|> x3 94 | "product" 95 | x3 <- rule $ Var <$> (satisfy ident "identifier") 96 | <|> namedToken "(" *> x1 <* namedToken ")" 97 | return x1 98 | where 99 | ident (x:_) = isAlpha x 100 | ident _ = False 101 | 102 | ambiguousExpr :: Grammar r (Prod r String String Expr) 103 | ambiguousExpr = mdo 104 | x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x1 105 | <|> x2 106 | "sum" 107 | x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x2 108 | <|> x3 109 | "product" 110 | x3 <- rule $ Var <$> (satisfy ident "identifier") 111 | <|> namedToken "(" *> x1 <* namedToken ")" 112 | return x1 113 | where 114 | ident (x:_) = isAlpha x 115 | ident _ = False 116 | 117 | prettyParens :: Bool -> String -> String 118 | prettyParens True s = "(" ++ s ++ ")" 119 | prettyParens False s = s 120 | 121 | prettyLeftExpr :: Int -> Expr -> String 122 | prettyLeftExpr _ (Var s) = s 123 | prettyLeftExpr d (Add a b) = prettyParens (d > 0) $ prettyLeftExpr 0 a ++ " + " ++ prettyLeftExpr 1 b 124 | prettyLeftExpr d (Mul a b) = prettyParens (d > 1) $ prettyLeftExpr 1 a ++ " * " ++ prettyLeftExpr 2 b 125 | 126 | prettyRightExpr :: Int -> Expr -> String 127 | prettyRightExpr _ (Var s) = s 128 | prettyRightExpr d (Add a b) = prettyParens (d > 0) $ prettyRightExpr 1 a ++ " + " ++ prettyRightExpr 0 b 129 | prettyRightExpr d (Mul a b) = prettyParens (d > 1) $ prettyRightExpr 2 a ++ " * " ++ prettyRightExpr 1 b 130 | 131 | prettyAmbiguousExpr :: Expr -> String 132 | prettyAmbiguousExpr (Var s) = s 133 | prettyAmbiguousExpr (Add a b) = prettyParens True $ prettyAmbiguousExpr a ++ " + " ++ prettyAmbiguousExpr b 134 | prettyAmbiguousExpr (Mul a b) = prettyParens True $ prettyAmbiguousExpr a ++ " * " ++ prettyAmbiguousExpr b 135 | 136 | -- @words@ like lexer, but consider parentheses as separate tokens 137 | lexExpr :: String -> [String] 138 | lexExpr "" = [] 139 | lexExpr ('(' : s) = "(" : lexExpr s 140 | lexExpr (')' : s) = ")" : lexExpr s 141 | lexExpr (c : s) 142 | | isSpace c = lexExpr s 143 | | otherwise = let (tok, rest) = span p (c : s) 144 | in tok : lexExpr rest 145 | where p x = not (x == '(' || x == ')' || isSpace x) 146 | -------------------------------------------------------------------------------- /Text/Earley/Grammar.hs: -------------------------------------------------------------------------------- 1 | -- | Context-free grammars. 2 | {-# LANGUAGE CPP, GADTs, RankNTypes #-} 3 | module Text.Earley.Grammar 4 | ( Prod(..) 5 | , terminal 6 | , () 7 | , constraint 8 | , alts 9 | , Grammar(..) 10 | , rule 11 | , runGrammar 12 | ) where 13 | import Control.Applicative 14 | import Control.Monad 15 | import Control.Monad.Fix 16 | import Data.String (IsString(..)) 17 | #if !MIN_VERSION_base(4,8,0) 18 | import Data.Monoid 19 | #endif 20 | import Data.Semigroup 21 | 22 | infixr 0 23 | 24 | -- | A production. 25 | -- 26 | -- The type parameters are: 27 | -- 28 | -- @a@: The return type of the production. 29 | -- 30 | -- @t@ for terminal: The type of the terminals that the production operates 31 | -- on. 32 | -- 33 | -- @e@ for expected: The type of names, used for example to report expected 34 | -- tokens. 35 | -- 36 | -- @r@ for rule: The type of a non-terminal. This plays a role similar to the 37 | -- @s@ in the type @ST s a@. Since the 'parser' function expects the @r@ to be 38 | -- universally quantified, there is not much to do with this parameter other 39 | -- than leaving it universally quantified. 40 | -- 41 | -- As an example, @'Prod' r 'String' 'Char' 'Int'@ is the type of a production that 42 | -- returns an 'Int', operates on (lists of) characters and reports 'String' 43 | -- names. 44 | -- 45 | -- Most of the functionality of 'Prod's is obtained through its instances, e.g. 46 | -- 'Functor', 'Applicative', and 'Alternative'. 47 | data Prod r e t a where 48 | -- Applicative. 49 | Terminal :: !(t -> Maybe a) -> !(Prod r e t (a -> b)) -> Prod r e t b 50 | NonTerminal :: !(r e t a) -> !(Prod r e t (a -> b)) -> Prod r e t b 51 | Pure :: a -> Prod r e t a 52 | -- Monoid/Alternative. We have to special-case 'many' (though it can be done 53 | -- with rules) to be able to satisfy the Alternative interface. 54 | Alts :: ![Prod r e t a] -> !(Prod r e t (a -> b)) -> Prod r e t b 55 | Many :: !(Prod r e t a) -> !(Prod r e t ([a] -> b)) -> Prod r e t b 56 | -- Error reporting. 57 | Named :: !(Prod r e t a) -> e -> Prod r e t a 58 | -- Non-context-free extension: conditioning on the parsed output. 59 | Constraint :: !(Prod r e t a) -> (a -> Bool) -> Prod r e t a 60 | 61 | -- | Match a token for which the given predicate returns @Just a@, 62 | -- and return the @a@. 63 | terminal :: (t -> Maybe a) -> Prod r e t a 64 | terminal p = Terminal p $ Pure id 65 | 66 | -- | A named production (used for reporting expected things). 67 | () :: Prod r e t a -> e -> Prod r e t a 68 | () = Named 69 | 70 | -- | A parser that filters results, post-parsing 71 | constraint :: (a -> Bool) -> Prod r e t a -> Prod r e t a 72 | constraint = flip Constraint 73 | 74 | -- | Lifted instance: @(<>) = 'liftA2' ('<>')@ 75 | instance Semigroup a => Semigroup (Prod r e t a) where 76 | (<>) = liftA2 (Data.Semigroup.<>) 77 | 78 | -- | Lifted instance: @mempty = 'pure' 'mempty'@ 79 | instance Monoid a => Monoid (Prod r e t a) where 80 | mempty = pure mempty 81 | mappend = (<>) 82 | 83 | instance Functor (Prod r e t) where 84 | {-# INLINE fmap #-} 85 | fmap f (Terminal b p) = Terminal b $ fmap (f .) p 86 | fmap f (NonTerminal r p) = NonTerminal r $ fmap (f .) p 87 | fmap f (Pure x) = Pure $ f x 88 | fmap f (Alts as p) = Alts as $ fmap (f .) p 89 | fmap f (Many p q) = Many p $ fmap (f .) q 90 | fmap f (Named p n) = Named (fmap f p) n 91 | 92 | -- | Smart constructor for alternatives. 93 | alts :: [Prod r e t a] -> Prod r e t (a -> b) -> Prod r e t b 94 | alts as p = case as >>= go of 95 | [] -> empty 96 | [a] -> a <**> p 97 | as' -> Alts as' p 98 | where 99 | go (Alts [] _) = [] 100 | go (Alts as' (Pure f)) = fmap f <$> as' 101 | go (Named p' n) = map ( n) $ go p' 102 | go a = [a] 103 | 104 | instance Applicative (Prod r e t) where 105 | pure = Pure 106 | {-# INLINE (<*>) #-} 107 | Terminal b p <*> q = Terminal b $ flip <$> p <*> q 108 | NonTerminal r p <*> q = NonTerminal r $ flip <$> p <*> q 109 | Pure f <*> q = fmap f q 110 | Alts as p <*> q = alts as $ flip <$> p <*> q 111 | Many a p <*> q = Many a $ flip <$> p <*> q 112 | Named p n <*> q = Named (p <*> q) n 113 | 114 | instance Alternative (Prod r e t) where 115 | empty = Alts [] $ pure id 116 | Named p m <|> q = Named (p <|> q) m 117 | p <|> Named q n = Named (p <|> q) n 118 | p <|> q = alts [p, q] $ pure id 119 | many (Alts [] _) = pure [] 120 | many p = Many p $ Pure id 121 | some p = (:) <$> p <*> many p 122 | 123 | -- | String literals can be interpreted as 'Terminal's 124 | -- that match that string. 125 | -- 126 | -- >>> :set -XOverloadedStrings 127 | -- >>> import Data.Text (Text) 128 | -- >>> let determiner = "the" <|> "a" <|> "an" :: Prod r e Text Text 129 | -- 130 | instance (IsString t, Eq t, a ~ t) => IsString (Prod r e t a) where 131 | fromString s = Terminal f $ Pure id 132 | where 133 | fs = fromString s 134 | f t | t == fs = Just fs 135 | f _ = Nothing 136 | {-# INLINE fromString #-} 137 | 138 | -- | A context-free grammar. 139 | -- 140 | -- The type parameters are: 141 | -- 142 | -- @a@: The return type of the grammar (often a 'Prod'). 143 | -- 144 | -- @r@ for rule: The type of a non-terminal. This plays a role similar to the 145 | -- @s@ in the type @ST s a@. Since the 'parser' function expects the @r@ to be 146 | -- universally quantified, there is not much to do with this parameter other 147 | -- than leaving it universally quantified. 148 | -- 149 | -- Most of the functionality of 'Grammar's is obtained through its instances, 150 | -- e.g. 'Monad' and 'MonadFix'. Note that GHC has syntactic sugar for 151 | -- 'MonadFix': use @{-\# LANGUAGE RecursiveDo \#-}@ and @mdo@ instead of 152 | -- @do@. 153 | data Grammar r a where 154 | RuleBind :: Prod r e t a -> (Prod r e t a -> Grammar r b) -> Grammar r b 155 | FixBind :: (a -> Grammar r a) -> (a -> Grammar r b) -> Grammar r b 156 | Return :: a -> Grammar r a 157 | 158 | instance Functor (Grammar r) where 159 | fmap f (RuleBind ps h) = RuleBind ps (fmap f . h) 160 | fmap f (FixBind g h) = FixBind g (fmap f . h) 161 | fmap f (Return x) = Return $ f x 162 | 163 | instance Applicative (Grammar r) where 164 | pure = return 165 | (<*>) = ap 166 | 167 | instance Monad (Grammar r) where 168 | return = Return 169 | RuleBind ps f >>= k = RuleBind ps (f >=> k) 170 | FixBind f g >>= k = FixBind f (g >=> k) 171 | Return x >>= k = k x 172 | 173 | instance MonadFix (Grammar r) where 174 | mfix f = FixBind f return 175 | 176 | -- | Create a new non-terminal by giving its production. 177 | rule :: Prod r e t a -> Grammar r (Prod r e t a) 178 | rule p = RuleBind p return 179 | 180 | -- | Run a grammar, given an action to perform on productions to be turned into 181 | -- non-terminals. 182 | runGrammar :: MonadFix m 183 | => (forall e t a. Prod r e t a -> m (Prod r e t a)) 184 | -> Grammar r b -> m b 185 | runGrammar r grammar = case grammar of 186 | RuleBind p k -> do 187 | nt <- r p 188 | runGrammar r $ k nt 189 | Return a -> return a 190 | FixBind f k -> do 191 | a <- mfix $ runGrammar r <$> f 192 | runGrammar r $ k a 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Earley [![Hackage](https://img.shields.io/hackage/v/Earley.svg)](https://hackage.haskell.org/package/Earley) 2 | ====== 3 | 4 | [Go to the API documentation on Hackage.](https://hackage.haskell.org/package/Earley) 5 | 6 | This ([Text.Earley](https://hackage.haskell.org/package/Earley/docs/Text-Earley.html)) is a library consisting of a few main parts: 7 | 8 | ### [Text.Earley.Grammar](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Grammar.html) 9 | 10 | An embedded context-free grammar (CFG) domain-specific language (DSL) with 11 | semantic action specification in applicative style. 12 | 13 | An example of a typical expression grammar working on an input tokenised 14 | into strings is the following: 15 | 16 | ```haskell 17 | expr :: Grammar r (Prod r String String Expr) 18 | expr = mdo 19 | x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2 20 | <|> x2 21 | "sum" 22 | x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3 23 | <|> x3 24 | "product" 25 | x3 <- rule $ Var <$> (satisfy ident "identifier") 26 | <|> namedToken "(" *> x1 <* namedToken ")" 27 | return x1 28 | where 29 | ident (x:_) = isAlpha x 30 | ident _ = False 31 | ``` 32 | 33 | ### [Text.Earley.Parser](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Parser.html) 34 | 35 | An implementation of (a modification of) the Earley parsing algorithm. 36 | 37 | To invoke the parser on the above grammar, run e.g. (here using `words` as a 38 | stupid tokeniser): 39 | 40 | ```haskell 41 | fullParses (parser expr) $ words "a + b * ( c + d )" 42 | = ( [Add (Var "a") (Mul (Var "b") (Add (Var "c") (Var "d")))] 43 | , Report {...} 44 | ) 45 | ``` 46 | 47 | Note that we get a list of all the possible parses (though in this case 48 | there is only one). 49 | 50 | Another invocation, which shows the error reporting capabilities (giving the 51 | last position that the parser reached and what it expected at that point), 52 | is the following: 53 | 54 | ```haskell 55 | fullParses (parser expr) $ words "a +" 56 | = ( [] 57 | , Report { position = 2 58 | , expected = ["(","identifier","product"] 59 | , unconsumed = [] 60 | } 61 | ) 62 | ``` 63 | 64 | ### [Text.Earley.Generator](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Generator.html) 65 | 66 | Functionality to generate the members of the language that a grammar generates. 67 | 68 | To get the language of a grammar given a list of allowed tokens, run e.g.: 69 | 70 | ```haskell 71 | language (generator romanNumeralsGrammar "VIX") 72 | = [(0,""),(1,"I"),(5,"V"),(10,"X"),(20,"XX"),(11,"XI"),(15,"XV"),(6,"VI"),(9,"IX"),(4,"IV"),(2,"II"),(3,"III"),(19,"XIX"),(16,"XVI"),(14,"XIV"),(12,"XII"),(7,"VII"),(21,"XXI"),(25,"XXV"),(30,"XXX"),(31,"XXXI"),(35,"XXXV"),(8,"VIII"),(13,"XIII"),(17,"XVII"),(26,"XXVI"),(29,"XXIX"),(24,"XXIV"),(22,"XXII"),(18,"XVIII"),(36,"XXXVI"),(39,"XXXIX"),(34,"XXXIV"),(32,"XXXII"),(23,"XXIII"),(27,"XXVII"),(33,"XXXIII"),(28,"XXVIII"),(37,"XXXVII"),(38,"XXXVIII")] 73 | ``` 74 | 75 | The above example shows the language generated by a [Roman numerals 76 | grammar](examples/RomanNumerals.hs) limited to the tokens `'V'`, `'I'`, and 77 | `'X'`. 78 | 79 | ### [Text.Earley.Mixfix](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Mixfix.html) 80 | 81 | Helper functionality for creating parsers for expressions with mixfix 82 | identifiers in the style of Agda. 83 | 84 | How do I write grammars? 85 | ------------------------ 86 | 87 | As hinted at above, the grammars are written inside `Grammar`, which is a 88 | `Monad` and `MonadFix`. For the library to be able to tame the recursion in 89 | the grammars, we have to use the `rule` function whenever a production is 90 | recursive. 91 | 92 | Whenever you would write e.g. 93 | ```haskell 94 | ... 95 | p = foo <|> bar <*> p 96 | ... 97 | ``` 98 | in a conventional combinator parser library, you instead write the following: 99 | ```haskell 100 | grammar = mdo 101 | ... 102 | p <- rule $ foo <|> bar <*> p 103 | ... 104 | ``` 105 | 106 | Apart from making it possible to do recursion (even left-recursion), `rule`s 107 | have an additional benefit: they control where work is shared, by the rule that 108 | any `rule` is only ever expanded once per position in the input string. If a 109 | `rule` is encountered more than once at a position, the work is shared. 110 | 111 | Compared to parser generators and combinator libraries 112 | ------------------------------------------------------ 113 | 114 | This library differs from the main methods that are used to write parsers in 115 | the Haskell ecosystem: 116 | 117 | * Compared to parser generators (YACC, Happy, etc.) it requires very little 118 | pre-processing of the grammar. It also allows you to stay in the host 119 | language for both grammar and parser, i.e. there is no use of a separate 120 | tool. This also means that you are free to use the abstraction facilities of 121 | Haskell when writing a grammar. Currently the library requires a linear 122 | traversal of the grammar's rules before use, which is usually fast enough to 123 | do at run time, but precludes infinite grammars. 124 | 125 | * The grammar language is similar to that of many parser combinators (Parsec, 126 | Attoparsec, parallel parsing processes, etc.), providing an applicative 127 | interface, but the parser gracefully handles all finite CFGs, including those 128 | with left-recursion. On the other hand, its productions are not monadic 129 | meaning that it does not support context-sensitive or infinite grammars, 130 | which are supported by many parser combinator libraries. 131 | 132 | Note: The `Grammar` type is a `Monad` (used to provide observable sharing) 133 | but it lives a layer above productions. It cannot be used to decide what 134 | production to use depending on the result of a previous production, i.e. it 135 | does not give us monadic parsing. 136 | 137 | The parsing algorithm 138 | --------------------- 139 | 140 | The parsing algorithm that this library uses is based on [Earley's parsing 141 | algorithm](https://en.wikipedia.org/wiki/Earley_parser). The algorithm has 142 | been modified to produce online parse results, to give good error messages, and 143 | to allow garbage collection of the item sets. Essentially, instead of storing a 144 | sequence of sets of items like in the original algorithm, the modified 145 | algorithm just stores pointers back to sets of reachable items. 146 | 147 | The worst-case run time performance of the Earley parsing algorithm is cubic in 148 | the length of the input, but for large classes of grammars it is linear. It 149 | should however be noted that this library will likely be slower than most 150 | parser generators and parser combinator libraries. 151 | 152 | The parser implements an optimisation similar to that presented in Joop M.I.M 153 | Leo's paper *A general context-free parsing algorithm running in linear time on 154 | every LR(k) grammar without using lookahead*, which removes indirections in 155 | sequences of non-ambiguous backpointers between item sets. 156 | 157 | For more in-depth information about the internals of the library, there are 158 | [implementation notes](docs/implementation.md) currently being written. 159 | 160 | Contact 161 | ------- 162 | 163 | Olle Fredriksson - https://github.com/ollef 164 | -------------------------------------------------------------------------------- /Text/Earley/Generator/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, BangPatterns, DeriveFunctor, GADTs, Rank2Types, RecursiveDo #-} 2 | -- | This module exposes the internals of the package: its API may change 3 | -- independently of the PVP-compliant version number. 4 | module Text.Earley.Generator.Internal where 5 | import Control.Applicative 6 | import Control.Monad 7 | import Control.Monad.ST.Lazy 8 | import Data.Maybe(mapMaybe) 9 | import Data.STRef.Lazy 10 | import Text.Earley.Grammar 11 | #if !MIN_VERSION_base(4,8,0) 12 | import Data.Monoid 13 | #endif 14 | import Data.Semigroup 15 | 16 | ------------------------------------------------------------------------------- 17 | -- * Concrete rules and productions 18 | ------------------------------------------------------------------------------- 19 | -- | The concrete rule type that the generator uses 20 | data Rule s r e t a = Rule 21 | { ruleProd :: ProdR s r e t a 22 | , ruleConts :: !(STRef s (STRef s [Cont s r e t a r])) 23 | , ruleNulls :: !(Results s t a) 24 | } 25 | 26 | mkRule :: ProdR s r e t a -> ST s (Rule s r e t a) 27 | mkRule p = mdo 28 | c <- newSTRef =<< newSTRef mempty 29 | computeNullsRef <- newSTRef $ do 30 | writeSTRef computeNullsRef $ return [] 31 | ns <- unResults $ prodNulls p 32 | writeSTRef computeNullsRef $ return ns 33 | return ns 34 | return $ Rule (removeNulls p) c (Results $ join $ readSTRef computeNullsRef) 35 | 36 | prodNulls :: ProdR s r e t a -> Results s t a 37 | prodNulls prod = case prod of 38 | Terminal {} -> empty 39 | NonTerminal r p -> ruleNulls r <**> prodNulls p 40 | Pure a -> pure a 41 | Alts as p -> mconcat (map prodNulls as) <**> prodNulls p 42 | Many a p -> prodNulls (pure [] <|> pure <$> a) <**> prodNulls p 43 | Named p _ -> prodNulls p 44 | Constraint p _ -> prodNulls p 45 | 46 | -- | Remove (some) nulls from a production 47 | removeNulls :: ProdR s r e t a -> ProdR s r e t a 48 | removeNulls prod = case prod of 49 | Terminal {} -> prod 50 | NonTerminal {} -> prod 51 | Pure _ -> empty 52 | Alts as (Pure f) -> alts (map removeNulls as) $ Pure f 53 | Alts {} -> prod 54 | Many {} -> prod 55 | Named p n -> Named (removeNulls p) n 56 | Constraint p n -> Constraint (removeNulls p) n 57 | 58 | type ProdR s r e t a = Prod (Rule s r) e t a 59 | 60 | resetConts :: Rule s r e t a -> ST s () 61 | resetConts r = writeSTRef (ruleConts r) =<< newSTRef mempty 62 | 63 | ------------------------------------------------------------------------------- 64 | -- * Delayed results 65 | ------------------------------------------------------------------------------- 66 | newtype Results s t a = Results { unResults :: ST s [(a, [t])] } 67 | deriving Functor 68 | 69 | lazyResults :: ST s [(a, [t])] -> ST s (Results s t a) 70 | lazyResults stas = mdo 71 | resultsRef <- newSTRef $ do 72 | as <- stas 73 | writeSTRef resultsRef $ return as 74 | return as 75 | return $ Results $ join $ readSTRef resultsRef 76 | 77 | instance Applicative (Results s t) where 78 | pure x = Results $ pure [(x, mempty)] 79 | (<*>) = ap 80 | 81 | instance Alternative (Results t s) where 82 | empty = Results $ pure [] 83 | Results sxs <|> Results sys = Results $ (<|>) <$> sxs <*> sys 84 | 85 | instance Monad (Results t s) where 86 | return = pure 87 | Results stxs >>= f = Results $ do 88 | xs <- stxs 89 | concat <$> mapM (\(x, ts) -> fmap (\(y, ts') -> (y, ts' ++ ts)) <$> unResults (f x)) xs 90 | 91 | instance Semigroup (Results s t a) where 92 | (<>) = (<|>) 93 | 94 | instance Monoid (Results s t a) where 95 | mempty = empty 96 | mappend = (<>) 97 | 98 | ------------------------------------------------------------------------------- 99 | -- * States and continuations 100 | ------------------------------------------------------------------------------- 101 | data BirthPos 102 | = Previous 103 | | Current 104 | deriving Eq 105 | 106 | -- | An Earley state with result type @a@. 107 | data State s r e t a where 108 | State :: !(ProdR s r e t a) 109 | -> !(a -> Results s t b) 110 | -> !BirthPos 111 | -> !(Conts s r e t b c) 112 | -> State s r e t c 113 | Final :: !(Results s t a) -> State s r e t a 114 | 115 | -- | A continuation accepting an @a@ and producing a @b@. 116 | data Cont s r e t a b where 117 | Cont :: !(a -> Results s t b) 118 | -> !(ProdR s r e t (b -> c)) 119 | -> !(c -> Results s t d) 120 | -> !(Conts s r e t d e') 121 | -> Cont s r e t a e' 122 | FinalCont :: (a -> Results s t c) -> Cont s r e t a c 123 | 124 | data Conts s r e t a c = Conts 125 | { conts :: !(STRef s [Cont s r e t a c]) 126 | , contsArgs :: !(STRef s (Maybe (STRef s (Results s t a)))) 127 | } 128 | 129 | newConts :: STRef s [Cont s r e t a c] -> ST s (Conts s r e t a c) 130 | newConts r = Conts r <$> newSTRef Nothing 131 | 132 | contraMapCont :: (b -> Results s t a) -> Cont s r e t a c -> Cont s r e t b c 133 | contraMapCont f (Cont g p args cs) = Cont (f >=> g) p args cs 134 | contraMapCont f (FinalCont args) = FinalCont (f >=> args) 135 | 136 | contToState :: BirthPos -> Results s t a -> Cont s r e t a c -> State s r e t c 137 | contToState pos r (Cont g p args cs) = State p (\f -> r >>= g >>= args . f) pos cs 138 | contToState _ r (FinalCont args) = Final $ r >>= args 139 | 140 | -- | Strings of non-ambiguous continuations can be optimised by removing 141 | -- indirections. 142 | simplifyCont :: Conts s r e t b a -> ST s [Cont s r e t b a] 143 | simplifyCont Conts {conts = cont} = readSTRef cont >>= go False 144 | where 145 | go !_ [Cont g (Pure f) args cont'] = do 146 | ks' <- simplifyCont cont' 147 | go True $ map (contraMapCont $ g >=> args . f) ks' 148 | go True ks = do 149 | writeSTRef cont ks 150 | return ks 151 | go False ks = return ks 152 | 153 | ------------------------------------------------------------------------------- 154 | -- * Grammars 155 | ------------------------------------------------------------------------------- 156 | -- | Given a grammar, construct an initial state. 157 | initialState :: ProdR s a e t a -> ST s (State s a e t a) 158 | initialState p = State p pure Previous <$> (newConts =<< newSTRef [FinalCont pure]) 159 | 160 | ------------------------------------------------------------------------------- 161 | -- * Generation 162 | ------------------------------------------------------------------------------- 163 | -- | The result of a generator. 164 | data Result s t a 165 | = Ended (ST s [(a, [t])]) 166 | -- ^ The generator ended. 167 | | Generated (ST s [(a, [t])]) (ST s (Result s t a)) 168 | -- ^ The generator produced a number of @a@s. These are given as a 169 | -- computation, @'ST' s [a]@ that constructs the 'a's when run. The 'Int' is 170 | -- the position in the input where these results were obtained, and the last 171 | -- component is the continuation. 172 | deriving Functor 173 | 174 | data GenerationEnv s e t a = GenerationEnv 175 | { results :: ![ST s [(a, [t])]] 176 | -- ^ Results ready to be reported (when this position has been processed) 177 | , next :: ![State s a e t a] 178 | -- ^ States to process at the next position 179 | , reset :: !(ST s ()) 180 | -- ^ Computation that resets the continuation refs of productions 181 | , tokens :: ![t] 182 | -- ^ The possible tokens 183 | } 184 | 185 | {-# INLINE emptyGenerationEnv #-} 186 | emptyGenerationEnv :: [t] -> GenerationEnv s e t a 187 | emptyGenerationEnv ts = GenerationEnv 188 | { results = mempty 189 | , next = mempty 190 | , reset = return () 191 | , tokens = ts 192 | } 193 | 194 | -- | The internal generation routine 195 | generate :: [State s a e t a] -- ^ States to process at this position 196 | -> GenerationEnv s e t a 197 | -> ST s (Result s t a) 198 | generate [] env@GenerationEnv {next = []} = do 199 | reset env 200 | return $ Ended $ concat <$> sequence (results env) 201 | generate [] env = do 202 | reset env 203 | return $ Generated (concat <$> sequence (results env)) 204 | $ generate (next env) $ emptyGenerationEnv $ tokens env 205 | generate (st:ss) env = case st of 206 | Final res -> generate ss env {results = unResults res : results env} 207 | State pr args pos scont -> case pr of 208 | Terminal f p -> generate ss env 209 | { next = [State p (\g -> Results (pure $ map (\(t, a) -> (g a, [t])) xs) >>= args) Previous scont | xs <- [mapMaybe (\t -> (,) t <$> f t) $ tokens env], not $ null xs] 210 | ++ next env 211 | } 212 | NonTerminal r p -> do 213 | rkref <- readSTRef $ ruleConts r 214 | ks <- readSTRef rkref 215 | writeSTRef rkref (Cont pure p args scont : ks) 216 | ns <- unResults $ ruleNulls r 217 | let addNullState 218 | | null ns = id 219 | | otherwise = (:) 220 | $ State p (\f -> Results (pure ns) >>= args . f) pos scont 221 | if null ks then do -- The rule has not been expanded at this position. 222 | st' <- State (ruleProd r) pure Current <$> newConts rkref 223 | generate (addNullState $ st' : ss) 224 | env {reset = resetConts r >> reset env} 225 | else -- The rule has already been expanded at this position. 226 | generate (addNullState ss) env 227 | Pure a 228 | -- Skip following continuations that stem from the current position; such 229 | -- continuations are handled separately. 230 | | pos == Current -> generate ss env 231 | | otherwise -> do 232 | let argsRef = contsArgs scont 233 | masref <- readSTRef argsRef 234 | case masref of 235 | Just asref -> do -- The continuation has already been followed at this position. 236 | modifySTRef asref $ mappend $ args a 237 | generate ss env 238 | Nothing -> do -- It hasn't. 239 | asref <- newSTRef $ args a 240 | writeSTRef argsRef $ Just asref 241 | ks <- simplifyCont scont 242 | res <- lazyResults $ unResults =<< readSTRef asref 243 | let kstates = map (contToState pos res) ks 244 | generate (kstates ++ ss) 245 | env {reset = writeSTRef argsRef Nothing >> reset env} 246 | Alts as (Pure f) -> do 247 | let args' = args . f 248 | sts = [State a args' pos scont | a <- as] 249 | generate (sts ++ ss) env 250 | Alts as p -> do 251 | scont' <- newConts =<< newSTRef [Cont pure p args scont] 252 | let sts = [State a pure Previous scont' | a <- as] 253 | generate (sts ++ ss) env 254 | Many p q -> mdo 255 | r <- mkRule $ pure [] <|> (:) <$> p <*> NonTerminal r (Pure id) 256 | generate (State (NonTerminal r q) args pos scont : ss) env 257 | Named pr' _ -> generate (State pr' args pos scont : ss) env 258 | Constraint pr' c -> generate (State pr' (test >=> args) pos scont : ss) env 259 | where test x = if c x then return x else empty 260 | 261 | type Generator t a = forall s. ST s (Result s t a) 262 | 263 | -- | Create a language generator for given grammar and list of allowed tokens. 264 | generator 265 | :: (forall r. Grammar r (Prod r e t a)) 266 | -> [t] 267 | -> Generator t a 268 | generator g ts = do 269 | let nt x = NonTerminal x $ pure id 270 | s <- initialState =<< runGrammar (fmap nt . mkRule) g 271 | generate [s] $ emptyGenerationEnv ts 272 | 273 | -- | Run a generator, returning all members of the language. 274 | -- 275 | -- The members are returned as parse results paired with the list of tokens 276 | -- used to produce the result. 277 | -- The elements of the returned list of results are sorted by their length in 278 | -- ascending order. If there are multiple results of the same length they are 279 | -- returned in an unspecified order. 280 | language 281 | :: Generator t a 282 | -> [(a, [t])] 283 | language gen = runST $ gen >>= go 284 | where 285 | go :: Result s t a -> ST s [(a, [t])] 286 | go r = case r of 287 | Ended mas -> mas 288 | Generated mas k -> do 289 | as <- mas 290 | (as ++) <$> (go =<< k) 291 | 292 | -- | @upTo n gen@ runs the generator @gen@, returning all members of the 293 | -- language that are of length less than or equal to @n@. 294 | -- 295 | -- The members are returned as parse results paired with the list of tokens 296 | -- used to produce the result. 297 | -- The elements of the returned list of results are sorted by their length in 298 | -- ascending order. If there are multiple results of the same length they are 299 | -- returned in an unspecified order. 300 | upTo 301 | :: Int 302 | -> Generator t a 303 | -> [(a, [t])] 304 | upTo len gen = runST $ gen >>= go 0 305 | where 306 | go :: Int -> Result s t a -> ST s [(a, [t])] 307 | go curLen r | curLen <= len = case r of 308 | Ended mas -> mas 309 | Generated mas k -> do 310 | as <- mas 311 | (as ++) <$> (go (curLen + 1) =<< k) 312 | go _ _ = return [] 313 | 314 | -- | @exactly n gen@ runs the generator @gen@, returning all members of the 315 | -- language that are of length equal to @n@. 316 | -- 317 | -- The members are returned as parse results paired with the list of tokens 318 | -- used to produce the result. 319 | -- If there are multiple results they are returned in an unspecified order. 320 | exactly 321 | :: Int 322 | -> Generator t a 323 | -> [(a, [t])] 324 | exactly len _ | len < 0 = [] 325 | exactly len gen = runST $ gen >>= go 0 326 | where 327 | go :: Int -> Result s t a -> ST s [(a, [t])] 328 | go !curLen r = case r of 329 | Ended mas 330 | | curLen == len -> mas 331 | | otherwise -> return [] 332 | Generated mas k 333 | | curLen == len -> mas 334 | | otherwise -> go (curLen + 1) =<< k 335 | -------------------------------------------------------------------------------- /Text/Earley/Parser/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, BangPatterns, DeriveFunctor, GADTs, TupleSections, Rank2Types, RecursiveDo #-} 2 | -- | This module exposes the internals of the package: its API may change 3 | -- independently of the PVP-compliant version number. 4 | module Text.Earley.Parser.Internal where 5 | import Control.Applicative 6 | import Control.Arrow 7 | import Control.Monad 8 | import Control.Monad.ST 9 | import Data.ListLike(ListLike) 10 | import qualified Data.ListLike as ListLike 11 | import Data.STRef 12 | import Text.Earley.Grammar 13 | #if !MIN_VERSION_base(4,8,0) 14 | import Data.Monoid 15 | #endif 16 | import Data.Semigroup 17 | 18 | ------------------------------------------------------------------------------- 19 | -- * Concrete rules and productions 20 | ------------------------------------------------------------------------------- 21 | -- | The concrete rule type that the parser uses 22 | data Rule s r e t a = Rule 23 | { ruleProd :: ProdR s r e t a 24 | , ruleConts :: !(STRef s (STRef s [Cont s r e t a r])) 25 | , ruleNulls :: !(Results s a) 26 | } 27 | 28 | mkRule :: ProdR s r e t a -> ST s (Rule s r e t a) 29 | mkRule p = mdo 30 | c <- newSTRef =<< newSTRef mempty 31 | computeNullsRef <- newSTRef $ do 32 | writeSTRef computeNullsRef $ return [] 33 | ns <- unResults $ prodNulls p 34 | writeSTRef computeNullsRef $ return ns 35 | return ns 36 | return $ Rule (removeNulls p) c (Results $ join $ readSTRef computeNullsRef) 37 | 38 | prodNulls :: ProdR s r e t a -> Results s a 39 | prodNulls prod = case prod of 40 | Terminal {} -> empty 41 | NonTerminal r p -> ruleNulls r <**> prodNulls p 42 | Pure a -> pure a 43 | Alts as p -> mconcat (map prodNulls as) <**> prodNulls p 44 | Many a p -> prodNulls (pure [] <|> pure <$> a) <**> prodNulls p 45 | Named p _ -> prodNulls p 46 | Constraint p _ -> prodNulls p 47 | 48 | -- | Remove (some) nulls from a production 49 | removeNulls :: ProdR s r e t a -> ProdR s r e t a 50 | removeNulls prod = case prod of 51 | Terminal {} -> prod 52 | NonTerminal {} -> prod 53 | Pure _ -> empty 54 | Alts as (Pure f) -> alts (map removeNulls as) $ Pure f 55 | Alts {} -> prod 56 | Many {} -> prod 57 | Named p n -> Named (removeNulls p) n 58 | Constraint p n -> Constraint (removeNulls p) n 59 | 60 | type ProdR s r e t a = Prod (Rule s r) e t a 61 | 62 | resetConts :: Rule s r e t a -> ST s () 63 | resetConts r = writeSTRef (ruleConts r) =<< newSTRef mempty 64 | 65 | ------------------------------------------------------------------------------- 66 | -- * Delayed results 67 | ------------------------------------------------------------------------------- 68 | newtype Results s a = Results { unResults :: ST s [a] } 69 | deriving Functor 70 | 71 | lazyResults :: ST s [a] -> ST s (Results s a) 72 | lazyResults stas = mdo 73 | resultsRef <- newSTRef $ do 74 | as <- stas 75 | writeSTRef resultsRef $ return as 76 | return as 77 | return $ Results $ join $ readSTRef resultsRef 78 | 79 | instance Applicative (Results s) where 80 | pure = Results . pure . pure 81 | (<*>) = ap 82 | 83 | instance Alternative (Results s) where 84 | empty = Results $ pure [] 85 | Results sxs <|> Results sys = Results $ (<|>) <$> sxs <*> sys 86 | 87 | instance Monad (Results s) where 88 | return = pure 89 | Results stxs >>= f = Results $ do 90 | xs <- stxs 91 | concat <$> mapM (unResults . f) xs 92 | 93 | instance Semigroup (Results s a) where 94 | (<>) = (<|>) 95 | 96 | instance Monoid (Results s a) where 97 | mempty = empty 98 | mappend = (<>) 99 | 100 | ------------------------------------------------------------------------------- 101 | -- * States and continuations 102 | ------------------------------------------------------------------------------- 103 | data BirthPos 104 | = Previous 105 | | Current 106 | deriving Eq 107 | 108 | -- | An Earley state with result type @a@. 109 | data State s r e t a where 110 | State :: !(ProdR s r e t a) 111 | -> !(a -> Results s b) 112 | -> !BirthPos 113 | -> !(Conts s r e t b c) 114 | -> State s r e t c 115 | Final :: !(Results s a) -> State s r e t a 116 | 117 | -- | A continuation accepting an @a@ and producing a @b@. 118 | data Cont s r e t a b where 119 | Cont :: !(a -> Results s b) 120 | -> !(ProdR s r e t (b -> c)) 121 | -> !(c -> Results s d) 122 | -> !(Conts s r e t d e') 123 | -> Cont s r e t a e' 124 | FinalCont :: (a -> Results s c) -> Cont s r e t a c 125 | 126 | data Conts s r e t a c = Conts 127 | { conts :: !(STRef s [Cont s r e t a c]) 128 | , contsArgs :: !(STRef s (Maybe (STRef s (Results s a)))) 129 | } 130 | 131 | newConts :: STRef s [Cont s r e t a c] -> ST s (Conts s r e t a c) 132 | newConts r = Conts r <$> newSTRef Nothing 133 | 134 | contraMapCont :: (b -> Results s a) -> Cont s r e t a c -> Cont s r e t b c 135 | contraMapCont f (Cont g p args cs) = Cont (f >=> g) p args cs 136 | contraMapCont f (FinalCont args) = FinalCont (f >=> args) 137 | 138 | contToState :: BirthPos -> Results s a -> Cont s r e t a c -> State s r e t c 139 | contToState pos r (Cont g p args cs) = State p (\f -> r >>= g >>= args . f) pos cs 140 | contToState _ r (FinalCont args) = Final $ r >>= args 141 | 142 | -- | Strings of non-ambiguous continuations can be optimised by removing 143 | -- indirections. 144 | simplifyCont :: Conts s r e t b a -> ST s [Cont s r e t b a] 145 | simplifyCont Conts {conts = cont} = readSTRef cont >>= go False 146 | where 147 | go !_ [Cont g (Pure f) args cont'] = do 148 | ks' <- simplifyCont cont' 149 | go True $ map (contraMapCont $ g >=> args . f) ks' 150 | go True ks = do 151 | writeSTRef cont ks 152 | return ks 153 | go False ks = return ks 154 | 155 | ------------------------------------------------------------------------------- 156 | -- * Grammars 157 | ------------------------------------------------------------------------------- 158 | -- | Given a grammar, construct an initial state. 159 | initialState :: ProdR s a e t a -> ST s (State s a e t a) 160 | initialState p = State p pure Previous <$> (newConts =<< newSTRef [FinalCont pure]) 161 | 162 | ------------------------------------------------------------------------------- 163 | -- * Parsing 164 | ------------------------------------------------------------------------------- 165 | -- | A parsing report, which contains fields that are useful for presenting 166 | -- errors to the user if a parse is deemed a failure. Note however that we get 167 | -- a report even when we successfully parse something. 168 | data Report e i = Report 169 | { position :: Int -- ^ The final position in the input (0-based) that the 170 | -- parser reached. 171 | , expected :: [e] -- ^ The named productions processed at the final 172 | -- position. 173 | , unconsumed :: i -- ^ The part of the input string that was not consumed, 174 | -- which may be empty. 175 | } deriving (Eq, Ord, Read, Show) 176 | 177 | -- | The result of a parse. 178 | data Result s e i a 179 | = Ended (Report e i) 180 | -- ^ The parser ended. 181 | | Parsed (ST s [a]) Int i (ST s (Result s e i a)) 182 | -- ^ The parser parsed a number of @a@s. These are given as a computation, 183 | -- @'ST' s [a]@ that constructs the 'a's when run. We can thus save some 184 | -- work by ignoring this computation if we do not care about the results. 185 | -- The 'Int' is the position in the input where these results were 186 | -- obtained, the @i@ the rest of the input, and the last component is the 187 | -- continuation. 188 | deriving Functor 189 | 190 | data ParseEnv s e i t a = ParseEnv 191 | { results :: ![ST s [a]] 192 | -- ^ Results ready to be reported (when this position has been processed) 193 | , next :: ![State s a e t a] 194 | -- ^ States to process at the next position 195 | , reset :: !(ST s ()) 196 | -- ^ Computation that resets the continuation refs of productions 197 | , names :: ![e] 198 | -- ^ Named productions encountered at this position 199 | , curPos :: !Int 200 | -- ^ The current position in the input string 201 | , input :: !i 202 | -- ^ The input string 203 | } 204 | 205 | {-# INLINE emptyParseEnv #-} 206 | emptyParseEnv :: i -> ParseEnv s e i t a 207 | emptyParseEnv i = ParseEnv 208 | { results = mempty 209 | , next = mempty 210 | , reset = return () 211 | , names = mempty 212 | , curPos = 0 213 | , input = i 214 | } 215 | 216 | {-# SPECIALISE parse :: [State s a e t a] 217 | -> ParseEnv s e [t] t a 218 | -> ST s (Result s e [t] a) #-} 219 | -- | The internal parsing routine 220 | parse :: ListLike i t 221 | => [State s a e t a] -- ^ States to process at this position 222 | -> ParseEnv s e i t a 223 | -> ST s (Result s e i a) 224 | parse [] env@ParseEnv {results = [], next = []} = do 225 | reset env 226 | return $ Ended Report 227 | { position = curPos env 228 | , expected = names env 229 | , unconsumed = input env 230 | } 231 | parse [] env@ParseEnv {results = []} = do 232 | reset env 233 | parse (next env) 234 | (emptyParseEnv $ ListLike.drop 1 $ input env) {curPos = curPos env + 1} 235 | parse [] env = do 236 | reset env 237 | return $ Parsed (concat <$> sequence (results env)) (curPos env) (input env) 238 | $ parse [] env {results = [], reset = return ()} 239 | parse (st:ss) env = case st of 240 | Final res -> parse ss env {results = unResults res : results env} 241 | State pr args pos scont -> case pr of 242 | Terminal f p -> case ListLike.uncons (input env) >>= f . fst of 243 | Just a -> parse ss env {next = State p (args . ($ a)) Previous scont 244 | : next env} 245 | Nothing -> parse ss env 246 | NonTerminal r p -> do 247 | rkref <- readSTRef $ ruleConts r 248 | ks <- readSTRef rkref 249 | writeSTRef rkref (Cont pure p args scont : ks) 250 | ns <- unResults $ ruleNulls r 251 | let addNullState 252 | | null ns = id 253 | | otherwise = (:) 254 | $ State p (\f -> Results (pure $ map f ns) >>= args) pos scont 255 | if null ks then do -- The rule has not been expanded at this position. 256 | st' <- State (ruleProd r) pure Current <$> newConts rkref 257 | parse (addNullState $ st' : ss) 258 | env {reset = resetConts r >> reset env} 259 | else -- The rule has already been expanded at this position. 260 | parse (addNullState ss) env 261 | Pure a 262 | -- Skip following continuations that stem from the current position; such 263 | -- continuations are handled separately. 264 | | pos == Current -> parse ss env 265 | | otherwise -> do 266 | let argsRef = contsArgs scont 267 | masref <- readSTRef argsRef 268 | case masref of 269 | Just asref -> do -- The continuation has already been followed at this position. 270 | modifySTRef asref $ mappend $ args a 271 | parse ss env 272 | Nothing -> do -- It hasn't. 273 | asref <- newSTRef $ args a 274 | writeSTRef argsRef $ Just asref 275 | ks <- simplifyCont scont 276 | res <- lazyResults $ unResults =<< readSTRef asref 277 | let kstates = map (contToState pos res) ks 278 | parse (kstates ++ ss) 279 | env {reset = writeSTRef argsRef Nothing >> reset env} 280 | Alts as (Pure f) -> do 281 | let args' = args . f 282 | sts = [State a args' pos scont | a <- as] 283 | parse (sts ++ ss) env 284 | Alts as p -> do 285 | scont' <- newConts =<< newSTRef [Cont pure p args scont] 286 | let sts = [State a pure Previous scont' | a <- as] 287 | parse (sts ++ ss) env 288 | Many p q -> mdo 289 | r <- mkRule $ pure [] <|> (:) <$> p <*> NonTerminal r (Pure id) 290 | parse (State (NonTerminal r q) args pos scont : ss) env 291 | Named pr' n -> parse (State pr' args pos scont : ss) 292 | env {names = n : names env} 293 | Constraint pr' c -> parse (State pr' (test >=> args) pos scont : ss) env 294 | where test x = if c x then return x else empty 295 | 296 | type Parser e i a = forall s. i -> ST s (Result s e i a) 297 | 298 | {-# INLINE parser #-} 299 | -- | Create a parser from the given grammar. 300 | parser 301 | :: ListLike i t 302 | => (forall r. Grammar r (Prod r e t a)) 303 | -> Parser e i a 304 | parser g i = do 305 | let nt x = NonTerminal x $ pure id 306 | s <- initialState =<< runGrammar (fmap nt . mkRule) g 307 | parse [s] $ emptyParseEnv i 308 | 309 | -- | Return all parses from the result of a given parser. The result may 310 | -- contain partial parses. The 'Int's are the position at which a result was 311 | -- produced. 312 | -- 313 | -- The elements of the returned list of results are sorted by their position in 314 | -- ascending order. If there are multiple results at the same position they 315 | -- are returned in an unspecified order. 316 | allParses 317 | :: Parser e i a 318 | -> i 319 | -> ([(a, Int)], Report e i) 320 | allParses p i = runST $ p i >>= go 321 | where 322 | go :: Result s e i a -> ST s ([(a, Int)], Report e i) 323 | go r = case r of 324 | Ended rep -> return ([], rep) 325 | Parsed mas cpos _ k -> do 326 | as <- mas 327 | fmap (first (map (, cpos) as ++)) $ go =<< k 328 | 329 | {-# INLINE fullParses #-} 330 | -- | Return all parses that reached the end of the input from the result of a 331 | -- given parser. 332 | -- 333 | -- If there are multiple results they are returned in an unspecified order. 334 | fullParses 335 | :: ListLike i t 336 | => Parser e i a 337 | -> i 338 | -> ([a], Report e i) 339 | fullParses p i = runST $ p i >>= go 340 | where 341 | go :: ListLike i t => Result s e i a -> ST s ([a], Report e i) 342 | go r = case r of 343 | Ended rep -> return ([], rep) 344 | Parsed mas _ i' k 345 | | ListLike.null i' -> do 346 | as <- mas 347 | fmap (first (as ++)) $ go =<< k 348 | | otherwise -> go =<< k 349 | 350 | {-# INLINE report #-} 351 | -- | See e.g. how far the parser is able to parse the input string before it 352 | -- fails. This can be much faster than getting the parse results for highly 353 | -- ambiguous grammars. 354 | report 355 | :: Parser e i a 356 | -> i 357 | -> Report e i 358 | report p i = runST $ p i >>= go 359 | where 360 | go :: Result s e i a -> ST s (Report e i) 361 | go r = case r of 362 | Ended rep -> return rep 363 | Parsed _ _ _ k -> go =<< k 364 | -------------------------------------------------------------------------------- /docs/implementation.md: -------------------------------------------------------------------------------- 1 | % Earley's internals 2 | % Olle Fredriksson 3 | % Late 2015 4 | 5 | This is an attempt to document the internals of the Earley library. The 6 | writing of this document was spurred both by a request to provide this 7 | documentation to help contributors and by some recent discussion regarding the 8 | readability of the implementation. 9 | 10 | I'll first start with some background that many of you probably already know 11 | about. Feel free to skip ahead or skim. 12 | 13 | Background 14 | === 15 | 16 | Let's start by having a look at how parser combinator libraries usually work. 17 | I'm aware that this doesn't give the full picture in terms of available 18 | libraries for parsing --- it ignores e.g. parser generators --- but I believe 19 | this way actually gives the best intuition for how the Earley library works. 20 | On the surface I think that the Earley library is more similar to combinator 21 | libraries than to parser generators because of the (Applicative, embedded in a 22 | powerful host language) interface that it provides. However, the 23 | *functionality* that it provides may more accurately be compared to parser 24 | generator libraries. 25 | 26 | Parser combinators 27 | ------- 28 | 29 | I know of three ways that parser combinator libraries handle alternatives: 30 | 31 | * *Greedy*, or LL(1): Commit to an alternative the instant it has consumed a 32 | symbol. Always start with the leftmost alternative. This is classical 33 | recursive descent. 34 | * *Backtracking*: Go back and try the other alternatives if an 35 | alternative fails (i.e. depth-first). 36 | * *Parallel*: Try all alternatives in parallel (i.e. breadth-first) 37 | 38 | Some examples of libraries in these categories are 39 | 40 | * [Parsec](https://hackage.haskell.org/package/parsec), which is greedy by default, but backtracks from wherever the `try` 41 | combinator is used, 42 | * [Attoparsec](https://hackage.haskell.org/package/attoparsec), which is backtracking, and 43 | * [Parsek](https://hackage.haskell.org/package/parsek) (a.k.a. parallel parsing processes), which is parallel. 44 | 45 | The following is a small expression language given in BNF: 46 | 47 | ``` 48 | ident ::= 'x' | 'y' 49 | expr ::= ident | ident '+' expr 50 | ``` 51 | 52 | An example string in this language is `"x+y"`. Let's investigate how the three 53 | different kinds of alternative handling cope with this input string. 54 | 55 | * Greedy: 56 | 57 | This starts by choosing the first `expr` branch and parsing an `'x'`, 58 | i.e. the derivation 59 | ``` 60 | expr ---->{1} ident 61 | ---->{1} 'x' 62 | -->{'x'} EMPTY 63 | -------> FAIL 64 | ``` 65 | I use the following kinds of transitions in the derivation: 66 | - ` a rest ---->{i} a_i rest`: expand the leftmost non-terminal's `i`th production. 67 | - `'x' rest -->{'x'} rest`: consume token 'x' from the input. 68 | 69 | Since the parser has managed to parse a symbol at this point, it has 70 | committed to the first `expr` branch and makes no further progress. Here I 71 | use `EMPTY` to denote the empty production (sometimes also called epsilon). 72 | * Backtracking: 73 | 74 | We take the transition `FAIL ->{back} y` to mean backtracking to some 75 | previous state. I'll use indentation to indicate what the state is; this is 76 | slightly informal (in an implementation we would have to save some piece of 77 | state), but hopefully clear enough. 78 | ``` 79 | expr ---->{1} ident 80 | ---->{1} 'x' 81 | ------> FAIL 82 | ->{back} 83 | expr ---->{2} ident '+' expr 84 | ---->{1} 'x' '+' expr 85 | -->{'x'} '+' expr 86 | -->{'+'} expr 87 | ---->{1} ident ---->{1} 'x' 88 | -------> FAIL 89 | ->{back} 90 | ident ---->{2} 'y' 91 | -->{'y'} EMPTY 92 | -->{EOF} SUCCESS 93 | ``` 94 | * Parallel: 95 | 96 | Here we use the syntax `a rest ---->{*} (a_1 rest | ... | a_k rest)` 97 | for expanding all the productions of a non-terminal. 98 | ``` 99 | expr ---->{*} ident | ident '+' expr 100 | ---->{*} 'x' | 'y' | 'x' '+' expr | 'y' '+' expr 101 | -->{'x'} EMPTY | '+' expr 102 | -->{'+'} expr 103 | ---->{*} ident | ident '+' expr 104 | ---->{*} 'x' | 'y' | 'x' '+' expr | 'y' '+' expr 105 | -->{'y'} EMPTY | '+' expr 106 | -->{EOF} SUCCESS 107 | ``` 108 | 109 | We saw that greedy parsing did not succeed like the others. So what is it good 110 | for? First of all, we can rewrite the grammar and make it work. For example, as 111 | follows: 112 | 113 | ``` 114 | ident ::= 'x' | 'y' 115 | tail ::= '+' ident tail | EMPTY 116 | expr ::= ident tail 117 | ``` 118 | 119 | For the example string, `"x+y"`, we get the following derivation: 120 | 121 | ``` 122 | expr ---->{1} ident tail 123 | ---->{1} 'x' tail 124 | -->{'x'} tail 125 | ---->{1} '+' ident tail 126 | -->{'+'} ident tail ---->{1} 'x' tail 127 | -------> FAIL 128 | ->{back} 129 | ident tail ---->{2} 'y' tail 130 | -->{'y'} tail ---->{1} '+' ident tail 131 | -------> FAIL 132 | ->{back} 133 | tail ---->{2} EMPTY 134 | -------> SUCCESS 135 | ``` 136 | 137 | Note that this parser also backtracks, but only if it hasn't consumed 138 | any input in the branch it backtracks from. The nice property of a greedy 139 | parser, compared to a backtracking parser, is that it never backtracks *in the 140 | input string*. This means that it is potentially more efficient than a 141 | backtracking parser, because it does not have to keep references to old 142 | positions in the input string when keeping backtracking states. I say 143 | potentially faster, because Attoparsec, probably the fastest parser combinator 144 | library for Haskell currently in existence, does full backtracking seemingly 145 | without suffering from any performance penalty. 146 | 147 | Both backtracking and parallel parsing have the same power in terms of the 148 | languages that they recognise. If we are only interested in recognition and 149 | our language is not LL(1) or not easily factorable into LL(1), backtracking 150 | parsing is a practical choice, since it often does less work than a 151 | parallel parser. Note, however, that there are cases where a backtracking 152 | parser has to do as much work as a parallel parser. If we are interested in 153 | getting *all* the ways a given string is parsable, parallel parsing is 154 | a natural choice. 155 | 156 | The limitations of parser combinators 157 | --------- 158 | 159 | * Language: 160 | 161 | Backtracking and parallel parsers are guaranteed to terminate only for any 162 | LL(k) language. For example, they do not handle left-recursive grammars, 163 | i.e. grammars where we have derivations of the form `p -->+ p rest`. 164 | 165 | As an example, they cannot handle the following grammar, because `expr` is 166 | left-recursive: 167 | 168 | ``` 169 | ident ::= 'x' | 'y' 170 | expr ::= ident | expr '+' ident 171 | ``` 172 | 173 | In practice, we can often perform *left-recursion removal*. The cost of 174 | this is that the intended meaning of the grammar is sometimes obscured. 175 | 176 | * Worst-case running time: 177 | 178 | There are many grammars for which parsing is exponential. 179 | 180 | Consider: 181 | 182 | ``` 183 | x ::= '(' x x ')' | EMPTY 184 | ``` 185 | 186 | Let's see what happens in trying to parse the string `"(())"` in parallel: 187 | 188 | ``` 189 | x ---->{*} '(' x x ')' | EMPTY 190 | -->{'('} x x ')' 191 | ---->{*} '(' x x ')' x ')' | x ')' 192 | ---->{*} '(' x x ')' x ')' | '(' x x ')' ')' | ')' 193 | -->{'('} x x ')' x ')' | x x ')' ')' 194 | ---->{*} '(' x x ')' x x ')' x ')' | x ')' x ')' | x x ')' ')' 195 | ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | x x ')' ')' 196 | ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | '(' x x ')' x ')' ')' | x ')' ')' 197 | ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | '(' x x ')' x ')' ')' | '(' x x ')' ')' ')' | ')' ')' 198 | -->{')'} x ')' | ')' 199 | ---->{*} '(' x x ')' ')' | ')' | ')' 200 | -->{')'} EMPTY | EMPTY 201 | -------> SUCCESS 202 | ``` 203 | We can at least see that the number of parallel productions that we are 204 | processing blows up quite quickly. 205 | I find it easiest to see this problem in the parallel strategy, but note 206 | that backtracking parsing has to do the same amount of work as the parallel 207 | parser for certain inputs. 208 | 209 | This should also have convinced you that quite a lot of the work is duplicated. 210 | 211 | 212 | Both of these problems stem from the way that parser combinator grammars are 213 | written. 214 | Most recursive grammars written using parser combinators are not finite, even though 215 | they have a finite representation, and *there is no way to detect this*. 216 | 217 | This is similar to how we can't detect (without using dirty, unsafe tricks) 218 | that we will never reach the end of the list defined by `ones = 1 : ones`. 219 | 220 | In this light, let's look at the left-recursive grammar again: 221 | 222 | ``` 223 | expr ::= ident | expr '+' ident 224 | ``` 225 | 226 | As a parser combinator grammar, this is equivalent to the following, which 227 | elucidates the left-recursion problem: 228 | 229 | ``` 230 | expr ::= ident | (ident | (ident | (ident | ...) '+' ident) '+' ident) '+' ident 231 | ``` 232 | 233 | Since parser combinators have no way of distinguishing between non-terminals 234 | and their productions, they also have *no way* of sharing work between 235 | invocations of the same non-terminal, because they cannot detect that this 236 | has happened in the first place. 237 | 238 | 239 | How not to repeat yourself yourself 240 | ----- 241 | 242 | I know of two situations where there is potential to share work. 243 | 244 | The first is when we encounter the same non-terminal at the same position. 245 | As an example, there is no need to re-do the work of `x` in both branches 246 | of `a` in the following grammar. 247 | 248 | ``` 249 | a ::= x rest1 | x rest2 250 | ``` 251 | 252 | A better idea would be to just parse `x` 253 | once and carry on with `rest1 | rest2`. 254 | Pictorially, we want the following branching structure: 255 | ``` 256 | rest1 257 | / 258 | a---x 259 | \ 260 | rest2 261 | ``` 262 | We do not want the following, which we have in parallel parsing: 263 | ``` 264 | x--rest1 265 | / 266 | a 267 | \ 268 | x--rest2 269 | ``` 270 | 271 | A more general example is whenever we are parsing two unrelated branches at the same position, such as: 272 | ``` 273 | a ::= start1 x rest1 274 | b ::= start2 x rest2 275 | ``` 276 | If we are parsing `a` and `b` at the same time we also want to share the work 277 | of the non-terminal `x` whenever it's encountered at the same position: 278 | ``` 279 | a---start1 --rest1 280 | \ / 281 | x 282 | / \ 283 | b---start2 --rest2 284 | ``` 285 | What we don't want is to keep the two branches independent: 286 | ``` 287 | a---start1---x---rest1 288 | 289 | b---start2---x---rest2 290 | ``` 291 | 292 | It turns out that if we never re-do the work of a non-terminal, we can 293 | automatically gain support for left-recursive grammars, since the main problem 294 | there is the infinite expansion of recursive non-terminals. 295 | 296 | The second way we can share work is perhaps less obvious. Here I will use an 297 | extended BNF syntax where productions can have alternatives anywhere in the 298 | tree. As an example 299 | 300 | ``` 301 | a ::= (x | y) rest 302 | ``` 303 | means that `a` accepts either an `x` or a `y`, followed by `rest`. 304 | One (bad) way to handle `(x | y) rest` is to desugar it as follows: 305 | 306 | ``` 307 | (x | y) rest = x rest | y rest 308 | ``` 309 | 310 | Now let's say that in parsing `a`, both branches `x` and `y` are successful 311 | and finish at the same time. Unless `rest` is a non-terminal and falls 312 | into the work-sharing situation above, we will be parsing two instances of 313 | the same thing. It would be more efficient if we could share the work of 314 | `rest`. 315 | 316 | Pictorially, we want the following branching structure: 317 | ``` 318 | x-- 319 | / \ 320 | a rest 321 | \ / 322 | y-- 323 | 324 | ``` 325 | We do not want the following, which we sometimes have in combinator libraries. 326 | ``` 327 | x---rest 328 | / 329 | a 330 | \ 331 | y---rest 332 | 333 | ``` 334 | 335 | This second point also applies if instead of `x | y` we have a non-terminal 336 | that expands to multiple branches. 337 | 338 | 339 | The Earley library and the essence of Earley 340 | --- 341 | 342 | There are already good presentations of Earley's algorithm available, 343 | so I will not repeat the full definition here. To give some context, here's the 344 | core of the algorithm, adapted from 345 | [Wikipedia](https://en.wikipedia.org/wiki/Earley_parser): 346 | 347 | ----------- 348 | 349 | The state set at input position `k` is called `S(k)`. The parser is seeded with 350 | `S(0)` consisting of only the top-level rule. The parser then repeatedly executes 351 | three operations: prediction, scanning, and completion. 352 | 353 | * _Prediction_: For every state in `S(k)` of the form `(X -> a . Y b, j)` (where `j` 354 | is the origin position as above), add `(Y -> . y, k)` to `S(k)` for every production 355 | in the grammar with `Y` on the left-hand side `(Y -> y)`. 356 | * _Scanning_: If `a` is the next symbol in the input stream, for every state in 357 | `S(k)` of the form `(X -> a . 'a' b, j)`, add `(X -> a 'a' . b, j)` to `S(k+1)`. 358 | * _Completion_: For every state in `S(k)` of the form `(X -> y ., j)`, find 359 | states in `S(j)` of the form `(Y -> a . X b, i)` and add 360 | `(Y -> a X . b, i)` to `S(k)`. 361 | 362 | It is important to note that duplicate states are not added to the state set, 363 | only new ones. These three operations are repeated until no new states can be 364 | added to the set. The set is generally implemented as a queue of states to 365 | process, with the operation to be performed depending on what kind of state it 366 | is. 367 | 368 | ----------- 369 | 370 | The note that duplicate states are not added is very important. It means that: 371 | 372 | 1. If the _prediction_ step expands the same non-terminal 373 | multiple times at the same position, no more than one copy of each of the 374 | non-terminal's productions are added to the current state set. 375 | 376 | 2. If the _completion_ step completes a non-terminal multiple 377 | times at the same position, no more than one copy of each of the 378 | *completions* from the earlier state that it refers to are added to the 379 | current state set. 380 | 381 | These points are the essence of Earley parsing, and mean that we are work-sharing 382 | precisely as outlined above. 383 | 384 | State sets 385 | --- 386 | 387 | The state set does not have to be a set, as long as we follow the two points 388 | above (note that they make no use of set-specific properties, but only that we 389 | do not expand the same thing more than once per position). We can use whatever 390 | representation of state collections that we find appropriate as long as we do 391 | that. In the Earley library, productions can contain functions, e.g. 392 | of type `token -> Bool` for matching input tokens with arbitrary predicates, 393 | which means that they are not in general comparable (they are not in the `Ord` 394 | typeclass). Since this means that we cannot use `Set` or similar containers, we 395 | instead use lists of states. 396 | 397 | Additionally, we do not have to keep states from earlier positions around, 398 | as long as we have enough information to perform the completion step. 399 | 400 | The Earley library keeps only two lists of Earley states: One for the current 401 | position and one for the next position, and follows the two points above by 402 | machinery that will now be described. 403 | 404 | The representation of states is different from the classical presentations 405 | of Earley's algorithm. 406 | We represent a state as a production and a *continuation pointer*. We keep only 407 | the part of the production that is left to parse, i.e. we drop everything before 408 | "the dot" in Earley's states. We have a special `Final` state for 409 | when we are done. 410 | The continuation pointer is a mutable reference to a list of continuations, 411 | which correspond to possible completions in the original algorithm. 412 | For the moment we can think of a continuation as being the same kind 413 | of thing as a state, though technically they differ slightly because 414 | when we are also dealing with *parse results*, continuations accept 415 | such a parse result before they can meaningfully be followed. 416 | 417 | Let's have a look at an example. 418 | A state (a production and a mutable reference to a list of states) might 419 | look a little bit like this: 420 | 421 | ``` 422 | +------------------+ 423 | | | 424 | v | 425 | cont1 (prod1, ptr) 426 | cont2 427 | 428 | ``` 429 | 430 | I said that continuations are pretty much also states, so there might be 431 | more continuation pointers going left until we reach the special `Final` 432 | state. 433 | 434 | The parsing operations 435 | --- 436 | 437 | When parsing we have a list of such states for the current position, 438 | and a list for the next position. When all the states for the current 439 | position have been processed, we advance the position in the input string and 440 | make the next-states the current, and use the empty list for the new 441 | next-states. 442 | 443 | The *scanning* step is done just like in Earley's original algorithm; 444 | when processing `(prod1, ptr)`, 445 | if `prod1 = 'x' prod1'` and the current position in the input is an `'x'`, 446 | then we add `(prod1', ptr)` to the list for the next position. 447 | 448 | The *prediction* step, i.e. the expansion of a non-terminal, is a bit 449 | more peculiar. First we need to look at how a non-terminal is represented. 450 | 451 | We are using the generalised representation of productions from above where 452 | alternatives can occur anywhere in the tree (which also means that productions 453 | are in the standard Haskell `Alternative` typeclass), so we just need to associate 454 | a production (and not e.g. a list of productions) with every non-terminal. 455 | 456 | Every non-terminal is also associated with a mutable reference to a mutable 457 | reference (no, that is not a typo) to a list of continuations. So a non-terminal 458 | might look a bit like this: 459 | 460 | ``` 461 | cont1 462 | cont2 463 | cont3 464 | ^ 465 | +---+ 466 | | 467 | | 468 | innerPtr<--+ 469 | | 470 | | 471 | (prod, outerPtr) 472 | ``` 473 | 474 | The outermost reference (`outerPtr` above) is made sure to point to a fresh 475 | `innerPtr` that points to the empty list every time we advance the position in 476 | the input string. 477 | 478 | Since both the production and the pointer depend on the type of the 479 | non-terminal, we cannot easily store this information in e.g. a `Map` 480 | because it would have to contain elements of different types (i.e. be 481 | *heterogeneous*). 482 | 483 | The solution to this typed associated-information problem, perhaps obvious to 484 | some, is to store the associated information *in* the non-terminals. The 485 | representation of non-terminals in the library is thus exactly this associated 486 | information: a production and the reference reference. 487 | 488 | Now we are ready to do *prediction*. 489 | Let's say our current-list is `[(x prod1, ptr1), (x prod2, ptr2)]`, 490 | where the non-terminal `x = (xProd, xPtr)`, and our next-list is `[]`. 491 | This means that we have two states that begin with the same non-terminal, 492 | but end with different productions (`prod1` and `prod2`), and that additionally 493 | have continuations from before (`ptr1` and `ptr2`). 494 | 495 | Assuming that it hasn't been expanded before at this position, `x` looks 496 | something like this: 497 | 498 | ``` 499 | [ ] 500 | ^ 501 | +---+ 502 | | 503 | | 504 | innerXPtr<--+ 505 | | 506 | | 507 | (xProd, xPtr) 508 | ``` 509 | 510 | The *prediction* step takes place when we process our first state, 511 | `(x prod1, ptr1)`. 512 | We then add the continuation `(prod1, ptr1)` to the inner list that `x` refers 513 | to, and add the state `(xProd, innerXPtr)` to the current-list. 514 | 515 | When we process the second state `(x prod2, ptr2)`, we can detect that `x` has 516 | already been expanded at this position, because the continuation list is 517 | non-empty. When this happens we add `(prod2, ptr2)` to the inner list that `x` 518 | refers to, but we *do not* add any new state to the current-list --- because 519 | `xProd` is already there. At this point we have the following pointer 520 | structure: 521 | 522 | ``` 523 | (prod1, ptr1) 524 | (prod2, ptr2) current-list 525 | ^ (xProd, innerXPtr) 526 | | | 527 | +---+------------------------+ 528 | | 529 | innerXPtr<--+ 530 | | 531 | | 532 | (xProd, xPtr) 533 | ``` 534 | 535 | We can see in the above that we are sharing the work of parsing `x`, and simply 536 | keep track of the two continuations for when that has been done. 537 | Here we can also see why we need a double reference from non-terminals. We 538 | clear every non-terminal when we advance the position, but we still 539 | want to keep the pointers between any states and continuations. Clearing 540 | in the above picture would mean to mutate `xPtr` to point to a fresh inner 541 | reference that points to the empty list. However, the `innerXPtr` from any 542 | states in the next-list stay intact. 543 | 544 | Now we come to *completion*, which is what happens when we process a state 545 | of the form `(EMPTY, ptr)`. Then we simply look up the continuations 546 | by following `ptr` and add those to the current-list. 547 | Pretending that `xProd` was `EMPTY` In the above picture, we would add 548 | `(prod1, ptr1)` and `(prod2, ptr2)` to the current-list. 549 | 550 | Since we do not tie alternatives to non-terminals, we have an additional 551 | operation which happens when we encounter alternatives. In the original 552 | Earley algorithm this is baked into the *prediction* step, and in our 553 | formulation of the algorithm this operation is basically the same 554 | as *prediction*. 555 | 556 | When processing a state `((alt1 | alt2) prod, ptr)` we create 557 | a new continuation pointer, `newptr` that points to `(prod, ptr)`, 558 | and continue with the states `(alt1, newptr)` and `(alt2, newptr)`. 559 | With this operation in place we have to be more careful when we do *completion*. 560 | Just like we made sure that a non-terminal is only expanded once per position 561 | in the input we have to make sure that completion of a continuation is only 562 | done once per position. For now we can think of this as pairing a mutable 563 | boolean with each list of continuations that we point to, though this is 564 | slighly more complicated when we also have to deal with parse results. 565 | 566 | TODO simplifyCont/Leo's optimisation 567 | 568 | Recognition vs. parsing 569 | --- 570 | 571 | A recogniser for a language is a program that decides if a given input string 572 | is in the language. Parsing additionally creates a parse tree. So far we have 573 | only really discussed recognition. 574 | 575 | In Haskell it is convenient to give an `Applicative` interface to our parsers, 576 | which allows us to attach semantic actions to parsers without having to 577 | construct and interpret an intermediate parse tree. 578 | 579 | TODO explain delayed results and how they cope with infinitely and exponentially 580 | many result 581 | --------------------------------------------------------------------------------