├── Setup.hs
├── .gitignore
├── Text
    ├── Earley
    │   ├── Generator.hs
    │   ├── Parser.hs
    │   ├── Derived.hs
    │   ├── Mixfix.hs
    │   ├── Grammar.hs
    │   ├── Generator
    │   │   └── Internal.hs
    │   └── Parser
    │   │   └── Internal.hs
    └── Earley.hs
├── tests
    ├── Arbitrary.hs
    ├── Generator.hs
    ├── Main.hs
    ├── InlineAlts.hs
    ├── ReversedWords.hs
    ├── VeryAmbiguous.hs
    ├── Empty.hs
    ├── Constraint.hs
    ├── Issue14.hs
    ├── Mixfix.hs
    ├── UnbalancedPars.hs
    ├── Optional.hs
    ├── Issue11.hs
    ├── Lambda.hs
    └── Expr.hs
├── examples
    ├── VeryAmbiguous.hs
    ├── Infinite.hs
    ├── Words.hs
    ├── Expr.hs
    ├── Expr2.hs
    ├── RomanNumerals.hs
    ├── English.hs
    └── Mixfix.hs
├── experiments
├── LICENSE
├── CHANGELOG.md
├── bench
    └── BenchAll.hs
├── Earley.cabal
├── README.md
└── docs
    └── implementation.md


/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.hi
 2 | *.o
 3 | *.aux
 4 | *.hp
 5 | *.prof
 6 | dist/
 7 | .cabal-sandbox
 8 | cabal.sandbox.config
 9 | dist-newstyle/
10 | .stack-work
11 | 


--------------------------------------------------------------------------------
/Text/Earley/Generator.hs:
--------------------------------------------------------------------------------
 1 | module Text.Earley.Generator
 2 |   ( Result(..)
 3 |   , Generator
 4 |   , generator
 5 |   , language
 6 |   , upTo
 7 |   , exactly
 8 |   ) where
 9 | import Text.Earley.Generator.Internal
10 | 
11 | 


--------------------------------------------------------------------------------
/Text/Earley/Parser.hs:
--------------------------------------------------------------------------------
 1 | -- | Parsing.
 2 | module Text.Earley.Parser
 3 |   ( Report(..)
 4 |   , Result(..)
 5 |   , Parser
 6 |   , parser
 7 |   , allParses
 8 |   , fullParses
 9 |   , report
10 |   ) where
11 | import Text.Earley.Parser.Internal
12 | 


--------------------------------------------------------------------------------
/tests/Arbitrary.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RankNTypes #-}
 2 | module Arbitrary where
 3 | 
 4 | import qualified Test.QuickCheck as QC
 5 | 
 6 | import Text.Earley.Generator
 7 | 
 8 | -- | Generate an arbitrary member generated by a 'Generator'.
 9 | arbitrary :: Generator t a -> QC.Gen (a, [t])
10 | arbitrary gen = QC.sized $ \n -> QC.elements (take (1 `max` n) xs)
11 |   where
12 |     xs = language gen
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/VeryAmbiguous.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | import Control.Applicative
 3 | import System.Environment
 4 | import Text.Earley
 5 | 
 6 | g :: Grammar r (Prod r Char Char ())
 7 | g = mdo
 8 |   s <- rule $ () <$ token 'b'
 9 |            <|> () <$ s <* s
10 |            <|> () <$ s <* s <* s
11 |            <?> 's'
12 |   return s
13 | 
14 | main :: IO ()
15 | main = do
16 |   xs:_ <- getArgs
17 |   print $ report (parser g) xs
18 | 


--------------------------------------------------------------------------------
/experiments:
--------------------------------------------------------------------------------
 1 | 2x slower with HashTable instead of Map for conts and next
 2 | 3x faster without ReaderT for ParserState
 3 | 6% faster without lens dependency
 4 | 10% faster with inlining
 5 | 45% faster with homemade, specialised, pipes
 6 | 55% faster with CPS transform
 7 | 16% faster without using Map of positions (only list)
 8 | slower but uses less memory with vector
 9 | 23% faster after refactoring
10 | 18% faster without strict fmap f Pure
11 | 30% faster with simpler driver
12 | 13% faster without parser monad
13 | 


--------------------------------------------------------------------------------
/tests/Generator.hs:
--------------------------------------------------------------------------------
 1 | module Generator where
 2 | import Control.Applicative
 3 | import Test.Tasty
 4 | import Test.Tasty.HUnit as HU
 5 | 
 6 | import Text.Earley
 7 | 
 8 | tests :: TestTree
 9 | tests = testGroup "Lambda"
10 |   [ HU.testCase "Generate exactly 0" $
11 |       exactly 0 (generator (pure $ pure ()) "") @?= [((), [])]
12 |   , HU.testCase "Generate upTo 0" $
13 |       upTo 0 (generator (pure $ pure ()) "") @?= [((), [])]
14 |   , HU.testCase "Generate exactly 1" $
15 |       exactly 1 (generator (pure $ pure ()) "") @?= []
16 |   , HU.testCase "Generate upTo 1" $
17 |       upTo 1 (generator (pure $ pure ()) "") @?= [((), [])]
18 |   ]
19 | 


--------------------------------------------------------------------------------
/Text/Earley.hs:
--------------------------------------------------------------------------------
 1 | -- | Parsing all context-free grammars using Earley's algorithm.
 2 | module Text.Earley
 3 |   ( -- * Context-free grammars
 4 |     Prod, terminal, (<?>), constraint, Grammar, rule
 5 |   , -- * Derived operators
 6 |     satisfy, token, namedToken, anyToken, list, listLike, matches
 7 |   , -- * Parsing
 8 |     Report(..), Parser.Result(..), Parser, parser, allParses, fullParses
 9 |   , -- * Recognition
10 |     report
11 |   , -- * Language generation
12 |     Generator, generator, language, upTo, exactly
13 |   )
14 |   where
15 | import Text.Earley.Derived
16 | import Text.Earley.Generator
17 | import Text.Earley.Grammar
18 | import Text.Earley.Parser as Parser
19 | 


--------------------------------------------------------------------------------
/examples/Infinite.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | import Control.Applicative
 3 | import Text.Earley
 4 | 
 5 | grammar :: Grammar r (Prod r () Char [Maybe Char])
 6 | grammar = mdo
 7 |   as <- rule $ pure []
 8 |             <|> (:) <$> optional (token 'a') <*> as
 9 |   return as
10 | 
11 | -- This grammar has an infinite number of results. We can still recognise the
12 | -- language, i.e. get a report, but we can't get the results, because in doing
13 | -- so the library will try to force a circular value.
14 | main :: IO ()
15 | main = do
16 |   let input = "aaa"
17 |   print $ report (parser grammar) input -- Works
18 |   print $ fullParses (parser grammar) input -- Hangs
19 | 


--------------------------------------------------------------------------------
/examples/Words.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | import Data.Char
 3 | import Control.Applicative
 4 | import System.Environment
 5 | 
 6 | import Text.Earley
 7 | 
 8 | grammar :: Grammar r (Prod r String Char [String])
 9 | grammar = mdo
10 |   whitespace  <- rule $ () <$ many (satisfy isSpace)
11 |   whitespace1 <- rule $ () <$ satisfy isSpace <* whitespace <?> "whitespace"
12 | 
13 |   ident <- rule
14 |     $ (:) <$> satisfy isAlpha <*> many (satisfy isAlphaNum)
15 |    <?> "identifier"
16 | 
17 |   expr <- rule
18 |     $  (:)   <$> ident <* whitespace1 <*> expr
19 |    <|> (:[]) <$> ident <* whitespace
20 | 
21 |   return expr
22 | 
23 | main :: IO ()
24 | main = do
25 |   x:_ <- getArgs
26 |   print $ fullParses (parser grammar) x
27 | 


--------------------------------------------------------------------------------
/tests/Main.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | import Test.Tasty
 3 | 
 4 | import qualified Constraint
 5 | import qualified Empty
 6 | import qualified Expr
 7 | import qualified Generator
 8 | import qualified InlineAlts
 9 | import qualified Issue11
10 | import qualified Issue14
11 | import qualified Lambda
12 | import qualified Mixfix
13 | import qualified Optional
14 | import qualified ReversedWords
15 | import qualified UnbalancedPars
16 | import qualified VeryAmbiguous
17 | 
18 | main :: IO ()
19 | main = defaultMain $ testGroup "Tests"
20 |   [ Constraint.tests
21 |   , Empty.tests
22 |   , Expr.tests
23 |   , Generator.tests
24 |   , InlineAlts.tests
25 |   , Issue11.tests
26 |   , Issue14.tests
27 |   , Lambda.tests
28 |   , Mixfix.tests
29 |   , Optional.tests
30 |   , ReversedWords.tests
31 |   , UnbalancedPars.tests
32 |   , VeryAmbiguous.tests
33 |   ]
34 | 


--------------------------------------------------------------------------------
/examples/Expr.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | import Control.Applicative
 3 | import Data.Char
 4 | import System.Environment
 5 | import Text.Earley
 6 | 
 7 | data Expr
 8 |   = Add Expr Expr
 9 |   | Mul Expr Expr
10 |   | Var String
11 |   deriving (Eq, Ord, Show)
12 | 
13 | expr :: Grammar r (Prod r String String Expr)
14 | expr = mdo
15 |   x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2
16 |             <|> x2
17 |             <?> "sum"
18 |   x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3
19 |             <|> x3
20 |             <?> "product"
21 |   x3 <- rule $ Var <$> (satisfy ident <?> "identifier")
22 |             <|> namedToken "(" *> x1 <* namedToken ")"
23 |   return x1
24 |   where
25 |     ident (x:_) = isAlpha x
26 |     ident _     = False
27 | 
28 | main :: IO ()
29 | main = do
30 |   x:_ <- getArgs
31 |   print $ fullParses (parser expr) $ words x
32 | 


--------------------------------------------------------------------------------
/tests/InlineAlts.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-}
 2 | module InlineAlts where
 3 | import Control.Applicative
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit as HU
 6 | 
 7 | import Text.Earley
 8 | 
 9 | tests :: TestTree
10 | tests = testGroup "Inline alternatives"
11 |   [ HU.testCase "They work when parsed" $
12 |       let input = "ababbbaaabaa" in
13 |       allParses (parser inlineAlts) input @?= allParses (parser nonInlineAlts) input
14 |   , HU.testCase "They work when generated" $
15 |       take 1000 (language $ generator inlineAlts "ab") @?=
16 |       take 1000 (language $ generator nonInlineAlts "ab")
17 |   ]
18 | 
19 | inlineAlts :: Grammar r (Prod r Char Char String)
20 | inlineAlts = mdo
21 |   p <- rule $ pure []
22 |            <|> (:) <$> (namedToken 'a' <|> namedToken 'b') <*> p
23 |   return p
24 | 
25 | nonInlineAlts :: Grammar r (Prod r Char Char String)
26 | nonInlineAlts = mdo
27 |   ab <- rule $ namedToken 'a' <|> namedToken 'b'
28 |   p  <- rule $ pure [] <|> (:) <$> ab <*> p
29 |   return p
30 | 


--------------------------------------------------------------------------------
/tests/ReversedWords.hs:
--------------------------------------------------------------------------------
 1 | module ReversedWords where
 2 | import Control.Applicative
 3 | import Test.Tasty
 4 | import Test.Tasty.HUnit as HU
 5 | 
 6 | import Text.Earley
 7 | 
 8 | someWords :: Grammar r (Prod r () Char [String])
 9 | someWords = return $ flip (:) <$> (map reverse <$> some (list "word")) <*> list "stop"
10 | 
11 | tests :: TestTree
12 | tests = testGroup "Reversed words"
13 |   [ HU.testCase "Parse" $
14 |       let input = "wordwordstop"
15 |           l     = length input in
16 |       allParses (parser someWords) input
17 |       @?= (,) [(["stop", "drow", "drow"], l)] Report { position   = l
18 |                                                      , expected   = []
19 |                                                      , unconsumed = []
20 |                                                      }
21 |   , HU.testCase "Generate" $
22 |     upTo 16 (generator someWords "stopwrd")
23 |     @?=
24 |     [ (["stop", "drow"], "wordstop")
25 |     , (["stop", "drow", "drow"], "wordwordstop")
26 |     , (["stop","drow","drow","drow"],"wordwordwordstop")
27 |     ]
28 |   ]
29 | 


--------------------------------------------------------------------------------
/tests/VeryAmbiguous.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-}
 2 | module VeryAmbiguous where
 3 | import Control.Applicative
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit      as HU
 6 | 
 7 | import Text.Earley
 8 | 
 9 | tests :: TestTree
10 | tests = testGroup "Very ambiguous"
11 |   [ HU.testCase "Gives the right number of results" $
12 |       length (fst $ fullParses (parser veryAmbiguous) $ replicate 8 'b') @?= 2871
13 |   , HU.testCase "Gives the correct report" $
14 |       report (parser veryAmbiguous) (replicate 3 'b') @?=
15 |       Report {position = 3, expected = "s", unconsumed = ""}
16 |   , HU.testCase "Parser agrees with generator" $ and (do
17 |       n <- [0..8]
18 |       let str = replicate n 'b'
19 |           numParses = length (fst $ fullParses (parser veryAmbiguous) str)
20 |           numGens = length $ exactly n $ generator veryAmbiguous "b"
21 |       return $ numParses == numGens)
22 |     @? "Parser agrees with generator"
23 |   ]
24 | 
25 | veryAmbiguous :: Grammar r (Prod r Char Char ())
26 | veryAmbiguous = mdo
27 |   s <- rule $ () <$ token 'b'
28 |            <|> () <$ s <* s
29 |            <|> () <$ s <* s <* s
30 |            <?> 's'
31 |   return s
32 | 


--------------------------------------------------------------------------------
/examples/Expr2.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables, RecursiveDo #-}
 2 | import Data.Char
 3 | import System.Environment
 4 | import Control.Applicative
 5 | import Text.Earley
 6 | 
 7 | data Expr
 8 |   = Expr :+: Expr
 9 |   | Expr :*: Expr
10 |   | Var String
11 |   | Lit Int
12 |   deriving (Show)
13 | 
14 | grammar :: forall r. Grammar r (Prod r String Char Expr)
15 | grammar = mdo
16 | 
17 |   whitespace <- rule $ many $ satisfy isSpace
18 | 
19 |   let tok :: Prod r String Char a -> Prod r String Char a
20 |       tok p   = whitespace *> p
21 | 
22 |       sym x   = tok $ token x <?> [x]
23 | 
24 |       ident   = tok $ (:) <$> satisfy isAlpha <*> many (satisfy isAlphaNum) <?> "identifier"
25 |       num     = tok $ some (satisfy isDigit) <?> "number"
26 | 
27 |   expr0 <- rule
28 |      $ (Lit . read)  <$> num
29 |     <|> Var  <$> ident
30 |     <|> sym '(' *> expr2 <* sym ')'
31 | 
32 |   expr1 <- rule
33 |     $ (:*:) <$> expr1 <* sym '*' <*> expr0
34 |    <|> expr0
35 | 
36 |   expr2 <- rule
37 |     $ (:+:) <$> expr2 <* sym '+' <*> expr1
38 |    <|> expr1
39 | 
40 |   return $ expr2 <* whitespace
41 | 
42 | main :: IO ()
43 | main = do
44 |   x:_ <- getArgs
45 |   print $ fullParses (parser grammar) x
46 | 


--------------------------------------------------------------------------------
/tests/Empty.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | module Empty where
 3 | import Control.Applicative
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit as HU
 6 | import Test.Tasty.QuickCheck as QC
 7 | 
 8 | import Text.Earley
 9 | 
10 | tests :: TestTree
11 | tests = testGroup "Empty productions"
12 |   [ QC.testProperty "The empty production doesn't parse anything" $
13 |     \(input :: String) ->
14 |       allParses (parser emptyGrammar) input
15 |       == (,) [] Report { position   = 0
16 |                        , expected   = []
17 |                        , unconsumed = input
18 |                        }
19 |   , HU.testCase "The empty production doesn't generate anything" $
20 |       language (generator emptyGrammar "abc") @?= []
21 |   , QC.testProperty "Many empty productions parse very little" $
22 |     \(input :: String) ->
23 |       allParses (parser manyEmpty) input
24 |       == (,) [([], 0)] Report { position   = 0
25 |                               , expected   = []
26 |                               , unconsumed = input
27 |                               }
28 |   , HU.testCase "Many empty productions generate very little" $
29 |       language (generator manyEmpty "blahc") @?= [([], "")]
30 |   ]
31 | 
32 | emptyGrammar :: Grammar r (Prod r () Char ())
33 | emptyGrammar = return empty
34 | 
35 | manyEmpty :: Grammar r (Prod r () Char [()])
36 | manyEmpty = return $ many empty <* pure "blah"
37 | 


--------------------------------------------------------------------------------
/examples/RomanNumerals.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | module Main where
 3 | 
 4 | import Control.Applicative ((<|>), (<**>))
 5 | import System.Environment (getArgs)
 6 | import Text.Earley
 7 | 
 8 | numeral :: String -> Int -> Prod r String Char Int
 9 | numeral str n = n <$ list str
10 | 
11 | romanNumeralsGrammar :: Grammar r (Prod r String Char Int)
12 | romanNumeralsGrammar = mdo
13 | 
14 |   thousands <- rule
15 |     $ pure 0
16 |     <|> numeral "M" 1000 <**> fmap (+) thousands
17 | 
18 |   le300 <- rule
19 |     $ pure 0
20 |     <|> numeral "C" 100
21 |     <|> numeral "CC" 200
22 |     <|> numeral "CCC" 300
23 | 
24 |   hundreds <- rule
25 |     $ le300
26 |     <|> numeral "CD" 400
27 |     <|> numeral "D" 500 <**> fmap (+) le300
28 |     <|> numeral "CM" 900
29 | 
30 |   le30 <- rule
31 |     $ pure 0
32 |     <|> numeral "X" 10
33 |     <|> numeral "XX" 20
34 |     <|> numeral "XXX" 30
35 | 
36 |   tens <- rule 
37 |     $ le30
38 |     <|> numeral "XL" 40
39 |     <|> numeral "L" 50 <**> fmap (+) le30
40 |     <|> numeral "XC" 90
41 | 
42 |   le3 <- rule
43 |     $ pure 0
44 |     <|> numeral "I" 1
45 |     <|> numeral "II" 2
46 |     <|> numeral "III" 3
47 | 
48 |   units <- rule
49 |     $ le3
50 |     <|> numeral "IV" 4
51 |     <|> numeral "V" 5 <**> fmap (+) le3
52 |     <|> numeral "IX" 9
53 | 
54 |   return
55 |     $ thousands
56 |     <**> fmap (+) hundreds
57 |     <**> fmap (+) tens
58 |     <**> fmap (+) units
59 | 
60 | 
61 | main :: IO ()
62 | main = do
63 |   x:_ <- getArgs
64 |   print $ fullParses (parser romanNumeralsGrammar) x
65 | 


--------------------------------------------------------------------------------
/Text/Earley/Derived.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE Rank2Types #-}
 2 | -- | Derived operators.
 3 | module Text.Earley.Derived where
 4 | import Control.Applicative hiding (many)
 5 | import Control.Monad (guard)
 6 | import Data.ListLike(ListLike)
 7 | import qualified Data.ListLike as ListLike
 8 | 
 9 | import Text.Earley.Grammar
10 | import Text.Earley.Parser
11 | 
12 | -- | Match a token that satisfies the given predicate. Returns the matched
13 | -- token. See also 'terminal'.
14 | {-# INLINE satisfy #-}
15 | satisfy :: (t -> Bool) -> Prod r e t t
16 | satisfy p = terminal ((<$) <*> guard . p)
17 | 
18 | -- | Match a single token.
19 | token :: Eq t => t -> Prod r e t t
20 | token x = satisfy (== x)
21 | 
22 | -- | Match a single token and give it the name of the token.
23 | namedToken :: Eq t => t -> Prod r t t t
24 | namedToken x = token x <?> x
25 | 
26 | -- | Match a single token with any value
27 | anyToken :: Prod r e t t
28 | anyToken = terminal Just
29 | 
30 | -- | Match a list of tokens in sequence.
31 | {-# INLINE list #-}
32 | list :: Eq t => [t] -> Prod r e t [t]
33 | list = listLike
34 | 
35 | -- | Match a 'ListLike' of tokens in sequence.
36 | {-# INLINE listLike #-}
37 | listLike :: (Eq t, ListLike i t) => i -> Prod r e t i
38 | listLike = ListLike.foldr (liftA2 ListLike.cons . satisfy . (==)) (pure ListLike.empty)
39 | 
40 | -- | Whether or not the grammar matches the input string. Equivalently,
41 | -- whether the given input is in the language described by the grammars.
42 | matches :: ListLike i t => (forall r. Grammar r (Prod r e t a)) -> i -> Bool
43 | matches grammar = not . null . fst . fullParses (parser grammar)
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2019, Olle Fredriksson
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Olle Fredriksson nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/examples/English.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | import Control.Applicative
 3 | import Text.Earley
 4 | import qualified Data.HashSet as HS
 5 | 
 6 | type Noun      = String
 7 | type Verb      = String
 8 | type Adjective = String
 9 | 
10 | nouns, verbs, adjectives :: HS.HashSet String
11 | nouns      = HS.fromList ["parsers", "sentences", "grammars"]
12 | verbs      = HS.fromList ["parse", "munch", "annihilate", "confuse", "use"]
13 | adjectives = HS.fromList ["many", "great", "long", "confusing"]
14 | 
15 | 
16 | data Sentence = Sentence NounPhrase VerbPhrase
17 |   deriving Show
18 | data NounPhrase = NounPhrase Adjective NounPhrase
19 |                 | Noun Noun
20 |   deriving Show
21 | data VerbPhrase = VerbPhrase Verb NounPhrase
22 |                 | Verb Verb
23 |   deriving Show
24 | 
25 | sentence :: Grammar r (Prod r String String Sentence)
26 | sentence = mdo
27 |   noun       <- rule $ satisfy (`HS.member` nouns)      <?> "noun"
28 |   verb       <- rule $ satisfy (`HS.member` verbs)      <?> "verb"
29 |   adjective  <- rule $ satisfy (`HS.member` adjectives) <?> "adjective"
30 |   nounPhrase <- rule $  NounPhrase <$> adjective <*> nounPhrase
31 |                     <|> Noun <$> noun
32 |                     <?> "noun phrase"
33 |   verbPhrase <- rule $  VerbPhrase <$> verb <*> nounPhrase
34 |                     <|> Verb <$> verb
35 |                     <?> "verb phrase"
36 |   return $ Sentence <$> nounPhrase <*> verbPhrase <?> "sentence"
37 | 
38 | main :: IO ()
39 | main = do
40 |   let p = fullParses (parser sentence) . words
41 |   print $ p "parsers use grammars"
42 |   print $ p "parsers munch long sentences"
43 |   print $ p "many great sentences confuse parsers"
44 |   print $ p "parsers use use"
45 |   print $ p "grammars many great confusing"
46 | 


--------------------------------------------------------------------------------
/tests/Constraint.hs:
--------------------------------------------------------------------------------
 1 | module Constraint where
 2 | import Control.Applicative
 3 | import Data.List
 4 | import Data.Set(fromList)
 5 | 
 6 | import Test.Tasty
 7 | import Test.Tasty.HUnit as HU
 8 | 
 9 | import Text.Earley
10 | 
11 | oneToken :: Grammar r (Prod r () t t)
12 | oneToken = rule anyToken
13 | 
14 | someTokens :: Grammar r (Prod r () t [t])
15 | someTokens = rule (some anyToken)
16 | 
17 | tests :: TestTree
18 | tests = testGroup "New features"
19 |   [ HU.testCase "anyToken1" $
20 |       let input = "hello"
21 |           l     = length input in
22 |       allParses (parser oneToken) input
23 |       @?= (,) [('h', 1)] Report { position   = 1
24 |                                 , expected   = []
25 |                                 , unconsumed = drop 1 input
26 |                                 }
27 |   , HU.testCase "anyToken2" $
28 |       allParses (parser oneToken) ""
29 |       @?= (,) [] Report { position = 0
30 |                         , expected   = []
31 |                         , unconsumed = ""
32 |                         }
33 |   , HU.testCase "anyToken3" $
34 |       let input = "hello"
35 |           l     = length input in
36 |       allParses (parser someTokens) input
37 |       @?= (,) [(init, length init) | init <- inits input, not (null init) ]
38 |               Report { position   = l
39 |                      , expected   = []
40 |                      , unconsumed = []
41 |                      }
42 |   , HU.testCase "constraint" $
43 |       matches noRepeats "salut"
44 |       @?= True
45 |   , HU.testCase "constraint2" $
46 |       matches noRepeats "hello"
47 |       @?= False
48 |   , HU.testCase "constraint3" $
49 |       fromList (map fst $ exactly 2 $ generator noRepeats "abcd")
50 |       @?= fromList [[x, y] | x <- "abcd", y <- "abcd", x /= y]
51 |   ]
52 | 
53 | noRepeats = rule $
54 |   constraint (\x -> length x == length (fromList x)) $
55 |     many anyToken
56 | 


--------------------------------------------------------------------------------
/tests/Issue14.hs:
--------------------------------------------------------------------------------
 1 | module Issue14 where
 2 | import Control.Applicative
 3 | import Test.Tasty
 4 | import Test.Tasty.QuickCheck as QC
 5 | 
 6 | import Text.Earley
 7 | 
 8 | tests :: TestTree
 9 | tests = testGroup "Issue 14"
10 |   [ QC.testProperty "The same rule in alternatives gives many results" $
11 |     \x -> fullParses (parser (issue14 x)) ""
12 |     == (,) (replicate (issue14Length x) ())
13 |            Report { position = 0, expected = [], unconsumed = [] }
14 |   , QC.testProperty "The same rule in alternatives generates many results" $
15 |     \x -> language (generator (issue14 x) "")
16 |     == replicate (issue14Length x) ((), "")
17 |   ]
18 | 
19 | data Issue14 a
20 |   = Pure a
21 |   | Alt (Issue14 a) (Issue14 a)
22 |   | Ap (Issue14 a) (Issue14 a)
23 |   deriving (Eq, Ord, Show)
24 | 
25 | instance Arbitrary a => Arbitrary (Issue14 a) where
26 |   arbitrary = sized arbTree
27 |     where arbTree n | n > 0  = oneof [ Pure <$> arbitrary
28 |                                      , Alt <$> arbTree1 <*> arbTree1
29 |                                      , Ap <$> arbTree1 <*> arbTree1
30 |                                      ]
31 |                                      where arbTree1 = arbTree (n `div` 2)
32 |           arbTree _          = Pure <$> arbitrary
33 | 
34 |   shrink (Pure a)  = Pure <$> shrink a
35 |   shrink (Alt a b) = a : b : [Alt a' b | a' <- shrink a] ++ [Alt a b' | b' <- shrink b]
36 |   shrink (Ap a b)  = a : b : [Ap a' b | a' <- shrink a] ++ [Ap a b' | b' <- shrink b]
37 | 
38 | issue14Length :: Issue14 () -> Int
39 | issue14Length (Pure ()) = 1
40 | issue14Length (Alt a b) = issue14Length a + issue14Length b
41 | issue14Length (Ap a b)  = issue14Length a * issue14Length b
42 | 
43 | issue14 :: Issue14 () -> Grammar r (Prod r () Char ())
44 | issue14 tree = do
45 |   emptyRule <- rule $ pure ()
46 |   let x = go emptyRule tree
47 |   return x
48 |   where
49 |     go x (Pure ())   = x
50 |     go x (Alt b1 b2) = go x b1 <|> go x b2
51 |     go x (Ap b1 b2)  = go x b1 <* go x b2
52 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Unreleased
 2 | 
 3 | - Add `satisfyMaybe` for matching tokens with a predicate that returns a `Maybe` result
 4 | 
 5 | # 0.13.0.1
 6 | 
 7 | - Add a missing test module to the Cabal file
 8 | 
 9 | # 0.13.0.0
10 | 
11 | - Remove the previously deprecated operators `symbol`, `namedSymbol`, and `word`
12 | - Change `Prod`'s `Monoid` and `Semigroup` instances to lift their element instances instead of being the same as the `Alternative` instance
13 | - Add unbalanced parentheses/EOF test
14 | 
15 | # 0.12.1.0
16 | 
17 | - GHC 8.4.1 support
18 | - Update 'base' dependency bounds
19 | - Add `Semigroup` instance to the `Prod` type
20 | 
21 | # 0.12.0.1
22 | 
23 | - Update 'base' dependency bounds
24 | 
25 | # 0.12.0.0
26 | 
27 | - Add the `Generator` module for generating grammar members
28 | - Change (simplify) the type returned by `parser`, introducing  a `Parser` type synonym for it, and change the signature of `allParses`, `fullParses`, and `report` to accept a `Parser`
29 | - The `Text.Earley.Internal` module is now `Text.Earley.Parser.Internal`
30 | 
31 | # 0.11.0.1
32 | 
33 | - Add missing modules to Cabal file
34 | 
35 | # 0.11.0.0
36 | 
37 | - Add `IsString Prod` instance
38 | - Change the signature of `Terminal` to take a function `a -> Maybe b`, and add a new operator `terminal`
39 | - Move `satisfy` to the `Derived` module
40 | - Add the `token`, `namedToken`, and `list` operators
41 | - Deprecate the `symbol`, `namedSymbol`, and `word` operators (use the above instead)
42 | - Add the `listLike` operator
43 | 
44 | # 0.10.1.0
45 | 
46 | - Fix bug concerning nullable rules (#14)
47 | - Add `runGrammar`
48 | 
49 | # 0.10.0.1
50 | 
51 | - Add changelog
52 | 
53 | # 0.10
54 | 
55 | - Remove `Args`, and use `Results` instead
56 | - Make `parser` function not take input directly
57 | - Remove redundant type parameter to `Grammar`
58 | 
59 | # 0.9
60 | 
61 | - Optimise handling of nullable non-terminals
62 | - Pass a record of arguments in the parse routine
63 | - Add support for consecutive mixfix holes
64 | 


--------------------------------------------------------------------------------
/tests/Mixfix.hs:
--------------------------------------------------------------------------------
 1 | module Mixfix where
 2 | import Control.Applicative
 3 | import Test.Tasty
 4 | import Test.Tasty.HUnit as HU
 5 | 
 6 | import Text.Earley
 7 | import Text.Earley.Mixfix
 8 | 
 9 | tests :: TestTree
10 | tests = testGroup "Mixfix"
11 |   [ HU.testCase "1" $
12 |       let x = Ident [Just "x"] in
13 |       fullParses (parser mixfixGrammar) (words "if x then x else x")
14 |       @?= (,) [App ifthenelse [x, x, x]] Report {position = 6, expected = [], unconsumed = []}
15 |   , HU.testCase "2" $
16 |       let x = Ident [Just "x"] in
17 |       fullParses (parser mixfixGrammar) (words "prefix x postfix")
18 |       @?= (,) [App prefix [App postfix [x]]] Report {position = 3, expected = [], unconsumed = []}
19 |   , HU.testCase "3" $
20 |       let x = Ident [Just "x"] in
21 |       fullParses (parser mixfixGrammar) (words "x infix1 x infix2 x")
22 |       @?= (,) [App infix1 [x, App infix2 [x, x]]] Report {position = 5, expected = [], unconsumed = []}
23 |   , HU.testCase "4" $
24 |       let x = Ident [Just "x"] in
25 |       fullParses (parser mixfixGrammar) (words "[ x ]")
26 |       @?= (,) [App closed [x]] Report {position = 3, expected = [], unconsumed = []}
27 |   ]
28 | 
29 | data MixfixExpr = Ident (Holey String) | App (Holey String) [MixfixExpr]
30 |   deriving (Eq, Show)
31 | 
32 | mixfixGrammar :: Grammar r (Prod r String String MixfixExpr)
33 | mixfixGrammar = mixfixExpression table
34 |                                  (Ident . pure . Just <$> namedToken "x")
35 |                                  App
36 |   where
37 |     hident = map (fmap token)
38 |     table =
39 |       [ [(hident ifthenelse, RightAssoc)]
40 |       , [(hident prefix, RightAssoc)]
41 |       , [(hident postfix, LeftAssoc)]
42 |       , [(hident infix1, LeftAssoc)]
43 |       , [(hident infix2, RightAssoc)]
44 |       , [(hident closed, NonAssoc)]
45 |       ]
46 | 
47 | ifthenelse, prefix, postfix, infix1, infix2, closed :: Holey String
48 | ifthenelse = [Just "if", Nothing, Just "then", Nothing, Just "else", Nothing]
49 | prefix = [Just "prefix", Nothing]
50 | postfix = [Nothing, Just "postfix"]
51 | infix1 = [Nothing, Just "infix1", Nothing]
52 | infix2 = [Nothing, Just "infix2", Nothing]
53 | closed = [Just "[", Nothing, Just "]"]
54 | 


--------------------------------------------------------------------------------
/tests/UnbalancedPars.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE FlexibleContexts, RankNTypes, RecursiveDo, ScopedTypeVariables #-}
 2 | module UnbalancedPars where
 3 | 
 4 | import Data.Char (isAlpha)
 5 | 
 6 | import Control.Applicative
 7 | import Test.Tasty
 8 | import Test.Tasty.HUnit      as HU
 9 | 
10 | import Text.Earley
11 | 
12 | tests :: TestTree
13 | tests = testGroup "Unbalanced parentheses"
14 |   [ HU.testCase "Parses balanced" $
15 |       fst (fullParses' unbalancedPars
16 |         "((x))") @?= [(b . b) x]
17 |   , HU.testCase "Parses one unbalanced" $
18 |       fst (fullParses' unbalancedPars
19 |         "((x)") @?= [(u . b) x]
20 |   , HU.testCase "Parses two unbalanced" $
21 |       fst (fullParses' unbalancedPars
22 |         "((x") @?= [(u . u) x]
23 |   ]
24 |   where
25 |     -- [b]alanced
26 |     b :: Expr -> Expr
27 |     b e = ExprInBrackets "(" e ")"
28 | 
29 |     -- [u]nbalanced
30 |     u :: Expr -> Expr
31 |     u e = ExprInBrackets "(" e ""
32 | 
33 |     -- [x] variable
34 |     x :: Expr
35 |     x = Var 'x'
36 | 
37 | data Token = EOF | Char !Char
38 |   deriving (Eq, Ord, Show)
39 | 
40 | fullParses'
41 |   :: (forall r. Grammar r (Prod r e Token a))
42 |   -> String
43 |   -> ([a], Report e String)
44 | fullParses' g s =
45 |   let (res, rep) = allParses (parser $ (<* eof) <$> g) $ fmap Char s ++ repeat EOF
46 |   in
47 |     ( fst <$> res
48 |     , rep { unconsumed = go $ unconsumed rep }
49 |     )
50 |   where
51 |     go (Char c:xs) = c : go xs
52 |     go _ = []
53 | 
54 | data Expr =
55 |   Var Char | ExprInBrackets String Expr String
56 |   deriving (Eq, Ord, Show)
57 | 
58 | eof :: Prod r e Token Token
59 | eof = token EOF
60 | 
61 | leftPar :: Prod r e Token String
62 | leftPar = "(" <$ token (Char '(')
63 | 
64 | rightPar :: Prod r e Token String
65 | rightPar = ")" <$ token (Char ')')
66 | 
67 | var :: Prod r e Token Expr
68 | var = terminal $ \t -> case t of
69 |   Char c | isAlpha c -> Just $ Var c
70 |   _ -> Nothing
71 | 
72 | unbalancedPars :: Grammar r (Prod r String Token Expr)
73 | unbalancedPars = mdo
74 |   expr <- rule $ var <|> exprInBrackets
75 |   exprInBrackets <- rule $
76 |     ExprInBrackets
77 |       <$> leftPar
78 |       <*> expr
79 |       <*> (rightPar <|> ("" <$ eof))
80 |       <?> "parenthesized expression"
81 |   return expr
82 | 


--------------------------------------------------------------------------------
/examples/Mixfix.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP, RecursiveDo #-}
 2 | import Control.Applicative
 3 | import Control.Arrow(first)
 4 | import Data.Maybe
 5 | #if !MIN_VERSION_base(4,8,0)
 6 | import Data.Monoid
 7 | #endif
 8 | import System.Environment
 9 | import Text.Earley
10 | import Text.Earley.Mixfix
11 | import qualified Data.HashSet as HS
12 | 
13 | holey :: String -> Holey String
14 | holey ""       = []
15 | holey ('_':xs) = Nothing : holey xs
16 | holey xs       = Just i : holey rest
17 |   where (i, rest) = span (/= '_') xs
18 | 
19 | data Expr = V (Holey String) | App Expr [Expr]
20 |   deriving Show
21 | 
22 | identTable :: [[(Holey String, Associativity)]]
23 | identTable = (map . map) (first holey)
24 |   [ [("_->_",          RightAssoc)]
25 |   , [("_,_",           NonAssoc)]
26 |   , [("if_then_else_", RightAssoc)]
27 |   , [("_|-_:_",        NonAssoc)]
28 |   , [("_+_",           LeftAssoc)]
29 |   , [("_*_",           LeftAssoc)]
30 |   ]
31 | 
32 | grammar :: Grammar r (Prod r String String Expr)
33 | grammar = mdo
34 |   ident     <- rule $ (V . pure . Just) <$> satisfy (not . (`HS.member` mixfixParts))
35 |                    <?> "identifier"
36 |   atom      <- rule $ ident
37 |                    <|> namedToken "(" *> expr <* namedToken ")"
38 |   normalApp <- rule $ atom
39 |                    <|> App <$> atom <*> some atom
40 |   expr      <- mixfixExpression table normalApp (App . V)
41 |   return expr
42 |   where
43 |     table = map (map $ first $ map $ fmap namedToken) identTable
44 |     mixfixParts = HS.fromList [s | xs <- identTable , (ys, _) <- xs
45 |                                  , Just s <- ys]
46 |                `mappend` HS.fromList ["(", ")"]
47 | 
48 | pretty :: Expr -> String
49 | pretty (V ps) = concatMap (fromMaybe "_") ps
50 | pretty (App e es) = "(" ++ pretty e ++ " " ++ unwords (map pretty es) ++ ")"
51 | 
52 | tokenize :: String -> [String]
53 | tokenize ""        = []
54 | tokenize (' ':xs)  = tokenize xs
55 | tokenize ('\n':xs) = tokenize xs
56 | tokenize (x:xs)
57 |   | x `HS.member` special = [x] : tokenize xs
58 |   | otherwise             = (x:as) : tokenize bs
59 |   where
60 |     (as, bs) = break (`HS.member` special) xs
61 |     special = HS.fromList "(), \n"
62 | 
63 | main :: IO ()
64 | main = do
65 |   x:_ <- getArgs
66 |   print $ first (map pretty) $ fullParses (parser grammar) $ tokenize x
67 | 


--------------------------------------------------------------------------------
/tests/Optional.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo, ScopedTypeVariables #-}
 2 | module Optional where
 3 | import Control.Applicative
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit as HU
 6 | 
 7 | import Text.Earley
 8 | 
 9 | tests :: TestTree
10 | tests = testGroup "Optional"
11 |   [ HU.testCase "Nothing" $
12 |       fullParses (parser $ return optional_) "b"
13 |       @?= (,) [(Nothing, 'b')] Report {position = 1, expected = "", unconsumed = ""}
14 |   , HU.testCase "Just" $
15 |       fullParses (parser $ return optional_) "ab"
16 |       @?= (,) [(Just 'a', 'b')] Report {position = 2, expected = "", unconsumed = ""}
17 |   , HU.testCase "Using rules Nothing" $
18 |       fullParses (parser optionalRule) "b"
19 |       @?= (,) [(Nothing, 'b')] Report {position = 1, expected = "", unconsumed = ""}
20 |   , HU.testCase "Using rules Just" $
21 |       fullParses (parser optionalRule) "ab"
22 |       @?= (,) [(Just 'a', 'b')] Report {position = 2, expected = "", unconsumed = ""}
23 |   , HU.testCase "Without continuation Nothing" $
24 |       fullParses (parser $ return $ optional $ namedToken 'a') ""
25 |       @?= (,) [Nothing] Report {position = 0, expected = "a", unconsumed = ""}
26 |   , HU.testCase "Without continuation Just" $
27 |       fullParses (parser $ return $ optional $ namedToken 'a') "a"
28 |       @?= (,) [Just 'a'] Report {position = 1, expected = "", unconsumed = ""}
29 |   , HU.testCase "Using rules without continuation Nothing" $
30 |       fullParses (parser $ rule $ optional $ namedToken 'a') ""
31 |       @?= (,) [Nothing] Report {position = 0, expected = "a", unconsumed = ""}
32 |   , HU.testCase "Using rules without continuation Just" $
33 |       fullParses (parser $ rule $ optional $ namedToken 'a') "a"
34 |       @?= (,) [Just 'a'] Report {position = 1, expected = "", unconsumed = ""}
35 |   , HU.testCase "Generate optional" $
36 |       language (generator (return optional_) "ab")
37 |       @?= [((Nothing, 'b'), "b"), ((Just 'a', 'b'), "ab")]
38 |   , HU.testCase "Generate optional using rules" $
39 |       language (generator optionalRule "ab")
40 |       @?= [((Nothing, 'b'), "b"), ((Just 'a', 'b'), "ab")]
41 |   , HU.testCase "Generate optional without continuation" $
42 |       language (generator (return $ optional $ namedToken 'a') "ab")
43 |       @?= [(Nothing, ""), (Just 'a', "a")]
44 |   , HU.testCase "Generate optional using rules without continuation" $
45 |       language (generator (rule $ optional $ namedToken 'a') "ab")
46 |       @?= [(Nothing, ""), (Just 'a', "a")]
47 |   ]
48 | 
49 | optional_ :: Prod r Char Char (Maybe Char, Char)
50 | optional_ = (,) <$> optional (namedToken 'a') <*> namedToken 'b'
51 | 
52 | optionalRule :: Grammar r (Prod r Char Char (Maybe Char, Char))
53 | optionalRule = mdo
54 |   test <- rule $ (,) <$> optional (namedToken 'a') <*> namedToken 'b'
55 |   return test
56 | 


--------------------------------------------------------------------------------
/tests/Issue11.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecursiveDo #-}
 2 | module Issue11 where
 3 | import Control.Applicative
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit as HU
 6 | 
 7 | import Text.Earley
 8 | import Text.Earley.Mixfix
 9 | 
10 | tests :: TestTree
11 | tests = testGroup "Issue 11"
12 |   [ let x = words "+ + 5 6 7" in
13 |     HU.testCase "1" $
14 |     fullParses (parser $ grammar LeftAssoc) x
15 |     @?= (,) [] Report {position = 1, expected = [], unconsumed = drop 1 x}
16 |   , let x = words "+ 5 + 6 7" in
17 |     HU.testCase "2" $
18 |     fullParses (parser $ grammar LeftAssoc) x
19 |     @?= (,) [] Report {position = 2, expected = [], unconsumed = drop 2 x}
20 |   , let x = words "+ 5 6" in
21 |     HU.testCase "3" $
22 |     fullParses (parser $ grammar LeftAssoc) x
23 |     @?= (,) [Plus (Var "5") (Var "6")]
24 |             Report {position = 3, expected = [], unconsumed = []}
25 |   , let x = words "+ + 5 6 7" in
26 |     HU.testCase "4" $
27 |     fullParses (parser $ grammar RightAssoc) x
28 |     @?= (,) [Plus (Plus (Var "5") (Var "6")) (Var "7")]
29 |             Report {position = 5, expected = [], unconsumed = []}
30 |   , let x = words "+ 5 + 6 7" in
31 |     HU.testCase "5" $
32 |     fullParses (parser $ grammar RightAssoc) x
33 |     @?= (,) [Plus (Var "5") (Plus (Var "6") (Var "7"))]
34 |             Report {position = 5, expected = [], unconsumed = []}
35 |   , let x = words "+ 5 6" in
36 |     HU.testCase "6" $
37 |     fullParses (parser $ grammar RightAssoc) x
38 |     @?= (,) [Plus (Var "5") (Var "6")]
39 |             Report {position = 3, expected = [], unconsumed = []}
40 |   , let x = words "+ + 5 6 7" in
41 |     HU.testCase "7" $
42 |     fullParses (parser $ grammar NonAssoc) x
43 |     @?= (,) [Plus (Plus (Var "5") (Var "6")) (Var "7")]
44 |             Report {position = 5, expected = [], unconsumed = []}
45 |   , let x = words "+ 5 + 6 7" in
46 |     HU.testCase "8" $
47 |     fullParses (parser $ grammar NonAssoc) x
48 |     @?= (,) [Plus (Var "5") (Plus (Var "6") (Var "7"))]
49 |             Report {position = 5, expected = [], unconsumed = []}
50 |   , let x = words "+ 5 6" in
51 |     HU.testCase "9" $
52 |     fullParses (parser $ grammar NonAssoc) x
53 |     @?= (,) [Plus (Var "5") (Var "6")]
54 |             Report {position = 3, expected = [], unconsumed = []}
55 |   ]
56 | 
57 | data AST
58 |   = Var String
59 |   | Plus AST AST
60 |   deriving (Eq, Ord, Show)
61 | 
62 | grammar :: Associativity -> Grammar r (Prod r String String AST)
63 | grammar a = mdo
64 |     atomicExpr <- rule $ Var <$> satisfy (/= "+")
65 | 
66 |     expr <- mixfixExpression
67 |                [[([Just (token "+"), Nothing, Nothing], a)]]
68 |                atomicExpr
69 |                (\x y -> case (x,y) of
70 |                   ([Just "+", Nothing, Nothing], [e1,e2]) -> Plus e1 e2
71 |                   _ -> undefined)
72 | 
73 |     return expr
74 | 


--------------------------------------------------------------------------------
/tests/Lambda.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RecursiveDo #-}
  2 | module Lambda where
  3 | import Control.Applicative
  4 | import Data.List as List
  5 | import Data.Foldable
  6 | import Test.Tasty
  7 | import Test.Tasty.HUnit as HU
  8 | import Test.Tasty.QuickCheck as QC
  9 | 
 10 | import Text.Earley
 11 | 
 12 | import qualified Arbitrary
 13 | 
 14 | tests :: TestTree
 15 | tests = testGroup "Lambda"
 16 |   [ HU.testCase "Generate exactly 0" $
 17 |       exactly 0 gen @?= []
 18 |   , HU.testCase "Generate upTo 0" $
 19 |       upTo 0 gen @?= []
 20 |   , HU.testCase "Generate exactly 4" $
 21 |       sort (snd <$> exactly 4 gen)
 22 |       @?=
 23 |       ["(a)a","(a)b","(aa)","(ab)","(b)a","(b)b","(ba)","(bb)"
 24 |       ,"\\a.a","\\a.b","\\b.a","\\b.b","a(a)","a(b)","a+aa","a+ab"
 25 |       ,"a+ba","a+bb","aa+a","aa+b","aaaa","aaab","aaba","aabb"
 26 |       ,"ab+a","ab+b","abaa","abab","abba","abbb","b(a)","b(b)"
 27 |       ,"b+aa","b+ab","b+ba","b+bb","ba+a","ba+b","baaa","baab"
 28 |       ,"baba","babb","bb+a","bb+b","bbaa","bbab","bbba","bbbb"
 29 |       ]
 30 |   , HU.testCase "upTo contains exactly" $ List.and (do
 31 |       m <- [0..5]
 32 |       let ys = snd <$> upTo m gen
 33 |       n <- [0..m]
 34 |       (_, x) <- upTo n gen
 35 |       return $ x `List.elem` ys)
 36 |     @? "exactly contains upTo"
 37 |   , HU.testCase "language contains upTo" $ do
 38 |     let ys = snd <$> language gen
 39 |     List.and (do
 40 |       n <- [0..5]
 41 |       (_, x) <- upTo n gen
 42 |       return $ x `List.elem` ys)
 43 |     @? "exactly contains upTo"
 44 |   , QC.testProperty "Arbitrary" $ do
 45 |     let p = parser grammar
 46 |     (e, s) <- Arbitrary.arbitrary $ generator grammar tokens
 47 |     return
 48 |       $ [e] === fst (fullParses p $ prettyExpr 0 e)
 49 |       .&&. [e] === fst (fullParses p s)
 50 |   ]
 51 |   where
 52 |     gen = generator grammar tokens
 53 | 
 54 | data Expr
 55 |   = Var Char
 56 |   | Lam String Expr
 57 |   | App Expr Expr
 58 |   | Add Expr Expr
 59 |   deriving (Eq, Ord, Show)
 60 | 
 61 | prettyExpr :: Int -> Expr -> String
 62 | prettyExpr _ (Var c) = [c]
 63 | prettyExpr d (Lam xs e) = prettyParens (d > 0) $ "\\" ++ xs ++ "." ++ prettyExpr d e
 64 | prettyExpr d (Add a b) = prettyParens (d > 1) $ prettyExpr 2 a ++ "+" ++ prettyExpr 1 b
 65 | prettyExpr d (App a b) = prettyParens (d > 3) $ prettyExpr 3 a ++ prettyExpr 4 b
 66 | 
 67 | prettyParens :: Bool -> String -> String
 68 | prettyParens True s  = "(" ++ s ++ ")"
 69 | prettyParens False s = s
 70 | 
 71 | tokens :: String
 72 | tokens = "(\\ab.+*)"
 73 | 
 74 | instance Arbitrary Expr where
 75 |   arbitrary = sized go
 76 |     where
 77 |       var = elements "ab"
 78 |       go 0 = Var <$> var
 79 |       go n = oneof
 80 |         [ Var <$> var
 81 |         , Lam <$> (take 2 <$> listOf1 var) <*> go'
 82 |         , App <$> go' <*> go'
 83 |         , Add <$> go' <*> go'
 84 |         ]
 85 |         where
 86 |           go' = go (n `div` 10)
 87 | 
 88 |   shrink (Var _) = []
 89 |   shrink (Lam xs e) = e : [Lam xs' e' | xs' <- shrink xs, not (null xs), e' <- shrink e]
 90 |   shrink (App a b) = a : b : [App a' b' | a' <- shrink a, b' <- shrink b]
 91 |   shrink (Add a b) = a : b : [Add a' b' | a' <- shrink a, b' <- shrink b]
 92 | 
 93 | grammar :: Grammar r (Prod r String Char Expr)
 94 | grammar = mdo
 95 |   let v = asum (token <$> "ab")
 96 |         <?> "variable"
 97 |   x1 <- rule
 98 |     $ Lam <$ token '\\' <*> some v <* token '.' <*> x1
 99 |     <|> x2
100 |     <?> "lambda"
101 |   x2 <- rule
102 |     $ Add <$> x3 <* token '+' <*> x2
103 |     <|> x3
104 |     <?> "sum"
105 |   x3 <- rule
106 |     $ App <$> x3 <*> x4
107 |     <|> x4
108 |     <?> "application"
109 |   let x4 = Var <$> v
110 |         <|> token '(' *> x1 <* token ')'
111 |   return x1
112 | 


--------------------------------------------------------------------------------
/bench/BenchAll.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RecursiveDo, FlexibleContexts #-}
  2 | module Main where
  3 | 
  4 | import Control.Applicative
  5 | import Control.Exception
  6 | import Control.DeepSeq
  7 | import Criterion.Main
  8 | import Data.Char
  9 | import Data.Maybe
 10 | import Text.Earley
 11 | import qualified Text.Parsec as Parsec
 12 | import qualified Text.Parsec.Pos as Parsec
 13 | 
 14 | data Expr
 15 |   = Add Expr Expr
 16 |   | Mul Expr Expr
 17 |   | Var String
 18 |   deriving (Eq, Ord, Show)
 19 | 
 20 | instance NFData Expr where
 21 |   rnf (Add a b) = rnf a `seq` rnf b
 22 |   rnf (Mul a b) = rnf a `seq` rnf b
 23 |   rnf (Var s)   = rnf s
 24 | 
 25 | type Token = String
 26 | 
 27 | tokenParens :: Bool -> [Token] -> [Token]
 28 | tokenParens True s  = ["("] ++ s ++ [")"]
 29 | tokenParens False s = s
 30 | 
 31 | tokenExpr :: Int -> Expr -> [Token]
 32 | tokenExpr _ (Var s) = [s]
 33 | tokenExpr d (Add a b) = tokenParens (d > 0) $ tokenExpr 0 a ++ ["+"] ++ tokenExpr 1 b
 34 | tokenExpr d (Mul a b) = tokenParens (d > 1) $ tokenExpr 1 a ++ ["*"] ++ tokenExpr 2 b
 35 | 
 36 | linearSum :: Int -> Expr
 37 | linearSum 1 = Var "x"
 38 | linearSum n = Add (linearSum $ n - 1) (Var "x")
 39 | 
 40 | treeSum :: Int -> Expr
 41 | treeSum 1 = Var "x"
 42 | treeSum n = let a = n `div` 2 -- will be at least 1
 43 |                 b = n - a
 44 |             in Add (treeSum a) (treeSum b)
 45 | 
 46 | -- Earley parser
 47 | 
 48 | expr :: Grammar r (Prod r String Token Expr)
 49 | expr = mdo
 50 |   x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2
 51 |             <|> x2
 52 |             <?> "sum"
 53 |   x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3
 54 |             <|> x3
 55 |             <?> "product"
 56 |   x3 <- rule $ Var <$> (satisfy isIdent <?> "identifier")
 57 |             <|> namedToken "(" *> x1 <* namedToken ")"
 58 |   return x1
 59 | 
 60 | isIdent :: String -> Bool
 61 | isIdent (x:_) = isAlpha x
 62 | isIdent _     = False
 63 | 
 64 | sepBy1 :: Prod r e t a -> Prod r e t op -> Grammar r (Prod r e t [a])
 65 | sepBy1 p op = mdo
 66 |   ops <- rule $ pure [] <|> (:) <$ op <*> p <*> ops
 67 |   rule $ (:) <$> p <*> ops
 68 | 
 69 | expr' :: Grammar r (Prod r String Token Expr)
 70 | expr' = mdo
 71 |   let var = Var <$> satisfy isIdent <|> token "(" *> mul <* token ")"
 72 |   mul <- fmap (foldl1 Mul) <$> add `sepBy1` token "*"
 73 |   add <- fmap (foldl1 Add) <$> var `sepBy1` token "+"
 74 |   return mul
 75 | 
 76 | parseEarley :: [Token] -> Maybe Expr
 77 | parseEarley input = listToMaybe (fst (fullParses (parser expr) input))
 78 | 
 79 | parseEarley' :: [Token] -> Maybe Expr
 80 | parseEarley' input = listToMaybe (fst (fullParses (parser expr') input))
 81 | 
 82 | -- Parsec parsec
 83 | 
 84 | type Parsec = Parsec.Parsec [Token] ()
 85 | 
 86 | parsecExpr :: Parsec Expr
 87 | parsecExpr = mul
 88 |   where mul   = foldl1 Mul <$> add `Parsec.sepBy1` t "*"
 89 |         add   = foldl1 Add <$> var `Parsec.sepBy1` t "+"
 90 |         ident = Parsec.token id pos $ \y -> if isIdent y then Just (Var y) else Nothing
 91 |         var   = ident <|> (t "(" *> mul <* t ")")
 92 |         t x   = Parsec.token id pos $ \y -> if x == y then Just x else Nothing
 93 |         pos   = const (Parsec.initialPos "")
 94 | 
 95 | parseParsec :: [Token] -> Maybe Expr
 96 | parseParsec =  either (const Nothing) Just . Parsec.parse parsecExpr ""
 97 | 
 98 | -- Our benchmark harness.
 99 | 
100 | linearInput :: Int -> (String, [Token])
101 | linearInput size = (show size, tokenExpr 0 $ linearSum size)
102 | 
103 | treeInput :: Int -> (String, [Token])
104 | treeInput size = (show size, tokenExpr 0 $ treeSum size)
105 | 
106 | inputBench :: (String, [Token]) -> Benchmark
107 | inputBench (name, input) = bench name $ nf id input
108 | 
109 | earleyBench :: (String, [Token]) -> Benchmark
110 | earleyBench (name, input) = bench name $ nf parseEarley input
111 | 
112 | earley'Bench :: (String, [Token]) -> Benchmark
113 | earley'Bench (name, input) = bench name $ nf parseEarley' input
114 | 
115 | parsecBench :: (String, [Token]) -> Benchmark
116 | parsecBench (name, input) = bench name $ nf parseParsec input
117 | 
118 | benchSizes :: [Int]
119 | benchSizes = [100, 200] -- [51, 101, 151, 201]
120 | 
121 | linearInputs :: [(String, [Token])]
122 | linearInputs = map linearInput benchSizes
123 | 
124 | treeInputs :: [(String, [Token])]
125 | treeInputs = map treeInput benchSizes
126 | 
127 | main :: IO ()
128 | main = do
129 |   evaluate (rnf linearInputs)
130 |   evaluate (rnf treeInputs)
131 |   defaultMain
132 |     [ -- bgroup "inputs" $ map inputBench linearInputs 
133 |       bgroup "earley" $ map earleyBench linearInputs
134 |     , bgroup "earley'" $ map earley'Bench linearInputs
135 |     , bgroup "parsec" $ map parsecBench linearInputs
136 |     -- , bgroup "inputsTree" $ map inputBench treeInputs
137 |     , bgroup "earleyTree" $ map earleyBench treeInputs
138 |     , bgroup "earley'Tree" $ map earley'Bench treeInputs
139 |     , bgroup "parsecTree" $ map parsecBench treeInputs
140 |     ]
141 | 
142 | 


--------------------------------------------------------------------------------
/Text/Earley/Mixfix.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, RecursiveDo #-}
  2 | module Text.Earley.Mixfix
  3 |   ( Associativity(..)
  4 |   , Holey
  5 |   , mixfixExpression
  6 |   , mixfixExpressionSeparate
  7 |   ) where
  8 | 
  9 | #if !MIN_VERSION_base(4,8,0)
 10 | import Control.Applicative
 11 | import Data.Traversable(sequenceA)
 12 | #endif
 13 | import Data.Either
 14 | import Data.Foldable(asum, foldrM)
 15 | import Text.Earley
 16 | 
 17 | replicateA :: Applicative f => Int -> f a -> f [a]
 18 | replicateA n = sequenceA . replicate n
 19 | 
 20 | consA :: Applicative f => f a -> f [a] -> f [a]
 21 | consA p q = (:) <$> p <*> q
 22 | 
 23 | data Associativity
 24 |   = LeftAssoc
 25 |   | NonAssoc
 26 |   | RightAssoc
 27 |   deriving (Eq, Show)
 28 | 
 29 | -- | An identifier with identifier parts ('Just's), and holes ('Nothing's)
 30 | -- representing the positions of its arguments.
 31 | --
 32 | -- Example (commonly written "if_then_else_"):
 33 | -- @['Just' "if", 'Nothing', 'Just' "then", 'Nothing', 'Just' "else", 'Nothing'] :: 'Holey' 'String'@
 34 | type Holey a = [Maybe a]
 35 | 
 36 | -- | Create a grammar for parsing mixfix expressions.
 37 | mixfixExpression
 38 |   :: [[(Holey (Prod r e t ident), Associativity)]]
 39 |   -- ^ A table of holey identifier parsers, with associativity information.
 40 |   -- The identifiers should be in groups of precedence levels listed from
 41 |   -- binding the least to the most tightly.
 42 |   --
 43 |   -- The associativity is taken into account when an identifier starts or ends
 44 |   -- with holes, or both. Internal holes (e.g. after "if" in "if_then_else_")
 45 |   -- start from the beginning of the table.
 46 |   --
 47 |   -- Note that this rule also applies to identifiers with multiple consecutive
 48 |   -- holes, e.g. "if__" --- the associativity then applies to both holes.
 49 |   -> Prod r e t expr
 50 |   -- ^ An atom, i.e. what is parsed at the lowest level. This will
 51 |   -- commonly be a (non-mixfix) identifier or a parenthesised expression.
 52 |   -> (Holey ident -> [expr] -> expr)
 53 |   -- ^ How to combine the successful application of a holey identifier to its
 54 |   -- arguments into an expression.
 55 |   -> Grammar r (Prod r e t expr)
 56 | mixfixExpression table atom app = mixfixExpressionSeparate table' atom
 57 |   where
 58 |     table' = [[(holey, assoc, app) | (holey, assoc) <- row] | row <- table]
 59 | 
 60 | -- | A version of 'mixfixExpression' with a separate semantic action for each
 61 | -- individual 'Holey' identifier.
 62 | mixfixExpressionSeparate
 63 |   :: [[(Holey (Prod r e t ident), Associativity, Holey ident -> [expr] -> expr)]]
 64 |   -- ^ A table of holey identifier parsers, with associativity information and
 65 |   -- semantic actions.  The identifiers should be in groups of precedence
 66 |   -- levels listed from binding the least to the most tightly.
 67 |   --
 68 |   -- The associativity is taken into account when an identifier starts or ends
 69 |   -- with holes, or both. Internal holes (e.g. after "if" in "if_then_else_")
 70 |   -- start from the beginning of the table.
 71 |   --
 72 |   -- Note that this rule also applies to identifiers with multiple consecutive
 73 |   -- holes, e.g. "if__" --- the associativity then applies to both holes.
 74 |   -> Prod r e t expr
 75 |   -- ^ An atom, i.e. what is parsed at the lowest level. This will
 76 |   -- commonly be a (non-mixfix) identifier or a parenthesised expression.
 77 |   -> Grammar r (Prod r e t expr)
 78 | mixfixExpressionSeparate table atom = mdo
 79 |   expr <- foldrM ($) atom $ map (level expr) table
 80 |   return expr
 81 |   where
 82 |     level expr idents next = mdo
 83 |       same <- rule $ asum $ next : map (mixfixIdent same) idents
 84 |       return same
 85 |       where
 86 |         -- Group consecutive holes and ident parts.
 87 |         grp [] = []
 88 |         grp (Nothing:ps) = case grp ps of
 89 |           Left n:rest -> (Left $! (n + 1)) : rest
 90 |           rest        -> Left 1            : rest
 91 |         grp (Just p:ps) = case grp ps of
 92 |           Right ps':rest -> Right (consA p ps')       : rest
 93 |           rest           -> Right (consA p $ pure []) : rest
 94 | 
 95 |         mixfixIdent same (ps, a, f) = f' <$> go (grp ps)
 96 |           where
 97 |             f' xs = f (concatMap (either (map $ const Nothing) $ map Just) xs)
 98 |                    $ concat $ lefts xs
 99 |             go ps' = case ps' of
100 |               [] -> pure []
101 |               [Right p] -> pure . Right <$> p
102 |               Left n:rest -> consA
103 |                 (Left <$> replicateA n (if a == RightAssoc then next
104 |                                                            else same))
105 |                 $ go rest
106 |               [Right p, Left n] -> consA
107 |                 (Right <$> p)
108 |                 $ pure . Left <$> replicateA n (if a == LeftAssoc then next
109 |                                                                   else same)
110 |               Right p:Left n:rest -> consA (Right <$> p)
111 |                 $ consA (Left <$> replicateA n expr)
112 |                 $ go rest
113 |               Right _:Right _:_ -> error
114 |                 $  "Earley.mixfixExpression: The impossible happened. "
115 |                 ++ "Please report this as a bug."
116 | 


--------------------------------------------------------------------------------
/Earley.cabal:
--------------------------------------------------------------------------------
  1 | name:                Earley
  2 | version:             0.13.0.1
  3 | synopsis:            Parsing all context-free grammars using Earley's algorithm.
  4 | description:         See <https://www.github.com/ollef/Earley> for more
  5 |                      information and
  6 |                      <https://github.com/ollef/Earley/tree/master/examples> for
  7 |                      examples.
  8 | license:             BSD3
  9 | license-file:        LICENSE
 10 | author:              Olle Fredriksson
 11 | maintainer:          fredriksson.olle@gmail.com
 12 | copyright:           (c) 2014-2019 Olle Fredriksson
 13 | category:            Parsing
 14 | build-type:          Simple
 15 | cabal-version:       >=1.10
 16 | tested-with:         GHC==7.6.3, GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1, GHC==8.4.1
 17 | 
 18 | extra-source-files:
 19 |                       README.md
 20 |                       CHANGELOG.md
 21 | 
 22 | Flag Examples
 23 |   Description: "Build examples"
 24 |   Default:     False
 25 |   Manual:      True
 26 | 
 27 | source-repository    head
 28 |   type:     git
 29 |   location: https://github.com/ollef/Earley.git
 30 | 
 31 | library
 32 |   exposed-modules:
 33 |                        Text.Earley,
 34 |                        Text.Earley.Derived,
 35 |                        Text.Earley.Generator,
 36 |                        Text.Earley.Generator.Internal,
 37 |                        Text.Earley.Grammar,
 38 |                        Text.Earley.Mixfix,
 39 |                        Text.Earley.Parser,
 40 |                        Text.Earley.Parser.Internal
 41 |   build-depends:       base >=4.6 && <5, ListLike >=4.1
 42 |   if impl(ghc < 8.0)
 43 |     build-depends:     semigroups >=0.18
 44 |   default-language:    Haskell2010
 45 |   ghc-options:         -Wall
 46 |                        -funbox-strict-fields
 47 | 
 48 | executable earley-english
 49 |   if !flag(examples)
 50 |     buildable:         False
 51 |   main-is:             English.hs
 52 |   ghc-options:         -Wall
 53 |   hs-source-dirs:      examples
 54 |   default-language:    Haskell2010
 55 |   build-depends:       base, Earley, unordered-containers >=0.2
 56 | 
 57 | executable earley-expr
 58 |   if !flag(examples)
 59 |     buildable:         False
 60 |   main-is:             Expr.hs
 61 |   ghc-options:         -Wall
 62 |   hs-source-dirs:      examples
 63 |   default-language:    Haskell2010
 64 |   build-depends:       base, Earley
 65 | 
 66 | executable earley-expr2
 67 |   if !flag(examples)
 68 |     buildable:         False
 69 |   main-is:             Expr2.hs
 70 |   ghc-options:         -Wall
 71 |   hs-source-dirs:      examples
 72 |   default-language:    Haskell2010
 73 |   build-depends:       base, Earley
 74 | 
 75 | executable earley-mixfix
 76 |   if !flag(examples)
 77 |     buildable:         False
 78 |   main-is:             Mixfix.hs
 79 |   ghc-options:         -Wall
 80 |   hs-source-dirs:      examples
 81 |   default-language:    Haskell2010
 82 |   build-depends:       base, Earley, unordered-containers
 83 | 
 84 | executable earley-roman-numerals
 85 |   if !flag(examples)
 86 |     buildable:         False
 87 |   main-is:             RomanNumerals.hs
 88 |   ghc-options:         -Wall
 89 |   hs-source-dirs:      examples
 90 |   default-language:    Haskell2010
 91 |   build-depends:       base, Earley
 92 | 
 93 | executable earley-very-ambiguous
 94 |   if !flag(examples)
 95 |     buildable:         False
 96 |   main-is:             VeryAmbiguous.hs
 97 |   ghc-options:         -Wall
 98 |   hs-source-dirs:      examples
 99 |   default-language:    Haskell2010
100 |   build-depends:       base, Earley
101 | 
102 | executable earley-words
103 |   if !flag(examples)
104 |     buildable:         False
105 |   main-is:             Words.hs
106 |   ghc-options:         -Wall
107 |   hs-source-dirs:      examples
108 |   default-language:    Haskell2010
109 |   build-depends:       base, Earley
110 | 
111 | executable earley-infinite
112 |   if !flag(examples)
113 |     buildable:         False
114 |   main-is:             Infinite.hs
115 |   ghc-options:         -Wall
116 |   hs-source-dirs:      examples
117 |   default-language:    Haskell2010
118 |   build-depends:       base, Earley
119 | 
120 | benchmark bench
121 |   type:                exitcode-stdio-1.0
122 |   hs-source-dirs:      . bench
123 |   main-is:             BenchAll.hs
124 |   build-depends:       base, Earley, ListLike, deepseq, criterion >=1.1, parsec >=3.1
125 |   if impl(ghc < 8.0)
126 |     build-depends:     semigroups >=0.18
127 |   default-language:    Haskell2010
128 |   ghc-options:         -Wall
129 | 
130 | test-suite tests
131 |   type:                exitcode-stdio-1.0
132 |   main-is:             Main.hs
133 |   ghc-options:         -Wall
134 |   hs-source-dirs:      tests
135 |   default-language:    Haskell2010
136 |   build-depends:       base, Earley, tasty >=0.10, tasty-quickcheck >=0.8, tasty-hunit >= 0.9, QuickCheck >= 2.8, containers >= 0.6
137 |   other-modules:
138 |                        Arbitrary,
139 |                        Constraint,
140 |                        Empty,
141 |                        Expr,
142 |                        Generator,
143 |                        InlineAlts,
144 |                        Issue11,
145 |                        Issue14,
146 |                        Lambda,
147 |                        Mixfix,
148 |                        Optional,
149 |                        ReversedWords,
150 |                        UnbalancedPars,
151 |                        VeryAmbiguous
152 | 


--------------------------------------------------------------------------------
/tests/Expr.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RecursiveDo #-}
  2 | module Expr where
  3 | import Control.Applicative
  4 | import Data.Char
  5 | import Test.Tasty
  6 | import Test.Tasty.QuickCheck as QC
  7 | 
  8 | import Text.Earley
  9 | 
 10 | import qualified Arbitrary
 11 | 
 12 | tests :: TestTree
 13 | tests = testGroup "Expr"
 14 |   [ QC.testProperty "Left-recursive: parse . pretty = id" $
 15 |     \e -> [e] === parseLeftExpr (prettyLeftExpr 0 e)
 16 |   , QC.testProperty "Left-recursive: parse . pretty = id (generator)" $ do
 17 |     (e, s) <- Arbitrary.arbitrary $ generator leftExpr tokens
 18 |     return
 19 |       $ [e] === parseLeftExpr (prettyLeftExpr 0 e)
 20 |       .&&. [e] === parseLeftExpr (unwords s)
 21 |   , QC.testProperty "Right-recursive: parse . pretty = id" $
 22 |     \e -> [e] === parseRightExpr (prettyRightExpr 0 e)
 23 |   , QC.testProperty "Right-recursive: parse . pretty = id (generator)" $ do
 24 |     (e, s) <- Arbitrary.arbitrary $ generator rightExpr tokens
 25 |     return
 26 |       $ [e] === parseRightExpr (prettyRightExpr 0 e)
 27 |       .&&. [e] === parseRightExpr (unwords s)
 28 |   , QC.testProperty "Ambiguous: parse . pretty ≈ id" $
 29 |     \e -> e `elem` parseAmbiguousExpr (prettyLeftExpr 0 e)
 30 |       .&&. e `elem` parseAmbiguousExpr (prettyRightExpr 0 e)
 31 |       .&&. [e] == parseAmbiguousExpr (prettyAmbiguousExpr e)
 32 |   , QC.testProperty "Ambiguous: parse . pretty ≈ id (generator)" $ do
 33 |     (e, s) <- Arbitrary.arbitrary $ generator ambiguousExpr tokens
 34 |     return $ e `elem` parseAmbiguousExpr (prettyLeftExpr 0 e)
 35 |       .&&. e `elem` parseAmbiguousExpr (prettyRightExpr 0 e)
 36 |       .&&. [e] == parseAmbiguousExpr (prettyAmbiguousExpr e)
 37 |       .&&. e `elem` parseAmbiguousExpr (unwords s)
 38 |   ]
 39 | 
 40 | tokens :: [String]
 41 | tokens = pure <$> "abcxyz+*()"
 42 | 
 43 | parseLeftExpr :: String -> [Expr]
 44 | parseLeftExpr input = fst (fullParses (parser leftExpr) (lexExpr input))
 45 | 
 46 | parseRightExpr :: String -> [Expr]
 47 | parseRightExpr input = fst (fullParses (parser rightExpr) (lexExpr input))
 48 | 
 49 | parseAmbiguousExpr :: String -> [Expr]
 50 | parseAmbiguousExpr input = fst (fullParses (parser ambiguousExpr) (lexExpr input))
 51 | 
 52 | data Expr
 53 |   = Add Expr Expr
 54 |   | Mul Expr Expr
 55 |   | Var String
 56 |   deriving (Eq, Ord, Show)
 57 | 
 58 | instance Arbitrary Expr where
 59 |   arbitrary = sized arbExpr
 60 |     where arbIdent           = Var <$> elements ["a", "b", "c", "x", "y", "z"]
 61 |           arbExpr n | n > 0  = oneof [ arbIdent
 62 |                                      , Add <$> arbExpr1 <*> arbExpr1
 63 |                                      , Mul <$> arbExpr1 <*> arbExpr1
 64 |                                      ]
 65 |                                      where arbExpr1 = arbExpr (n `div` 3)
 66 |           arbExpr _          = arbIdent
 67 | 
 68 |   shrink (Var _)    = []
 69 |   shrink (Add a b)  = a : b : [ Add a' b | a' <- shrink a ] ++ [ Add a b' | b' <- shrink b ]
 70 |   shrink (Mul a b)  = a : b : [ Mul a' b | a' <- shrink a ] ++ [ Mul a b' | b' <- shrink b ]
 71 | 
 72 | leftExpr :: Grammar r (Prod r String String Expr)
 73 | leftExpr = mdo
 74 |   x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2
 75 |             <|> x2
 76 |             <?> "sum"
 77 |   x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3
 78 |             <|> x3
 79 |             <?> "product"
 80 |   x3 <- rule $ Var <$> (satisfy ident <?> "identifier")
 81 |             <|> namedToken "(" *> x1 <* namedToken ")"
 82 |   return x1
 83 |   where
 84 |     ident (x:_) = isAlpha x
 85 |     ident _     = False
 86 | 
 87 | rightExpr :: Grammar r (Prod r String String Expr)
 88 | rightExpr = mdo
 89 |   x1 <- rule $ Add <$> x2 <* namedToken "+" <*> x1
 90 |             <|> x2
 91 |             <?> "sum"
 92 |   x2 <- rule $ Mul <$> x3 <* namedToken "*" <*> x2
 93 |             <|> x3
 94 |             <?> "product"
 95 |   x3 <- rule $ Var <$> (satisfy ident <?> "identifier")
 96 |             <|> namedToken "(" *> x1 <* namedToken ")"
 97 |   return x1
 98 |   where
 99 |     ident (x:_) = isAlpha x
100 |     ident _     = False
101 | 
102 | ambiguousExpr :: Grammar r (Prod r String String Expr)
103 | ambiguousExpr = mdo
104 |   x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x1
105 |             <|> x2
106 |             <?> "sum"
107 |   x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x2
108 |             <|> x3
109 |             <?> "product"
110 |   x3 <- rule $ Var <$> (satisfy ident <?> "identifier")
111 |             <|> namedToken "(" *> x1 <* namedToken ")"
112 |   return x1
113 |   where
114 |     ident (x:_) = isAlpha x
115 |     ident _     = False
116 | 
117 | prettyParens :: Bool -> String -> String
118 | prettyParens True s  = "(" ++ s ++ ")"
119 | prettyParens False s = s
120 | 
121 | prettyLeftExpr :: Int -> Expr -> String
122 | prettyLeftExpr _ (Var s) = s
123 | prettyLeftExpr d (Add a b) = prettyParens (d > 0) $ prettyLeftExpr 0 a ++ " + " ++ prettyLeftExpr 1 b
124 | prettyLeftExpr d (Mul a b) = prettyParens (d > 1) $ prettyLeftExpr 1 a ++ " * " ++ prettyLeftExpr 2 b
125 | 
126 | prettyRightExpr :: Int -> Expr -> String
127 | prettyRightExpr _ (Var s) = s
128 | prettyRightExpr d (Add a b) = prettyParens (d > 0) $ prettyRightExpr 1 a ++ " + " ++ prettyRightExpr 0 b
129 | prettyRightExpr d (Mul a b) = prettyParens (d > 1) $ prettyRightExpr 2 a ++ " * " ++ prettyRightExpr 1 b
130 | 
131 | prettyAmbiguousExpr :: Expr -> String
132 | prettyAmbiguousExpr (Var s) = s
133 | prettyAmbiguousExpr (Add a b) = prettyParens True $ prettyAmbiguousExpr a ++ " + " ++ prettyAmbiguousExpr b
134 | prettyAmbiguousExpr (Mul a b) = prettyParens True $ prettyAmbiguousExpr a ++ " * " ++ prettyAmbiguousExpr b
135 | 
136 | -- @words@ like lexer, but consider parentheses as separate tokens
137 | lexExpr :: String -> [String]
138 | lexExpr ""        = []
139 | lexExpr ('(' : s) = "(" : lexExpr s
140 | lexExpr (')' : s) = ")" : lexExpr s
141 | lexExpr (c : s)
142 |   | isSpace c     = lexExpr s
143 |   | otherwise     = let (tok, rest) = span p (c : s)
144 |                     in tok : lexExpr rest
145 |   where p x       = not (x == '(' || x == ')' || isSpace x)
146 | 


--------------------------------------------------------------------------------
/Text/Earley/Grammar.hs:
--------------------------------------------------------------------------------
  1 | -- | Context-free grammars.
  2 | {-# LANGUAGE CPP, GADTs, RankNTypes #-}
  3 | module Text.Earley.Grammar
  4 |   ( Prod(..)
  5 |   , terminal
  6 |   , (<?>)
  7 |   , constraint
  8 |   , alts
  9 |   , Grammar(..)
 10 |   , rule
 11 |   , runGrammar
 12 |   ) where
 13 | import Control.Applicative
 14 | import Control.Monad
 15 | import Control.Monad.Fix
 16 | import Data.String (IsString(..))
 17 | #if !MIN_VERSION_base(4,8,0)
 18 | import Data.Monoid
 19 | #endif
 20 | import Data.Semigroup
 21 | 
 22 | infixr 0 <?>
 23 | 
 24 | -- | A production.
 25 | --
 26 | -- The type parameters are:
 27 | --
 28 | -- @a@: The return type of the production.
 29 | --
 30 | -- @t@ for terminal: The type of the terminals that the production operates
 31 | -- on.
 32 | --
 33 | -- @e@ for expected: The type of names, used for example to report expected
 34 | -- tokens.
 35 | --
 36 | -- @r@ for rule: The type of a non-terminal. This plays a role similar to the
 37 | -- @s@ in the type @ST s a@.  Since the 'parser' function expects the @r@ to be
 38 | -- universally quantified, there is not much to do with this parameter other
 39 | -- than leaving it universally quantified.
 40 | --
 41 | -- As an example, @'Prod' r 'String' 'Char' 'Int'@ is the type of a production that
 42 | -- returns an 'Int', operates on (lists of) characters and reports 'String'
 43 | -- names.
 44 | --
 45 | -- Most of the functionality of 'Prod's is obtained through its instances, e.g.
 46 | -- 'Functor', 'Applicative', and 'Alternative'.
 47 | data Prod r e t a where
 48 |   -- Applicative.
 49 |   Terminal    :: !(t -> Maybe a) -> !(Prod r e t (a -> b)) -> Prod r e t b
 50 |   NonTerminal :: !(r e t a) -> !(Prod r e t (a -> b)) -> Prod r e t b
 51 |   Pure        :: a -> Prod r e t a
 52 |   -- Monoid/Alternative. We have to special-case 'many' (though it can be done
 53 |   -- with rules) to be able to satisfy the Alternative interface.
 54 |   Alts        :: ![Prod r e t a] -> !(Prod r e t (a -> b)) -> Prod r e t b
 55 |   Many        :: !(Prod r e t a) -> !(Prod r e t ([a] -> b)) -> Prod r e t b
 56 |   -- Error reporting.
 57 |   Named       :: !(Prod r e t a) -> e -> Prod r e t a
 58 |   -- Non-context-free extension: conditioning on the parsed output.
 59 |   Constraint  :: !(Prod r e t a) -> (a -> Bool) -> Prod r e t a
 60 | 
 61 | -- | Match a token for which the given predicate returns @Just a@,
 62 | -- and return the @a@.
 63 | terminal :: (t -> Maybe a) -> Prod r e t a
 64 | terminal p = Terminal p $ Pure id
 65 | 
 66 | -- | A named production (used for reporting expected things).
 67 | (<?>) :: Prod r e t a -> e -> Prod r e t a
 68 | (<?>) = Named
 69 | 
 70 | -- | A parser that filters results, post-parsing
 71 | constraint :: (a -> Bool) -> Prod r e t a -> Prod r e t a
 72 | constraint = flip Constraint
 73 | 
 74 | -- | Lifted instance: @(<>) = 'liftA2' ('<>')@
 75 | instance Semigroup a => Semigroup (Prod r e t a) where
 76 |   (<>) = liftA2 (Data.Semigroup.<>)
 77 | 
 78 | -- | Lifted instance: @mempty = 'pure' 'mempty'@
 79 | instance Monoid a => Monoid (Prod r e t a) where
 80 |   mempty  = pure mempty
 81 |   mappend = (<>)
 82 | 
 83 | instance Functor (Prod r e t) where
 84 |   {-# INLINE fmap #-}
 85 |   fmap f (Terminal b p)    = Terminal b $ fmap (f .) p
 86 |   fmap f (NonTerminal r p) = NonTerminal r $ fmap (f .) p
 87 |   fmap f (Pure x)          = Pure $ f x
 88 |   fmap f (Alts as p)       = Alts as $ fmap (f .) p
 89 |   fmap f (Many p q)        = Many p $ fmap (f .) q
 90 |   fmap f (Named p n)       = Named (fmap f p) n
 91 | 
 92 | -- | Smart constructor for alternatives.
 93 | alts :: [Prod r e t a] -> Prod r e t (a -> b) -> Prod r e t b
 94 | alts as p = case as >>= go of
 95 |   []  -> empty
 96 |   [a] -> a <**> p
 97 |   as' -> Alts as' p
 98 |   where
 99 |     go (Alts [] _)         = []
100 |     go (Alts as' (Pure f)) = fmap f <$> as'
101 |     go (Named p' n)        = map (<?> n) $ go p'
102 |     go a                   = [a]
103 | 
104 | instance Applicative (Prod r e t) where
105 |   pure = Pure
106 |   {-# INLINE (<*>) #-}
107 |   Terminal b p    <*> q = Terminal b $ flip <$> p <*> q
108 |   NonTerminal r p <*> q = NonTerminal r $ flip <$> p <*> q
109 |   Pure f          <*> q = fmap f q
110 |   Alts as p       <*> q = alts as $ flip <$> p <*> q
111 |   Many a p        <*> q = Many a $ flip <$> p <*> q
112 |   Named p n       <*> q = Named (p <*> q) n
113 | 
114 | instance Alternative (Prod r e t) where
115 |   empty = Alts [] $ pure id
116 |   Named p m <|> q         = Named (p <|> q) m
117 |   p         <|> Named q n = Named (p <|> q) n
118 |   p         <|> q         = alts [p, q] $ pure id
119 |   many (Alts [] _) = pure []
120 |   many p           = Many p $ Pure id
121 |   some p           = (:) <$> p <*> many p
122 | 
123 | -- | String literals can be interpreted as 'Terminal's
124 | -- that match that string.
125 | --
126 | -- >>> :set -XOverloadedStrings
127 | -- >>> import Data.Text (Text)
128 | -- >>> let determiner = "the" <|> "a" <|> "an" :: Prod r e Text Text
129 | --
130 | instance (IsString t, Eq t, a ~ t) => IsString (Prod r e t a) where
131 |   fromString s = Terminal f $ Pure id
132 |     where
133 |       fs = fromString s
134 |       f t | t == fs = Just fs
135 |       f _ = Nothing
136 |   {-# INLINE fromString #-}
137 | 
138 | -- | A context-free grammar.
139 | --
140 | -- The type parameters are:
141 | --
142 | -- @a@: The return type of the grammar (often a 'Prod').
143 | --
144 | -- @r@ for rule: The type of a non-terminal. This plays a role similar to the
145 | -- @s@ in the type @ST s a@.  Since the 'parser' function expects the @r@ to be
146 | -- universally quantified, there is not much to do with this parameter other
147 | -- than leaving it universally quantified.
148 | --
149 | -- Most of the functionality of 'Grammar's is obtained through its instances,
150 | -- e.g.  'Monad' and 'MonadFix'. Note that GHC has syntactic sugar for
151 | -- 'MonadFix': use @{-\# LANGUAGE RecursiveDo \#-}@ and @mdo@ instead of
152 | -- @do@.
153 | data Grammar r a where
154 |   RuleBind :: Prod r e t a -> (Prod r e t a -> Grammar r b) -> Grammar r b
155 |   FixBind  :: (a -> Grammar r a) -> (a -> Grammar r b) -> Grammar r b
156 |   Return   :: a -> Grammar r a
157 | 
158 | instance Functor (Grammar r) where
159 |   fmap f (RuleBind ps h) = RuleBind ps (fmap f . h)
160 |   fmap f (FixBind g h)   = FixBind g (fmap f . h)
161 |   fmap f (Return x)      = Return $ f x
162 | 
163 | instance Applicative (Grammar r) where
164 |   pure  = return
165 |   (<*>) = ap
166 | 
167 | instance Monad (Grammar r) where
168 |   return = Return
169 |   RuleBind ps f >>= k = RuleBind ps (f >=> k)
170 |   FixBind f g   >>= k = FixBind f (g >=> k)
171 |   Return x      >>= k = k x
172 | 
173 | instance MonadFix (Grammar r) where
174 |   mfix f = FixBind f return
175 | 
176 | -- | Create a new non-terminal by giving its production.
177 | rule :: Prod r e t a -> Grammar r (Prod r e t a)
178 | rule p = RuleBind p return
179 | 
180 | -- | Run a grammar, given an action to perform on productions to be turned into
181 | -- non-terminals.
182 | runGrammar :: MonadFix m
183 |            => (forall e t a. Prod r e t a -> m (Prod r e t a))
184 |            -> Grammar r b -> m b
185 | runGrammar r grammar = case grammar of
186 |   RuleBind p k -> do
187 |     nt <- r p
188 |     runGrammar r $ k nt
189 |   Return a -> return a
190 |   FixBind f k -> do
191 |     a <- mfix $ runGrammar r <$> f
192 |     runGrammar r $ k a
193 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Earley [![Hackage](https://img.shields.io/hackage/v/Earley.svg)](https://hackage.haskell.org/package/Earley)
  2 | ======
  3 | 
  4 | [Go to the API documentation on Hackage.](https://hackage.haskell.org/package/Earley)
  5 | 
  6 | This ([Text.Earley](https://hackage.haskell.org/package/Earley/docs/Text-Earley.html)) is a library consisting of a few main parts:
  7 | 
  8 | ### [Text.Earley.Grammar](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Grammar.html)
  9 | 
 10 | An embedded context-free grammar (CFG) domain-specific language (DSL) with
 11 | semantic action specification in applicative style.
 12 | 
 13 | An example of a typical expression grammar working on an input tokenised
 14 | into strings is the following:
 15 | 
 16 | ```haskell
 17 |    expr :: Grammar r (Prod r String String Expr)
 18 |    expr = mdo
 19 |      x1 <- rule $ Add <$> x1 <* namedToken "+" <*> x2
 20 |                <|> x2
 21 |                <?> "sum"
 22 |      x2 <- rule $ Mul <$> x2 <* namedToken "*" <*> x3
 23 |                <|> x3
 24 |                <?> "product"
 25 |      x3 <- rule $ Var <$> (satisfy ident <?> "identifier")
 26 |                <|> namedToken "(" *> x1 <* namedToken ")"
 27 |      return x1
 28 |      where
 29 |        ident (x:_) = isAlpha x
 30 |        ident _     = False
 31 | ```
 32 | 
 33 | ### [Text.Earley.Parser](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Parser.html)
 34 | 
 35 | An implementation of (a modification of) the Earley parsing algorithm.
 36 | 
 37 | To invoke the parser on the above grammar, run e.g. (here using `words` as a
 38 | stupid tokeniser):
 39 | 
 40 | ```haskell
 41 |    fullParses (parser expr) $ words "a + b * ( c + d )"
 42 |    = ( [Add (Var "a") (Mul (Var "b") (Add (Var "c") (Var "d")))]
 43 |      , Report {...}
 44 |      )
 45 | ```
 46 | 
 47 | Note that we get a list of all the possible parses (though in this case
 48 | there is only one).
 49 | 
 50 | Another invocation, which shows the error reporting capabilities (giving the
 51 | last position that the parser reached and what it expected at that point),
 52 | is the following:
 53 | 
 54 | ```haskell
 55 |    fullParses (parser expr) $ words "a +"
 56 |    = ( []
 57 |      , Report { position   = 2
 58 |               , expected   = ["(","identifier","product"]
 59 |               , unconsumed = []
 60 |               }
 61 |      )
 62 | ```
 63 | 
 64 | ### [Text.Earley.Generator](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Generator.html)
 65 | 
 66 | Functionality to generate the members of the language that a grammar generates.
 67 | 
 68 | To get the language of a grammar given a list of allowed tokens, run e.g.:
 69 | 
 70 | ```haskell
 71 |    language (generator romanNumeralsGrammar "VIX")
 72 |    = [(0,""),(1,"I"),(5,"V"),(10,"X"),(20,"XX"),(11,"XI"),(15,"XV"),(6,"VI"),(9,"IX"),(4,"IV"),(2,"II"),(3,"III"),(19,"XIX"),(16,"XVI"),(14,"XIV"),(12,"XII"),(7,"VII"),(21,"XXI"),(25,"XXV"),(30,"XXX"),(31,"XXXI"),(35,"XXXV"),(8,"VIII"),(13,"XIII"),(17,"XVII"),(26,"XXVI"),(29,"XXIX"),(24,"XXIV"),(22,"XXII"),(18,"XVIII"),(36,"XXXVI"),(39,"XXXIX"),(34,"XXXIV"),(32,"XXXII"),(23,"XXIII"),(27,"XXVII"),(33,"XXXIII"),(28,"XXVIII"),(37,"XXXVII"),(38,"XXXVIII")]
 73 | ```
 74 | 
 75 | The above example shows the language generated by a [Roman numerals
 76 | grammar](examples/RomanNumerals.hs) limited to the tokens `'V'`, `'I'`, and
 77 | `'X'`.
 78 | 
 79 | ### [Text.Earley.Mixfix](https://hackage.haskell.org/package/Earley/docs/Text-Earley-Mixfix.html)
 80 | 
 81 | Helper functionality for creating parsers for expressions with mixfix
 82 | identifiers in the style of Agda.
 83 | 
 84 | How do I write grammars?
 85 | ------------------------
 86 | 
 87 | As hinted at above, the grammars are written inside `Grammar`, which is a
 88 | `Monad` and `MonadFix`.  For the library to be able to tame the recursion in
 89 | the grammars, we have to use the `rule` function whenever a production is
 90 | recursive.
 91 | 
 92 | Whenever you would write e.g.
 93 | ```haskell
 94 | ...
 95 | p = foo <|> bar <*> p
 96 | ...
 97 | ```
 98 | in a conventional combinator parser library, you instead write the following:
 99 | ```haskell
100 | grammar = mdo
101 |   ...
102 |   p <- rule $ foo <|> bar <*> p
103 |   ...
104 | ```
105 | 
106 | Apart from making it possible to do recursion (even left-recursion), `rule`s
107 | have an additional benefit: they control where work is shared, by the rule that
108 | any `rule` is only ever expanded once per position in the input string. If a
109 | `rule` is encountered more than once at a position, the work is shared.
110 | 
111 | Compared to parser generators and combinator libraries
112 | ------------------------------------------------------
113 | 
114 | This library differs from the main methods that are used to write parsers in
115 | the Haskell ecosystem:
116 | 
117 | * Compared to parser generators (YACC, Happy, etc.) it requires very little
118 |   pre-processing of the grammar. It also allows you to stay in the host
119 |   language for both grammar and parser, i.e. there is no use of a separate
120 |   tool. This also means that you are free to use the abstraction facilities of
121 |   Haskell when writing a grammar. Currently the library requires a linear
122 |   traversal of the grammar's rules before use, which is usually fast enough to
123 |   do at run time, but precludes infinite grammars.
124 | 
125 | * The grammar language is similar to that of many parser combinators (Parsec,
126 |   Attoparsec, parallel parsing processes, etc.), providing an applicative
127 |   interface, but the parser gracefully handles all finite CFGs, including those
128 |   with left-recursion. On the other hand, its productions are not monadic
129 |   meaning that it does not support context-sensitive or infinite grammars,
130 |   which are supported by many parser combinator libraries.
131 | 
132 |   Note: The `Grammar` type is a `Monad` (used to provide observable sharing)
133 |   but it lives a layer above productions. It cannot be used to decide what
134 |   production to use depending on the result of a previous production, i.e. it
135 |   does not give us monadic parsing.
136 | 
137 | The parsing algorithm
138 | ---------------------
139 | 
140 | The parsing algorithm that this library uses is based on [Earley's parsing
141 | algorithm](https://en.wikipedia.org/wiki/Earley_parser).  The algorithm has
142 | been modified to produce online parse results, to give good error messages, and
143 | to allow garbage collection of the item sets. Essentially, instead of storing a
144 | sequence of sets of items like in the original algorithm, the modified
145 | algorithm just stores pointers back to sets of reachable items.
146 | 
147 | The worst-case run time performance of the Earley parsing algorithm is cubic in
148 | the length of the input, but for large classes of grammars it is linear. It
149 | should however be noted that this library will likely be slower than most
150 | parser generators and parser combinator libraries.
151 | 
152 | The parser implements an optimisation similar to that presented in Joop M.I.M
153 | Leo's paper *A general context-free parsing algorithm running in linear time on
154 | every LR(k) grammar without using lookahead*, which removes indirections in
155 | sequences of non-ambiguous backpointers between item sets.
156 | 
157 | For more in-depth information about the internals of the library, there are
158 | [implementation notes](docs/implementation.md) currently being written.
159 | 
160 | Contact
161 | -------
162 | 
163 | Olle Fredriksson - https://github.com/ollef
164 | 


--------------------------------------------------------------------------------
/Text/Earley/Generator/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, BangPatterns, DeriveFunctor, GADTs, Rank2Types, RecursiveDo #-}
  2 | -- | This module exposes the internals of the package: its API may change
  3 | -- independently of the PVP-compliant version number.
  4 | module Text.Earley.Generator.Internal where
  5 | import Control.Applicative
  6 | import Control.Monad
  7 | import Control.Monad.ST.Lazy
  8 | import Data.Maybe(mapMaybe)
  9 | import Data.STRef.Lazy
 10 | import Text.Earley.Grammar
 11 | #if !MIN_VERSION_base(4,8,0)
 12 | import Data.Monoid
 13 | #endif
 14 | import Data.Semigroup
 15 | 
 16 | -------------------------------------------------------------------------------
 17 | -- * Concrete rules and productions
 18 | -------------------------------------------------------------------------------
 19 | -- | The concrete rule type that the generator uses
 20 | data Rule s r e t a = Rule
 21 |   { ruleProd  :: ProdR s r e t a
 22 |   , ruleConts :: !(STRef s (STRef s [Cont s r e t a r]))
 23 |   , ruleNulls :: !(Results s t a)
 24 |   }
 25 | 
 26 | mkRule :: ProdR s r e t a -> ST s (Rule s r e t a)
 27 | mkRule p = mdo
 28 |   c <- newSTRef =<< newSTRef mempty
 29 |   computeNullsRef <- newSTRef $ do
 30 |     writeSTRef computeNullsRef $ return []
 31 |     ns <- unResults $ prodNulls p
 32 |     writeSTRef computeNullsRef $ return ns
 33 |     return ns
 34 |   return $ Rule (removeNulls p) c (Results $ join $ readSTRef computeNullsRef)
 35 | 
 36 | prodNulls :: ProdR s r e t a -> Results s t a
 37 | prodNulls prod = case prod of
 38 |   Terminal {}     -> empty
 39 |   NonTerminal r p -> ruleNulls r <**> prodNulls p
 40 |   Pure a          -> pure a
 41 |   Alts as p       -> mconcat (map prodNulls as) <**> prodNulls p
 42 |   Many a p        -> prodNulls (pure [] <|> pure <$> a) <**> prodNulls p
 43 |   Named p _       -> prodNulls p
 44 |   Constraint p _  -> prodNulls p
 45 | 
 46 | -- | Remove (some) nulls from a production
 47 | removeNulls :: ProdR s r e t a -> ProdR s r e t a
 48 | removeNulls prod = case prod of
 49 |   Terminal {}      -> prod
 50 |   NonTerminal {}   -> prod
 51 |   Pure _           -> empty
 52 |   Alts as (Pure f) -> alts (map removeNulls as) $ Pure f
 53 |   Alts {}          -> prod
 54 |   Many {}          -> prod
 55 |   Named p n        -> Named (removeNulls p) n
 56 |   Constraint p n   -> Constraint (removeNulls p) n
 57 | 
 58 | type ProdR s r e t a = Prod (Rule s r) e t a
 59 | 
 60 | resetConts :: Rule s r e t a -> ST s ()
 61 | resetConts r = writeSTRef (ruleConts r) =<< newSTRef mempty
 62 | 
 63 | -------------------------------------------------------------------------------
 64 | -- * Delayed results
 65 | -------------------------------------------------------------------------------
 66 | newtype Results s t a = Results { unResults :: ST s [(a, [t])] }
 67 |   deriving Functor
 68 | 
 69 | lazyResults :: ST s [(a, [t])] -> ST s (Results s t a)
 70 | lazyResults stas = mdo
 71 |   resultsRef <- newSTRef $ do
 72 |     as <- stas
 73 |     writeSTRef resultsRef $ return as
 74 |     return as
 75 |   return $ Results $ join $ readSTRef resultsRef
 76 | 
 77 | instance Applicative (Results s t) where
 78 |   pure x = Results $ pure [(x, mempty)]
 79 |   (<*>) = ap
 80 | 
 81 | instance Alternative (Results t s) where
 82 |   empty = Results $ pure []
 83 |   Results sxs <|> Results sys = Results $ (<|>) <$> sxs <*> sys
 84 | 
 85 | instance Monad (Results t s) where
 86 |   return = pure
 87 |   Results stxs >>= f = Results $ do
 88 |     xs <- stxs
 89 |     concat <$> mapM (\(x, ts) -> fmap (\(y, ts') -> (y, ts' ++ ts)) <$> unResults (f x)) xs
 90 | 
 91 | instance Semigroup (Results s t a) where
 92 |   (<>) = (<|>)
 93 | 
 94 | instance Monoid (Results s t a) where
 95 |   mempty = empty
 96 |   mappend = (<>)
 97 | 
 98 | -------------------------------------------------------------------------------
 99 | -- * States and continuations
100 | -------------------------------------------------------------------------------
101 | data BirthPos
102 |   = Previous
103 |   | Current
104 |   deriving Eq
105 | 
106 | -- | An Earley state with result type @a@.
107 | data State s r e t a where
108 |   State :: !(ProdR s r e t a)
109 |         -> !(a -> Results s t b)
110 |         -> !BirthPos
111 |         -> !(Conts s r e t b c)
112 |         -> State s r e t c
113 |   Final :: !(Results s t a) -> State s r e t a
114 | 
115 | -- | A continuation accepting an @a@ and producing a @b@.
116 | data Cont s r e t a b where
117 |   Cont      :: !(a -> Results s t b)
118 |             -> !(ProdR s r e t (b -> c))
119 |             -> !(c -> Results s t d)
120 |             -> !(Conts s r e t d e')
121 |             -> Cont s r e t a e'
122 |   FinalCont :: (a -> Results s t c) -> Cont s r e t a c
123 | 
124 | data Conts s r e t a c = Conts
125 |   { conts     :: !(STRef s [Cont s r e t a c])
126 |   , contsArgs :: !(STRef s (Maybe (STRef s (Results s t a))))
127 |   }
128 | 
129 | newConts :: STRef s [Cont s r e t a c] -> ST s (Conts s r e t a c)
130 | newConts r = Conts r <$> newSTRef Nothing
131 | 
132 | contraMapCont :: (b -> Results s t a) -> Cont s r e t a c -> Cont s r e t b c
133 | contraMapCont f (Cont g p args cs) = Cont (f >=> g) p args cs
134 | contraMapCont f (FinalCont args)   = FinalCont (f >=> args)
135 | 
136 | contToState :: BirthPos -> Results s t a -> Cont s r e t a c -> State s r e t c
137 | contToState pos r (Cont g p args cs) = State p (\f -> r >>= g >>= args . f) pos cs
138 | contToState _   r (FinalCont args)   = Final $ r >>= args
139 | 
140 | -- | Strings of non-ambiguous continuations can be optimised by removing
141 | -- indirections.
142 | simplifyCont :: Conts s r e t b a -> ST s [Cont s r e t b a]
143 | simplifyCont Conts {conts = cont} = readSTRef cont >>= go False
144 |   where
145 |     go !_ [Cont g (Pure f) args cont'] = do
146 |       ks' <- simplifyCont cont'
147 |       go True $ map (contraMapCont $ g >=> args . f) ks'
148 |     go True ks = do
149 |       writeSTRef cont ks
150 |       return ks
151 |     go False ks = return ks
152 | 
153 | -------------------------------------------------------------------------------
154 | -- * Grammars
155 | -------------------------------------------------------------------------------
156 | -- | Given a grammar, construct an initial state.
157 | initialState :: ProdR s a e t a -> ST s (State s a e t a)
158 | initialState p = State p pure Previous <$> (newConts =<< newSTRef [FinalCont pure])
159 | 
160 | -------------------------------------------------------------------------------
161 | -- * Generation
162 | -------------------------------------------------------------------------------
163 | -- | The result of a generator.
164 | data Result s t a
165 |   = Ended (ST s [(a, [t])])
166 |     -- ^ The generator ended.
167 |   | Generated (ST s [(a, [t])]) (ST s (Result s t a))
168 |     -- ^ The generator produced a number of @a@s.  These are given as a
169 |     -- computation, @'ST' s [a]@ that constructs the 'a's when run.  The 'Int' is
170 |     -- the position in the input where these results were obtained, and the last
171 |     -- component is the continuation.
172 |   deriving Functor
173 | 
174 | data GenerationEnv s e t a = GenerationEnv
175 |   { results :: ![ST s [(a, [t])]]
176 |     -- ^ Results ready to be reported (when this position has been processed)
177 |   , next    :: ![State s a e t a]
178 |     -- ^ States to process at the next position
179 |   , reset   :: !(ST s ())
180 |     -- ^ Computation that resets the continuation refs of productions
181 |   , tokens  :: ![t]
182 |     -- ^ The possible tokens
183 |   }
184 | 
185 | {-# INLINE emptyGenerationEnv #-}
186 | emptyGenerationEnv :: [t] -> GenerationEnv s e t a
187 | emptyGenerationEnv ts = GenerationEnv
188 |   { results = mempty
189 |   , next    = mempty
190 |   , reset   = return ()
191 |   , tokens  = ts
192 |   }
193 | 
194 | -- | The internal generation routine
195 | generate :: [State s a e t a] -- ^ States to process at this position
196 |          -> GenerationEnv s e t a
197 |          -> ST s (Result s t a)
198 | generate [] env@GenerationEnv {next = []} = do
199 |   reset env
200 |   return $ Ended $ concat <$> sequence (results env)
201 | generate [] env = do
202 |   reset env
203 |   return $ Generated (concat <$> sequence (results env))
204 |          $ generate (next env) $ emptyGenerationEnv $ tokens env
205 | generate (st:ss) env = case st of
206 |   Final res -> generate ss env {results = unResults res : results env}
207 |   State pr args pos scont -> case pr of
208 |     Terminal f p -> generate ss env
209 |       { next = [State p (\g -> Results (pure $ map (\(t, a) -> (g a, [t])) xs) >>= args) Previous scont | xs <- [mapMaybe (\t -> (,) t <$> f t) $ tokens env], not $ null xs]
210 |             ++ next env
211 |       }
212 |     NonTerminal r p -> do
213 |       rkref <- readSTRef $ ruleConts r
214 |       ks    <- readSTRef rkref
215 |       writeSTRef rkref (Cont pure p args scont : ks)
216 |       ns    <- unResults $ ruleNulls r
217 |       let addNullState
218 |             | null ns = id
219 |             | otherwise = (:)
220 |                         $ State p (\f -> Results (pure ns) >>= args . f) pos scont
221 |       if null ks then do -- The rule has not been expanded at this position.
222 |         st' <- State (ruleProd r) pure Current <$> newConts rkref
223 |         generate (addNullState $ st' : ss)
224 |               env {reset = resetConts r >> reset env}
225 |       else -- The rule has already been expanded at this position.
226 |         generate (addNullState ss) env
227 |     Pure a
228 |       -- Skip following continuations that stem from the current position; such
229 |       -- continuations are handled separately.
230 |       | pos == Current -> generate ss env
231 |       | otherwise -> do
232 |         let argsRef = contsArgs scont
233 |         masref  <- readSTRef argsRef
234 |         case masref of
235 |           Just asref -> do -- The continuation has already been followed at this position.
236 |             modifySTRef asref $ mappend $ args a
237 |             generate ss env
238 |           Nothing    -> do -- It hasn't.
239 |             asref <- newSTRef $ args a
240 |             writeSTRef argsRef $ Just asref
241 |             ks  <- simplifyCont scont
242 |             res <- lazyResults $ unResults =<< readSTRef asref
243 |             let kstates = map (contToState pos res) ks
244 |             generate (kstates ++ ss)
245 |                   env {reset = writeSTRef argsRef Nothing >> reset env}
246 |     Alts as (Pure f) -> do
247 |       let args' = args . f
248 |           sts   = [State a args' pos scont | a <- as]
249 |       generate (sts ++ ss) env
250 |     Alts as p -> do
251 |       scont' <- newConts =<< newSTRef [Cont pure p args scont]
252 |       let sts = [State a pure Previous scont' | a <- as]
253 |       generate (sts ++ ss) env
254 |     Many p q -> mdo
255 |       r <- mkRule $ pure [] <|> (:) <$> p <*> NonTerminal r (Pure id)
256 |       generate (State (NonTerminal r q) args pos scont : ss) env
257 |     Named pr' _ -> generate (State pr' args pos scont : ss) env
258 |     Constraint pr' c -> generate (State pr' (test >=> args) pos scont : ss) env
259 |       where test x = if c x then return x else empty
260 | 
261 | type Generator t a = forall s. ST s (Result s t a)
262 | 
263 | -- | Create a language generator for given grammar and list of allowed tokens.
264 | generator
265 |   :: (forall r. Grammar r (Prod r e t a))
266 |   -> [t]
267 |   -> Generator t a
268 | generator g ts = do
269 |   let nt x = NonTerminal x $ pure id
270 |   s <- initialState =<< runGrammar (fmap nt . mkRule) g
271 |   generate [s] $ emptyGenerationEnv ts
272 | 
273 | -- | Run a generator, returning all members of the language.
274 | --
275 | -- The members are returned as parse results paired with the list of tokens
276 | -- used to produce the result.
277 | -- The elements of the returned list of results are sorted by their length in
278 | -- ascending order.  If there are multiple results of the same length they are
279 | -- returned in an unspecified order.
280 | language
281 |   :: Generator t a
282 |   -> [(a, [t])]
283 | language gen = runST $ gen >>= go
284 |   where
285 |     go :: Result s t a -> ST s [(a, [t])]
286 |     go r = case r of
287 |       Ended mas -> mas
288 |       Generated mas k -> do
289 |         as <- mas
290 |         (as ++) <$> (go =<< k)
291 | 
292 | -- | @upTo n gen@ runs the generator @gen@, returning all members of the
293 | -- language that are of length less than or equal to @n@.
294 | --
295 | -- The members are returned as parse results paired with the list of tokens
296 | -- used to produce the result.
297 | -- The elements of the returned list of results are sorted by their length in
298 | -- ascending order.  If there are multiple results of the same length they are
299 | -- returned in an unspecified order.
300 | upTo
301 |   :: Int
302 |   -> Generator t a
303 |   -> [(a, [t])]
304 | upTo len gen = runST $ gen >>= go 0
305 |   where
306 |     go :: Int -> Result s t a -> ST s [(a, [t])]
307 |     go curLen r | curLen <= len = case r of
308 |       Ended mas -> mas
309 |       Generated mas k -> do
310 |         as <- mas
311 |         (as ++) <$> (go (curLen + 1) =<< k)
312 |     go _ _ = return []
313 | 
314 | -- | @exactly n gen@ runs the generator @gen@, returning all members of the
315 | -- language that are of length equal to @n@.
316 | --
317 | -- The members are returned as parse results paired with the list of tokens
318 | -- used to produce the result.
319 | -- If there are multiple results they are returned in an unspecified order.
320 | exactly
321 |   :: Int
322 |   -> Generator t a
323 |   -> [(a, [t])]
324 | exactly len _ | len < 0 = []
325 | exactly len gen = runST $ gen >>= go 0
326 |   where
327 |     go :: Int -> Result s t a -> ST s [(a, [t])]
328 |     go !curLen r = case r of
329 |       Ended mas
330 |         | curLen == len -> mas
331 |         | otherwise -> return []
332 |       Generated mas k
333 |         | curLen == len -> mas
334 |         | otherwise -> go (curLen + 1) =<< k
335 | 


--------------------------------------------------------------------------------
/Text/Earley/Parser/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, BangPatterns, DeriveFunctor, GADTs, TupleSections, Rank2Types, RecursiveDo #-}
  2 | -- | This module exposes the internals of the package: its API may change
  3 | -- independently of the PVP-compliant version number.
  4 | module Text.Earley.Parser.Internal where
  5 | import Control.Applicative
  6 | import Control.Arrow
  7 | import Control.Monad
  8 | import Control.Monad.ST
  9 | import Data.ListLike(ListLike)
 10 | import qualified Data.ListLike as ListLike
 11 | import Data.STRef
 12 | import Text.Earley.Grammar
 13 | #if !MIN_VERSION_base(4,8,0)
 14 | import Data.Monoid
 15 | #endif
 16 | import Data.Semigroup
 17 | 
 18 | -------------------------------------------------------------------------------
 19 | -- * Concrete rules and productions
 20 | -------------------------------------------------------------------------------
 21 | -- | The concrete rule type that the parser uses
 22 | data Rule s r e t a = Rule
 23 |   { ruleProd  :: ProdR s r e t a
 24 |   , ruleConts :: !(STRef s (STRef s [Cont s r e t a r]))
 25 |   , ruleNulls :: !(Results s a)
 26 |   }
 27 | 
 28 | mkRule :: ProdR s r e t a -> ST s (Rule s r e t a)
 29 | mkRule p = mdo
 30 |   c <- newSTRef =<< newSTRef mempty
 31 |   computeNullsRef <- newSTRef $ do
 32 |     writeSTRef computeNullsRef $ return []
 33 |     ns <- unResults $ prodNulls p
 34 |     writeSTRef computeNullsRef $ return ns
 35 |     return ns
 36 |   return $ Rule (removeNulls p) c (Results $ join $ readSTRef computeNullsRef)
 37 | 
 38 | prodNulls :: ProdR s r e t a -> Results s a
 39 | prodNulls prod = case prod of
 40 |   Terminal {}     -> empty
 41 |   NonTerminal r p -> ruleNulls r <**> prodNulls p
 42 |   Pure a          -> pure a
 43 |   Alts as p       -> mconcat (map prodNulls as) <**> prodNulls p
 44 |   Many a p        -> prodNulls (pure [] <|> pure <$> a) <**> prodNulls p
 45 |   Named p _       -> prodNulls p
 46 |   Constraint p _  -> prodNulls p
 47 | 
 48 | -- | Remove (some) nulls from a production
 49 | removeNulls :: ProdR s r e t a -> ProdR s r e t a
 50 | removeNulls prod = case prod of
 51 |   Terminal {}      -> prod
 52 |   NonTerminal {}   -> prod
 53 |   Pure _           -> empty
 54 |   Alts as (Pure f) -> alts (map removeNulls as) $ Pure f
 55 |   Alts {}          -> prod
 56 |   Many {}          -> prod
 57 |   Named p n        -> Named (removeNulls p) n
 58 |   Constraint p n   -> Constraint (removeNulls p) n
 59 | 
 60 | type ProdR s r e t a = Prod (Rule s r) e t a
 61 | 
 62 | resetConts :: Rule s r e t a -> ST s ()
 63 | resetConts r = writeSTRef (ruleConts r) =<< newSTRef mempty
 64 | 
 65 | -------------------------------------------------------------------------------
 66 | -- * Delayed results
 67 | -------------------------------------------------------------------------------
 68 | newtype Results s a = Results { unResults :: ST s [a] }
 69 |   deriving Functor
 70 | 
 71 | lazyResults :: ST s [a] -> ST s (Results s a)
 72 | lazyResults stas = mdo
 73 |   resultsRef <- newSTRef $ do
 74 |     as <- stas
 75 |     writeSTRef resultsRef $ return as
 76 |     return as
 77 |   return $ Results $ join $ readSTRef resultsRef
 78 | 
 79 | instance Applicative (Results s) where
 80 |   pure = Results . pure . pure
 81 |   (<*>) = ap
 82 | 
 83 | instance Alternative (Results s) where
 84 |   empty = Results $ pure []
 85 |   Results sxs <|> Results sys = Results $ (<|>) <$> sxs <*> sys
 86 | 
 87 | instance Monad (Results s) where
 88 |   return = pure
 89 |   Results stxs >>= f = Results $ do
 90 |     xs <- stxs
 91 |     concat <$> mapM (unResults . f) xs
 92 | 
 93 | instance Semigroup (Results s a) where
 94 |   (<>) = (<|>)
 95 | 
 96 | instance Monoid (Results s a) where
 97 |   mempty = empty
 98 |   mappend = (<>)
 99 | 
100 | -------------------------------------------------------------------------------
101 | -- * States and continuations
102 | -------------------------------------------------------------------------------
103 | data BirthPos
104 |   = Previous
105 |   | Current
106 |   deriving Eq
107 | 
108 | -- | An Earley state with result type @a@.
109 | data State s r e t a where
110 |   State :: !(ProdR s r e t a)
111 |         -> !(a -> Results s b)
112 |         -> !BirthPos
113 |         -> !(Conts s r e t b c)
114 |         -> State s r e t c
115 |   Final :: !(Results s a) -> State s r e t a
116 | 
117 | -- | A continuation accepting an @a@ and producing a @b@.
118 | data Cont s r e t a b where
119 |   Cont      :: !(a -> Results s b)
120 |             -> !(ProdR s r e t (b -> c))
121 |             -> !(c -> Results s d)
122 |             -> !(Conts s r e t d e')
123 |             -> Cont s r e t a e'
124 |   FinalCont :: (a -> Results s c) -> Cont s r e t a c
125 | 
126 | data Conts s r e t a c = Conts
127 |   { conts     :: !(STRef s [Cont s r e t a c])
128 |   , contsArgs :: !(STRef s (Maybe (STRef s (Results s a))))
129 |   }
130 | 
131 | newConts :: STRef s [Cont s r e t a c] -> ST s (Conts s r e t a c)
132 | newConts r = Conts r <$> newSTRef Nothing
133 | 
134 | contraMapCont :: (b -> Results s a) -> Cont s r e t a c -> Cont s r e t b c
135 | contraMapCont f (Cont g p args cs) = Cont (f >=> g) p args cs
136 | contraMapCont f (FinalCont args)   = FinalCont (f >=> args)
137 | 
138 | contToState :: BirthPos -> Results s a -> Cont s r e t a c -> State s r e t c
139 | contToState pos r (Cont g p args cs) = State p (\f -> r >>= g >>= args . f) pos cs
140 | contToState _   r (FinalCont args)   = Final $ r >>= args
141 | 
142 | -- | Strings of non-ambiguous continuations can be optimised by removing
143 | -- indirections.
144 | simplifyCont :: Conts s r e t b a -> ST s [Cont s r e t b a]
145 | simplifyCont Conts {conts = cont} = readSTRef cont >>= go False
146 |   where
147 |     go !_ [Cont g (Pure f) args cont'] = do
148 |       ks' <- simplifyCont cont'
149 |       go True $ map (contraMapCont $ g >=> args . f) ks'
150 |     go True ks = do
151 |       writeSTRef cont ks
152 |       return ks
153 |     go False ks = return ks
154 | 
155 | -------------------------------------------------------------------------------
156 | -- * Grammars
157 | -------------------------------------------------------------------------------
158 | -- | Given a grammar, construct an initial state.
159 | initialState :: ProdR s a e t a -> ST s (State s a e t a)
160 | initialState p = State p pure Previous <$> (newConts =<< newSTRef [FinalCont pure])
161 | 
162 | -------------------------------------------------------------------------------
163 | -- * Parsing
164 | -------------------------------------------------------------------------------
165 | -- | A parsing report, which contains fields that are useful for presenting
166 | -- errors to the user if a parse is deemed a failure.  Note however that we get
167 | -- a report even when we successfully parse something.
168 | data Report e i = Report
169 |   { position   :: Int -- ^ The final position in the input (0-based) that the
170 |                       -- parser reached.
171 |   , expected   :: [e] -- ^ The named productions processed at the final
172 |                       -- position.
173 |   , unconsumed :: i   -- ^ The part of the input string that was not consumed,
174 |                       -- which may be empty.
175 |   } deriving (Eq, Ord, Read, Show)
176 | 
177 | -- | The result of a parse.
178 | data Result s e i a
179 |   = Ended (Report e i)
180 |     -- ^ The parser ended.
181 |   | Parsed (ST s [a]) Int i (ST s (Result s e i a))
182 |     -- ^ The parser parsed a number of @a@s.  These are given as a computation,
183 |     -- @'ST' s [a]@ that constructs the 'a's when run.  We can thus save some
184 |     -- work by ignoring this computation if we do not care about the results.
185 |     -- The 'Int' is the position in the input where these results were
186 |     -- obtained, the @i@ the rest of the input, and the last component is the
187 |     -- continuation.
188 |   deriving Functor
189 | 
190 | data ParseEnv s e i t a = ParseEnv
191 |   { results :: ![ST s [a]]
192 |     -- ^ Results ready to be reported (when this position has been processed)
193 |   , next    :: ![State s a e t a]
194 |     -- ^ States to process at the next position
195 |   , reset   :: !(ST s ())
196 |     -- ^ Computation that resets the continuation refs of productions
197 |   , names   :: ![e]
198 |     -- ^ Named productions encountered at this position
199 |   , curPos  :: !Int
200 |     -- ^ The current position in the input string
201 |   , input   :: !i
202 |     -- ^ The input string
203 |   }
204 | 
205 | {-# INLINE emptyParseEnv #-}
206 | emptyParseEnv :: i -> ParseEnv s e i t a
207 | emptyParseEnv i = ParseEnv
208 |   { results = mempty
209 |   , next    = mempty
210 |   , reset   = return ()
211 |   , names   = mempty
212 |   , curPos  = 0
213 |   , input   = i
214 |   }
215 | 
216 | {-# SPECIALISE parse :: [State s a e t a]
217 |                      -> ParseEnv s e [t] t a
218 |                      -> ST s (Result s e [t] a) #-}
219 | -- | The internal parsing routine
220 | parse :: ListLike i t
221 |       => [State s a e t a] -- ^ States to process at this position
222 |       -> ParseEnv s e i t a
223 |       -> ST s (Result s e i a)
224 | parse [] env@ParseEnv {results = [], next = []} = do
225 |   reset env
226 |   return $ Ended Report
227 |     { position   = curPos env
228 |     , expected   = names env
229 |     , unconsumed = input env
230 |     }
231 | parse [] env@ParseEnv {results = []} = do
232 |   reset env
233 |   parse (next env)
234 |         (emptyParseEnv $ ListLike.drop 1 $ input env) {curPos = curPos env + 1}
235 | parse [] env = do
236 |   reset env
237 |   return $ Parsed (concat <$> sequence (results env)) (curPos env) (input env)
238 |          $ parse [] env {results = [], reset = return ()}
239 | parse (st:ss) env = case st of
240 |   Final res -> parse ss env {results = unResults res : results env}
241 |   State pr args pos scont -> case pr of
242 |     Terminal f p -> case ListLike.uncons (input env) >>= f . fst of
243 |       Just a -> parse ss env {next = State p (args . ($ a)) Previous scont
244 |                                    : next env}
245 |       Nothing -> parse ss env
246 |     NonTerminal r p -> do
247 |       rkref <- readSTRef $ ruleConts r
248 |       ks    <- readSTRef rkref
249 |       writeSTRef rkref (Cont pure p args scont : ks)
250 |       ns    <- unResults $ ruleNulls r
251 |       let addNullState
252 |             | null ns = id
253 |             | otherwise = (:)
254 |                         $ State p (\f -> Results (pure $ map f ns) >>= args) pos scont
255 |       if null ks then do -- The rule has not been expanded at this position.
256 |         st' <- State (ruleProd r) pure Current <$> newConts rkref
257 |         parse (addNullState $ st' : ss)
258 |               env {reset = resetConts r >> reset env}
259 |       else -- The rule has already been expanded at this position.
260 |         parse (addNullState ss) env
261 |     Pure a
262 |       -- Skip following continuations that stem from the current position; such
263 |       -- continuations are handled separately.
264 |       | pos == Current -> parse ss env
265 |       | otherwise -> do
266 |         let argsRef = contsArgs scont
267 |         masref  <- readSTRef argsRef
268 |         case masref of
269 |           Just asref -> do -- The continuation has already been followed at this position.
270 |             modifySTRef asref $ mappend $ args a
271 |             parse ss env
272 |           Nothing    -> do -- It hasn't.
273 |             asref <- newSTRef $ args a
274 |             writeSTRef argsRef $ Just asref
275 |             ks  <- simplifyCont scont
276 |             res <- lazyResults $ unResults =<< readSTRef asref
277 |             let kstates = map (contToState pos res) ks
278 |             parse (kstates ++ ss)
279 |                   env {reset = writeSTRef argsRef Nothing >> reset env}
280 |     Alts as (Pure f) -> do
281 |       let args' = args . f
282 |           sts   = [State a args' pos scont | a <- as]
283 |       parse (sts ++ ss) env
284 |     Alts as p -> do
285 |       scont' <- newConts =<< newSTRef [Cont pure p args scont]
286 |       let sts = [State a pure Previous scont' | a <- as]
287 |       parse (sts ++ ss) env
288 |     Many p q -> mdo
289 |       r <- mkRule $ pure [] <|> (:) <$> p <*> NonTerminal r (Pure id)
290 |       parse (State (NonTerminal r q) args pos scont : ss) env
291 |     Named pr' n -> parse (State pr' args pos scont : ss)
292 |                          env {names = n : names env}
293 |     Constraint pr' c -> parse (State pr' (test >=> args) pos scont : ss) env
294 |       where test x = if c x then return x else empty
295 | 
296 | type Parser e i a = forall s. i -> ST s (Result s e i a)
297 | 
298 | {-# INLINE parser #-}
299 | -- | Create a parser from the given grammar.
300 | parser
301 |   :: ListLike i t
302 |   => (forall r. Grammar r (Prod r e t a))
303 |   -> Parser e i a
304 | parser g i = do
305 |   let nt x = NonTerminal x $ pure id
306 |   s <- initialState =<< runGrammar (fmap nt . mkRule) g
307 |   parse [s] $ emptyParseEnv i
308 | 
309 | -- | Return all parses from the result of a given parser. The result may
310 | -- contain partial parses. The 'Int's are the position at which a result was
311 | -- produced.
312 | --
313 | -- The elements of the returned list of results are sorted by their position in
314 | -- ascending order.  If there are multiple results at the same position they
315 | -- are returned in an unspecified order.
316 | allParses
317 |   :: Parser e i a
318 |   -> i
319 |   -> ([(a, Int)], Report e i)
320 | allParses p i = runST $ p i >>= go
321 |   where
322 |     go :: Result s e i a -> ST s ([(a, Int)], Report e i)
323 |     go r = case r of
324 |       Ended rep           -> return ([], rep)
325 |       Parsed mas cpos _ k -> do
326 |         as <- mas
327 |         fmap (first (map (, cpos) as ++)) $ go =<< k
328 | 
329 | {-# INLINE fullParses #-}
330 | -- | Return all parses that reached the end of the input from the result of a
331 | -- given parser.
332 | --
333 | -- If there are multiple results they are returned in an unspecified order.
334 | fullParses
335 |   :: ListLike i t
336 |   => Parser e i a
337 |   -> i
338 |   -> ([a], Report e i)
339 | fullParses p i = runST $ p i >>= go
340 |   where
341 |     go :: ListLike i t => Result s e i a -> ST s ([a], Report e i)
342 |     go r = case r of
343 |       Ended rep -> return ([], rep)
344 |       Parsed mas _ i' k
345 |         | ListLike.null i' -> do
346 |           as <- mas
347 |           fmap (first (as ++)) $ go =<< k
348 |         | otherwise -> go =<< k
349 | 
350 | {-# INLINE report #-}
351 | -- | See e.g. how far the parser is able to parse the input string before it
352 | -- fails.  This can be much faster than getting the parse results for highly
353 | -- ambiguous grammars.
354 | report
355 |   :: Parser e i a
356 |   -> i
357 |   -> Report e i
358 | report p i = runST $ p i >>= go
359 |   where
360 |     go :: Result s e i a -> ST s (Report e i)
361 |     go r = case r of
362 |       Ended rep      -> return rep
363 |       Parsed _ _ _ k -> go =<< k
364 | 


--------------------------------------------------------------------------------
/docs/implementation.md:
--------------------------------------------------------------------------------
  1 | % Earley's internals
  2 | % Olle Fredriksson
  3 | % Late 2015
  4 | 
  5 | This is an attempt to document the internals of the Earley library.  The
  6 | writing of this document was spurred both by a request to provide this
  7 | documentation to help contributors and by some recent discussion regarding the
  8 | readability of the implementation.
  9 | 
 10 | I'll first start with some background that many of you probably already know
 11 | about. Feel free to skip ahead or skim.
 12 | 
 13 | Background
 14 | ===
 15 | 
 16 | Let's start by having a look at how parser combinator libraries usually work.
 17 | I'm aware that this doesn't give the full picture in terms of available
 18 | libraries for parsing --- it ignores e.g. parser generators --- but I believe
 19 | this way actually gives the best intuition for how the Earley library works.
 20 | On the surface I think that the Earley library is more similar to combinator
 21 | libraries than to parser generators because of the (Applicative, embedded in a
 22 | powerful host language) interface that it provides. However, the
 23 | *functionality* that it provides may more accurately be compared to parser
 24 | generator libraries.
 25 | 
 26 | Parser combinators
 27 | -------
 28 | 
 29 | I know of three ways that parser combinator libraries handle alternatives:
 30 | 
 31 | * *Greedy*, or LL(1): Commit to an alternative the instant it has consumed a
 32 |   symbol.  Always start with the leftmost alternative. This is classical
 33 |   recursive descent.
 34 | * *Backtracking*: Go back and try the other alternatives if an
 35 |   alternative fails (i.e. depth-first).
 36 | * *Parallel*: Try all alternatives in parallel (i.e. breadth-first)
 37 | 
 38 | Some examples of libraries in these categories are
 39 | 
 40 | * [Parsec](https://hackage.haskell.org/package/parsec), which is greedy by default, but backtracks from wherever the `try`
 41 |   combinator is used,
 42 | * [Attoparsec](https://hackage.haskell.org/package/attoparsec), which is backtracking, and
 43 | * [Parsek](https://hackage.haskell.org/package/parsek) (a.k.a. parallel parsing processes), which is parallel.
 44 | 
 45 | The following is a small expression language given in BNF:
 46 | 
 47 | ```
 48 | ident ::= 'x' | 'y'
 49 | expr  ::= ident | ident '+' expr
 50 | ```
 51 | 
 52 | An example string in this language is `"x+y"`.  Let's investigate how the three
 53 | different kinds of alternative handling cope with this input string.
 54 | 
 55 | * Greedy:
 56 | 
 57 |     This starts by choosing the first `expr` branch and parsing an `'x'`,
 58 |     i.e.  the derivation
 59 |     ```
 60 |     expr ---->{1} ident
 61 |          ---->{1} 'x'
 62 |          -->{'x'} EMPTY
 63 |          -------> FAIL
 64 |     ```
 65 |     I use the following kinds of transitions in the derivation:
 66 |     - ` a  rest ---->{i} a_i rest`: expand the leftmost non-terminal's `i`th production.
 67 |     - `'x' rest -->{'x'} rest`: consume token 'x' from the input.
 68 | 
 69 |     Since the parser has managed to parse a symbol at this point, it has
 70 |     committed to the first `expr` branch and makes no further progress. Here I
 71 |     use `EMPTY` to denote the empty production (sometimes also called epsilon).
 72 | * Backtracking:
 73 | 
 74 |     We take the transition `FAIL ->{back} y` to mean backtracking to some
 75 |     previous state. I'll use indentation to indicate what the state is; this is
 76 |     slightly informal (in an implementation we would have to save some piece of
 77 |     state), but hopefully clear enough.
 78 |     ```
 79 |     expr ---->{1} ident
 80 |          ---->{1} 'x'
 81 |          ------> FAIL
 82 |          ->{back}
 83 |     expr ---->{2} ident '+' expr
 84 |          ---->{1} 'x' '+' expr 
 85 |          -->{'x'} '+' expr
 86 |          -->{'+'} expr
 87 |          ---->{1} ident ---->{1} 'x'
 88 |                         -------> FAIL
 89 |                         ->{back}
 90 |                   ident ---->{2} 'y'
 91 |                         -->{'y'} EMPTY
 92 |                         -->{EOF} SUCCESS
 93 |     ```
 94 | * Parallel:
 95 | 
 96 |     Here we use the syntax `a rest ---->{*} (a_1 rest | ... | a_k rest)`
 97 |     for expanding all the productions of a non-terminal.
 98 |     ```
 99 |     expr ---->{*} ident | ident '+' expr
100 |          ---->{*} 'x' | 'y' | 'x' '+' expr | 'y' '+' expr
101 |          -->{'x'} EMPTY | '+' expr
102 |          -->{'+'} expr
103 |          ---->{*} ident | ident '+' expr
104 |          ---->{*} 'x' | 'y' | 'x' '+' expr | 'y' '+' expr
105 |          -->{'y'} EMPTY | '+' expr
106 |          -->{EOF} SUCCESS
107 |     ```
108 | 
109 | We saw that greedy parsing did not succeed like the others. So what is it good
110 | for? First of all, we can rewrite the grammar and make it work. For example, as
111 | follows:
112 | 
113 | ```
114 | ident ::= 'x' | 'y'
115 | tail ::= '+' ident tail | EMPTY
116 | expr  ::= ident tail
117 | ```
118 | 
119 | For the example string, `"x+y"`, we get the following derivation:
120 | 
121 | ```
122 | expr ---->{1} ident tail
123 |      ---->{1} 'x' tail
124 |      -->{'x'} tail
125 |      ---->{1} '+' ident tail
126 |      -->{'+'} ident tail ---->{1} 'x' tail
127 |                          -------> FAIL
128 |                          ->{back}
129 |               ident tail ---->{2} 'y' tail
130 |                          -->{'y'} tail ---->{1} '+' ident tail
131 |                                        -------> FAIL
132 |                                        ->{back}
133 |                                   tail ---->{2} EMPTY
134 |                                        -------> SUCCESS
135 | ```
136 | 
137 | Note that this parser also backtracks, but only if it hasn't consumed
138 | any input in the branch it backtracks from. The nice property of a greedy
139 | parser, compared to a backtracking parser, is that it never backtracks *in the
140 | input string*.  This means that it is potentially more efficient than a
141 | backtracking parser, because it does not have to keep references to old
142 | positions in the input string when keeping backtracking states. I say
143 | potentially faster, because Attoparsec, probably the fastest parser combinator
144 | library for Haskell currently in existence, does full backtracking seemingly
145 | without suffering from any performance penalty.
146 | 
147 | Both backtracking and parallel parsing have the same power in terms of the
148 | languages that they recognise.  If we are only interested in recognition and
149 | our language is not LL(1) or not easily factorable into LL(1), backtracking
150 | parsing is a practical choice, since it often does less work than a
151 | parallel parser. Note, however, that there are cases where a backtracking
152 | parser has to do as much work as a parallel parser. If we are interested in
153 | getting *all* the ways a given string is parsable, parallel parsing is
154 | a natural choice.
155 | 
156 | The limitations of parser combinators
157 | ---------
158 | 
159 | * Language:
160 | 
161 |     Backtracking and parallel parsers are guaranteed to terminate only for any
162 |     LL(k) language. For example, they do not handle left-recursive grammars,
163 |     i.e. grammars where we have derivations of the form `p -->+ p rest`.
164 | 
165 |     As an example, they cannot handle the following grammar, because `expr` is
166 |     left-recursive:
167 | 
168 |     ```
169 |     ident ::= 'x' | 'y'
170 |     expr  ::= ident | expr '+' ident
171 |     ```
172 | 
173 |     In practice, we can often perform *left-recursion removal*. The cost of
174 |     this is that the intended meaning of the grammar is sometimes obscured.
175 | 
176 | * Worst-case running time:
177 | 
178 |     There are many grammars for which parsing is exponential.
179 | 
180 |     Consider:
181 | 
182 |     ```
183 |     x ::= '(' x x ')' | EMPTY
184 |     ```
185 | 
186 |     Let's see what happens in trying to parse the string `"(())"` in parallel:
187 | 
188 |     ```
189 |     x ---->{*} '(' x x ')' | EMPTY
190 |       -->{'('} x x ')'
191 |       ---->{*} '(' x x ')' x ')' | x ')'
192 |       ---->{*} '(' x x ')' x ')' | '(' x x ')' ')' | ')'
193 |       -->{'('} x x ')' x ')' | x x ')' ')'
194 |       ---->{*} '(' x x ')' x x ')' x ')' | x ')' x ')' | x x ')' ')'
195 |       ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | x x ')' ')'
196 |       ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | '(' x x ')' x ')' ')' | x ')' ')'
197 |       ---->{*} '(' x x ')' x x ')' x ')' | '(' x x ')' ')' x ')' | ')' x ')' | '(' x x ')' x ')' ')' | '(' x x ')' ')' ')' | ')' ')'
198 |       -->{')'} x ')' | ')'
199 |       ---->{*} '(' x x ')' ')' | ')' | ')'
200 |       -->{')'} EMPTY | EMPTY
201 |       -------> SUCCESS
202 |     ```
203 |     We can at least see that the number of parallel productions that we are
204 |     processing blows up quite quickly.
205 |     I find it easiest to see this problem in the parallel strategy, but note
206 |     that backtracking parsing has to do the same amount of work as the parallel
207 |     parser for certain inputs.
208 | 
209 |     This should also have convinced you that quite a lot of the work is duplicated.
210 | 
211 | 
212 | Both of these problems stem from the way that parser combinator grammars are
213 | written.
214 | Most recursive grammars written using parser combinators are not finite, even though
215 | they have a finite representation, and *there is no way to detect this*.
216 | 
217 | This is similar to how we can't detect (without using dirty, unsafe tricks)
218 | that we will never reach the end of the list defined by `ones = 1 : ones`.
219 | 
220 | In this light, let's look at the left-recursive grammar again:
221 | 
222 | ```
223 | expr ::= ident | expr '+' ident
224 | ```
225 | 
226 | As a parser combinator grammar, this is equivalent to the following, which
227 | elucidates the left-recursion problem:
228 | 
229 | ```
230 | expr ::= ident | (ident | (ident | (ident | ...) '+' ident) '+' ident) '+' ident
231 | ```
232 | 
233 | Since parser combinators have no way of distinguishing between non-terminals
234 | and their productions, they also have *no way* of sharing work between
235 | invocations of the same non-terminal, because they cannot detect that this
236 | has happened in the first place.
237 | 
238 | 
239 | How not to repeat yourself yourself
240 | -----
241 | 
242 | I know of two situations where there is potential to share work.
243 | 
244 | The first is when we encounter the same non-terminal at the same position.
245 | As an example, there is no need to re-do the work of `x` in both branches
246 | of `a` in the following grammar.
247 | 
248 | ```
249 | a ::= x rest1 | x rest2
250 | ```
251 | 
252 | A better idea would be to just parse `x`
253 | once and carry on with `rest1 | rest2`.
254 | Pictorially, we want the following branching structure:
255 | ```
256 |        rest1
257 |       /
258 |  a---x
259 |       \
260 |        rest2
261 | ```
262 | We do not want the following, which we have in parallel parsing:
263 | ```
264 |    x--rest1
265 |   /
266 |  a
267 |   \
268 |    x--rest2
269 | ```
270 | 
271 | A more general example is whenever we are parsing two unrelated branches at the same position, such as:
272 | ```
273 | a ::= start1 x rest1
274 | b ::= start2 x rest2
275 | ```
276 | If we are parsing `a` and `b` at the same time we also want to share the work
277 | of the non-terminal `x` whenever it's encountered at the same position:
278 | ```
279 | a---start1   --rest1
280 |           \ /
281 |            x
282 |           / \
283 | b---start2   --rest2
284 | ```
285 | What we don't want is to keep the two branches independent:
286 | ```
287 | a---start1---x---rest1
288 | 
289 | b---start2---x---rest2
290 | ```
291 | 
292 | It turns out that if we never re-do the work of a non-terminal, we can
293 | automatically gain support for left-recursive grammars, since the main problem
294 | there is the infinite expansion of recursive non-terminals.
295 | 
296 | The second way we can share work is perhaps less obvious. Here I will use an
297 | extended BNF syntax where productions can have alternatives anywhere in the
298 | tree. As an example
299 | 
300 | ```
301 | a ::= (x | y) rest
302 | ```
303 | means that `a` accepts either an `x` or a `y`, followed by `rest`.
304 | One (bad) way to handle `(x | y) rest` is to desugar it as follows:
305 | 
306 | ```
307 | (x | y) rest = x rest | y rest
308 | ```
309 | 
310 | Now let's say that in parsing `a`, both branches `x` and `y` are successful
311 | and finish at the same time.  Unless `rest` is a non-terminal and falls
312 | into the work-sharing situation above, we will be parsing two instances of
313 | the same thing. It would be more efficient if we could share the work of
314 | `rest`.
315 | 
316 | Pictorially, we want the following branching structure:
317 | ```
318 |   x--
319 |  /   \
320 | a     rest
321 |  \   /
322 |   y--
323 | 
324 | ```
325 | We do not want the following, which we sometimes have in combinator libraries.
326 | ```
327 |   x---rest
328 |  /
329 | a
330 |  \
331 |   y---rest
332 | 
333 | ```
334 | 
335 | This second point also applies if instead of `x | y` we have a non-terminal
336 | that expands to multiple branches.
337 | 
338 | 
339 | The Earley library and the essence of Earley
340 | ---
341 | 
342 | There are already good presentations of Earley's algorithm available,
343 | so I will not repeat the full definition here. To give some context, here's the
344 | core of the algorithm, adapted from
345 | [Wikipedia](https://en.wikipedia.org/wiki/Earley_parser):
346 | 
347 | -----------
348 | 
349 | The state set at input position `k` is called `S(k)`. The parser is seeded with
350 | `S(0)` consisting of only the top-level rule. The parser then repeatedly executes
351 | three operations: prediction, scanning, and completion.
352 | 
353 | * _Prediction_: For every state in `S(k)` of the form `(X -> a . Y b, j)` (where `j`
354 |     is the origin position as above), add `(Y -> . y, k)` to `S(k)` for every production
355 |     in the grammar with `Y` on the left-hand side `(Y -> y)`.
356 | * _Scanning_: If `a` is the next symbol in the input stream, for every state in
357 |     `S(k)` of the form `(X -> a . 'a' b, j)`, add `(X -> a 'a' . b, j)` to `S(k+1)`.
358 | * _Completion_: For every state in `S(k)` of the form `(X -> y ., j)`, find
359 |     states in `S(j)` of the form `(Y -> a . X b, i)` and add
360 |     `(Y -> a X . b, i)` to `S(k)`.
361 | 
362 | It is important to note that duplicate states are not added to the state set,
363 | only new ones. These three operations are repeated until no new states can be
364 | added to the set. The set is generally implemented as a queue of states to
365 | process, with the operation to be performed depending on what kind of state it
366 | is.
367 | 
368 | -----------
369 | 
370 | The note that duplicate states are not added is very important. It means that:
371 | 
372 | 1. If the _prediction_ step expands the same non-terminal
373 |     multiple times at the same position, no more than one copy of each of the
374 |     non-terminal's productions are added to the current state set.
375 | 
376 | 2. If the _completion_ step completes a non-terminal multiple
377 |     times at the same position, no more than one copy of each of the
378 |     *completions* from the earlier state that it refers to are added to the
379 |     current state set.
380 | 
381 | These points are the essence of Earley parsing, and mean that we are work-sharing
382 | precisely as outlined above.
383 | 
384 | State sets
385 | ---
386 | 
387 | The state set does not have to be a set, as long as we follow the two points
388 | above (note that they make no use of set-specific properties, but only that we
389 | do not expand the same thing more than once per position).  We can use whatever
390 | representation of state collections that we find appropriate as long as we do
391 | that. In the Earley library, productions can contain functions, e.g.
392 | of type `token -> Bool` for matching input tokens with arbitrary predicates,
393 | which means that they are not in general comparable (they are not in the `Ord`
394 | typeclass). Since this means that we cannot use `Set` or similar containers, we
395 | instead use lists of states.
396 | 
397 | Additionally, we do not have to keep states from earlier positions around,
398 | as long as we have enough information to perform the completion step.
399 | 
400 | The Earley library keeps only two lists of Earley states: One for the current
401 | position and one for the next position, and follows the two points above by
402 | machinery that will now be described.
403 | 
404 | The representation of states is different from the classical presentations
405 | of Earley's algorithm.
406 | We represent a state as a production and a *continuation pointer*. We keep only
407 | the part of the production that is left to parse, i.e. we drop everything before
408 | "the dot" in Earley's states. We have a special `Final` state for
409 | when we are done.
410 | The continuation pointer is a mutable reference to a list of continuations,
411 | which correspond to possible completions in the original algorithm.
412 | For the moment we can think of a continuation as being the same kind
413 | of thing as a state, though technically they differ slightly because
414 | when we are also dealing with *parse results*, continuations accept
415 | such a parse result before they can meaningfully be followed.
416 | 
417 | Let's have a look at an example.
418 | A state (a production and a mutable reference to a list of states) might
419 | look a little bit like this:
420 | 
421 | ```
422 |   +------------------+
423 |   |                  |
424 |   v                  |
425 | cont1       (prod1, ptr)
426 | cont2
427 | 
428 | ```
429 | 
430 | I said that continuations are pretty much also states, so there might be
431 | more continuation pointers going left until we reach the special `Final`
432 | state.
433 | 
434 | The parsing operations
435 | ---
436 | 
437 | When parsing we have a list of such states for the current position,
438 | and a list for the next position. When all the states for the current
439 | position have been processed, we advance the position in the input string and
440 | make the next-states the current, and use the empty list for the new
441 | next-states.
442 | 
443 | The *scanning* step is done just like in Earley's original algorithm;
444 | when processing `(prod1, ptr)`,
445 | if `prod1 = 'x' prod1'` and the current position in the input is an `'x'`,
446 | then we add `(prod1', ptr)` to the list for the next position.
447 | 
448 | The *prediction* step, i.e. the expansion of a non-terminal, is a bit
449 | more peculiar. First we need to look at how a non-terminal is represented.
450 | 
451 | We are using the generalised representation of productions from above where
452 | alternatives can occur anywhere in the tree (which also means that productions
453 | are in the standard Haskell `Alternative` typeclass), so we just need to associate
454 | a production (and not e.g. a list of productions) with every non-terminal.
455 | 
456 | Every non-terminal is also associated with a mutable reference to a mutable
457 | reference (no, that is not a typo) to a list of continuations. So a non-terminal
458 | might look a bit like this:
459 | 
460 | ```
461 |       cont1
462 |       cont2
463 |       cont3
464 |        ^
465 |    +---+
466 |    |
467 |    |
468 | innerPtr<--+
469 |            |
470 |            |
471 | (prod, outerPtr)
472 | ```
473 | 
474 | The outermost reference (`outerPtr` above) is made sure to point to a fresh
475 | `innerPtr` that points to the empty list every time we advance the position in
476 | the input string.
477 | 
478 | Since both the production and the pointer depend on the type of the
479 | non-terminal, we cannot easily store this information in e.g. a `Map`
480 | because it would have to contain elements of different types (i.e. be
481 | *heterogeneous*).
482 | 
483 | The solution to this typed associated-information problem, perhaps obvious to
484 | some, is to store the associated information *in* the non-terminals.  The
485 | representation of non-terminals in the library is thus exactly this associated
486 | information: a production and the reference reference.
487 | 
488 | Now we are ready to do *prediction*.
489 | Let's say our current-list is `[(x prod1, ptr1), (x prod2, ptr2)]`,
490 | where the non-terminal `x = (xProd, xPtr)`, and our next-list is `[]`.
491 | This means that we have two states that begin with the same non-terminal,
492 | but end with different productions (`prod1` and `prod2`), and that additionally
493 | have continuations from before (`ptr1` and `ptr2`).
494 | 
495 | Assuming that it hasn't been expanded before at this position, `x` looks
496 | something like this:
497 | 
498 | ```
499 |       [ ]
500 |        ^
501 |    +---+
502 |    |
503 |    |
504 | innerXPtr<--+
505 |             |
506 |             |
507 |   (xProd, xPtr)
508 | ```
509 | 
510 | The *prediction* step takes place when we process our first state,
511 | `(x prod1, ptr1)`.
512 | We then add the continuation `(prod1, ptr1)` to the inner list that `x` refers
513 | to, and add the state `(xProd, innerXPtr)` to the current-list.
514 | 
515 | When we process the second state `(x prod2, ptr2)`, we can detect that `x` has
516 | already been expanded at this position, because the continuation list is
517 | non-empty. When this happens we add `(prod2, ptr2)` to the inner list that `x`
518 | refers to, but we *do not* add any new state to the current-list --- because
519 | `xProd` is already there. At this point we have the following pointer
520 | structure:
521 | 
522 | ```
523 |  (prod1, ptr1)
524 |  (prod2, ptr2)         current-list
525 |        ^            (xProd, innerXPtr)
526 |        |                        |
527 |    +---+------------------------+
528 |    |
529 | innerXPtr<--+
530 |             |
531 |             |
532 |   (xProd, xPtr)
533 | ```
534 | 
535 | We can see in the above that we are sharing the work of parsing `x`, and simply
536 | keep track of the two continuations for when that has been done.
537 | Here we can also see why we need a double reference from non-terminals. We
538 | clear every non-terminal when we advance the position, but we still
539 | want to keep the pointers between any states and continuations. Clearing
540 | in the above picture would mean to mutate `xPtr` to point to a fresh inner
541 | reference that points to the empty list. However, the `innerXPtr` from any
542 | states in the next-list stay intact.
543 | 
544 | Now we come to *completion*, which is what happens when we process a state
545 | of the form `(EMPTY, ptr)`. Then we simply look up the continuations
546 | by following `ptr` and add those to the current-list.
547 | Pretending that `xProd` was `EMPTY` In the above picture, we would add
548 | `(prod1, ptr1)` and `(prod2, ptr2)` to the current-list.
549 | 
550 | Since we do not tie alternatives to non-terminals, we have an additional
551 | operation which happens when we encounter alternatives. In the original
552 | Earley algorithm this is baked into the *prediction* step, and in our
553 | formulation of the algorithm this operation is basically the same
554 | as *prediction*.
555 | 
556 | When processing a state `((alt1 | alt2) prod, ptr)` we create
557 | a new continuation pointer, `newptr` that points to `(prod, ptr)`,
558 | and continue with the states `(alt1, newptr)` and `(alt2, newptr)`.
559 | With this operation in place we have to be more careful when we do *completion*.
560 | Just like we made sure that a non-terminal is only expanded once per position
561 | in the input we have to make sure that completion of a continuation is only
562 | done once per position. For now we can think of this as pairing a mutable
563 | boolean with each list of continuations that we point to, though this is
564 | slighly more complicated when we also have to deal with parse results.
565 | 
566 | TODO simplifyCont/Leo's optimisation
567 | 
568 | Recognition vs. parsing
569 | ---
570 | 
571 | A recogniser for a language is a program that decides if a given input string
572 | is in the language. Parsing additionally creates a parse tree.  So far we have
573 | only really discussed recognition.
574 | 
575 | In Haskell it is convenient to give an `Applicative` interface to our parsers,
576 | which allows us to attach semantic actions to parsers without having to
577 | construct and interpret an intermediate parse tree.
578 | 
579 | TODO explain delayed results and how they cope with infinitely and exponentially
580 | many result
581 | 


--------------------------------------------------------------------------------