├── .gitignore ├── AddLinks.lhs ├── AnIssueWithTokenParsers.lhs ├── ApplicativeStyle.lhs ├── CombinatorReview.lhs ├── ErrorMessages.lhs ├── FromClause.lhs ├── FunctionsAndTypesForParsing.lhs ├── GettingStarted.lhs ├── IntroductionToCombinators.lhs ├── LICENSE ├── ParseFile.lhs ├── ParseSQLQueries.lhs ├── ParseString.lhs ├── ParsingTpch.lhs ├── PrettyPrinting0.lhs ├── QueryExpressions.lhs ├── README.asciidoc ├── Setup.hs ├── SimpleSQLQueryParser0.lhs ├── Text └── Parsec │ └── String │ ├── Char.hs │ ├── Combinator.hs │ ├── Expr.hs │ └── Parsec.hs ├── TextParsecExpr.lhs ├── TextParsecPerm.lhs ├── TextParsecToken.lhs ├── ValueExpressions.lhs ├── VerySimpleExpressions.lhs ├── main.css ├── make_website.sh ├── render ├── FixReadme.lhs └── Render.lhs └── source_text /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /nohup.out 3 | /cabal.sandbox.config 4 | /.cabal-sandbox 5 | /.ghc.environment.x86_64-linux-8.6.5 6 | /dist-newstyle/ 7 | /dist/ -------------------------------------------------------------------------------- /AddLinks.lhs: -------------------------------------------------------------------------------- 1 | 2 | Little hack to add links to the navigation bars when generating the 3 | html website. 4 | 5 | > main :: IO () 6 | > main = interact addLinks 7 | 8 | 9 | > addLinks :: String -> String 10 | > addLinks [] = error "not found" 11 | > addLinks ('<':'/':'u':'l':'>':'\n':'<':'/':'d':'i':'v':'>':xs) = 12 | > "" ++ linkSection ++ "\n" ++ xs 13 | > addLinks (x:xs) = x : addLinks xs 14 | 15 | > linkSection :: String 16 | > linkSection = 17 | > "
\n\ 18 | > \\n" 25 | -------------------------------------------------------------------------------- /AnIssueWithTokenParsers.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[an-issue-with-token-parsers]] 3 | = An issue with token parsers 4 | 5 | This is a tutorial about an issue with the token parsing we have so 6 | far. 7 | 8 | > import Text.Parsec.String (Parser) 9 | > import Text.Parsec.String.Parsec (try) 10 | > import Text.Parsec.String.Combinator (many1, notFollowedBy) 11 | > import Text.Parsec.String.Char (digit, string, oneOf, satisfy, char, letter) 12 | > 13 | > import Control.Applicative ((<$>), (<*>), (<*), many, (<$), (<|>)) 14 | > import Control.Monad (void, guard) 15 | > 16 | > import qualified Text.Parsec.String.Expr as E 17 | > import FunctionsAndTypesForParsing 18 | 19 | Here is a simplified expression type and parser: 20 | 21 | > data SimpleExpr = Num Integer 22 | > | BinaryOp SimpleExpr String SimpleExpr 23 | > deriving (Eq,Show) 24 | > 25 | > simpleExpr :: Parser SimpleExpr 26 | > simpleExpr = E.buildExpressionParser table num 27 | > 28 | > table :: [[E.Operator SimpleExpr]] 29 | > table = [[binary "<=" E.AssocNone 30 | > ,binary ">=" E.AssocNone] 31 | > ,[binary "<" E.AssocNone 32 | > ,binary ">" E.AssocNone] 33 | > ] 34 | > where 35 | > binary name assoc = 36 | > E.Infix (mkBinOp name <$ symbol name) assoc 37 | > mkBinOp nm a b = BinaryOp a nm b 38 | > 39 | > num :: Parser SimpleExpr 40 | > num = Num <$> integer 41 | > 42 | > whitespace :: Parser () 43 | > whitespace = void $ many $ oneOf " \n\t" 44 | > 45 | > lexeme :: Parser a -> Parser a 46 | > lexeme p = p <* whitespace 47 | > 48 | > integer :: Parser Integer 49 | > integer = read <$> lexeme (many1 digit) 50 | > 51 | > symbol :: String -> Parser String 52 | > symbol s = lexeme $ string s 53 | 54 | Let's try it out: 55 | 56 | ``` 57 | *Main> regularParse simpleExpr "1=2" 58 | Right (Num 1) 59 | 60 | *Main> regularParse simpleExpr "1>=2" 61 | Right (BinaryOp (Num 1) ">=" (Num 2)) 62 | 63 | *Main> regularParse simpleExpr "1>2" 64 | Left (line 1, column 2): 65 | unexpected "2" 66 | expecting ">=" 67 | ``` 68 | 69 | What happened? The parser tried to parse > as >=, failed, and since 70 | the failure consumed some input (the first >), it failed completely. 71 | 72 | We are going to change the symbol parser to fix this. Here is a 73 | parameterized version of the simpleExpr parser so we can try a few 74 | variations out. 75 | 76 | > simpleExprP :: (String -> Parser String) -> Parser SimpleExpr 77 | > simpleExprP sym = E.buildExpressionParser (tableP sym) num 78 | > 79 | > tableP :: (String -> Parser String) -> [[E.Operator SimpleExpr]] 80 | > tableP sym = [[binary "<=" E.AssocNone 81 | > ,binary ">=" E.AssocNone] 82 | > ,[binary "<" E.AssocNone 83 | > ,binary ">" E.AssocNone]] 84 | > where 85 | > binary name assoc = 86 | > E.Infix (mkBinOp name <$ sym name) assoc 87 | > mkBinOp nm a b = BinaryOp a nm b 88 | 89 | Let's reproduce the failure: 90 | 91 | ``` 92 | *Main> regularParse (simpleExprP symbol) "1>=2" 93 | Right (BinaryOp (Num 1) ">=" (Num 2)) 94 | 95 | *Main> regularParse (simpleExprP symbol) "1>2" 96 | Left (line 1, column 2): 97 | unexpected "2" 98 | expecting ">=" 99 | ``` 100 | 101 | We are going to look at two possible solutions. 102 | 103 | 1. Let's use `try`: 104 | 105 | ``` 106 | *Main> regularParse (simpleExprP (try . symbol)) "1>=2" 107 | Right (BinaryOp (Num 1) ">=" (Num 2)) 108 | 109 | *Main> regularParse (simpleExprP (try . symbol)) "1>2" 110 | Right (BinaryOp (Num 1) ">" (Num 2)) 111 | ``` 112 | 113 | This seems to have done the job. There is still a problem 114 | though. Consider a case when the precedence is the other way round - 115 | the `<` and `>` are higher precedence than `<=` and `>=`, 116 | 117 | > simpleExprP1 :: (String -> Parser String) -> Parser SimpleExpr 118 | > simpleExprP1 sym = E.buildExpressionParser (tableP1 sym) num 119 | > 120 | > tableP1 :: (String -> Parser String) -> [[E.Operator SimpleExpr]] 121 | > tableP1 sym = [[binary "<" E.AssocNone 122 | > ,binary ">" E.AssocNone] 123 | > ,[binary "<=" E.AssocNone 124 | > ,binary ">=" E.AssocNone]] 125 | > where 126 | > binary name assoc = 127 | > E.Infix (mkBinOp name <$ sym name) assoc 128 | > mkBinOp nm a b = BinaryOp a nm b 129 | 130 | ``` 131 | *Main> regularParse (simpleExprP1 (try . symbol)) "1>2" 132 | Right (BinaryOp (Num 1) ">" (Num 2)) 133 | 134 | *Main> regularParse (simpleExprP1 (try . symbol)) "1>=2" 135 | Left (line 1, column 3): 136 | unexpected "=" 137 | expecting digit 138 | ``` 139 | 140 | Although the precendence order is a little contrived in this case, 141 | this issue could easily crop up for real when we start adding more 142 | operators. Let's fix it now. 143 | 144 | This could be solved by adding a `try` at a earlier place in the 145 | parsing. Because of how the `buildExpressionParser` function works, 146 | it's not obvious where the `try` could go. 147 | 148 | Let's try tackling the problem in a different way. One way of looking 149 | at this is to consider that the symbol parser stops parsing too soon: 150 | 151 | ``` 152 | *Main> parseWithLeftOver (symbol ">") ">=" 153 | Right (">","=") 154 | ``` 155 | 156 | What it should do is keep parsing symbol characters until it gets a 157 | result string which can't be a symbol, and stop one character before 158 | this.. 159 | 160 | Here is a slightly naive way of doing it, which will be good enough 161 | for quite a while: 162 | 163 | > symbol1 :: String -> Parser String 164 | > symbol1 s = try $ lexeme $ do 165 | > u <- many1 (oneOf "<>=+-^%/*") 166 | > guard (s == u) 167 | > return s 168 | 169 | Here is a similar alternative: 170 | 171 | > symbol2 :: String -> Parser String 172 | > symbol2 s = try $ lexeme $ do 173 | > void $ string s 174 | > notFollowedBy (oneOf "<>=+-^%/*") 175 | > return s 176 | 177 | Let's try them out: 178 | 179 | ``` 180 | *Main> parseWithLeftOver (symbol1 ">") ">=" 181 | Left (line 1, column 3): 182 | unexpected end of input 183 | 184 | *Main> parseWithLeftOver (symbol1 ">") ">" 185 | Right (">","") 186 | 187 | *Main> parseWithLeftOver (symbol1 ">") ">= 3" 188 | Left (line 1, column 3): 189 | unexpected " " 190 | 191 | *Main> parseWithLeftOver (symbol1 ">=") ">= 3" 192 | Right (">="," 3") 193 | 194 | 195 | ``` 196 | The error messages don't seem very good, but it parses and fails to 197 | parse correctly. 198 | 199 | ``` 200 | *Main> parseWithLeftOver (symbol2 ">") ">=" 201 | Left (line 1, column 3): 202 | unexpected '=' 203 | 204 | *Main> parseWithLeftOver (symbol2 ">") ">" 205 | Right (">","") 206 | 207 | *Main> parseWithLeftOver (symbol2 ">") ">= 3" 208 | Left (line 1, column 3): 209 | unexpected '=' 210 | 211 | *Main> parseWithLeftOver (symbol2 ">=") ">= 3" 212 | Right (">="," 3") 213 | ``` 214 | 215 | This one appears to give better error messages in this limited 216 | scenario, apart from that they both work the same. 217 | 218 | Let's try them out in the full expression parser: 219 | 220 | ``` 221 | *Main> regularParse (simpleExprP symbol1) "1>=2" 222 | Right (BinaryOp (Num 1) ">=" (Num 2)) 223 | 224 | *Main> regularParse (simpleExprP symbol1) "1>2" 225 | Right (BinaryOp (Num 1) ">" (Num 2)) 226 | 227 | *Main> regularParse (simpleExprP symbol2) "1>=2" 228 | Right (BinaryOp (Num 1) ">=" (Num 2)) 229 | 230 | *Main> regularParse (simpleExprP symbol2) "1>2" 231 | Right (BinaryOp (Num 1) ">" (Num 2)) 232 | ``` 233 | 234 | They both work fine here. Let's see some error messages in this 235 | context. 236 | 237 | ``` 238 | *Main> parseWithEof (simpleExprP symbol1) "1>*2" 239 | Left (line 1, column 4): 240 | unexpected "2" 241 | expecting operator 242 | 243 | *Main> parseWithEof (simpleExprP symbol2) "1>*2" 244 | Left (line 1, column 4): 245 | unexpected '*' 246 | expecting operator 247 | ``` 248 | 249 | Both error messages are a bit crap. So much for the second variation 250 | producing better error messages. 251 | 252 | Let's look at the equivalent issue with respect to keyword parsing. We 253 | can get a similar problem here. 254 | 255 | > keyword :: String -> Parser String 256 | > keyword s = try $ string s 257 | 258 | ``` 259 | *Main> parseWithEof (keyword "not") "not" 260 | Right "not" 261 | 262 | *Main> parseWithEof (keyword "not") "nothing" 263 | Left (line 1, column 4): 264 | unexpected 'h' 265 | expecting end of input 266 | 267 | *Main> parseWithEof (keyword "not" <|> keyword "nothing") "nothing" 268 | Left (line 1, column 4): 269 | unexpected 'h' 270 | expecting end of input 271 | 272 | *Main> parseWithEof (keyword "nothing" <|> keyword "not") "nothing" 273 | Right "nothing" 274 | ``` 275 | 276 | We can fix this overlapping prefix issue by reordering the 277 | choices. But let's fix the `keyword` parser in a similar way to the 278 | symbol parser. 279 | 280 | TODO: I don't know if symbol is the right name, I don't think Parsec 281 | usually uses symbol in this way. Maybe it should be called operator. 282 | 283 | > identifier :: Parser String 284 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) 285 | > where 286 | > firstChar = letter <|> char '_' 287 | > nonFirstChar = digit <|> firstChar 288 | 289 | > keyword1 :: String -> Parser String 290 | > keyword1 k = try $ do 291 | > i <- identifier 292 | > guard (i == k) 293 | > return k 294 | 295 | TODO: later note in error messages about choosing identifier here 296 | instead of e.g. many1 letter. 297 | 298 | ``` 299 | *Main> parseWithEof (keyword1 "not") "not" 300 | Right "not" 301 | 302 | *Main> parseWithEof (keyword1 "not") "nothing" 303 | Left (line 1, column 8): 304 | unexpected end of input 305 | expecting digit, letter or "_" 306 | 307 | *Main> parseWithEof (keyword1 "not" <|> keyword1 "nothing") "nothing" 308 | Right "nothing" 309 | 310 | *Main> parseWithEof (keyword1 "nothing" <|> keyword1 "not") "nothing" 311 | Right "nothing" 312 | 313 | *Main> parseWithEof (keyword1 "not" <|> keyword1 "nothing") "not" 314 | Right "not" 315 | 316 | *Main> parseWithEof (keyword1 "nothing" <|> keyword1 "not") "not" 317 | Right "not" 318 | ``` 319 | 320 | Try implementing the `keyword2` parser which uses `notFollowedBy` 321 | instead of `guard`, using something analogous to the change from 322 | `symbol1` to `symbol2` above. 323 | 324 | After this, you can try reimplementing the expression parser from the 325 | Text.Parsec.Expr tutorial using the new symbol and keyword parsers. 326 | -------------------------------------------------------------------------------- /ApplicativeStyle.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[applicative-style-parsing-code]] 3 | = Applicative style parsing code 4 | 5 | Now we can go back over the expression parsing code written in the 6 | last tutorial, and make it much more concise, and also make it more 7 | readable. We are going to do this mainly by using functions from the 8 | typeclass Applicative. 9 | 10 | Remember you can (and should) use the functions `regularParse` and its 11 | variations (TODO list them here) to try out the all these parsers in 12 | ghci, and you can write your own variations to experiment with if you 13 | are unsure about anything. 14 | 15 | > import Text.Parsec.String (Parser) 16 | > import Text.Parsec.String.Char (oneOf, char, digit, letter, satisfy) 17 | > import Text.Parsec.String.Combinator (many1, chainl1) 18 | > import Control.Applicative ((<$>), (<*>), (<*), (*>), (<|>), many, (<$)) 19 | > import Control.Monad (void, ap) 20 | > import Data.Char (isLetter, isDigit) 21 | > import FunctionsAndTypesForParsing 22 | 23 | Here is the SimpleExpr type again: 24 | 25 | > data SimpleExpr = Num Integer 26 | > | Var String 27 | > | Add SimpleExpr SimpleExpr 28 | > | Parens SimpleExpr 29 | > deriving (Eq,Show) 30 | 31 | Here is the basic pattern behind most of the rewrites we are going to 32 | cover. Here is a function which takes a constructor and two parsers 33 | for the two arguments for the constructor. It parses the two 34 | arguments, then applies the constructor to the results: 35 | 36 | > myParser1 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c 37 | > myParser1 ctor pa pb = do 38 | > a <- pa 39 | > b <- pb 40 | > return $ ctor a b 41 | 42 | TODO: concrete example, plus examples at the bottom of this section 43 | (for ctor <$> a, ctor <$> a <*> b, ctor <$> a <*> b <*> c). 44 | 45 | This can be rewritten without the do syntactic sugar like this: 46 | 47 | > myParser2 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c 48 | > myParser2 ctor pa pb = 49 | > pa >>= \a -> pb >>= \b -> return $ ctor a b 50 | 51 | And can also be rewritten like this: 52 | 53 | > myParser3 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c 54 | > myParser3 ctor pa pb = ctor `fmap` pa `ap` pb 55 | 56 | (This uses functions from Applicative instead of Monad.) We replace 57 | the use of `>>=` with `fmap` and `ap`. This isn't always possible, but 58 | it often is. 59 | 60 | Here is the version using the operators for `fmap` and `ap` (`fmap` 61 | changed to `<$>`, and `ap` changed to `<*>`). 62 | 63 | > myParser4 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c 64 | > myParser4 ctor pa pb = ctor <$> pa <*> pb 65 | 66 | This style takes less typing, and is often much simpler to write and 67 | read. 68 | 69 | This pattern 'scales', you can use: 70 | 71 | ``` 72 | Ctor <$> pa 73 | ``` 74 | 75 | for a single argument constructor. This might also be familiar to you 76 | as 77 | 78 | ``` 79 | fmap Ctor pa 80 | ``` 81 | 82 | or 83 | 84 | ``` 85 | Ctor `fmap` pa 86 | ``` 87 | 88 | All of which mean the same thing, just slightly different spellings. 89 | 90 | This can also be written using Monad operators: 91 | 92 | ``` 93 | pa >>= liftM ctor 94 | ``` 95 | 96 | or 97 | 98 | ``` 99 | liftM ctor =<< pa 100 | ``` 101 | 102 | (`liftM` is in the module `Control.Monad`) 103 | 104 | These `liftM` versions effectively mean the same thing as the previous 105 | versions with `fmap` and `<$>`. 106 | 107 | You can use 108 | 109 | ``` 110 | Ctor <$> pa <*> pb <*> pc 111 | ``` 112 | 113 | for three args, and so on. So you use `<$>` between the pure 114 | constructor and the first arg, then `<*>` between each subsequent arg. 115 | 116 | Let's go over the simple expression parsers and try to rewrite them 117 | using this style. We will see a few other new functions. I will break 118 | things down into a lot of steps. 119 | 120 | == lexeme 121 | 122 | Here is the old lexeme parser, 'D' suffix for 'do notation'. 123 | 124 | > lexemeD :: Parser a -> Parser a 125 | > lexemeD p = do 126 | > x <- p 127 | > whitespace 128 | > return x 129 | 130 | > whitespace :: Parser () 131 | > whitespace = void $ many $ oneOf " \n\t" 132 | 133 | First, we can move the `whitespace` from its own separate line. 134 | 135 | > lexemeA0 :: Parser a -> Parser a 136 | > lexemeA0 p = do 137 | > x <- p <* whitespace 138 | > return x 139 | 140 | The expression `pa <* pb` means run `pa`, then run `pb`, and return 141 | the result of `pa`. It is sort of equivalent to this code: 142 | 143 | ```haskell 144 | (<*) :: Parser a -> Parser b -> Parser a 145 | (<*) pa pb = do 146 | a <- pa 147 | void pb 148 | return a 149 | ``` 150 | 151 | (It isn't implemented this way, since `(<*)` only needs Applicative 152 | and not Monad.) 153 | 154 | Now we can use the usual monad syntax rewrites, first eliminate the 155 | name `x`. 156 | 157 | > lexemeA1 :: Parser a -> Parser a 158 | > lexemeA1 p = do 159 | > p <* whitespace 160 | 161 | Now remove the redundant do: 162 | 163 | > lexemeA :: Parser a -> Parser a 164 | > lexemeA p = p <* whitespace 165 | 166 | == num 167 | 168 | Now let's tackle the `num` parser. 169 | 170 | > numD :: Parser SimpleExpr 171 | > numD = do 172 | > n <- lexemeD $ many1 digit 173 | > return $ Num $ read n 174 | 175 | Let's move 'read' to the first line. 176 | 177 | > numA0 :: Parser SimpleExpr 178 | > numA0 = do 179 | > n <- read <$> lexemeA (many1 digit) 180 | > return $ Num n 181 | 182 | This uses `(<$>)` which we saw above. You may have done code rewrites 183 | like this using `fmap` with IO in other Haskell code. 184 | 185 | Now let's move the `Num` ctor as well: 186 | 187 | > numA1 :: Parser SimpleExpr 188 | > numA1 = do 189 | > n <- (Num . read) <$> lexemeA (many1 digit) 190 | > return n 191 | 192 | You can also write it in this way: 193 | 194 | > numA2 :: Parser SimpleExpr 195 | > numA2 = do 196 | > n <- Num <$> read <$> lexemeA (many1 digit) 197 | > return n 198 | 199 | Why does this work? It it equivalent to the previous version partly 200 | because of the applicative laws. 201 | 202 | Let's break it down: 203 | 204 | > numA2'' :: Parser SimpleExpr 205 | > numA2'' = do 206 | > n <- numb 207 | > return n 208 | > where 209 | > numb :: Parser SimpleExpr 210 | > numb = Num <$> int 211 | > int :: Parser Integer 212 | > int = read <$> lexemeA (many1 digit) 213 | 214 | In terms of style, which do you think looks better: 215 | `(a . b) <$> p` or `a <$> b <$> p`. 216 | 217 | The next step for num, we can eliminate the temporary name `n` and the 218 | `do`: 219 | 220 | > numA3 :: Parser SimpleExpr 221 | > numA3 = (Num . read) <$> lexemeA (many1 digit) 222 | 223 | In more 'industrial' parser code, I would usually write some 224 | tokenization parsers separately like this: 225 | 226 | > integerA4 :: Parser Integer 227 | > integerA4 = read <$> lexemeA (many1 digit) 228 | 229 | Then the num expression parser looks like this: 230 | 231 | > numA4 :: Parser SimpleExpr 232 | > numA4 = Num <$> integerA4 233 | 234 | and we also get a integer parser which we can reuse if we need to 235 | parse an integer in another context. 236 | 237 | == var 238 | 239 | Here is the previous var parser: 240 | 241 | > varD :: Parser SimpleExpr 242 | > varD = lexemeA $ do 243 | > fc <- firstChar 244 | > rest <- many nonFirstChar 245 | > return $ Var (fc:rest) 246 | > where 247 | > firstChar = satisfy (\a -> isLetter a || a == '_') 248 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 249 | 250 | The first thing we can do is to make the `firstChar` and 251 | `nonFirstChar` a little easier to read, using `(<|>)`, `char`, 252 | `letter` and `digit`: 253 | 254 | > varA0 :: Parser SimpleExpr 255 | > varA0 = lexemeA $ do 256 | > fl <- firstChar 257 | > rest <- many nonFirstChar 258 | > return $ Var (fl:rest) 259 | > where 260 | > firstChar = letter <|> char '_' 261 | > nonFirstChar = digit <|> firstChar 262 | 263 | Here is another way of making the function a little better: 264 | 265 | > varA0' :: Parser SimpleExpr 266 | > varA0' = lexemeA $ do 267 | > fl <- satisfy validFirstChar 268 | > rest <- many (satisfy validNonFirstChar) 269 | > return $ Var (fl:rest) 270 | > where 271 | > validFirstChar a = isLetter a || a == '_' 272 | > validNonFirstChar a = validFirstChar a || isDigit a 273 | 274 | We can lift the `(:)` using the Applicative operators. 275 | 276 | > varA1 :: Parser SimpleExpr 277 | > varA1 = do 278 | > i <- iden 279 | > return $ Var i 280 | > where 281 | > iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar) 282 | > firstChar = letter <|> char '_' 283 | > nonFirstChar = digit <|> firstChar 284 | 285 | We used the prefix version of `(:)` to use it with `(<$>)` and 286 | `(<*>)`. The lexemeA call was moved to the `iden` helper function. 287 | 288 | Now tidy it up using `(<$>)` with the `Var` constructor: 289 | 290 | > varA2 :: Parser SimpleExpr 291 | > varA2 = Var <$> iden 292 | > where 293 | > iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar) 294 | > firstChar = letter <|> char '_' 295 | > nonFirstChar = digit <|> firstChar 296 | 297 | We could also split the `iden` into a separate top level function, 298 | with the same idea as with splitting the `integer` parser. 299 | 300 | == parens 301 | 302 | Here is the starting point: 303 | 304 | > parensD :: Parser SimpleExpr 305 | > parensD = do 306 | > void $ lexemeA $ char '(' 307 | > e <- simpleExprD 308 | > void $ lexemeA $ char ')' 309 | > return $ Parens e 310 | 311 | Here is the rewrite in one step: 312 | 313 | > parensA0 :: Parser SimpleExpr 314 | > parensA0 = 315 | > Parens <$> (lexemeA (char '(') 316 | > *> simpleExprD 317 | > <* lexemeA (char ')')) 318 | 319 | // f'in asciidoc, the first operator following is (*>) 320 | 321 | Here you can see that there is a `(\*>)` which works in the opposite 322 | direction to `(<*)`. The precendence of these operators means that we 323 | have to use some extra parentheses (!) here. 324 | 325 | TODO: lost the chained <*. Put something below about this so there is 326 | a concrete example. 327 | 328 | == simple expr 329 | 330 | Here is the old version: 331 | 332 | > termD :: Parser SimpleExpr 333 | > termD = numD <|> varD <|> parensD 334 | 335 | > simpleExprD :: Parser SimpleExpr 336 | > simpleExprD = chainl1 termD op 337 | > where 338 | > op = do 339 | > void $ lexemeA $ char '+' 340 | > return Add 341 | 342 | We can simplify the `op` function using the techniques we've already 343 | seen: 344 | 345 | > simpleExprA0 :: Parser SimpleExpr 346 | > simpleExprA0 = chainl1 termD op 347 | > where op = lexemeA (char '+') *> return Add 348 | 349 | The pattern `p *> return f` can use a different operator like this: 350 | `f <$ p`. Here it is in the expression parser: 351 | 352 | > simpleExprA1 :: Parser SimpleExpr 353 | > simpleExprA1 = chainl1 termD op 354 | > where op = Add <$ lexemeA (char '+') 355 | 356 | You could also write the `op` parser inline: 357 | 358 | > simpleExprA2 :: Parser SimpleExpr 359 | > simpleExprA2 = chainl1 termD (Add <$ lexemeA (char '+')) 360 | 361 | Maybe this last step makes it less readable? 362 | 363 | == summary 364 | 365 | Here is the finished job for all the simple expression code without 366 | separate token parsers: 367 | 368 | > num' :: Parser SimpleExpr 369 | > num' = (Num . read) <$> lexemeA (many1 digit) 370 | 371 | > var' :: Parser SimpleExpr 372 | > var' = Var <$> iden 373 | > where 374 | > iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar) 375 | > firstChar = letter <|> char '_' 376 | > nonFirstChar = digit <|> firstChar 377 | 378 | > parens' :: Parser SimpleExpr 379 | > parens' = 380 | > Parens <$> (lexemeA (char '(') 381 | > *> simpleExpr' 382 | > <* lexemeA (char ')')) 383 | 384 | > term' :: Parser SimpleExpr 385 | > term' = num' <|> var' <|> parens' 386 | 387 | > simpleExpr' :: Parser SimpleExpr 388 | > simpleExpr' = chainl1 term' op 389 | > where op = Add <$ lexemeA (char '+') 390 | 391 | Here they are with separate token parsers and a helper function: 392 | 393 | > lexeme :: Parser a -> Parser a 394 | > lexeme p = p <* whitespace 395 | 396 | > identifier :: Parser String 397 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) 398 | > where 399 | > firstChar = letter <|> char '_' 400 | > nonFirstChar = digit <|> firstChar 401 | 402 | > integer :: Parser Integer 403 | > integer = read <$> lexeme (many1 digit) 404 | 405 | Here is a lexeme wrapper for parsing single character symbols. 406 | 407 | > symbol :: Char -> Parser () 408 | > symbol c = void $ lexeme $ char c 409 | 410 | Here is another little helper function. It barely pays its way in this 411 | short example, but even though it is only used once, I think it is 412 | worth it to make the code clearer. 413 | 414 | > betweenParens :: Parser a -> Parser a 415 | > betweenParens p = symbol '(' *> p <* symbol ')' 416 | 417 | Now the expression parsers: 418 | 419 | > num :: Parser SimpleExpr 420 | > num = Num <$> integer 421 | 422 | > var :: Parser SimpleExpr 423 | > var = Var <$> identifier 424 | 425 | > parens :: Parser SimpleExpr 426 | > parens = Parens <$> betweenParens simpleExpr 427 | 428 | > term :: Parser SimpleExpr 429 | > term = num <|> var <|> parens 430 | 431 | > simpleExpr :: Parser SimpleExpr 432 | > simpleExpr = chainl1 term' op 433 | > where op = Add <$ lexemeA (char '+') 434 | 435 | Splitting the lexer parser layer out means that we have one place 436 | where we have to remember to add `lexeme` wrappers, and also I think 437 | makes the code easier to follow. 438 | -------------------------------------------------------------------------------- /CombinatorReview.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[combinator-review]] 3 | = Combinator review 4 | 5 | In this tutorial we will go through all the functions in 6 | Text.Parsec.Combinator, and some useful ones in Control.Applicative 7 | and Control.Monad as well. 8 | 9 | > import Text.Parsec (ParseError) 10 | > import Text.Parsec.String (Parser) 11 | > import Text.Parsec.String.Parsec (try) 12 | > import Text.Parsec.String.Char (oneOf, char, digit 13 | > ,string, letter, satisfy) 14 | > import Text.Parsec.String.Combinator (many1, choice, chainl1, between 15 | > ,count, option, optionMaybe, optional) 16 | > import Control.Applicative ((<$>), (<*>), (<$), (<*), (*>), (<|>), many) 17 | > import Control.Monad (void, ap, mzero) 18 | > import Data.Char (isLetter, isDigit) 19 | > import FunctionsAndTypesForParsing 20 | 21 | == Text.Parsec.Combinator 22 | 23 | You should look at the source for these functions and try to 24 | understand how they are implemented. 25 | 26 | 27 | 28 | The style of the source code in the Parsec library sources is a little 29 | different to what we used at the end of the last tutorial. You can try 30 | reimplementing each of the Text.Parsec.Combinator module functions 31 | using the Applicative style. See if you can find a way to reassure 32 | yourself that the rewritten versions you make are correct, perhaps via 33 | writing automated tests, or perhaps some other method. 34 | 35 | You should be able to easily understand the implementation of all the 36 | functions in Text.Parsec.Combinator except possibly `anyToken` and 37 | `notFollowedBy`. 38 | 39 | === choice 40 | 41 | ```haskell 42 | choice :: [Parser a] -> Parser a 43 | ``` 44 | 45 | `choice ps` tries to apply the parsers in the list `ps` in order, 46 | until one of them succeeds. It returns the value of the succeeding 47 | parser. 48 | 49 | > a :: Parser Char 50 | > a = char 'a' 51 | > 52 | > b :: Parser Char 53 | > b = char 'b' 54 | > 55 | > aOrB :: Parser Char 56 | > aOrB = choice [a,b] 57 | 58 | ``` 59 | *Main> regularParse aOrB "a" 60 | Right 'a' 61 | 62 | *Main> regularParse aOrB "b" 63 | Right 'b' 64 | 65 | *Main> regularParse aOrB "c" 66 | Left (line 1, column 1): 67 | unexpected "c" 68 | expecting "a" or "b" 69 | ``` 70 | 71 | ==== using with try 72 | 73 | If a parser fails with `(<|>)` or `choice`, then it will only try the 74 | next parser if the last parser consumed no input. 75 | 76 | TODO: make the parsers return the keyword and update the examples 77 | 78 | > byKeyword :: Parser () 79 | > byKeyword = void $ string "by" 80 | > 81 | > betweenKeyword :: Parser () 82 | > betweenKeyword = void $ string "between" 83 | 84 | Since both of these have the same prefix - b - if we combine them 85 | using `choice` then it doesn't work correctly: 86 | 87 | ``` 88 | *Main> regularParse byKeyword "by" 89 | Right () 90 | 91 | *Main> regularParse byKeyword "between" 92 | Left (line 1, column 1): 93 | unexpected "e" 94 | expecting "by" 95 | 96 | *Main> regularParse betweenKeyword "between" 97 | Right () 98 | 99 | *Main> regularParse betweenKeyword "by" 100 | Left (line 1, column 1): 101 | unexpected "y" 102 | expecting "between" 103 | 104 | *Main> regularParse (choice [betweenKeyword,byKeyword]) "between" 105 | Right () 106 | 107 | *Main> regularParse (choice [betweenKeyword,byKeyword]) "by" 108 | Left (line 1, column 1): 109 | unexpected "y" 110 | expecting "between" 111 | 112 | *Main> regularParse (choice [byKeyword,betweenKeyword]) "between" 113 | Left (line 1, column 1): 114 | unexpected "e" 115 | expecting "by" 116 | 117 | *Main> regularParse (choice [byKeyword,betweenKeyword]) "by" 118 | Right () 119 | ``` 120 | 121 | If we use `try` on the first option, then it all works fine. 122 | 123 | ``` 124 | *Main> regularParse (choice [try byKeyword,betweenKeyword]) "by" 125 | Right () 126 | 127 | *Main> regularParse (choice [try byKeyword,betweenKeyword]) "between" 128 | Right () 129 | ``` 130 | 131 | === count 132 | 133 | ```haskell 134 | count :: Int -> Parser a -> Parser [a] 135 | ``` 136 | 137 | `count n p` parses `n` occurrences of `p`. If `n` is smaller or equal 138 | to zero, the parser is equivalent to `return []`. It returns a list of 139 | the n values returned by `p`. 140 | 141 | ``` 142 | *Main> regularParse (count 5 a) "aaaaa" 143 | Right "aaaaa" 144 | 145 | *Main> regularParse (count 5 a) "aaaa" 146 | Left (line 1, column 5): 147 | unexpected end of input 148 | expecting "a" 149 | 150 | *Main> regularParse (count 5 a) "aaaab" 151 | Left (line 1, column 5): 152 | unexpected "b" 153 | expecting "a" 154 | 155 | *Main> regularParse (count 5 aOrB) "aabaa" 156 | Right "aabaa" 157 | ``` 158 | 159 | === between 160 | 161 | ```haskell 162 | between :: Parser open -> Parser close -> Parser a -> Parser a 163 | ``` 164 | 165 | `between open close p` parses `open`, followed by `p` and 166 | `close`. It returns the value returned by `p`. 167 | 168 | We can replace the betweenParens from the previous tutorial using 169 | this: 170 | 171 | > betweenParens :: Parser a -> Parser a 172 | > betweenParens p = between (symbol '(') (symbol ')') p 173 | 174 | It hardly seems worth it to make this change, but it might be slightly 175 | quicker to read and understand if you aren't already familiar with 176 | some code or haven't viewed it for a while. This is good for 'code 177 | maintenance', where we need to fix bugs or add new features quickly to 178 | code we haven't looked at for two years or something. 179 | 180 | Here are the support functions for this parser. 181 | 182 | > symbol :: Char -> Parser Char 183 | > symbol c = lexeme $ char c 184 | 185 | > lexeme :: Parser a -> Parser a 186 | > lexeme p = p <* whitespace 187 | 188 | > whitespace :: Parser () 189 | > whitespace = void $ oneOf " \n\t" 190 | 191 | === option 192 | 193 | ```haskell 194 | option :: a -> Parser a -> Parser a 195 | ``` 196 | 197 | `option x p` tries to apply parser `p`. If `p` fails without 198 | consuming input, it returns the value `x`, otherwise the value returned 199 | by `p`. 200 | 201 | ``` 202 | *Main> regularParse (option "" (count 5 aOrB)) "aaaaa" 203 | Right "aaaaa" 204 | 205 | *Main> regularParse (option "" (count 5 aOrB)) "caaaa" 206 | Right "" 207 | 208 | *Main> regularParse (option "" (count 5 aOrB)) "aaaa" 209 | Left (line 1, column 5): 210 | unexpected end of input 211 | expecting "a" or "b" 212 | 213 | *Main> regularParse (option "" (count 5 aOrB)) "aaaac" 214 | Left (line 1, column 5): 215 | unexpected "c" 216 | expecting "a" or "b" 217 | 218 | *Main> regularParse (option "" (try (count 5 aOrB))) "aaaa" 219 | Right "" 220 | ``` 221 | 222 | === optionMaybe 223 | 224 | ```haskell 225 | optionMaybe :: Parser a -> Parser (Maybe a) 226 | ``` 227 | 228 | `optionMaybe p` tries to apply parser `p`. If `p` fails without consuming 229 | input, it returns `Nothing`, otherwise it returns `Just` the value returned 230 | by `p`. 231 | 232 | ``` 233 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaaa" 234 | Right (Just "aaaaa") 235 | 236 | *Main> regularParse (optionMaybe (count 5 aOrB)) "caaaa" 237 | Right Nothing 238 | 239 | *Main> regularParse (optionMaybe (count 5 aOrB)) "caaa" 240 | Right Nothing 241 | 242 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaa" 243 | Left (line 1, column 5): 244 | unexpected end of input 245 | expecting "a" or "b" 246 | 247 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaac" 248 | Left (line 1, column 5): 249 | unexpected "c" 250 | expecting "a" or "b" 251 | 252 | *Main> regularParse (optionMaybe (try $ count 5 aOrB)) "aaaac" 253 | Right Nothing 254 | ``` 255 | 256 | === optional 257 | 258 | ```haskell 259 | optional :: Parser a -> Parser () 260 | ``` 261 | 262 | `optional p` tries to apply parser `p`. It will parse `p` or 263 | nothing. It only fails if `p` fails after consuming input. It discards 264 | the result of `p`. 265 | 266 | ``` 267 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaaa" 268 | Right ((),"") 269 | 270 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "caaaa" 271 | Right ((),"caaaa") 272 | 273 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "caaa" 274 | Right ((),"caaa") 275 | 276 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaa" 277 | Left (line 1, column 5): 278 | unexpected end of input 279 | expecting "a" or "b" 280 | 281 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaac" 282 | Left (line 1, column 5): 283 | unexpected "c" 284 | expecting "a" or "b" 285 | 286 | *Main> parseWithLeftOver (optional (try $ count 5 aOrB)) "aaaac" 287 | Right ((),"aaaac") 288 | ``` 289 | 290 | === skipMany1 291 | 292 | ```haskell 293 | skipMany1 :: Parser a -> Parser () 294 | ``` 295 | 296 | `skipMany1 p` applies the parser `p` one or more times, skipping its result. 297 | 298 | === many1 299 | 300 | ```haskell 301 | many1 :: Parser a -> Parser [a] 302 | ``` 303 | 304 | many1 p applies the parser p one or more times. Returns a list of the 305 | returned values of p. 306 | 307 | ```haskell 308 | word = many1 letter 309 | ``` 310 | 311 | === sepBy 312 | 313 | ```haskell 314 | sepBy :: Parser a -> Parser sep -> Parser [a] 315 | ``` 316 | 317 | sepBy p sep parses zero or more occurrences of p, separated by 318 | sep. Returns a list of values returned by p. 319 | 320 | ```haskell 321 | commaSep p = p `sepBy` (symbol ",") 322 | ``` 323 | 324 | === sepBy1 325 | 326 | ```haskell 327 | sepBy1 :: Parser a -> Parser sep -> Parser [a] 328 | ``` 329 | 330 | sepBy1 p sep parses one or more occurrences of p, separated by 331 | sep. Returns a list of values returned by p. 332 | 333 | === endBy 334 | 335 | ```haskell 336 | endBy :: Parser a -> Parser sep -> Parser [a] 337 | ``` 338 | 339 | endBy p sep parses zero or more occurrences of p, seperated and ended 340 | by sep. Returns a list of values returned by p. 341 | 342 | ```haskell 343 | cStatements = cStatement `endBy` semi 344 | ``` 345 | 346 | === endBy1 347 | 348 | ```haskell 349 | endBy1 :: Parser a -> Parser sep -> Parser [a] 350 | ``` 351 | 352 | endBy1 p sep parses one or more occurrences of p, seperated and ended 353 | by sep. Returns a list of values returned by p. 354 | 355 | === sepEndBy 356 | 357 | ```haskell 358 | sepEndBy :: Parser a -> Parser sep -> Parser [a] 359 | ``` 360 | 361 | sepEndBy p sep parses zero or more occurrences of p, separated and 362 | optionally ended by sep, ie. haskell style statements. Returns a list 363 | of values returned by p. 364 | 365 | ```haskell 366 | haskellStatements = haskellStatement `sepEndBy` semi 367 | ``` 368 | 369 | === sepEndBy1 370 | 371 | ```haskell 372 | sepEndBy1 :: Parser a -> Parser sep -> Parser [a] 373 | ``` 374 | 375 | sepEndBy1 p sep parses one or more occurrences of p, separated and 376 | optionally ended by sep. Returns a list of values returned by p. 377 | 378 | === chainl 379 | 380 | ```haskell 381 | chainl :: Parser a -> Parser (a -> a -> a) -> a -> Parser a 382 | ``` 383 | 384 | chainl p op x parser zero or more occurrences of p, separated by 385 | op. Returns a value obtained by a left associative application of all 386 | functions returned by op to the values returned by p. If there are 387 | zero occurrences of p, the value x is returned. 388 | 389 | === chainl1 390 | 391 | ```haskell 392 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a 393 | ``` 394 | 395 | chainl1 p op x parser one or more occurrences of p, separated by op 396 | Returns a value obtained by a left associative application of all 397 | functions returned by op to the values returned by p. . This parser 398 | can for example be used to eliminate left recursion which typically 399 | occurs in expression grammars. 400 | 401 | === chainr 402 | 403 | ```haskell 404 | chainr :: Parser a -> Parser (a -> a -> a) -> a -> Parser a 405 | ``` 406 | 407 | chainr p op x parser zero or more occurrences of p, separated by op 408 | Returns a value obtained by a right associative application of all 409 | functions returned by op to the values returned by p. If there are no 410 | occurrences of p, the value x is returned. 411 | 412 | === chainr1 413 | 414 | ```haskell 415 | chainr1 :: Parser a -> Parser (a -> a -> a) -> Parser a 416 | ``` 417 | 418 | chainr1 p op x parser one or more occurrences of `p`, separated by op 419 | Returns a value obtained by a right associative application of all 420 | functions returned by op to the values returned by p. 421 | 422 | === eof 423 | 424 | ```haskell 425 | eof :: Parser () 426 | ``` 427 | 428 | This parser only succeeds at the end of the input. This is not a 429 | primitive parser but it is defined using notFollowedBy. 430 | 431 | ```haskell 432 | eof = notFollowedBy anyToken "end of input" 433 | ``` 434 | 435 | The () operator is used for error messages. We will come back to 436 | error messages after writing the basic SQL parser. 437 | 438 | === notFollowedBy 439 | 440 | ```haskell 441 | notFollowedBy :: Show a => Parser a -> Parser () 442 | ``` 443 | 444 | notFollowedBy p only succeeds when parser p fails. This parser does 445 | not consume any input. This parser can be used to implement the 446 | 'longest match' rule. For example, when recognizing keywords (for 447 | example let), we want to make sure that a keyword is not followed by a 448 | legal identifier character, in which case the keyword is actually an 449 | identifier (for example lets). We can program this behaviour as 450 | follows: 451 | 452 | ```haskell 453 | keywordLet = try (do{ string "let" 454 | ; notFollowedBy alphaNum 455 | }) 456 | ``` 457 | 458 | === manyTill 459 | 460 | ```haskell 461 | manyTill :: Parser a -> Parser end -> Parser [a] 462 | ``` 463 | 464 | manyTill p end applies parser p zero or more times until parser end 465 | succeeds. Returns the list of values returned by p. This parser can be 466 | used to scan comments: 467 | 468 | ```haskell 469 | simpleComment = do{ string "")) 471 | } 472 | ``` 473 | // asciidoc: the string should be "-->" 474 | 475 | Note the overlapping parsers anyChar and string "-\->", and therefore 476 | the use of the try combinator. 477 | 478 | === lookAhead 479 | 480 | ```haskell 481 | lookAhead :: Parser a -> Parser a 482 | ``` 483 | 484 | lookAhead p parses p without consuming any input. 485 | 486 | If p fails and consumes some input, so does lookAhead. Combine with 487 | try if this is undesirable. 488 | 489 | === anyToken 490 | 491 | ```haskell 492 | anyToken :: Parser Char 493 | ``` 494 | 495 | The parser anyToken accepts any kind of token. It is for example used 496 | to implement eof. Returns the accepted token. 497 | 498 | == Control.Applicative 499 | 500 | Here are the functions from Applicative that are used: 501 | 502 | `(<$>)`, `(<*>)`, `(<$)`, `(<*)`, `(*>)`, `(<|>)`, `many` 503 | 504 | TODO: examples for all of these 505 | 506 | We've already seen all of these, except `(<$)`. This is often used to 507 | parse a keyword and return a no argument constructor: 508 | 509 | > data Something = Type1 | Type2 | Type3 510 | 511 | > something :: Parser Something 512 | > something = choice [Type1 <$ string "type1" 513 | > ,Type2 <$ string "type2" 514 | > ,Type3 <$ string "type3"] 515 | 516 | // the first symbol following should be (<**>) 517 | // the backslashes are for asciidoc 518 | // todo: do this in the render.lhs? 519 | // I wish I understood why you need backslashes in some places 520 | // and not others 521 | 522 | There is also `(<\**>)` which is `(<*>)` with the arguments flipped. 523 | 524 | TODO: double check using these from Parsec instead of 525 | Control.Applicative: possible performance implictions? 526 | 527 | == Control.Monad 528 | 529 | === return 530 | 531 | One use of return is to always succeed, and return a value: 532 | 533 | > alwaysX :: Parser Char 534 | > alwaysX = return 'x' 535 | 536 | ``` 537 | *Main> parseWithLeftOver (a <|> alwaysX) "a" 538 | Right ('a',"") 539 | 540 | *Main> parseWithLeftOver (a <|> alwaysX) "b" 541 | Right ('x',"b") 542 | ``` 543 | 544 | === mzero 545 | 546 | This function is used in the implementation of `choice`: 547 | 548 | > choice' :: [Parser a] -> Parser a 549 | > choice' ps = foldr (<|>) mzero ps 550 | 551 | 552 | 553 | TODO: go through a bunch of functions + do notation examples 554 | 555 | >>= 556 | =<< 557 | >> 558 | void 559 | mapM, mapM_ 560 | sequence,sequence_ 561 | guard 562 | return 563 | mzero 564 | mplus 565 | when, unless 566 | liftMN 567 | ap 568 | quick note about fail, will return to this in the error messages stage 569 | 570 | TODO: using trace 571 | -------------------------------------------------------------------------------- /ErrorMessages.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[error-messages]] 3 | = Error messages 4 | 5 | In this tutorial, we will start looking at the error messages 6 | generated by Parsec and how we can influence to improve them. 7 | 8 | TODO: 9 | 10 | review the different methods of generating failure in parsec and what 11 | they look like, including choice. 12 | 13 | examine how rearranging parsers can change the error messages, 14 | committing to a parsing branch to improve the messages 15 | 16 | also possibly look at adding additional parsing to detect common 17 | errors to provide a nicer error messages 18 | 19 | maybe talk about the idea of being permissive in the parsing proper 20 | stage, then doing additional syntax-type checks on the ast after 21 | 22 | 23 | == ways of generating errors in parsec 24 | 25 | == how parsec combines errors or drops them into the void 26 | 27 | 28 | == Parsing tpch 29 | 30 | Let's try the parser out on the TPC-H queries. 31 | 32 | ``` 33 | ``` 34 | 35 | Summary of errors so far: 36 | q1: typed literal: type_name 'literal value' 37 | q2: scalar subquery 38 | q3: typed literal 39 | q4: typed literal 40 | q5: typed literal 41 | q6: typed literal 42 | q7: ?? 43 | q8: extract?? 44 | q9: extract ?? 45 | q10: typed literal 46 | q11: scalar sub query 47 | q12: ?? 48 | q13: not like?? 49 | q14: decimal literal 50 | q15: cte 51 | q16: count distinct 52 | q17: decimal literal 53 | q18: in subquery 54 | q19: in literal list 55 | q20: in subquery 56 | q21: exists subquery 57 | q22: substring 58 | 59 | TODO: try to figure out each issue. Not very easy: the error messages 60 | are not very good. 61 | 62 | Do some ad hoc stuff to try to improve each error message? Keep 63 | simplifying expressions then try to understand specific principles. 64 | Then make some generalizations that can be made across the parser 65 | 66 | 67 | == value expressions in isolation 68 | 69 | We can start with the real error code now by considering the 70 | valueexpression parser in isolation. 71 | 72 | ideas: 73 | error inside function parameter list 74 | bad identifier 75 | unrecognised operator 76 | unrecognised chars 77 | unmatching parens 78 | keyword errors in case 79 | unterminated string lit 80 | 3 dot components 81 | binary op with missing second arg 82 | associativity errors with binary ops? 83 | blacklist errors 84 | 85 | == query expressions 86 | 87 | mispelling keywords at this level 88 | clauses in wrong order 89 | adding extra tokens or removing them at clause/value expression 90 | boundaries 91 | 92 | == summarizing? 93 | 94 | == updated parsing code with improved error message behaviour 95 | 96 | -------------------------------------------------------------------------------- /FromClause.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[from-clause]] 3 | = From clause 4 | 5 | In this tutorial, we extend the from clause support to the following: 6 | we will support implicit and explicit joins, including keywords 7 | natural, inner, outer, left, right, full, cross, on and using, plus 8 | parens and simple aliases (e.g. select a from t u, but not select a 9 | from t(a,b)). We don't support oracle outer join syntax (+) or the 10 | other 'pre-ANSI' variations on this theme. No lateral keyword or apply 11 | or pivot. 12 | 13 | > {-# LANGUAGE TupleSections #-} 14 | > 15 | > --import Text.Groom (groom) 16 | > --import qualified Text.Parsec as P 17 | > import Text.Parsec.String (Parser) 18 | > import Text.Parsec.String.Parsec (try) 19 | > import Text.Parsec.String.Combinator 20 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>)) 21 | > import Control.Monad 22 | > import Data.Maybe () 23 | > import qualified Test.HUnit as H 24 | > import FunctionsAndTypesForParsing 25 | > import Debug.Trace 26 | 27 | TODO: should explicitly import from these two modules (and same in 28 | QueryExpressions.lhs) 29 | 30 | > import ValueExpressions (ValueExpr(..), valueExpr, identifier, makeTest, parens) 31 | > import QueryExpressions (selectList,whereClause,groupByClause,having,orderBy 32 | > ,commaSep1, keyword_,blackListIdentifier) 33 | 34 | == Abstract syntax 35 | 36 | Here are is the updated `QueryExpr` and the new `TableRef` abstract 37 | syntax types. 38 | 39 | > data QueryExpr 40 | > = Select 41 | > {qeSelectList :: [(ValueExpr,Maybe String)] 42 | > ,qeFrom :: [TableRef] 43 | > ,qeWhere :: Maybe ValueExpr 44 | > ,qeGroupBy :: [ValueExpr] 45 | > ,qeHaving :: Maybe ValueExpr 46 | > ,qeOrderBy :: [ValueExpr] 47 | > } deriving (Eq,Show) 48 | > 49 | > makeSelect :: QueryExpr 50 | > makeSelect = Select {qeSelectList = [] 51 | > ,qeFrom = [] 52 | > ,qeWhere = Nothing 53 | > ,qeGroupBy = [] 54 | > ,qeHaving = Nothing 55 | > ,qeOrderBy = []} 56 | > 57 | > data TableRef = TRSimple String 58 | > | TRJoin TableRef JoinType TableRef (Maybe JoinCondition) 59 | > | TRParens TableRef 60 | > | TRAlias TableRef String 61 | > | TRQueryExpr QueryExpr 62 | > deriving (Eq,Show) 63 | 64 | This syntax for table references can represent invalid syntax, for 65 | instance two nested aliases. The justification for this is that 66 | sometimes trying to accurately represent only exactly what is valid 67 | creates something much more complex. Maybe this is a good tradeoff in 68 | this situation, and maybe not. 69 | 70 | > data JoinType = JoinInner | JoinLeft | JoinRight | JoinFull | JoinCross 71 | > deriving (Eq,Show) 72 | > 73 | > data JoinCondition = JoinOn ValueExpr 74 | > | JoinUsing [String] 75 | > | JoinNatural 76 | > deriving (Eq,Show) 77 | 78 | With the join condition, we've done the opposite to TableRef - we've 79 | combined `natural` and `on`/`using`, since only one of these can be 80 | present, even though this departs a little from the concrete 81 | syntax. 82 | 83 | First we will develop the standalone from clause parser, then we will 84 | update the query expression syntax and parsing to incorporate our new 85 | from clause parser. 86 | 87 | == simple table name 88 | 89 | Let's start with something simple: a from clause can be multiple comma 90 | separated tablerefs, aka an implicit join. 91 | 92 | > multipleTRSimpleTests :: [(String, [TableRef])] 93 | > multipleTRSimpleTests = [("from a,b", [TRSimple "a", TRSimple "b"])] 94 | > 95 | > from0 :: Parser [TableRef] 96 | > from0 = keyword_ "from" *> commaSep1 (TRSimple <$> identifier) 97 | 98 | ``` 99 | *Main> H.runTestTT $ H.TestList $ map (makeTest from0) multipleTRSimpleTests 100 | Cases: 1 Tried: 1 Errors: 0 Failures: 0 101 | Counts {cases = 1, tried = 1, errors = 0, failures = 0} 102 | ``` 103 | 104 | Let's do the query expression, parens and alias first, before tackling 105 | joins. 106 | 107 | == subquery 108 | 109 | Here is the example: 110 | 111 | > trQueryExprTests :: [(String, [TableRef])] 112 | > trQueryExprTests = 113 | > [("from (select a from t)" 114 | > ,[TRQueryExpr $ makeSelect {qeSelectList = [(Iden "a", Nothing)] 115 | > ,qeFrom = [TRSimple "t"]}])] 116 | 117 | Here is the query expression parser we can use: 118 | 119 | > queryExpr1 :: Parser [TableRef] -> Parser QueryExpr 120 | > queryExpr1 from' = Select 121 | > <$> selectList 122 | > <*> option [] from' 123 | > <*> optionMaybe whereClause 124 | > <*> option [] groupByClause 125 | > <*> optionMaybe having 126 | > <*> option [] orderBy 127 | > 128 | > from1 :: Parser [TableRef] 129 | > from1 = 130 | > keyword_ "from" *> commaSep1 trefTerm 131 | > where 132 | > trefTerm = choice [TRSimple <$> identifier 133 | > ,TRQueryExpr <$> parens (queryExpr1 from1)] 134 | 135 | == parens 136 | 137 | We can't do a sensible example for these right now - we need explicit 138 | joins and then the parens can be used to override the associativity of 139 | a three way join, or to specify over what part of the expression to 140 | apply an alias. 141 | 142 | > trParensTests :: [(String, [TableRef])] 143 | > trParensTests = [("from (a)", [TRParens $ TRSimple "a"])] 144 | 145 | We can write some more tests for parens after we've done the explicit 146 | joins. 147 | 148 | > from2 :: Parser [TableRef] 149 | > from2 = 150 | > keyword_ "from" *> commaSep1 trefTerm 151 | > where 152 | > trefTerm = choice [TRSimple <$> identifier 153 | > ,try (TRQueryExpr <$> parens (queryExpr1 from2)) 154 | > ,TRParens <$> parens trefTerm] 155 | 156 | == alias 157 | 158 | > trAliasTests :: [(String,[TableRef])] 159 | > trAliasTests = [("from a as b", [TRAlias (TRSimple "a") "b"]) 160 | > ,("from a b", [TRAlias (TRSimple "a") "b"])] 161 | 162 | The alias can be treated like a postfix operator. 163 | 164 | > suffixWrapper :: (a -> Parser a) -> a -> Parser a 165 | > suffixWrapper p a = p a <|> return a 166 | 167 | TODO: ?? not sure about this 168 | 169 | > from3 :: Parser [TableRef] 170 | > from3 = 171 | > keyword_ "from" *> commaSep1 trefTerm 172 | > where 173 | > trefTerm = choice [TRSimple <$> identifier 174 | > ,try (TRQueryExpr <$> parens (queryExpr1 from3)) 175 | > ,TRParens <$> parens trefTerm] 176 | > >>= suffixWrapper alias 177 | > alias tr = TRAlias tr <$> (optional (keyword_ "as") *> identifier) 178 | 179 | How to make it keep nesting? 180 | 181 | == joins 182 | 183 | Here is a casual sketch of the target grammar: 184 | 185 | ``` 186 | tref 187 | (cross | [natural] 188 | ([inner] 189 | | left [outer] 190 | | right [outer] 191 | | full [outer] 192 | ) 193 | join tref 194 | [on expr | using (...)] 195 | ``` 196 | 197 | Let's start with parsers for the 'join operator' in the middle and for 198 | the join condition: 199 | 200 | === join type 201 | 202 | > joinType :: Parser JoinType 203 | > joinType = choice 204 | > [JoinCross <$ keyword_ "cross" <* keyword_ "join" 205 | > ,JoinInner <$ keyword_ "inner" <* keyword_ "join" 206 | > ,JoinLeft <$ keyword_ "left" 207 | > <* optional (keyword_ "outer") 208 | > <* keyword_ "join" 209 | > ,JoinRight <$ keyword_ "right" 210 | > <* optional (keyword_ "outer") 211 | > <* keyword_ "join" 212 | > ,JoinFull <$ keyword_ "full" 213 | > <* optional (keyword_ "outer") 214 | > <* keyword_ "join" 215 | > ,JoinInner <$ keyword_ "join"] 216 | 217 | ``` 218 | *Main> parseWithEof joinType "cross join" 219 | Right JoinCross 220 | 221 | *Main> parseWithEof joinType "inner join" 222 | Right JoinInner 223 | 224 | *Main> parseWithEof joinType "left outer join" 225 | Right JoinLeft 226 | 227 | *Main> parseWithEof joinType "left join" 228 | Right JoinLeft 229 | 230 | *Main> parseWithEof joinType "right outer join" 231 | Right JoinRight 232 | 233 | *Main> parseWithEof joinType "right join" 234 | Right JoinRight 235 | 236 | *Main> parseWithEof joinType "full outer join" 237 | Right JFull 238 | 239 | *Main> parseWithEof joinType "full join" 240 | Right JoinFull 241 | 242 | *Main> parseWithEof joinType "join" 243 | Right JoinInner 244 | ``` 245 | 246 | I thought about factoring out the common bits with the joinType parser: 247 | 248 | > joinType0 :: Parser JoinType 249 | > joinType0 = choice 250 | > [choice 251 | > [JoinCross <$ try (keyword_ "cross") 252 | > ,JoinInner <$ try (keyword_ "inner") 253 | > ,choice [JoinLeft <$ try (keyword_ "left") 254 | > ,JoinRight <$ try (keyword_ "right") 255 | > ,JoinFull <$ try (keyword_ "full")] 256 | > <* optional (try $ keyword_ "outer")] 257 | > <* keyword_ "join" 258 | > ,JoinInner <$ keyword_ "join"] 259 | 260 | But I think the longer version is much easier to follow, even if it is 261 | a little more boring. 262 | 263 | === join condition 264 | 265 | The idea with the join condition is that we pass a bool to say whether 266 | we've already seen the 'natural' keyword. If so, then we don't try to 267 | parse 'on' or 'using'. 268 | 269 | > joinCondition :: Bool -> Parser JoinCondition 270 | > joinCondition nat = 271 | > choice [guard nat >> return JoinNatural 272 | > ,keyword_ "on" >> JoinOn <$> valueExpr [] 273 | > ,keyword_ "using" >> JoinUsing <$> parens (commaSep1 identifier) 274 | > ] 275 | 276 | ``` 277 | *Main> parseWithEof (joinCondition False) "on a" 278 | Right (JoinOn (Iden "a")) 279 | 280 | *Main> parseWithEof (joinCondition False) "on a + b" 281 | Right (JoinOn (BinOp (Iden "a") "+" (Iden "b"))) 282 | 283 | *Main> parseWithEof (joinCondition False) "using (a,b)" 284 | Right (JoinUsing ["a","b"]) 285 | 286 | *Main> parseWithEof (joinCondition True) "using (a,b)" 287 | Left (line 1, column 1): 288 | unexpected 'u' 289 | expecting end of input 290 | 291 | *Main> parseWithEof (joinCondition True) "" 292 | Right JoinNatural 293 | ``` 294 | 295 | === simple binary join 296 | 297 | Let's try some simple binary joins: 298 | 299 | > simpleBinaryJoinTests :: [(String,[TableRef])] 300 | > simpleBinaryJoinTests = 301 | > [("from a join b" 302 | > ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") Nothing]) 303 | > 304 | > ,("from a natural join b" 305 | > ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") (Just JoinNatural)]) 306 | > 307 | > ,("from a join b on a.x = b.y" 308 | > ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") 309 | > (Just $ JoinOn $ BinOp (DIden "a" "x") "="(DIden "b" "y"))]) 310 | > 311 | > ,("from a join b using(x,y)" 312 | > ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") 313 | > (Just $ JoinUsing ["x","y"])]) 314 | > 315 | > ,("from a cross join b" 316 | > ,[TRJoin (TRSimple "a") JoinCross (TRSimple "b") Nothing]) 317 | > 318 | > ] 319 | 320 | We want to parse the first table, then optionally parse the 'natural' 321 | keyword, then the join type, then the second table, then optionally 322 | parse the join condition. 323 | 324 | > from4 :: Parser [TableRef] 325 | > from4 = keyword_ "from" >> (:[]) <$> do 326 | > t0 <- simpleTref 327 | > nat <- option False (True <$ keyword_ "natural") 328 | > jt <- joinType 329 | > t1 <- simpleTref 330 | > jc <- optionMaybe (joinCondition nat) 331 | > return $ TRJoin t0 jt t1 jc 332 | > where 333 | > simpleTref = TRSimple <$> identifier 334 | 335 | Let's start extending this into the full target parser. In this next 336 | version, I've tried to combine all the versions we've seen so far. 337 | 338 | > from5 :: Parser [TableRef] 339 | > from5 = keyword_ "from" >> commaSep1 tref 340 | > where 341 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 342 | > joinTrefSuffix t0 = do 343 | > nat <- option False (True <$ keyword_ "natural") 344 | > jt <- joinType 345 | > t1 <- nonJoinTref 346 | > jc <- optionMaybe (joinCondition nat) 347 | > return $ TRJoin t0 jt t1 jc 348 | > nonJoinTref = choice [TRSimple <$> identifier 349 | > ,try (TRQueryExpr <$> parens (queryExpr1 from5)) 350 | > ,TRParens <$> parens tref] 351 | > >>= suffixWrapper alias 352 | > alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> identifier)) 353 | 354 | ``` 355 | *Main> H.runTestTT $ H.TestList $ map (makeTest from5) (multipleTRSimpleTests ++ trQueryExprTests ++ trParensTests ++ trAliasTests ++ simpleBinaryJoinTests) 356 | ### Failure in: 5:from a join b 357 | (line 1, column 14): 358 | unexpected end of input 359 | expecting digit, letter, "_", "--" or "/*" 360 | ### Failure in: 6:from a natural join b 361 | from a natural join b 362 | expected: [TRJoin (TRSimple "a") JoinInner (TRSimple "b") (Just JoinNatural)] 363 | but got: [TRJoin (TRAlias (TRSimple "a") "natural") JoinInner (TRSimple "b") Nothing] 364 | ### Failure in: 7:from a join b on a.x = b.y 365 | (line 1, column 15): 366 | unexpected "o" 367 | expecting "--" or "/*" 368 | ### Failure in: 8:from a join b using(x,y) 369 | (line 1, column 15): 370 | unexpected "u" 371 | expecting "--" or "/*" 372 | ### Failure in: 9:from a cross join b 373 | from a cross join b 374 | expected: [TRJoin (TRSimple "a") JoinCross (TRSimple "b") Nothing] 375 | but got: [TRJoin (TRAlias (TRSimple "a") "cross") JoinInner (TRSimple "b") Nothing] 376 | Cases: 10 Tried: 10 Errors: 0 Failures: 5 377 | Counts {cases = 10, tried = 10, errors = 0, failures = 5} 378 | ``` 379 | 380 | What's going wrong? If you look at some of the issues, it looks like 381 | we are getting keywords parsed as aliases. Let's fix that first: 382 | 383 | > from6 :: Parser [TableRef] 384 | > from6 = keyword_ "from" >> commaSep1 tref 385 | > where 386 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 387 | > joinTrefSuffix t0 = do 388 | > nat <- option False (True <$ keyword_ "natural") 389 | > jt <- joinType 390 | > t1 <- nonJoinTref 391 | > jc <- optionMaybe (joinCondition nat) 392 | > return $ TRJoin t0 jt t1 jc 393 | > nonJoinTref = choice [TRSimple <$> identifier 394 | > ,try (TRQueryExpr <$> parens (queryExpr1 from6)) 395 | > ,TRParens <$> parens tref] 396 | > >>= suffixWrapper alias 397 | > alias tr = TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier) 398 | > aliasIdentifier = blackListIdentifier 399 | > ["natural" 400 | > ,"inner" 401 | > ,"outer" 402 | > ,"cross" 403 | > ,"left" 404 | > ,"right" 405 | > ,"full" 406 | > ,"join" 407 | > ,"on" 408 | > ,"using"] 409 | 410 | That didn't solve the problem. I think we also have a problem since 411 | the `alias` can now fail after consuming input, we need to use `try`. 412 | 413 | > from7 :: Parser [TableRef] 414 | > from7 = keyword_ "from" >> commaSep1 tref 415 | > where 416 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 417 | > joinTrefSuffix t0 = do 418 | > nat <- option False (True <$ keyword_ "natural") 419 | > jt <- joinType 420 | > t1 <- nonJoinTref 421 | > jc <- optionMaybe (joinCondition nat) 422 | > return $ TRJoin t0 jt t1 jc 423 | > nonJoinTref = choice [TRSimple <$> identifier 424 | > ,try (TRQueryExpr <$> parens (queryExpr1 from7)) 425 | > ,TRParens <$> parens tref] 426 | > >>= suffixWrapper (try . alias) 427 | > alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier)) 428 | > aliasIdentifier = blackListIdentifier 429 | > ["natural" 430 | > ,"inner" 431 | > ,"outer" 432 | > ,"cross" 433 | > ,"left" 434 | > ,"right" 435 | > ,"full" 436 | > ,"join" 437 | > ,"on" 438 | > ,"using"] 439 | 440 | The final step is to make it parse n-way explicit joins. 441 | 442 | > threeWayJoinTests :: [(String,[TableRef])] 443 | > threeWayJoinTests = 444 | > [("from a join b join c" 445 | > ,[TRJoin 446 | > (TRJoin (TRSimple "a") JoinInner (TRSimple "b") Nothing) 447 | > JoinInner (TRSimple "c") Nothing])] 448 | 449 | 450 | > from8 :: Parser [TableRef] 451 | > from8 = keyword_ "from" >> commaSep1 tref 452 | > where 453 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 454 | > joinTrefSuffix t0 = (do 455 | > nat <- option False (True <$ keyword_ "natural") 456 | > TRJoin t0 <$> joinType 457 | > <*> nonJoinTref 458 | > <*> optionMaybe (joinCondition nat)) 459 | > >>= suffixWrapper joinTrefSuffix 460 | > nonJoinTref = choice [TRSimple <$> identifier 461 | > ,try (TRQueryExpr <$> parens (queryExpr1 from8)) 462 | > ,TRParens <$> parens tref] 463 | > >>= suffixWrapper (try . alias) 464 | > alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier)) 465 | > aliasIdentifier = blackListIdentifier 466 | > ["natural" 467 | > ,"inner" 468 | > ,"outer" 469 | > ,"cross" 470 | > ,"left" 471 | > ,"right" 472 | > ,"full" 473 | > ,"join" 474 | > ,"on" 475 | > ,"using"] 476 | 477 | We get left associative with this code. I don't know if this is correct. 478 | 479 | We should do some more testing to make sure this code is good. TODO 480 | 481 | == query expressions 482 | 483 | Let's create the full query expression parser now: 484 | 485 | > queryExprJoinTests :: [(String,QueryExpr)] 486 | > queryExprJoinTests = 487 | > [("select a from t" 488 | > ,ms [TRSimple "t"]) 489 | > 490 | > ,("select a from t,u" 491 | > ,ms [TRSimple "t", TRSimple "u"]) 492 | > 493 | > ,("select a from t inner join u on expr" 494 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 495 | > (Just $ JoinOn $ Iden "expr")]) 496 | > 497 | > ,("select a from t left join u on expr" 498 | > ,ms [TRJoin (TRSimple "t") JoinLeft (TRSimple "u") 499 | > (Just $ JoinOn $ Iden "expr")]) 500 | > 501 | > ,("select a from t right join u on expr" 502 | > ,ms [TRJoin (TRSimple "t") JoinRight (TRSimple "u") 503 | > (Just $ JoinOn $ Iden "expr")]) 504 | > 505 | > ,("select a from t full join u on expr" 506 | > ,ms [TRJoin (TRSimple "t") JoinFull (TRSimple "u") 507 | > (Just $ JoinOn $ Iden "expr")]) 508 | > 509 | > ,("select a from t cross join u" 510 | > ,ms [TRJoin (TRSimple "t") JoinCross (TRSimple "u") Nothing]) 511 | > 512 | > ,("select a from t natural inner join u" 513 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 514 | > (Just JoinNatural)]) 515 | > 516 | > ,("select a from t inner join u using(a,b)" 517 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 518 | > (Just $ JoinUsing ["a", "b"])]) 519 | > 520 | > ,("select a from (select a from t)" 521 | > ,ms [TRQueryExpr $ ms [TRSimple "t"]]) 522 | > 523 | > ,("select a from t as u" 524 | > ,ms [TRAlias (TRSimple "t") "u"]) 525 | > 526 | > ,("select a from t u" 527 | > ,ms [TRAlias (TRSimple "t") "u"]) 528 | > 529 | > ,("select a from (t cross join u) as u" 530 | > ,ms [TRAlias (TRParens $ TRJoin (TRSimple "t") JoinCross 531 | > (TRSimple "u") Nothing) "u"]) 532 | > ] 533 | > where 534 | > ms f = makeSelect {qeSelectList = [(Iden "a",Nothing)] 535 | > ,qeFrom = f} 536 | 537 | Here are all the other query expression tests updated with the new 538 | QueryExpr type. 539 | 540 | > singleSelectItemTests :: [(String,QueryExpr)] 541 | > singleSelectItemTests = 542 | > [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})] 543 | 544 | > multipleSelectItemsTests :: [(String,QueryExpr)] 545 | > multipleSelectItemsTests = 546 | > [("select a" 547 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)]}) 548 | > ,("select a,b" 549 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 550 | > ,(Iden "b",Nothing)]}) 551 | > ,("select 1+2,3+4" 552 | > ,makeSelect {qeSelectList = 553 | > [(BinOp (NumLit 1) "+" (NumLit 2),Nothing) 554 | > ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]}) 555 | > ] 556 | 557 | > selectListTests :: [(String,QueryExpr)] 558 | > selectListTests = 559 | > [("select a as a, b as b" 560 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a") 561 | > ,(Iden "b", Just "b")]}) 562 | > 563 | > ,("select a a, b b" 564 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a") 565 | > ,(Iden "b", Just "b")]}) 566 | > ] ++ multipleSelectItemsTests 567 | > ++ singleSelectItemTests 568 | 569 | > fromTests :: [(String,QueryExpr)] 570 | > fromTests = 571 | > [("select a from t" 572 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 573 | > ,qeFrom = [TRSimple "t"]})] 574 | 575 | > whereTests :: [(String,QueryExpr)] 576 | > whereTests = 577 | > [("select a from t where a = 5" 578 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 579 | > ,qeFrom = [TRSimple "t"] 580 | > ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)}) 581 | > ] 582 | 583 | > groupByTests :: [(String,QueryExpr)] 584 | > groupByTests = 585 | > [("select a,sum(b) from t group by a" 586 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 587 | > ,(App "sum" [Iden "b"],Nothing)] 588 | > ,qeFrom = [TRSimple "t"] 589 | > ,qeGroupBy = [Iden "a"] 590 | > }) 591 | > 592 | > ,("select a,b,sum(c) from t group by a,b" 593 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 594 | > ,(Iden "b",Nothing) 595 | > ,(App "sum" [Iden "c"],Nothing)] 596 | > ,qeFrom = [TRSimple "t"] 597 | > ,qeGroupBy = [Iden "a",Iden "b"] 598 | > }) 599 | > ] 600 | 601 | > havingTests :: [(String,QueryExpr)] 602 | > havingTests = 603 | > [("select a,sum(b) from t group by a having sum(b) > 5" 604 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 605 | > ,(App "sum" [Iden "b"],Nothing)] 606 | > ,qeFrom = [TRSimple "t"] 607 | > ,qeGroupBy = [Iden "a"] 608 | > ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5) 609 | > }) 610 | > ] 611 | 612 | > orderByTests :: [(String,QueryExpr)] 613 | > orderByTests = 614 | > [("select a from t order by a" 615 | > ,ms [Iden "a"]) 616 | > 617 | > ,("select a from t order by a, b" 618 | > ,ms [Iden "a", Iden "b"]) 619 | > ] 620 | > where 621 | > ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)] 622 | > ,qeFrom = [TRSimple "t"] 623 | > ,qeOrderBy = o} 624 | 625 | ``` 626 | *Main> H.runTestTT $ H.TestList $ map (makeTest (queryExpr1 from8)) (selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests ++ queryExprJoinTests) 627 | ### Failure in: 7:select a from t where a = 5 628 | (line 1, column 25): 629 | unexpected "=" 630 | expecting "--" or "/*" 631 | ### Failure in: 8:select a,sum(b) from t group by a 632 | (line 1, column 33): 633 | unexpected "a" 634 | expecting "--" or "/*" 635 | ### Failure in: 9:select a,b,sum(c) from t group by a,b 636 | (line 1, column 35): 637 | unexpected "a" 638 | expecting "--" or "/*" 639 | ### Failure in: 10:select a,sum(b) from t group by a having sum(b) > 5 640 | (line 1, column 33): 641 | unexpected "a" 642 | expecting "--" or "/*" 643 | ### Failure in: 11:select a from t order by a 644 | (line 1, column 26): 645 | unexpected "a" 646 | expecting "--" or "/*" 647 | ### Failure in: 12:select a from t order by a, b 648 | (line 1, column 26): 649 | unexpected "a" 650 | expecting "--" or "/*" 651 | Cases: 26 Tried: 26 Errors: 0 Failures: 6 652 | Counts {cases = 26, tried = 26, errors = 0, failures = 6} 653 | ``` 654 | 655 | The problem is the table alias parser is trying to parse keywords 656 | again. Here is the `from` parser with the alias name blacklist 657 | expanded. 658 | 659 | > from :: Parser [TableRef] 660 | > from = keyword_ "from" >> commaSep1 tref 661 | > where 662 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 663 | > joinTrefSuffix t0 = (do 664 | > nat <- option False (True <$ keyword_ "natural") 665 | > TRJoin t0 <$> joinType 666 | > <*> nonJoinTref 667 | > <*> optionMaybe (joinCondition nat)) 668 | > >>= suffixWrapper joinTrefSuffix 669 | > nonJoinTref = choice [TRSimple <$> identifier 670 | > ,try (TRQueryExpr <$> parens queryExpr) 671 | > ,TRParens <$> parens tref] 672 | > >>= suffixWrapper (try . alias) 673 | > alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier)) 674 | > aliasIdentifier = blackListIdentifier 675 | > [-- join keywords 676 | > "natural" 677 | > ,"inner" 678 | > ,"outer" 679 | > ,"cross" 680 | > ,"left" 681 | > ,"right" 682 | > ,"full" 683 | > ,"join" 684 | > ,"on" 685 | > ,"using" 686 | > -- subsequent clause keywords 687 | > ,"where" 688 | > ,"group" 689 | > ,"having" 690 | > ,"order" 691 | > ] 692 | 693 | Here is the final query expression parser: 694 | 695 | > queryExpr :: Parser QueryExpr 696 | > queryExpr = Select 697 | > <$> selectList 698 | > <*> option [] from 699 | > <*> optionMaybe whereClause 700 | > <*> option [] groupByClause 701 | > <*> optionMaybe having 702 | > <*> option [] orderBy 703 | -------------------------------------------------------------------------------- /FunctionsAndTypesForParsing.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[functions-and-types-for-parsing]] 3 | = Functions and types for parsing 4 | 5 | In this file is the source and explanation for the parsing functions 6 | which we've been using, and some limited notes about the wrappers and 7 | full types in Parsec. 8 | 9 | > module FunctionsAndTypesForParsing where 10 | > 11 | > import Text.Parsec (ParseError) 12 | > import Text.Parsec.String (Parser) 13 | > import Text.Parsec.String.Parsec (parse) 14 | > import Text.Parsec.String.Char (oneOf) 15 | > import Text.Parsec.String.Combinator (eof,manyTill,anyToken) 16 | > import Control.Applicative ((<$>), (<*>), (<*), (*>), many) 17 | > import Control.Monad (void) 18 | 19 | 20 | == Functions for parsing 21 | 22 | Here are the testing functions which were used earlier: 23 | 24 | The basic parse function: this is a pretty simple wrapper. The parse 25 | function from parsec just adds a filename to use in parse errors, 26 | which is set as the empty string here. 27 | 28 | > regularParse :: Parser a -> String -> Either ParseError a 29 | > regularParse p = parse p "" 30 | 31 | 'parse' is a basic function in the family of functions for running 32 | parsers in Parsec. You can compose the parser functions in the Parser 33 | monad, then run the top level function using 'parse' and get back an 34 | 'Either ParserError a' as the result. There are a few alternatives to 35 | 'parse' in Parsec, mostly when you are using a more general parser 36 | type instead of 'Parser a' (which is an alias for 'ParsecT String () 37 | Identity a'). Have a look in the Text.Parsec.Prim module for these 38 | . 39 | 40 | This function will run the parser, but additionally fail if it doesn't 41 | consume all the input. 42 | 43 | > parseWithEof :: Parser a -> String -> Either ParseError a 44 | > parseWithEof p = parse (p <* eof) "" 45 | 46 | This function will apply the parser, then also return any left over 47 | input which wasn't parsed. 48 | 49 | > parseWithLeftOver :: Parser a -> String -> Either ParseError (a,String) 50 | > parseWithLeftOver p = parse ((,) <$> p <*> leftOver) "" 51 | > where leftOver = manyTill anyToken eof 52 | 53 | TODO: what happens when you use 'many anyToken <* eof' variations 54 | instead? Maybe should talk about greediness? Or talk about it in a 55 | better place in the tutorial. 56 | 57 | > parseWithWSEof :: Parser a -> String -> Either ParseError a 58 | > parseWithWSEof p = parseWithEof (whiteSpace *> p) 59 | > where whiteSpace = void $ many $ oneOf " \n\t" 60 | 61 | You should have a look at the two helper executables, and see if you 62 | can understand the code now. You can see them online here: 63 | 64 | 65 | 66 | 67 | 68 | == type signatures revisited 69 | 70 | todo: update this to refer to real parsec instead of the string 71 | wrappers here. 72 | 73 | I think you should always use type signatures with Parsec. Because the 74 | Parsec code is really generalized, without the type GHC will refuse to 75 | compile this code. Try commenting out the type signature above and 76 | loading into ghci to see the error message. 77 | 78 | There is an alternative: you can get this code to compile without a 79 | type signature by using the NoMonomorphismRestriction language 80 | pragma. You can also see the type signature that GHC will choose for 81 | this function by commenting the type signature and using -Wall and 82 | -XNoMonomorphismRestriction together. Using NoMonomorphismRestriction 83 | is a popular solution to these sorts of problems in haskell. 84 | 85 | It's up to you whether you prefer to always write type signatures when 86 | you are developing parsing code, or use the NoMonomorphismRestriction 87 | pragma. Even if you can use NoMonomorphismRestriction, when using 88 | explicit type signatures you usually get much simpler compiler error 89 | messages. 90 | 91 | 92 | == Parser 93 | 94 | The definition of Parser and a partial explanation of the full type 95 | signature. 96 | 97 | ``` 98 | type Parser = Parsec String () 99 | ``` 100 | 101 | This means that a function returning Parser a parses from a String 102 | with () as the initial state. 103 | 104 | The Parsec type is defined like this: 105 | 106 | ``` 107 | type Parsec s u = ParsecT s u Identity 108 | ``` 109 | 110 | ParsecT is a monad transformer, I think it is the primitive one in the 111 | Parsec library, and the 'Parsec' type is a type alias which sets the 112 | base monad to be Identity. 113 | 114 | Here is the haddock for the ParsecT type: 115 | 116 | `ParsecT s u m a` is a parser with stream type `s`, user state type `u`, 117 | underlying monad `m` and return type `a`. 118 | 119 | The full types that you see like this: 120 | 121 | ``` 122 | satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char 123 | ``` 124 | 125 | refer to the same things (stream type s, user state type u, underlying 126 | monad m). 127 | 128 | We are using String as the stream type (i.e. the input type), () as 129 | the user state type (this effectively means no user state, since () 130 | only has one value), and the underlying monad is Identity: we are 131 | using no other underlying monad, so `Parser a` expands to `ParsecT 132 | String () Identity a`. 133 | 134 | I.e. the source is String, the user state is (), and the underlying monad 135 | is Identity. 136 | 137 | == Other information 138 | 139 | TODO: Here is some other information on Parsec and Haskell: 140 | links, tutorials on fp, section in rwh, lyah?, old parsec docs, 141 | parsec docs on hackage, other parser combinator libs (uu, polyparse, 142 | trifecta?) 143 | -------------------------------------------------------------------------------- /GettingStarted.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[getting-started]] 3 | = Getting started 4 | 5 | This is an introduction to parsing with Haskell and Parsec. 6 | 7 | Prerequisites: you should know some basic Haskell and have GHC and 8 | cabal-install installed (installing the Haskell Platform will give you 9 | this). 10 | 11 | This tutorial was originally written using GHC 7.6.3 and Parsec 3.1.3, 12 | which are the versions which come with the Haskell Platform 13 | 2013.2.0.0. It should also work fine with GHC 7.8.4 and GHC 7.10.2 and 14 | through to at least the latest release of Parsec 3.1.x. 15 | 16 | This tutorial was written using Literate Haskell files available here: 17 | link:https://github.com/JakeWheat/intro_to_parsing[]. 18 | 19 | I recommend you download them all, and follow along in your favourite 20 | editor, and use GHCi to experiment. To download the intro_to_parsing 21 | files, use: 22 | 23 | ``` 24 | git clone https://github.com/JakeWheat/intro_to_parsing.git 25 | ``` 26 | 27 | Here are the imports. 28 | 29 | > import Text.Parsec.String (Parser) 30 | > import Text.Parsec.String.Char (anyChar) 31 | > import Text.Parsec.String.Char 32 | > import FunctionsAndTypesForParsing (regularParse, parseWithEof, parseWithLeftOver) 33 | > import Data.Char 34 | > import Text.Parsec.String.Combinator (many1) 35 | 36 | == First parser 37 | 38 | The first parser: 39 | 40 | ``` 41 | anyChar :: Parser Char 42 | ``` 43 | 44 | This parser is in the module `Text.Parsec.Char`. There is a wrapper in 45 | this tutorial's project, `Text.Parsec.String.Char`, which gives this 46 | function a simplified type. 47 | 48 | Whenever we write a parser which parses to a value of type `a`, we give 49 | it the return type of `Parser a`. In this case, we parse a character 50 | so the return type is `Parser Char`. The `Parser` type itself is in 51 | the module `Text.Parsec.String`. We will cover this in more detail 52 | later. 53 | 54 | Let's use this parser. I will assume you have GHC and cabal-install 55 | installed (which provides the 'cabal' executable) and both are in your 56 | PATH. The Haskell Platform is one way that provides this. 57 | 58 | Change to the directory where you downloaded the intro_to_parsing 59 | source files (which will contain the GettingStarted.lhs file). Then 60 | you can set up a cabal sandbox and be ready to work with the code by 61 | running the following commands in that directory: 62 | 63 | ``` 64 | cabal v1-update 65 | cabal v1-sandbox init 66 | cabal v1-install parsec HUnit 67 | cabal v1-repl 68 | ``` 69 | 70 | Now you will get the ghci prompt. Type in ':l GettingStarted.lhs'. You 71 | can run the parser using a wrapper, enter the following at the ghci 72 | prompt: `regularParse anyChar "a"`. 73 | 74 | Here is a transcript of running ghci via 'cabal repl': 75 | 76 | ``` 77 | $ cabal repl 78 | Warning: The repl command is a part of the legacy v1 style of cabal usage. 79 | 80 | Please switch to using either the new project style and the new-repl command 81 | or the legacy v1-repl alias as new-style projects will become the default in 82 | the next version of cabal-install. Please file a bug if you cannot replicate a 83 | working v1- use case with the new-style commands. 84 | 85 | For more information, see: https://wiki.haskell.org/Cabal/NewBuild 86 | 87 | GHCi, version 8.6.5: http://www.haskell.org/ghc/ :? for help 88 | Prelude> :l GettingStarted.lhs 89 | [1 of 5] Compiling Text.Parsec.String.Char ( Text/Parsec/String/Char.hs, interpreted ) 90 | [2 of 5] Compiling Text.Parsec.String.Combinator ( Text/Parsec/String/Combinator.hs, interpreted ) 91 | [3 of 5] Compiling Text.Parsec.String.Parsec ( Text/Parsec/String/Parsec.hs, interpreted ) 92 | [4 of 5] Compiling FunctionsAndTypesForParsing ( FunctionsAndTypesForParsing.lhs, interpreted ) 93 | [5 of 5] Compiling Main ( GettingStarted.lhs, interpreted ) 94 | Ok, five modules loaded. 95 | *Main> regularParse anyChar "a" 96 | Right 'a' 97 | *Main> 98 | ``` 99 | 100 | You can exit ghci by entering ':quit' or using Ctrl-d. From now on, to 101 | start ghci again, you can just change to the directory with 102 | GettingStarted.lhs and run 'cabal repl'. ghci should have readline 103 | support so you can browse through your command history using up and 104 | down arrow, etc. 105 | 106 | This is the type of `regularParse`. It is wrapper which takes a parser 107 | function such as anyChar, and wraps it so you can parse a string to 108 | either a parse error, or the return value from your parser function: 109 | 110 | ``` 111 | regularParse :: Parser a -> String -> Either ParseError a 112 | ``` 113 | 114 | 115 | Here are some examples of running this parser on various input: 116 | 117 | ``` 118 | *Main> regularParse anyChar "a" 119 | Right 'a' 120 | 121 | *Main> regularParse anyChar "b" 122 | Right 'b' 123 | 124 | *Main> regularParse anyChar "0" 125 | Right '0' 126 | 127 | *Main> regularParse anyChar " " 128 | Right ' ' 129 | 130 | *Main> regularParse anyChar "\n" 131 | Right '\n' 132 | 133 | *Main> regularParse anyChar "aa" 134 | Right 'a' 135 | 136 | *Main> regularParse anyChar "" 137 | Left (line 1, column 1): 138 | unexpected end of input 139 | 140 | *Main> regularParse anyChar " a" 141 | Right ' ' 142 | ``` 143 | 144 | 145 | You can see that if there are no characters, we get an error. 146 | Otherwise, it takes the first character and returns it, and throws 147 | away any trailing characters. The details of the helper function 148 | `regularParse` will come later. 149 | 150 | Here are two alternatives to `regularParse` you can also use for 151 | experimenting for the time being: 152 | 153 | ``` 154 | parseWithEof :: Parser a -> String -> Either ParseError a 155 | ``` 156 | 157 | ``` 158 | parseWithLeftOver :: Parser a -> String -> Either ParseError (a,String) 159 | ``` 160 | 161 | These can be useful when you are not sure if your parser is consuming 162 | all your input string or not. The eof parser will error if you haven't 163 | consumed all the input, and the leftover parser can instead tell you 164 | what was not consumed from the input. 165 | 166 | ``` 167 | *Main> regularParse anyChar "a" 168 | Right 'a' 169 | 170 | *Main> parseWithEof anyChar "a" 171 | Right 'a' 172 | 173 | *Main> parseWithLeftOver anyChar "a" 174 | Right ('a',"") 175 | 176 | *Main> *Main> regularParse anyChar "" 177 | Left (line 1, column 1): 178 | unexpected end of input 179 | 180 | *Main> parseWithEof anyChar "" 181 | Left (line 1, column 1): 182 | unexpected end of input 183 | 184 | *Main> parseWithLeftOver anyChar "" 185 | Left (line 1, column 1): 186 | unexpected end of input 187 | 188 | *Main> regularParse anyChar "aa" 189 | Right 'a' 190 | 191 | *Main> parseWithEof anyChar "aa" 192 | Left (line 1, column 2): 193 | unexpected 'a' 194 | expecting end of input 195 | 196 | *Main> parseWithLeftOver anyChar "aa" 197 | Right ('a',"a") 198 | 199 | *Main> parseWithLeftOver anyChar "abc" 200 | Right ('a',"bc") 201 | 202 | ``` 203 | 204 | You can use these functions and ghci to experiment. Try running all 205 | the parsers in ghci on various input strings as you work through the 206 | document to get a good feel for all the different features. Tip: you 207 | can also write the parsers inline in the function call, for example: 208 | 209 | ``` 210 | *Main> regularParse (many1 digit) "1" 211 | Right "1" 212 | 213 | *Main> regularParse (many1 digit) "122" 214 | Right "122" 215 | ``` 216 | 217 | This can be used to quickly try out new ad hoc parsing code. 218 | 219 | == Type signatures 220 | 221 | The real Parsec functions have quite complex type signatures. This 222 | makes a lot of things very tricky before you understand them, and can 223 | make some of the error messages you'll see really difficult to 224 | understand. I've created some wrapper modules, which set the types of 225 | all the functions from Parsec we use to be much more restricted. This 226 | will make the types easy to understand, and reduce the amount of 227 | tricky to understand compiler errors you get. You can use this 228 | approach when writing your own parser code with Parsec. These wrapper 229 | modules are created with the following name pattern: 230 | `Text.Parsec.Char` -> `Text.Parsec.String.Char`. 231 | 232 | Later on, we will look at the general types in more detail. 233 | 234 | == Text.Parsec.Char 235 | 236 | Let's go through some of the functions in `Text.Parsec.Char` module from 237 | the Parsec package. The haddock is available here: 238 | . 239 | 240 | Here is the `satisfy` function, with its full type signature. 241 | 242 | ``` 243 | satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char 244 | ``` 245 | 246 | This is one of the main primitive functions in Parsec. This looks at 247 | the next character from the current input, and if the function (`Char 248 | -> Bool`) returns true for this character, it 'pops' it from the input 249 | and returns it. In this way, the current position in the input string 250 | is tracked behind the scenes. 251 | 252 | In the simplified type wrappers, the `satisfy` function's type is this: 253 | 254 | ``` 255 | satisfy :: (Char -> Bool) -> Parser Char 256 | ``` 257 | 258 | This makes it a bit clearer what it is doing. All the functions in 259 | `Text.Parsec.Char` are reproduced in the local `Text.Parsec.String.Char` 260 | module with simplified types 261 | (). 262 | 263 | Here are some examples of satisfy in action. 264 | 265 | ``` 266 | *Main> parseWithEof (satisfy (=='a')) "a" 267 | Right 'a' 268 | 269 | *Main> parseWithEof (satisfy (=='b')) "a" 270 | Left (line 1, column 1): 271 | unexpected "a" 272 | 273 | *Main> parseWithEof (satisfy (`elem` "abc")) "a" 274 | Right 'a' 275 | 276 | *Main> parseWithEof (satisfy (`elem` "abc")) "d" 277 | Left (line 1, column 1): 278 | unexpected "d" 279 | 280 | *Main> parseWithEof (satisfy isDigit) "d" 281 | Left (line 1, column 1): 282 | unexpected "d" 283 | 284 | *Main> parseWithEof (satisfy isDigit) "1" 285 | Right '1' 286 | ``` 287 | 288 | You can see that it is easy to use `==`, or `elem` or one of the 289 | functions from the Data.Char module. 290 | 291 | If you look at the docs on hackage 292 | , 293 | you can view the source. The implementations of most of the functions 294 | in `Text.Parsec.Char` are straightforward. I recommend you look at the 295 | source for all of these functions. 296 | 297 | You can see in the source that the `satisfy` function is a little more 298 | primitive than the other functions. 299 | 300 | Here is the parser we used above in the `anyChar` parser: 301 | 302 | ``` 303 | anyChar :: Parser Char 304 | ``` 305 | 306 | If you look at the source via the haddock link above, you can see it 307 | uses `satisfy`. 308 | 309 | Here are some other simple wrappers of `satisfy` from 310 | `Text.Parsec.Char` which use different validation functions. 311 | 312 | The `char` parser parses a specific character which you supply: 313 | 314 | ``` 315 | char :: Char -> Parser Char 316 | ``` 317 | 318 | ``` 319 | *Main> regularParse (char 'a') "a" 320 | Right 'a' 321 | 322 | *Main> regularParse (char 'a') "b" 323 | Left (line 1, column 1): 324 | unexpected "b" 325 | expecting "a" 326 | ``` 327 | 328 | These parsers all parse single hardcoded characters 329 | 330 | ``` 331 | space :: Parser Char 332 | newline :: Parser Char 333 | tab :: Parser Char 334 | ``` 335 | 336 | They all return a `Char`. You might be able to guess what `Char` each 337 | of them returns, you can double check your intuition using ghci. 338 | 339 | These parser all parse one character from a hardcoded set of 340 | characters: 341 | 342 | ``` 343 | upper :: Parser Char 344 | lower :: Parser Char 345 | alphaNum :: Parser Char 346 | letter :: Parser Char 347 | digit :: Parser Char 348 | hexDigit :: Parser Char 349 | octDigit :: Parser Char 350 | ``` 351 | 352 | In these cases, the return value is less redundant. 353 | 354 | `oneOf` and `noneOf` parse any of the characters in the given list 355 | 356 | ``` 357 | oneOf :: [Char] -> Parser Char 358 | noneOf :: [Char] -> Parser Char 359 | ``` 360 | 361 | These are just simple wrappers of satisfy using `elem`. 362 | 363 | 364 | You should try all these parsers out in ghci, e.g.: 365 | 366 | ``` 367 | 368 | regularParse space " " 369 | 370 | regularParse upper "A" 371 | 372 | regularParse (char 'b') "B" 373 | 374 | regularParse (oneOf "abc") "c" 375 | 376 | ``` 377 | 378 | Here are the final functions in `Text.Parsec.Char`: 379 | 380 | `string` matches a complete string, one character at a time. I think 381 | the implementation of this function is like it is for efficiency when 382 | parsing from, e.g., `Data.Text.Text`, instead of `String`, but I'm not 383 | sure. We will skip the detailed explanation of the implementation for 384 | now. 385 | 386 | ``` 387 | string :: String -> Parser String 388 | ``` 389 | 390 | ``` 391 | *Main> regularParse (string "one") "one" 392 | Right "one" 393 | 394 | *Main> regularParse (string "one") "two" 395 | Left (line 1, column 1): 396 | unexpected "t" 397 | expecting "one" 398 | ``` 399 | 400 | Here is the `spaces` parser, which, if you look at the source, you can 401 | see uses a combinator (`skipMany`). We will cover this combinator 402 | shortly. 403 | 404 | ``` 405 | spaces :: Parser () 406 | ``` 407 | 408 | ``` 409 | *Main> regularParse spaces "" 410 | Right () 411 | 412 | *Main> regularParse spaces " " 413 | Right () 414 | 415 | *Main> regularParse spaces " " 416 | Right () 417 | 418 | *Main> regularParse spaces " a " 419 | Right () 420 | 421 | *Main> regularParse spaces "a a " 422 | Right () 423 | ``` 424 | 425 | It always succeeds. 426 | 427 | == A couple of helper executables 428 | 429 | Here are two exes which you can use to parse either a string or a file 430 | to help you experiment. This will save you having to figure out how to 431 | write this boilerplate until later. 432 | 433 | 434 | 435 | 436 | 437 | Now you can easily experiment using ghci, or with a string on the 438 | command line, or by putting the text to parse into a file and parsing 439 | that. 440 | -------------------------------------------------------------------------------- /IntroductionToCombinators.lhs: -------------------------------------------------------------------------------- 1 | 2 | = Introduction to combinators 3 | 4 | In this tutorial we will develop a parser for a very simple expression 5 | language, and start learning about the set of combinators which comes 6 | with Parsec. 7 | 8 | > import Text.Parsec (ParseError) 9 | > import Text.Parsec.String (Parser) 10 | > import Text.Parsec.String.Parsec (try) 11 | > import Text.Parsec.String.Char (oneOf, char, digit 12 | > ,string, letter, satisfy) 13 | > import Text.Parsec.String.Combinator (many1, choice, chainl1, between) 14 | > import Control.Applicative ((<$>), (<*>), (<$), (<*), (*>), (<|>), many) 15 | > import Control.Monad (void, ap) 16 | > import Data.Char (isLetter, isDigit) 17 | > import FunctionsAndTypesForParsing 18 | 19 | Let's create a very simple expression language: 20 | 21 | > data SimpleExpr = Num Integer 22 | > | Var String 23 | > | Add SimpleExpr SimpleExpr 24 | > | Parens SimpleExpr 25 | > deriving (Eq,Show) 26 | 27 | It's a bit simple and almost useless at the moment, but we will expand 28 | on this a lot in later tutorials. 29 | 30 | Here are some examples: 31 | 32 | > simpleExprExamples :: [(String,SimpleExpr)] 33 | > simpleExprExamples = 34 | > [("a", Var "a") 35 | > ,("1", Num 1) 36 | > ,("2 + 3", Add (Num 2) (Num 3)) 37 | > ,("(42)", Parens (Num 42))] 38 | 39 | TODO: some more complex examples 40 | 41 | Let's write a simple parser for these, and introduce a few things 42 | along the way. 43 | 44 | We will write a parser for each constructor separately, then look at 45 | how we can write a parser for all of them together. 46 | 47 | ==== Num 48 | 49 | To parse a number, we need to parse one or more digits, and then read 50 | the resulting string. We can use the combinator 'many1' to help with 51 | this. We will also use do notation. 52 | 53 | > num :: Parser SimpleExpr 54 | > num = do 55 | > n <- many1 digit 56 | > return (Num (read n)) 57 | 58 | Let's try it out. TODO: examples 59 | 60 | How does it work? First, we parse one or more (many1) digits (digit), 61 | and give the result the name 'n'. Then we convert the string to an 62 | integer using read, and wrap it in a Num constructor. 63 | 64 | The many1 function's type looks like this: 65 | 66 | ``` 67 | many1 :: Parser a -> Parser [a] 68 | ``` 69 | 70 | It applies the parser given one or more times, returning the result. 71 | 72 | TODO: example show what happens when you use 'many' instead of 'many1' 73 | 74 | ==== Var 75 | 76 | For var, we have to decide on a syntax for the identifiers. Let's go 77 | for a common choice: identifiers must start with a letter or 78 | underscore, and then they can be followed by zero or more letters, 79 | underscores or digits in any combination. 80 | 81 | > var :: Parser SimpleExpr 82 | > var = do 83 | > fc <- firstChar 84 | > rest <- many nonFirstChar 85 | > return (Var (fc:rest)) 86 | > where 87 | > firstChar = satisfy (\a -> isLetter a || a == '_') 88 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 89 | 90 | This time, we create two helper parsers: firstChar - which parses a 91 | letter or underscore, and nonFirstChar = which parses a digit, letter 92 | or underscore. This time, we use the 'many' function instead of 93 | 'many1': TODO - demonstrate why using examples only. 94 | 95 | > add :: Parser SimpleExpr 96 | > add = do 97 | > e0 <- num 98 | > void $ char '+' 99 | > e1 <- num 100 | > return (Add e0 e1) 101 | 102 | There are two simplifications here. We don't have a general simple 103 | expression parser yet, so it only supports parsing adding two number 104 | literals, and we don't deal with whitespace yet, so you must write 105 | "1+2" and not "1 + 2". We will deal with both of these issues below. 106 | 107 | We used void to ignore the return of "char '+'". This is not required, 108 | but supresses a warning which you should get (since you are using 109 | -Wall, right?) and I think it is also good style to explicitly say 110 | that the result is being ignored. 111 | 112 | To use -Wall in ghci, enter the following at the prompt: 113 | 114 | ``` 115 | *Main> :set -Wall 116 | ``` 117 | 118 | Try it out, then replace the line 119 | 120 | ``` 121 | void $ char '+' 122 | ``` 123 | 124 | with 125 | 126 | ``` 127 | char '+' 128 | ``` 129 | 130 | And check you see the warning. Another way of avoiding the warning is 131 | to write this: 132 | 133 | ``` 134 | _ <- char '+' 135 | ``` 136 | 137 | ==== parens 138 | 139 | > parens :: Parser SimpleExpr 140 | > parens = do 141 | > void $ char '(' 142 | > e <- num 143 | > void $ char ')' 144 | > return (Parens e) 145 | 146 | The same two issues from the 'add' parser apply here: whitespace and 147 | lack of a general simple expression parser, so it just uses the num 148 | parser again instead. Now we will tackle the whitespace issue. 149 | 150 | === whitespace and lexeme parsing 151 | 152 | Here is a parser which will skip zero or more whitespace characters. 153 | 154 | > whiteSpace :: Parser () 155 | > whiteSpace = void $ many $ oneOf " \n\t" 156 | 157 | In the original parsec documentation, one of the concepts mentioned is 158 | the idea of 'lexeme' parsing. This is a style in which every token 159 | parser should also consume and ignore any trailing whitespace. This is 160 | a simple convention which with a bit of care allows skipping 161 | whitespace exactly once wherever it needs to be skipped. To complete 162 | the lexeme style, we should also always skip leading whitespace at the 163 | top level only. 164 | 165 | > parseWithWhitespace :: Parser a -> String -> Either ParseError a 166 | > parseWithWhitespace p = parseWithEof wrapper 167 | > where 168 | > wrapper = do 169 | > whiteSpace 170 | > p 171 | 172 | the wrapper function can also use (>>) to make it a bit shorter: 173 | 174 | ``` 175 | wrapper = whiteSpace >> p 176 | ``` 177 | 178 | Here is a shorter version of this function using (>>): 179 | 180 | > parseWithWhitespace' :: Parser a -> String -> Either ParseError a 181 | > parseWithWhitespace' p = parseWithEof (whiteSpace >> p) 182 | 183 | Here is the num parser rewritten in the lexeme style: 184 | 185 | > lexeme :: Parser a -> Parser a 186 | > lexeme p = do 187 | > x <- p 188 | > whiteSpace 189 | > return x 190 | 191 | TODO: review the placement of the function 'lexeme' in all the code 192 | below. Maybe something could be said about all the places that it can 193 | be put. 194 | 195 | > num' :: Parser SimpleExpr 196 | > num' = lexeme $ do 197 | > n <- many1 digit 198 | > return (Num (read n)) 199 | 200 | Here it is in action: 201 | 202 | ``` 203 | *Main Data.List> parseWithEof num "1" 204 | Right (Num 1) 205 | 206 | *Main Data.List> parseWithEof num " 1" 207 | Left (line 1, column 1): 208 | unexpected " " 209 | expecting digit 210 | 211 | *Main Data.List> parseWithEof num "1 " 212 | Left (line 1, column 2): 213 | unexpected ' ' 214 | expecting digit or end of input 215 | 216 | *Main Data.List> parseWithEof num " 1 " 217 | Left (line 1, column 1): 218 | unexpected " " 219 | expecting digit 220 | 221 | *Main Data.List> parseWithWhitespace num' "1" 222 | Right (Num 1) 223 | 224 | *Main Data.List> parseWithWhitespace num' " 1" 225 | Right (Num 1) 226 | 227 | *Main Data.List> parseWithWhitespace num' "1 " 228 | Right (Num 1) 229 | 230 | *Main Data.Lst> parseWithWhitespace num' " 1 " 231 | Right (Num 1) 232 | ``` 233 | 234 | Here are the other functions in lexeme style. 235 | 236 | > var' :: Parser SimpleExpr 237 | > var' = lexeme $ do 238 | > fl <- firstChar 239 | > rest <- many nonFirstChar 240 | > return (Var (fl:rest)) 241 | > where 242 | > firstChar = satisfy (\a -> isLetter a || a == '_') 243 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 244 | 245 | > add' :: Parser SimpleExpr 246 | > add' = do 247 | > e0 <- num' 248 | > void $ lexeme $ char '+' 249 | > e1 <- num' 250 | > return (Add e0 e1) 251 | 252 | > parens' :: Parser SimpleExpr 253 | > parens' = do 254 | > void $ lexeme $ char '(' 255 | > e <- num' 256 | > void $ lexeme $ char ')' 257 | > return (Parens e) 258 | 259 | In this style, you have to be slightly careful to make sure you call 260 | whitespace at the right points. 261 | 262 | Let's try and implement the simpleExpr parser. We can use a function 263 | called 'choice': 264 | 265 | > numOrVar :: Parser SimpleExpr 266 | > numOrVar = choice [num', var'] 267 | 268 | It tries each parser one at a time, finishing with the first one that 269 | succeeds. There are some more details about this later on. 270 | 271 | Here is another way to write the numOrVar parser: 272 | 273 | > numOrVar' :: Parser SimpleExpr 274 | > numOrVar' = num' <|> var' 275 | 276 | In general, you can write 'choice [p0, p1, p2, ...]' as 'p0 <|> p1 <|> 277 | p2 <|> ...'. 278 | 279 | TODO: a bunch of examples 280 | 281 | Here is the first version of the simpleExpr parser: 282 | 283 | > simpleExpr :: Parser SimpleExpr 284 | > simpleExpr = choice [num', var', add', parens'] 285 | 286 | TODO: a bunch of examples 287 | 288 | It works well for some of the parsers, but fails with add': the num' 289 | always partially succeeds first, then fails, so the add' is never 290 | tried. We can rearrange this parser like this: 291 | 292 | > simpleExpr' :: Parser SimpleExpr 293 | > simpleExpr' = choice [add', num', var', parens'] 294 | 295 | TODO: examples again 296 | 297 | We have another problem now: when we start parsing the add', see a 298 | num, then fail, it gives up completely. This is because the 'choice' 299 | function (and <|>) will only try the next parser if the parser that 300 | failed consumed no input before failing. 301 | 302 | TODO: show examples with satify and choice on 1 char match/no match, 303 | and two char match/no match. 304 | 305 | Here is one way to fix it: 306 | 307 | > simpleExpr'' :: Parser SimpleExpr 308 | > simpleExpr'' = choice [try add', num', var', parens'] 309 | 310 | The try function implements backtracking. When this is used in a 311 | choice like this, it means that if the add' parser fails, it will undo 312 | the consumed input and carry on with the next option, instead of 313 | failing completely. If there is another place using try higher up in 314 | the call stack, then we will continue there, otherwise the whole parse 315 | will fail immediately. 316 | 317 | The same happens with <|>, we can implement the simpleExpr parser like 318 | this also: 319 | 320 | > simpleExpr''' :: Parser SimpleExpr 321 | > simpleExpr''' = try add' <|> num' <|> var' <|> parens' 322 | 323 | TODO: show the examples all working 324 | 325 | Now we can make 'parens' and 'add' use a general simple expression 326 | parser. Parens is simple: 327 | 328 | > parens'' :: Parser SimpleExpr 329 | > parens'' = do 330 | > void $ lexeme $ char '(' 331 | > e <- simpleExpr''' 332 | > void $ lexeme $ char ')' 333 | > return (Parens e) 334 | 335 | There is a problem implementing 'add' in the same way: 336 | 337 | > add'' :: Parser SimpleExpr 338 | > add'' = do 339 | > e0 <- simpleExpr''' 340 | > void $ lexeme $ char '+' 341 | > e1 <- simpleExpr''' 342 | > return (Add e0 e1) 343 | 344 | It will never return since it calls simpleExpr''' which calls add'' 345 | again. 346 | 347 | Let's look at another problem: 348 | 349 | ``` 350 | *Main> parseWithWhitespace simpleExpr''' " 1 + 1 + 1" 351 | Left (line 1, column 8): 352 | unexpected '+' 353 | expecting end of input 354 | ``` 355 | 356 | Our parser will only parse one operator, and not a chain of them. 357 | 358 | Here is one way to solve it: 359 | 360 | > simpleExpr4 :: Parser SimpleExpr 361 | > simpleExpr4 = do 362 | > e <- term 363 | > maybeAddSuffix e 364 | > where 365 | > maybeAddSuffix e = addSuffix e <|> return e 366 | > addSuffix e0 = do 367 | > void $ lexeme $ char '+' 368 | > e1 <- term 369 | > maybeAddSuffix (Add e0 e1) 370 | > term = num' <|> var' <|> parens' 371 | 372 | TODO: explain how this works in much more detail 373 | 374 | This forces our Add operator to be left associative. This also solves 375 | the previous problem with add calling simpleExpr recursively. There is 376 | lots of discussion about these issues in Parsing theory documents you 377 | can find online, etc.. 378 | 379 | == general simple expression parser 380 | 381 | Here is the all the parser code written out again for clarity. 382 | 383 | > numD :: Parser SimpleExpr 384 | > numD = lexeme $ do 385 | > n <- many1 digit 386 | > return $ Num $ read n 387 | 388 | > varD :: Parser SimpleExpr 389 | > varD = lexeme $ do 390 | > fl <- firstChar 391 | > rest <- many nonFirstChar 392 | > return $ Var (fl:rest) 393 | > where 394 | > firstChar = satisfy (\a -> isLetter a || a == '_') 395 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 396 | 397 | > parensD :: Parser SimpleExpr 398 | > parensD = do 399 | > void $ lexeme $ char '(' 400 | > e <- simpleExprD 401 | > void $ lexeme $ char ')' 402 | > return $ Parens e 403 | 404 | > simpleExprD :: Parser SimpleExpr 405 | > simpleExprD = do 406 | > e <- term 407 | > maybeAddSuffix e 408 | > where 409 | > maybeAddSuffix e = 410 | > choice [addSuffix e 411 | > ,return e] 412 | > addSuffix e0 = do 413 | > void $ lexeme $ char '+' 414 | > e1 <- term 415 | > maybeAddSuffix (Add e0 e1) 416 | > term = numD <|> varD <|> parensD 417 | 418 | 419 | === Testing with the examples 420 | 421 | TODO: write a little manual tester that accepts a parser and a list of 422 | examples, and checks they all parse correctly. 423 | 424 | === Testing with quickcheck 425 | 426 | Let's see if we can check with quickcheck. It's a bit tricky testing 427 | parsers in this way, but one way to do something useful is to generate 428 | random asts, convert them to concrete syntax, parse them, and check 429 | the result. We can write a simple 'pretty printer' to convert an ast 430 | to concrete syntax. 431 | 432 | ==== a pretty printer 433 | 434 | TODO: a really simple pretty printer just pasting strings together, no 435 | layout. 436 | 437 | ==== the quick check code 438 | 439 | TODO: write a quickcheck property and arbitary instance and show 440 | running it at the ghci prompt 441 | 442 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2019, Jake Wheat 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Jake Wheat nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /ParseFile.lhs: -------------------------------------------------------------------------------- 1 | 2 | = Parsing a file executable 3 | 4 | This is the example exe file which will parse a string from the file 5 | given on the command line. You can substitute your own parser for the 6 | parser given below. 7 | 8 | 9 | > import System.Environment 10 | > import Text.Parsec 11 | > import Text.Parsec.String 12 | > import Control.Monad 13 | 14 | > main :: IO () 15 | > main = do 16 | > a <- getArgs 17 | > case a of 18 | > [str] -> parseFromFile myParser str >>= either print print 19 | > _ -> error "please pass one argument with the file containing the text to parse" 20 | 21 | This is the parser which you can replace with your own code: 22 | 23 | > myParser :: Parser () 24 | > myParser = void $ string "correct" 25 | 26 | Here is an example of running this program: 27 | 28 | $ echo x > source_text && runhaskell ParseFile.lhs source_text 29 | "source_text" (line 1, column 1): 30 | unexpected "x" 31 | expecting "correct" 32 | $ echo correct > source_text && runhaskell ParseFile.lhs source_text 33 | () 34 | -------------------------------------------------------------------------------- /ParseSQLQueries.lhs: -------------------------------------------------------------------------------- 1 | 2 | = Parsing a file with SQL queries 3 | 4 | This is an example exe which parses a list of SQL queries in a file 5 | separated by semi colons. 6 | 7 | > import System.Environment 8 | > import Text.Parsec (ParseError, errorPos, sourceName, sourceLine, sourceColumn) 9 | > import Text.Parsec.String 10 | > import Text.Parsec.String.Char 11 | > import Text.Parsec.String.Combinator 12 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many) 13 | > import Control.Monad 14 | > import SimpleSQLQueryParser0 15 | > import Data.List 16 | > import qualified PrettyPrinting0 as P 17 | 18 | > main :: IO () 19 | > main = do 20 | > a <- getArgs 21 | > case a of 22 | > [str] -> parseFromFile myParser str 23 | > >>= either (putStrLn . showError) 24 | > (putStrLn . intercalate "\n" . map P.prettyQueryExpr) 25 | > _ -> error "please pass one argument with the file containing the queries to parse" 26 | 27 | > myParser :: Parser [QueryExpr] 28 | > myParser = whitespace 29 | > *> sepBy1 queryExpr semi 30 | > <* optional semi 31 | > <* eof 32 | > where semi = void $ lexeme $ char ';' 33 | 34 | > showError :: ParseError -> String 35 | > showError e = 36 | > let p = errorPos e 37 | > in sourceName p ++ ":" ++ show (sourceLine p) ++ ":" 38 | > ++ show (sourceColumn p) ++ ":\n" 39 | > ++ show e 40 | -------------------------------------------------------------------------------- /ParseString.lhs: -------------------------------------------------------------------------------- 1 | 2 | = Parsing a string executable 3 | 4 | This is the example exe file which will parse a string given on the 5 | command line. You can substitute your own parser for the parser given 6 | below. 7 | 8 | 9 | > import System.Environment 10 | > import Text.Parsec 11 | > import Text.Parsec.String 12 | > import Control.Monad 13 | 14 | > main :: IO () 15 | > main = do 16 | > a <- getArgs 17 | > case a of 18 | > [str] -> either print print $ parse myParser "" str 19 | > _ -> error "please pass one argument with the string to parse" 20 | 21 | This is the parser which you can replace with your own code: 22 | 23 | > myParser :: Parser () 24 | > myParser = void $ string "correct" 25 | 26 | Here is an example of running this program: 27 | 28 | ``` 29 | $ runhaskell ParseString.lhs x 30 | (line 1, column 1): 31 | unexpected "x" 32 | expecting "correct" 33 | 34 | $ runhaskell ParseString.lhs correct 35 | () 36 | ``` 37 | -------------------------------------------------------------------------------- /ParsingTpch.lhs: -------------------------------------------------------------------------------- 1 | 2 | = Parsing TPC-H queries 3 | 4 | In this file we are going to get the TPC-H queries parsing. This can 5 | be an example of taking a parser, identifying some missing features, 6 | and adding them. The TPC-H queries are slightly more complex than 7 | trivial, and will also serve to exercise the human readable layout 8 | feature of the pretty printer. 9 | 10 | Let's try the parser out on the TPC-H queries. 11 | 12 | ``` 13 | ``` 14 | 15 | Summary of errors so far: 16 | q1: typed literal: type_name 'literal value' 17 | q2: scalar subquery 18 | q3: typed literal 19 | q4: typed literal 20 | q5: typed literal 21 | q6: typed literal 22 | q7: ?? 23 | q8: extract?? 24 | q9: extract ?? 25 | q10: typed literal 26 | q11: scalar sub query 27 | q12: ?? 28 | q13: not like?? 29 | q14: decimal literal 30 | q15: cte 31 | q16: count distinct 32 | q17: decimal literal 33 | q18: in subquery 34 | q19: in literal list 35 | q20: in subquery 36 | q21: exists subquery 37 | q22: substring 38 | -------------------------------------------------------------------------------- /PrettyPrinting0.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[pretty-printing]] 3 | = Pretty printing 4 | 5 | Here is a pretty printer for the parser version 0. 6 | 7 | It uses Text.PrettyPrint module. I think this module is a bit out of 8 | date these days, it would be nice to update this code to a more 9 | fashionable pretty printing library. I haven't put much 10 | comments/explanation here. 11 | 12 | > module PrettyPrinting0 where 13 | 14 | > import Prelude hiding ((<>)) 15 | > import SimpleSQLQueryParser0 (ValueExpr(..), QueryExpr(..), TableRef(..) 16 | > ,JoinType(..), JoinCondition(..)) 17 | > import qualified SimpleSQLQueryParser0 as S 18 | > import Text.PrettyPrint (render, vcat, text, (<>), (<+>), empty, parens, 19 | > nest, Doc, punctuate, comma, sep, quotes, 20 | > doubleQuotes, hsep) 21 | > import Data.Maybe (maybeToList, catMaybes) 22 | > import FunctionsAndTypesForParsing 23 | > --import Text.Parsec.String.Combinator (eof) 24 | > import qualified Test.HUnit as H 25 | > import Text.Parsec (parse, ParseError) 26 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many) 27 | > --import Text.Parsec.String (Parser) 28 | 29 | The basic concept in this pretty printer is we convert the ast into 30 | `Doc` values, then convert these into a string. We have a bunch of 31 | functions to convert `String` to `Doc`s, and combine these docs with 32 | different layouts. 33 | 34 | By using a pretty printer library, we can get human readable source 35 | very easily compared with trying to convert directly to strings 36 | ourselves. 37 | 38 | == api 39 | 40 | > prettyQueryExpr :: QueryExpr -> String 41 | > prettyQueryExpr = render . queryExpr 42 | 43 | == value expressions 44 | 45 | > valueExpr :: ValueExpr -> Doc 46 | > valueExpr (StringLit s) = quotes $ text s 47 | > valueExpr (NumLit i) = text $ show i 48 | > valueExpr (Iden s) = text s 49 | > valueExpr (DIden q i) = text q <> text "." <> text i 50 | > valueExpr Star = text "*" 51 | > valueExpr (DStar q) = text q <> text "." <> text "*" 52 | > valueExpr (App f es) = text f <> parens (commaSep $ map valueExpr es) 53 | > valueExpr (PrefOp op e) = sep[text op, valueExpr e] 54 | > valueExpr (BinOp e0 op e1) = sep [valueExpr e0, text op, valueExpr e1] 55 | > valueExpr (Case test whens els) = 56 | > sep [text "case" <+> maybe empty valueExpr test 57 | > ,nest 5 $ sep (map wh whens 58 | > ++ [maybe empty 59 | > (\e -> text "else" <+> valueExpr e) 60 | > els]) 61 | > ,text "end"] 62 | > where wh (w,t) = sep [text "when" <+> valueExpr w 63 | > ,text "then" <+> valueExpr t] 64 | > valueExpr (Parens e) = parens $ valueExpr e 65 | 66 | == query expressions 67 | 68 | > queryExpr :: QueryExpr -> Doc 69 | > queryExpr (Select sl fr wh gb hv ob) = sep 70 | > [text "select" <+> nest 7 (commaSep $ map selectItem sl) 71 | > -- from 72 | > ,ml fr $ \f -> text "from" <+> nest 7 (commaSep $ map tref f) 73 | > -- where 74 | > ,me wh $ \w -> text "where" <+> nest 6 (valueExpr w) 75 | > -- group by 76 | > ,ml gb $ \g -> text "group by" <+> nest 9 (commaSep $ map valueExpr g) 77 | > -- having 78 | > ,me hv $ \h -> text "having" <+> nest 6 (valueExpr h) 79 | > -- order by 80 | > ,ml ob $ \o -> text "order by" <+> nest 9 (commaSep $ map valueExpr o) 81 | > ] 82 | > where 83 | > selectItem (e,a) = valueExpr e <+> me a (\a' -> text "as" <+> text a') 84 | > 85 | > tref (TRSimple t) = text t 86 | > tref (TRParens t) = parens $ tref t 87 | > tref (TRAlias t a) = tref t <+> text "as" <+> text a 88 | > tref (TRQueryExpr q) = parens $ queryExpr q 89 | > tref (TRJoin t0 jt t1 jc) = sep 90 | > [tref t0 91 | > ,joinName jt jc <+> tref t1 92 | > ,case jc of 93 | > Just (JoinOn e) -> text "on" <+> valueExpr e 94 | > Just (JoinUsing is) -> text "using" <+> parens (commaSep $ map text is) 95 | > Just JoinNatural -> empty 96 | > Nothing -> empty] 97 | > joinName jt jc = 98 | > hsep [case jc of 99 | > Just JoinNatural -> text "natural" 100 | > _ -> empty 101 | > ,case jt of 102 | > JoinInner -> text "inner join" 103 | > JoinCross -> text "cross join" 104 | > JoinLeft -> text "left join" 105 | > JoinRight -> text "right join" 106 | > JoinFull -> text "full join"] 107 | > me e r = maybe empty r e 108 | > ml [] _ = empty 109 | > ml l r = r l 110 | 111 | == helpers 112 | 113 | > commaSep :: [Doc] -> Doc 114 | > commaSep = sep . punctuate comma 115 | 116 | Have a look at the haddock for this module and see if you can work out 117 | how the code above works. 118 | 119 | ``` 120 | *PrettyPrinting0> either (error . show) valueExpr (parseWithEof (S.valueExpr []) "a and b") 121 | a and b 122 | 123 | *PrettyPrinting0> either (error . show) queryExpr (parseWithEof S.queryExpr "select a from t inner join u using(a,b)") 124 | select a from t inner join u using (a, b) 125 | ``` 126 | 127 | == tests 128 | 129 | Now we can do some tests: we take the previous test data, and for each 130 | test add an additional test which pretty prints then parses the 131 | results to see that it is unchanged. 132 | 133 | > makeTest :: (Eq a, Show a) => 134 | > (String -> Either ParseError a) 135 | > -> (a -> String) 136 | > -> (String,a) 137 | > -> H.Test 138 | > makeTest parser pretty (src,expected) = H.TestLabel src $ H.TestCase $ do 139 | > let gote = parser src 140 | > case gote of 141 | > Left e -> H.assertFailure $ show e 142 | > Right got -> do 143 | > H.assertEqual src expected got 144 | > let prsql = pretty got 145 | > gotpretty = parser prsql 146 | > case gotpretty of 147 | > Left e -> H.assertFailure $ "pretty: " ++ prsql ++ "\n" ++ show e 148 | > Right gotp -> H.assertEqual ("pretty: " ++ prsql) expected gotp 149 | 150 | TODO: fix parsing issue 151 | 152 | ``` 153 | *PrettyPrinting0> H.runTestTT $ H.TestList $ map (makeTest S.parseQueryExpr (render . queryExpr)) S.allQueryExprTests 154 | Cases: 26 Tried: 26 Errors: 0 Failures: 0 155 | Counts {cases = 26, tried = 26, errors = 0, failures = 0} 156 | *PrettyPrinting0> H.runTestTT $ H.TestList $ map (makeTest S.parseValueExpr (render . valueExpr)) S.allValueExprTests 157 | ### Failure in: 34:case when a=1 then 2 when a=3 then 4 else 5 end 158 | (line 1, column 11): 159 | unexpected "a" 160 | expecting "--" or "/*" 161 | Cases: 35 Tried: 35 Errors: 0 Failures: 1 162 | Counts {cases = 35, tried = 35, errors = 0, failures = 1} 163 | ``` 164 | -------------------------------------------------------------------------------- /QueryExpressions.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[query-expressions]] 3 | = Query expressions 4 | 5 | We can now start on the 'select' parser. In the SQL standard, it 6 | refers to these things as 'query expressions' to distinguish then from 7 | 'value expressions', so we will reuse this language here. 8 | 9 | The subset of SQL we will support is this: 10 | 11 | TODO: write lots of examples here 12 | 13 | select queries only, no union, intersect or except. No common table 14 | expressions. 15 | 16 | we will support all the value expressions that the value expression 17 | parser above supports 18 | 19 | we will support select lists with optional aliases, with the 'as' 20 | optional in the alias 21 | 22 | we will support the * as in 'select * from t', and the variation 23 | 'select t.* from t', but not the alias version select * as (a,b,c) 24 | from t. 25 | 26 | we support two part dotted identifiers in value expressions, but no 27 | other sort (such as 3-part dotted value expression identifiers, or 28 | schema qualified table names or function names). 29 | 30 | for the from clause, we will only support optional 'from table_name'. 31 | 32 | supports where 33 | 34 | we will support regular group by lists, but not the new group by 35 | options in SQL2003 (group by (), grouping sets, cube, rollup). 36 | 37 | we will support having 38 | 39 | we support order by, with multiple columns, but not explicit asc or 40 | desc , and no 'nulls first' or 'nulls last' syntax. 41 | 42 | 43 | No support for offset and fetch first, or variations. 44 | 45 | > {-# LANGUAGE TupleSections #-} 46 | > module QueryExpressions where 47 | > 48 | > --import Text.Groom (groom) 49 | > --import qualified Text.Parsec as P 50 | > import Text.Parsec.String (Parser) 51 | > import Text.Parsec (try,optionMaybe, optional, sepBy1,option) 52 | > import Control.Applicative ((<$>),(*>),(<*>)) 53 | > import Control.Monad (void,guard) 54 | > --import Debug.Trace 55 | > --import Data.List (intercalate) 56 | > import Data.Maybe () 57 | > import qualified Test.HUnit as H 58 | > import FunctionsAndTypesForParsing 59 | 60 | > import ValueExpressions (ValueExpr(..), valueExpr, identifier, symbol, keyword, comma) 61 | 62 | Here is the datatype for query expressions to get started with. In 63 | this tutorial, we will only support an optional single table in the 64 | from clause, and this will be expanded in the next tutorial. 65 | 66 | TODO: rearrange the from/where/groupby/having/orderby to a separate 67 | datatype which is optional in a select, and the from part is mandatory 68 | in this new type. This follows the standard and is more accurate 69 | syntax. 70 | 71 | > data QueryExpr 72 | > = Select 73 | > {qeSelectList :: [(ValueExpr,Maybe String)] 74 | > ,qeFrom :: Maybe String 75 | > ,qeWhere :: Maybe ValueExpr 76 | > ,qeGroupBy :: [ValueExpr] 77 | > ,qeHaving :: Maybe ValueExpr 78 | > ,qeOrderBy :: [ValueExpr] 79 | > } deriving (Eq,Show) 80 | 81 | Here is a default value which can be used to easily construct query 82 | expression values. 83 | 84 | > makeSelect :: QueryExpr 85 | > makeSelect = Select {qeSelectList = [] 86 | > ,qeFrom = Nothing 87 | > ,qeWhere = Nothing 88 | > ,qeGroupBy = [] 89 | > ,qeHaving = Nothing 90 | > ,qeOrderBy = []} 91 | 92 | == select lists 93 | 94 | Let's start with something simple: 95 | 96 | ``` 97 | select [value expr] 98 | ``` 99 | 100 | TODO: shorten the names of these examples 101 | 102 | > singleSelectItemTests :: [(String,QueryExpr)] 103 | > singleSelectItemTests = 104 | > [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})] 105 | 106 | Here are a couple of wrappers for `symbol` and `keyword` which wrap 107 | them with void. 108 | 109 | > keyword_ :: String -> Parser () 110 | > keyword_ = void . keyword 111 | 112 | > symbol_ :: String -> Parser () 113 | > symbol_ = void . symbol 114 | 115 | > singleSelectItem :: Parser QueryExpr 116 | > singleSelectItem = do 117 | > keyword_ "select" 118 | > e <- valueExpr [] 119 | > return $ makeSelect {qeSelectList = [(e,Nothing)]} 120 | 121 | You can use the old test runner to check these: 122 | 123 | ``` 124 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest singleSelectItem) parseSingleSelectItemTestData 125 | Cases: 1 Tried: 1 Errors: 0 Failures: 0 126 | Counts {cases = 1, tried = 1, errors = 0, failures = 0} 127 | ``` 128 | 129 | Let's rewrite it in the Applicative and mostly point free style. 130 | 131 | > singleSelectItemApplicative :: Parser QueryExpr 132 | > singleSelectItemApplicative = 133 | > (\sl -> makeSelect {qeSelectList = sl}) 134 | > <$> (keyword_ "select" *> (((:[]) . (,Nothing)) <$> valueExpr [])) 135 | 136 | That didn't go so well. Using a wrapper: 137 | 138 | > singleSelectItemApplicative' :: Parser QueryExpr 139 | > singleSelectItemApplicative' = 140 | > ms <$> (keyword_ "select" *> valueExpr []) 141 | > where 142 | > ms e = makeSelect {qeSelectList = [(e,Nothing)]} 143 | 144 | Now let's write something that supports multiple value expressions, 145 | e.g. 146 | 147 | ``` 148 | select 1+2, 3+4; 149 | ``` 150 | 151 | > multipleSelectItemsTests :: [(String,QueryExpr)] 152 | > multipleSelectItemsTests = 153 | > [("select a" 154 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)]}) 155 | > ,("select a,b" 156 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 157 | > ,(Iden "b",Nothing)]}) 158 | > ,("select 1+2,3+4" 159 | > ,makeSelect {qeSelectList = 160 | > [(BinOp (NumLit 1) "+" (NumLit 2),Nothing) 161 | > ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]}) 162 | > ] 163 | 164 | > selectMultipleItems :: Parser QueryExpr 165 | > selectMultipleItems = do 166 | > keyword_ "select" 167 | > es <- commaSep1 (valueExpr []) 168 | > return $ makeSelect {qeSelectList = map (,Nothing) es} 169 | 170 | > commaSep1 :: Parser a -> Parser [a] 171 | > commaSep1 = (`sepBy1` comma) 172 | 173 | === aliases 174 | 175 | We can write names for the columns produced from a select list using 176 | the keyword `as`, and we can miss out the `as`: 177 | 178 | ```sql 179 | select a as a1, b as b1, f(c) as c1; 180 | 181 | -- no as 182 | select a a1, b b1; 183 | ``` 184 | 185 | > selectListTests :: [(String,QueryExpr)] 186 | > selectListTests = 187 | > [("select a as a1, b as b1" 188 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a1") 189 | > ,(Iden "b", Just "b1")]}) 190 | > ,("select a a1, b b1" 191 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a1") 192 | > ,(Iden "b", Just "b1")]}) 193 | > ] ++ multipleSelectItemsTests 194 | > ++ singleSelectItemTests 195 | 196 | Finally, here is the select list parser and the helper for select 197 | items: 198 | 199 | > selectItem0 :: Parser (ValueExpr, Maybe String) 200 | > selectItem0 = (,) <$> valueExpr [] <*> optionMaybe (try alias) 201 | > where alias = optional (keyword_ "as") *> identifier 202 | 203 | > selectList0 :: Parser [(ValueExpr, Maybe String)] 204 | > selectList0 = keyword_ "select" *> commaSep1 selectItem0 205 | 206 | > queryExpr0 :: Parser QueryExpr 207 | > queryExpr0 = mkSelect <$> selectList0 208 | > where mkSelect sl = makeSelect {qeSelectList = sl} 209 | 210 | == from clause 211 | 212 | > from :: Parser String 213 | > from = keyword_ "from" *> identifier 214 | 215 | > fromTests :: [(String,QueryExpr)] 216 | > fromTests = 217 | > [("select a from t" 218 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 219 | > ,qeFrom = (Just "t")})] 220 | 221 | > queryExpr1 :: Parser QueryExpr 222 | > queryExpr1 = mkSelect 223 | > <$> selectList0 224 | > <*> optionMaybe from 225 | > where mkSelect sl fr = makeSelect {qeSelectList = sl 226 | > ,qeFrom = fr} 227 | 228 | ``` 229 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest queryExpr1) (selectListTests ++ fromTests) 230 | ### Failure in: 6:select a from t 231 | (line 1, column 16): 232 | unexpected end of input 233 | expecting digit, letter, "_", "--" or "/*" 234 | Cases: 7 Tried: 7 Errors: 0 Failures: 1 235 | Counts {cases = 7, tried = 7, errors = 0, failures = 1} 236 | ``` 237 | 238 | This is a keyword issue again. We are parsing the `from` as if it was 239 | a column alias and then getting stuck. 240 | 241 | > blackListIdentifier :: [String] -> Parser String 242 | > blackListIdentifier bl = do 243 | > i <- identifier 244 | > guard (i `notElem` bl) 245 | > return i 246 | 247 | > selectItem :: Parser (ValueExpr, Maybe String) 248 | > selectItem = (,) <$> valueExpr [] <*> optionMaybe (try alias) 249 | > where alias = optional (keyword_ "as") *> blackListIdentifier ["from"] 250 | 251 | > selectList :: Parser [(ValueExpr, Maybe String)] 252 | > selectList = keyword_ "select" *> commaSep1 selectItem 253 | 254 | > queryExpr2 :: Parser QueryExpr 255 | > queryExpr2 = mkSelect 256 | > <$> selectList 257 | > <*> optionMaybe from 258 | > where mkSelect sl fr = makeSelect {qeSelectList = sl 259 | > ,qeFrom = fr} 260 | 261 | That did the job for now. 262 | 263 | == where 264 | 265 | The where, group by, having, and order by parsers are simple. 266 | 267 | > whereTests :: [(String,QueryExpr)] 268 | > whereTests = 269 | > [("select a from t where a = 5" 270 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 271 | > ,qeFrom = Just "t" 272 | > ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)}) 273 | > ] 274 | 275 | > whereClause :: Parser ValueExpr 276 | > whereClause = keyword_ "where" *> valueExpr [] 277 | 278 | > queryExpr3 :: Parser QueryExpr 279 | > queryExpr3 = mkSelect 280 | > <$> selectList 281 | > <*> optionMaybe from 282 | > <*> optionMaybe whereClause 283 | > where mkSelect sl fr wh = 284 | > makeSelect {qeSelectList = sl 285 | > ,qeFrom = fr 286 | > ,qeWhere = wh} 287 | 288 | == group by 289 | 290 | > groupByTests :: [(String,QueryExpr)] 291 | > groupByTests = 292 | > [("select a,sum(b) from t group by a" 293 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 294 | > ,(App "sum" [Iden "b"],Nothing)] 295 | > ,qeFrom = Just "t" 296 | > ,qeGroupBy = [Iden "a"] 297 | > }) 298 | > ,("select a,b,sum(c) from t group by a,b" 299 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 300 | > ,(Iden "b",Nothing) 301 | > ,(App "sum" [Iden "c"],Nothing)] 302 | > ,qeFrom = Just "t" 303 | > ,qeGroupBy = [Iden "a",Iden "b"] 304 | > }) 305 | > ] 306 | 307 | > groupByClause :: Parser [ValueExpr] 308 | > groupByClause = keyword_ "group" *> keyword_ "by" 309 | > *> commaSep1 (valueExpr []) 310 | 311 | > queryExpr4 :: Parser QueryExpr 312 | > queryExpr4 = mkSelect 313 | > <$> selectList 314 | > <*> optionMaybe from 315 | > <*> optionMaybe whereClause 316 | > <*> option [] groupByClause 317 | > where mkSelect sl fr wh gr = 318 | > makeSelect {qeSelectList = sl 319 | > ,qeFrom = fr 320 | > ,qeWhere = wh 321 | > ,qeGroupBy = gr} 322 | 323 | == having 324 | 325 | > havingTests :: [(String,QueryExpr)] 326 | > havingTests = 327 | > [("select a,sum(b) from t group by a having sum(b) > 5" 328 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 329 | > ,(App "sum" [Iden "b"],Nothing)] 330 | > ,qeFrom = (Just "t") 331 | > ,qeGroupBy = [Iden "a"] 332 | > ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5) 333 | > }) 334 | > ] 335 | 336 | > having :: Parser ValueExpr 337 | > having = keyword_ "having" *> (valueExpr []) 338 | 339 | > queryExpr5 :: Parser QueryExpr 340 | > queryExpr5 = mkSelect 341 | > <$> selectList 342 | > <*> optionMaybe from 343 | > <*> optionMaybe whereClause 344 | > <*> option [] groupByClause 345 | > <*> optionMaybe having 346 | > where mkSelect sl fr wh gr hv = 347 | > makeSelect {qeSelectList = sl 348 | > ,qeFrom = fr 349 | > ,qeWhere = wh 350 | > ,qeGroupBy = gr 351 | > ,qeHaving = hv} 352 | 353 | Looking nice so far. Did you run the tests for each stage? 354 | 355 | 356 | == order by 357 | 358 | > orderByTests :: [(String,QueryExpr)] 359 | > orderByTests = 360 | > [("select a from t order by a" 361 | > ,ms [Iden "a"]) 362 | > ,("select a from t order by a, b" 363 | > ,ms [Iden "a", Iden "b"]) 364 | > ] 365 | > where 366 | > ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)] 367 | > ,qeFrom = (Just "t") 368 | > ,qeOrderBy = o} 369 | 370 | > orderBy :: Parser [ValueExpr] 371 | > orderBy = keyword_ "order" *> keyword_ "by" 372 | > *> commaSep1 (valueExpr []) 373 | 374 | > queryExpr6 :: Parser QueryExpr 375 | > queryExpr6 = Select 376 | > <$> selectList 377 | > <*> optionMaybe from 378 | > <*> optionMaybe whereClause 379 | > <*> option [] groupByClause 380 | > <*> optionMaybe having 381 | > <*> option [] orderBy 382 | 383 | ``` 384 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest queryExpr6) (selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests) 385 | Cases: 13 Tried: 13 Errors: 0 Failures: 0 386 | Counts {cases = 13, tried = 13, errors = 0, failures = 0} 387 | ``` 388 | 389 | TODO: talk about putting the maybes/default values inside from, where, etc?? 390 | -------------------------------------------------------------------------------- /README.asciidoc: -------------------------------------------------------------------------------- 1 | = Intro to Parsing with Parsec in Haskell 2 | 3 | == Overview 4 | 5 | WIP, a tutorial which demonstrates the basics of Parsec and goes on to 6 | build a SQL query parser. 7 | 8 | You can view this tutorial as HTML online here: 9 | 10 | http://jakewheat.github.io/intro_to_parsing/ 11 | 12 | and you can view the files directly in the github repository here: 13 | 14 | https://github.com/JakeWheat/intro_to_parsing 15 | 16 | === Summary of sections 17 | 18 | // the first link for each section will work in the readme on github, 19 | // the second link is for the rendered html and doesn't work here 20 | 21 | link:GettingStarted.lhs[] 22 | 23 | // <> 24 | 25 | Introduction to parsing with Parsec, including a review of 26 | Text.Parsec.Char functions. 27 | 28 | link:VerySimpleExpressions.lhs[] 29 | 30 | // <> 31 | 32 | Creating a very simple expression language parser, and introducing 33 | some functions from Text.Parsec.Combinator. 34 | 35 | link:ApplicativeStyle.lhs[] 36 | 37 | // <> 38 | 39 | Rewriting the simple expression parser code in a more succinct style. 40 | 41 | link:CombinatorReview.lhs[] 42 | 43 | // <> 44 | 45 | Review and examples of all functions from Text.Parsec.Combinator, and 46 | some from Control.Applicative and Control.Monad. 47 | 48 | link:FunctionsAndTypesForParsing.lhs[] 49 | 50 | // <> 51 | 52 | The utility functions used in the previous tutorials, plus some notes 53 | on types in Parsec. 54 | 55 | link:TextParsecExpr.lhs[] 56 | 57 | // <> 58 | 59 | This covers using the Text.Parsec.Expr for expression parsing with 60 | prefix, postfix and infix operators with fixity. 61 | 62 | link:AnIssueWithTokenParsers.lhs[] 63 | 64 | // <> 65 | 66 | Looks at an issue we have with the way the symbol parser in the 67 | Text.Parsec.Expr tutorial was used, and some possible fixes. 68 | 69 | link:TextParsecPerm.lhs[] 70 | 71 | // <> 72 | 73 | This covers the Text.Parsec.Perm module which is used for parsing 74 | different things in flexible order. 75 | 76 | link:TextParsecToken.lhs[] 77 | 78 | // <> 79 | 80 | This covers Text.Parsec.Token which can be used to create token 81 | parsers easily. 82 | 83 | link:ValueExpressions.lhs[] 84 | 85 | // <> 86 | 87 | This covers building a parser a subset of value expressions from SQL, 88 | which are an extension of the simple expression types and parsers 89 | covered in previous tutorials. 90 | 91 | link:QueryExpressions.lhs[] 92 | 93 | // <> 94 | 95 | This covers building a parser to parse query expressions with select 96 | lists, simple from, where, group by, having and order by. 97 | 98 | link:FromClause.lhs[] 99 | 100 | // <> 101 | 102 | This extend the parser for query expressions to support a from clause 103 | with much more features including joins. 104 | 105 | link:SimpleSQLQueryParser0.lhs[] 106 | 107 | // <> 108 | 109 | Here is the code from ValueExpressions, QueryExpressions and 110 | FromClause plus tests put together and rearranged as a coherent 111 | standalone module. 112 | 113 | link:PrettyPrinting0.lhs[] 114 | 115 | // <> 116 | 117 | This quick module covers a simple pretty printer for our SQL ast. 118 | 119 | link:ErrorMessages.lhs[] 120 | 121 | // <> 122 | 123 | In this document, we will explore error messages with parsec and how 124 | restructuring parser code can lead to better or worse error messages. 125 | 126 | === Going further 127 | 128 | If you are interested in SQL parsing, check out the project to build 129 | a complete SQL parser here: 130 | http://jakewheat.github.io/simple-sql-parser/latest. The parsing code in the 131 | simple-sql-parser project is based on this tutorial code. 132 | 133 | //// 134 | 135 | Later documents 136 | 137 | Additional provisional documents not yet started: 138 | 139 | Parsing TPC-H queries 140 | 141 | We will use the tpch queries as examples to help improve the pretty 142 | printer. First there are a few extra bits of syntax to be able to 143 | parse these queries 144 | 145 | Pretty printing part 2 146 | 147 | some tweaks to the pretty printer to improve the layout for the tpch 148 | queries 149 | 150 | Writing tests 151 | 152 | Here we will take the ad hoc tests and build an organised test suite 153 | with a wrapper for hunit, wrapper for test.framework wrapper and maybe 154 | tasty 155 | 156 | Refactored project + cabal package 157 | 158 | In this tutorial, we will take the sql parser, pretty printer and 159 | tests, and create a complete cabal package. 160 | 161 | TODO: talk about robustness and the casual way the parser has been put 162 | together and the casual way issues have been tackled. 163 | 164 | Writing a command line sql interface 165 | 166 | quick experiment to try to implement the front end for a multiline sql 167 | command line using fake incremental parsing which parsec doesn't 168 | support directly. 169 | 170 | Position annotation 171 | 172 | In this tutorial, we will add position annotation to the parsing, so 173 | that a later stage could, e.g., provide type error messages with the 174 | correct line and column numbers. 175 | 176 | Dialects 177 | 178 | In this tutorial, we will discuss how we can support other SQL dialects 179 | 180 | Separate lexer 181 | 182 | In this tutorial, we will look at creating a proper separate lexer to 183 | see how it is done, and remark on what the tradeoffs seem to be. 184 | 185 | Fixing fixity 186 | 187 | Parsing full SQL expressions is a mess, and trying to do the fixity at 188 | parse time has many downsides. Here is another approach, to ignore 189 | fixity at parse time and fix it in a pass on the ast after parsing. 190 | 191 | Quasiquotes 192 | 193 | In this tutorial, we will create quasiquoters for sql query 194 | expressions and value expressions, and see how powerful this can be 195 | 196 | 197 | Something about syntax highlighting, generating documentation + links? 198 | 199 | //// 200 | 201 | === Extras 202 | 203 | * link:ParseString.lhs[] 204 | 205 | an executable which contains the boilerplate to run a parsec parser on 206 | a string passed as an argument 207 | 208 | * link:ParseFile.lhs[] 209 | 210 | an executable which contains the boilerplate to run a parsec parser on 211 | a file passed as an argument 212 | 213 | Contact: +++jakewheatmail@gmail.com+++ 214 | 215 | License: BSD3 216 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /SimpleSQLQueryParser0.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[simple-sql-query-parser]] 3 | = Simple SQL query parser 4 | 5 | Here is the complete syntax, parser and tests for the value and query 6 | expressions so far as a self contained module 7 | 8 | > module SimpleSQLQueryParser0 where 9 | > 10 | > import Text.Parsec.String (Parser) 11 | > import Text.Parsec.String.Parsec (try) 12 | > import Text.Parsec.String.Char 13 | > import Text.Parsec.String.Combinator 14 | > import Text.Parsec (parse,ParseError) 15 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many) 16 | > import qualified Text.Parsec.String.Expr as E 17 | > import Control.Monad 18 | > --import Data.List (intercalate) 19 | > import Data.Maybe () 20 | > import qualified Test.HUnit as H 21 | > import FunctionsAndTypesForParsing 22 | > import Debug.Trace 23 | 24 | == Supported SQL 25 | 26 | === comments 27 | 28 | ```sql 29 | -- single line comment 30 | /* 31 | multiline 32 | comment 33 | */ 34 | ``` 35 | 36 | The `/* */` comments do not nest. 37 | 38 | === value expressions 39 | 40 | ==== literals 41 | 42 | postive integral literals and string literals with single quote, 43 | without escaping of single quote within string (so there is no way to 44 | create a string literal with a single quote in it). 45 | 46 | ```sql 47 | 1 48 | 500 49 | 'string literal' 50 | ``` 51 | ==== identifiers 52 | 53 | Unquoted identifiers only, an identifier may start with a letter or 54 | underscore, and contain letters, underscores and digits. 55 | 56 | ```sql 57 | a 58 | something 59 | _test_ 60 | a123 61 | ``` 62 | 63 | ==== dotted identifiers 64 | 65 | Supports two part dotted identifiers only. Both parts must parse 66 | according to the rules for regular identifiers. 67 | 68 | ```sql 69 | t.a 70 | something.something_else 71 | ``` 72 | 73 | ==== star 74 | 75 | Star, plus dotted star using the identifier rules for the first part. 76 | 77 | ```sql 78 | * 79 | t.* 80 | ``` 81 | 82 | ==== function application 83 | 84 | The function name must parse as a valid identifier. 85 | 86 | ```sql 87 | f() 88 | g(1) 89 | h(2,'something') 90 | ``` 91 | 92 | ==== operators 93 | 94 | Here is the range of operators supported. 95 | 96 | ```sql 97 | a = b 98 | a > b 99 | a < b 100 | a >= b 101 | a <= b 102 | a != b 103 | a <> b 104 | a and b 105 | a or b 106 | 1 + 2 107 | 1 - 2 108 | 1 * 2 109 | 1 / 2 110 | 'some' || 'thing' 111 | a like b 112 | not a 113 | ``` 114 | 115 | ==== case expression 116 | 117 | ```sql 118 | case a 119 | when 3 then 'got three' 120 | when 5 then 'got five' 121 | else 'neither' 122 | end 123 | ``` 124 | 125 | ```sql 126 | case 127 | when a = 3 then 'a is three' 128 | when b = 4 then 'b is four' 129 | else 'neither' 130 | end 131 | ``` 132 | 133 | The else branch is optional in both cases. 134 | 135 | ==== parentheses 136 | 137 | ```sql 138 | (1 + 2) * 3 139 | ``` 140 | 141 | Parentheses are explicit in the abstract syntax. 142 | 143 | === query expressions 144 | 145 | TODO: examples 146 | 147 | select queries only, no union, intersect or except. No common table 148 | expressions. 149 | 150 | select list aliases, with the 'as' optional in the alias 151 | 152 | 'select * from t' 153 | 'select t.* from t' 154 | but not the alias version 'select * as (a,b,c) from t'. 155 | 156 | from clause 157 | 158 | implicit and explicit joins, including keywords 159 | natural, inner, outer, left, right, full, cross, on and using, plus 160 | parens and simple aliases (e.g. select a from t u, but not select a 161 | from t(a,b)). 162 | 163 | where 164 | 165 | group by lists 166 | but not the new group by options in SQL2003 (group by (), grouping 167 | sets, cube, rollup). 168 | 169 | having 170 | 171 | order by, with multiple columns, but not explicit asc or 172 | desc , and no 'nulls first' or 'nulls last' syntax. 173 | 174 | No support for offset and fetch first, or variations. 175 | 176 | 177 | == Abstract syntax 178 | 179 | > data ValueExpr = StringLit String 180 | > | NumLit Integer 181 | > | Iden String 182 | > | DIden String String -- a.b 183 | > | Star 184 | > | DStar String -- t.* 185 | > | App String [ValueExpr] 186 | > | PrefOp String ValueExpr 187 | > | BinOp ValueExpr String ValueExpr 188 | > | Case (Maybe ValueExpr) -- test value 189 | > [(ValueExpr,ValueExpr)] -- when branches 190 | > (Maybe ValueExpr) -- else value 191 | > | Parens ValueExpr 192 | > deriving (Eq,Show) 193 | 194 | > data QueryExpr 195 | > = Select 196 | > {qeSelectList :: [(ValueExpr,Maybe String)] 197 | > ,qeFrom :: [TableRef] 198 | > ,qeWhere :: Maybe ValueExpr 199 | > ,qeGroupBy :: [ValueExpr] 200 | > ,qeHaving :: Maybe ValueExpr 201 | > ,qeOrderBy :: [ValueExpr] 202 | > } deriving (Eq,Show) 203 | 204 | > makeSelect :: QueryExpr 205 | > makeSelect = Select {qeSelectList = [] 206 | > ,qeFrom = [] 207 | > ,qeWhere = Nothing 208 | > ,qeGroupBy = [] 209 | > ,qeHaving = Nothing 210 | > ,qeOrderBy = []} 211 | 212 | > data TableRef = TRSimple String 213 | > | TRJoin TableRef JoinType TableRef (Maybe JoinCondition) 214 | > | TRParens TableRef 215 | > | TRAlias TableRef String 216 | > | TRQueryExpr QueryExpr 217 | > deriving (Eq,Show) 218 | 219 | > data JoinType = JoinInner | JoinLeft | JoinRight | JoinFull | JoinCross 220 | > deriving (Eq,Show) 221 | 222 | > data JoinCondition = JoinOn ValueExpr 223 | > | JoinUsing [String] 224 | > | JoinNatural 225 | > deriving (Eq,Show) 226 | 227 | == Value expression parsing 228 | 229 | === term components 230 | 231 | > num :: Parser ValueExpr 232 | > num = NumLit <$> integer 233 | 234 | > iden :: [String] -> Parser ValueExpr 235 | > iden blacklist = Iden <$> identifierBlacklist blacklist 236 | 237 | > parensValue :: Parser ValueExpr 238 | > parensValue = Parens <$> parens (valueExpr []) 239 | 240 | > stringLit :: Parser ValueExpr 241 | > stringLit = StringLit <$> stringToken 242 | 243 | > dIden :: Parser ValueExpr 244 | > dIden = DIden <$> identifier <*> (dot *> identifier) 245 | 246 | > star :: Parser ValueExpr 247 | > star = Star <$ symbol "*" 248 | 249 | > dstar :: Parser ValueExpr 250 | > dstar = DStar <$> (identifier <* dot <* symbol "*") 251 | 252 | > app :: Parser ValueExpr 253 | > app = App <$> identifier <*> parens (commaSep $ valueExpr []) 254 | 255 | === case 256 | 257 | > caseValue :: Parser ValueExpr 258 | > caseValue = 259 | > Case 260 | > <$> (keyword "case" *> optionMaybe caseVal) 261 | > <*> many1 whenClause 262 | > <*> optionMaybe elseClause 263 | > <* keyword "end" 264 | > where 265 | > whenClause = (,) <$> (keyword "when" *> caseVal) 266 | > <*> (keyword "then" *> caseVal) 267 | > elseClause = keyword "else" *> caseVal 268 | > caseVal = valueExpr blackList 269 | > blackList = ["case", "when", "then", "else", "end"] 270 | 271 | === term 272 | 273 | > term :: [String] -> Parser ValueExpr 274 | > term blackList = choice [caseValue 275 | > ,try app 276 | > ,try dstar 277 | > ,try dIden 278 | > ,iden blackList 279 | > ,num 280 | > ,parensValue 281 | > ,stringLit 282 | > ,star] 283 | 284 | === operators 285 | 286 | > table :: [[E.Operator ValueExpr]] 287 | > table = [[prefix "-", prefix "+"] 288 | > ,[binary "^" E.AssocLeft] 289 | > ,[binary "*" E.AssocLeft 290 | > ,binary "/" E.AssocLeft 291 | > ,binary "%" E.AssocLeft] 292 | > ,[binary "+" E.AssocLeft 293 | > ,binary "-" E.AssocLeft] 294 | > ,[binary "<=" E.AssocRight 295 | > ,binary ">=" E.AssocRight 296 | > ,binaryK "like" E.AssocNone 297 | > ,binary "!=" E.AssocRight 298 | > ,binary "<>" E.AssocRight 299 | > ,binary "||" E.AssocRight] 300 | > ,[binary "<" E.AssocNone 301 | > ,binary ">" E.AssocNone] 302 | > ,[binary "=" E.AssocRight] 303 | > ,[prefixK "not"] 304 | > ,[binaryK "and" E.AssocLeft] 305 | > ,[binaryK "or" E.AssocLeft]] 306 | > where 307 | > binary name assoc = 308 | > E.Infix (mkBinOp name <$ symbol name) assoc 309 | > mkBinOp nm a b = BinOp a nm b 310 | > prefix name = E.Prefix (PrefOp name <$ symbol name) 311 | > binaryK name assoc = 312 | > E.Infix (mkBinOp name <$ keyword name) assoc 313 | > prefixK name = E.Prefix (PrefOp name <$ keyword name) 314 | 315 | === valueExpr 316 | 317 | > valueExpr :: [String] -> Parser ValueExpr 318 | > valueExpr blackList = E.buildExpressionParser table (term blackList) 319 | 320 | == Query expression parsing 321 | 322 | > selectList :: Parser [(ValueExpr, Maybe String)] 323 | > selectList = keyword_ "select" *> commaSep1 selectItem 324 | 325 | > selectItem :: Parser (ValueExpr, Maybe String) 326 | > selectItem = (,) <$> valueExpr [] <*> optionMaybe (try alias) 327 | > where alias = optional (keyword_ "as") *> identifierBlacklist ["from"] 328 | 329 | > whereClause :: Parser ValueExpr 330 | > whereClause = keyword_ "where" *> valueExpr [] 331 | 332 | > groupByClause :: Parser [ValueExpr] 333 | > groupByClause = keyword_ "group" *> keyword_ "by" 334 | > *> commaSep1 (valueExpr []) 335 | 336 | > having :: Parser ValueExpr 337 | > having = keyword_ "having" *> (valueExpr []) 338 | 339 | > orderBy :: Parser [ValueExpr] 340 | > orderBy = keyword_ "order" *> keyword_ "by" 341 | > *> commaSep1 (valueExpr []) 342 | 343 | === from clause 344 | 345 | > from :: Parser [TableRef] 346 | > from = keyword_ "from" >> commaSep1 tref 347 | > where 348 | > tref = nonJoinTref >>= suffixWrapper joinTrefSuffix 349 | > joinTrefSuffix t0 = (do 350 | > nat <- option False (True <$ keyword_ "natural") 351 | > TRJoin t0 <$> joinType 352 | > <*> nonJoinTref 353 | > <*> optionMaybe (joinCondition nat)) 354 | > >>= suffixWrapper joinTrefSuffix 355 | > nonJoinTref = choice [TRSimple <$> identifier 356 | > ,try (TRQueryExpr <$> parens queryExpr) 357 | > ,TRParens <$> parens tref] 358 | > >>= suffixWrapper (try . alias) 359 | > alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier)) 360 | > aliasIdentifier = identifierBlacklist 361 | > [-- join keywords 362 | > "natural" 363 | > ,"inner" 364 | > ,"outer" 365 | > ,"cross" 366 | > ,"left" 367 | > ,"right" 368 | > ,"full" 369 | > ,"join" 370 | > ,"on" 371 | > ,"using" 372 | > -- subsequent clause keywords 373 | > ,"where" 374 | > ,"group" 375 | > ,"having" 376 | > ,"order" 377 | > ] 378 | 379 | > joinType :: Parser JoinType 380 | > joinType = choice 381 | > [JoinCross <$ keyword_ "cross" <* keyword_ "join" 382 | > ,JoinInner <$ keyword_ "inner" <* keyword_ "join" 383 | > ,JoinLeft <$ keyword_ "left" 384 | > <* optional (keyword_ "outer") 385 | > <* keyword_ "join" 386 | > ,JoinRight <$ keyword_ "right" 387 | > <* optional (keyword_ "outer") 388 | > <* keyword_ "join" 389 | > ,JoinFull <$ keyword_ "full" 390 | > <* optional (keyword_ "outer") 391 | > <* keyword_ "join" 392 | > ,JoinInner <$ keyword_ "join"] 393 | 394 | > joinCondition :: Bool -> Parser JoinCondition 395 | > joinCondition nat = 396 | > choice [guard nat >> return JoinNatural 397 | > ,keyword_ "on" >> JoinOn <$> valueExpr [] 398 | > ,keyword_ "using" >> JoinUsing <$> parens (commaSep1 identifier) 399 | > ] 400 | 401 | === queryExpr 402 | 403 | > queryExpr :: Parser QueryExpr 404 | > queryExpr = Select 405 | > <$> selectList 406 | > <*> option [] from 407 | > <*> optionMaybe whereClause 408 | > <*> option [] groupByClause 409 | > <*> optionMaybe having 410 | > <*> option [] orderBy 411 | 412 | == tokens 413 | 414 | > whitespace :: Parser () 415 | > whitespace = 416 | > choice [simpleWhitespace *> whitespace 417 | > ,lineComment *> whitespace 418 | > ,blockComment *> whitespace 419 | > ,return ()] 420 | > where 421 | > lineComment = try (string "--") 422 | > *> manyTill anyChar (void (char '\n') <|> eof) 423 | > blockComment = try (string "/*") 424 | > *> manyTill anyChar (try $ string "*/") 425 | > simpleWhitespace = void $ many1 (oneOf " \t\n") 426 | 427 | > lexeme :: Parser a -> Parser a 428 | > lexeme p = p <* whitespace 429 | 430 | > integer :: Parser Integer 431 | > integer = read <$> lexeme (many1 digit) 432 | 433 | > identifier :: Parser String 434 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) 435 | > where 436 | > firstChar = letter <|> char '_' 437 | > nonFirstChar = digit <|> firstChar 438 | 439 | > symbol :: String -> Parser String 440 | > symbol s = try $ lexeme $ do 441 | > u <- many1 (oneOf "<>=+-^%/*!|") 442 | > guard (s == u) 443 | > return s 444 | 445 | > openParen :: Parser Char 446 | > openParen = lexeme $ char '(' 447 | 448 | > closeParen :: Parser Char 449 | > closeParen = lexeme $ char ')' 450 | 451 | > stringToken :: Parser String 452 | > stringToken = lexeme (char '\'' *> manyTill anyChar (char '\'')) 453 | 454 | > dot :: Parser Char 455 | > dot = lexeme $ char '.' 456 | 457 | > comma :: Parser Char 458 | > comma = lexeme $ char ',' 459 | 460 | == helper functions 461 | 462 | > keyword :: String -> Parser String 463 | > keyword k = try $ do 464 | > i <- identifier 465 | > guard (i == k) 466 | > return k 467 | 468 | > parens :: Parser a -> Parser a 469 | > parens = between openParen closeParen 470 | 471 | > commaSep :: Parser a -> Parser [a] 472 | > commaSep = (`sepBy` comma) 473 | 474 | > keyword_ :: String -> Parser () 475 | > keyword_ = void . keyword 476 | 477 | > symbol_ :: String -> Parser () 478 | > symbol_ = void . symbol 479 | 480 | > commaSep1 :: Parser a -> Parser [a] 481 | > commaSep1 = (`sepBy1` comma) 482 | 483 | > identifierBlacklist :: [String] -> Parser String 484 | > identifierBlacklist bl = do 485 | > i <- identifier 486 | > guard (i `notElem` bl) 487 | > return i 488 | 489 | > suffixWrapper :: (a -> Parser a) -> a -> Parser a 490 | > suffixWrapper p a = p a <|> return a 491 | 492 | == the parser api 493 | 494 | > parseQueryExpr :: String -> Either ParseError QueryExpr 495 | > parseQueryExpr = parse (whitespace *> queryExpr <* eof) "" 496 | 497 | > parseValueExpr :: String -> Either ParseError ValueExpr 498 | > parseValueExpr = parse (whitespace *> valueExpr [] <* eof) "" 499 | 500 | 501 | == tests 502 | 503 | > data TestItem = Group String [TestItem] 504 | > | ValueExpressionTest String ValueExpr 505 | > | QueryExpressionTest String QueryExpr 506 | 507 | > numLitTests :: [(String,ValueExpr)] 508 | > numLitTests = 509 | > [("1", NumLit 1) 510 | > ,("54321", NumLit 54321)] 511 | > 512 | > idenTests :: [(String,ValueExpr)] 513 | > idenTests = 514 | > [("test", Iden "test") 515 | > ,("_something3", Iden "_something3")] 516 | > 517 | > operatorTests :: [(String,ValueExpr)] 518 | > operatorTests = 519 | > map (\o -> (o ++ " a", PrefOp o (Iden "a"))) ["not", "+", "-"] 520 | > ++ map (\o -> ("a " ++ o ++ " b", BinOp (Iden "a") o (Iden "b"))) 521 | > ["=",">","<", ">=", "<=", "!=", "<>" 522 | > ,"and", "or", "+", "-", "*", "/", "||", "like"] 523 | > 524 | > parensTests :: [(String,ValueExpr)] 525 | > parensTests = [("(1)", Parens (NumLit 1))] 526 | 527 | > basicTests :: [(String,ValueExpr)] 528 | > basicTests = numLitTests ++ idenTests ++ operatorTests ++ parensTests 529 | 530 | > stringLiteralTests :: [(String,ValueExpr)] 531 | > stringLiteralTests = 532 | > [("''", StringLit "") 533 | > ,("'test'", StringLit "test")] 534 | 535 | > dIdenTests :: [(String,ValueExpr)] 536 | > dIdenTests = 537 | > [("t.a", DIden "t" "a")] 538 | 539 | > starTests :: [(String,ValueExpr)] 540 | > starTests = [("*", Star)] 541 | 542 | 543 | > dStarTests :: [(String,ValueExpr)] 544 | > dStarTests = [("t.*", DStar "t")] 545 | 546 | > appTests :: [(String,ValueExpr)] 547 | > appTests = [("f()", App "f" []) 548 | > ,("f(1)", App "f" [NumLit 1]) 549 | > ,("f(1,a)", App "f" [NumLit 1, Iden "a"])] 550 | 551 | > caseTests :: [(String,ValueExpr)] 552 | > caseTests = 553 | > [("case a when 1 then 2 end" 554 | > ,Case (Just $ Iden "a") [(NumLit 1,NumLit 2)] Nothing) 555 | > 556 | > ,("case a when 1 then 2 when 3 then 4 end" 557 | > ,Case (Just $ Iden "a") 558 | > [(NumLit 1, NumLit 2) 559 | > ,(NumLit 3, NumLit 4)] 560 | > Nothing) 561 | > 562 | > ,("case a when 1 then 2 when 3 then 4 else 5 end" 563 | > ,Case (Just $ Iden "a") 564 | > [(NumLit 1, NumLit 2) 565 | > ,(NumLit 3, NumLit 4)] 566 | > (Just $ NumLit 5)) 567 | > 568 | > ,("case when a=1 then 2 when a=3 then 4 else 5 end" 569 | > ,Case Nothing 570 | > [(BinOp (Iden "a") "=" (NumLit 1), NumLit 2) 571 | > ,(BinOp (Iden "a") "=" (NumLit 3), NumLit 4)] 572 | > (Just $ NumLit 5)) 573 | > ] 574 | 575 | > allValueExprTests :: [(String,ValueExpr)] 576 | > allValueExprTests = concat [basicTests 577 | > ,stringLiteralTests 578 | > ,dIdenTests 579 | > ,starTests 580 | > ,dStarTests 581 | > ,appTests 582 | > ,caseTests] 583 | 584 | > singleSelectItemTests :: [(String,QueryExpr)] 585 | > singleSelectItemTests = 586 | > [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})] 587 | 588 | > multipleSelectItemsTests :: [(String,QueryExpr)] 589 | > multipleSelectItemsTests = 590 | > [("select a" 591 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)]}) 592 | > ,("select a,b" 593 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 594 | > ,(Iden "b",Nothing)]}) 595 | > ,("select 1+2,3+4" 596 | > ,makeSelect {qeSelectList = 597 | > [(BinOp (NumLit 1) "+" (NumLit 2),Nothing) 598 | > ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]}) 599 | > ] 600 | 601 | > selectListTests :: [(String,QueryExpr)] 602 | > selectListTests = 603 | > [("select a as a, b as b" 604 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a") 605 | > ,(Iden "b", Just "b")]}) 606 | > ,("select a a, b b" 607 | > ,makeSelect {qeSelectList = [(Iden "a", Just "a") 608 | > ,(Iden "b", Just "b")]}) 609 | > ] ++ multipleSelectItemsTests 610 | > ++ singleSelectItemTests 611 | 612 | > fromTests :: [(String,QueryExpr)] 613 | > fromTests = 614 | > [("select a from t" 615 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 616 | > ,qeFrom = [TRSimple "t"]})] 617 | 618 | > whereTests :: [(String,QueryExpr)] 619 | > whereTests = 620 | > [("select a from t where a = 5" 621 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing)] 622 | > ,qeFrom = [TRSimple "t"] 623 | > ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)}) 624 | > ] 625 | 626 | > groupByTests :: [(String,QueryExpr)] 627 | > groupByTests = 628 | > [("select a,sum(b) from t group by a" 629 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 630 | > ,(App "sum" [Iden "b"],Nothing)] 631 | > ,qeFrom = [TRSimple "t"] 632 | > ,qeGroupBy = [Iden "a"] 633 | > }) 634 | > ,("select a,b,sum(c) from t group by a,b" 635 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 636 | > ,(Iden "b",Nothing) 637 | > ,(App "sum" [Iden "c"],Nothing)] 638 | > ,qeFrom = [TRSimple "t"] 639 | > ,qeGroupBy = [Iden "a",Iden "b"] 640 | > }) 641 | > ] 642 | 643 | > havingTests :: [(String,QueryExpr)] 644 | > havingTests = 645 | > [("select a,sum(b) from t group by a having sum(b) > 5" 646 | > ,makeSelect {qeSelectList = [(Iden "a",Nothing) 647 | > ,(App "sum" [Iden "b"],Nothing)] 648 | > ,qeFrom = [TRSimple "t"] 649 | > ,qeGroupBy = [Iden "a"] 650 | > ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5) 651 | > }) 652 | > ] 653 | 654 | > orderByTests :: [(String,QueryExpr)] 655 | > orderByTests = 656 | > [("select a from t order by a" 657 | > ,ms [Iden "a"]) 658 | > ,("select a from t order by a, b" 659 | > ,ms [Iden "a", Iden "b"]) 660 | > ] 661 | > where 662 | > ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)] 663 | > ,qeFrom = [TRSimple "t"] 664 | > ,qeOrderBy = o} 665 | 666 | > queryExprJoinTests :: [(String,QueryExpr)] 667 | > queryExprJoinTests = 668 | > [("select a from t" 669 | > ,ms [TRSimple "t"]) 670 | > ,("select a from t,u" 671 | > ,ms [TRSimple "t", TRSimple "u"]) 672 | > 673 | > ,("select a from t inner join u on expr" 674 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 675 | > (Just $ JoinOn $ Iden "expr")]) 676 | > 677 | > ,("select a from t left join u on expr" 678 | > ,ms [TRJoin (TRSimple "t") JoinLeft (TRSimple "u") 679 | > (Just $ JoinOn $ Iden "expr")]) 680 | > 681 | > ,("select a from t right join u on expr" 682 | > ,ms [TRJoin (TRSimple "t") JoinRight (TRSimple "u") 683 | > (Just $ JoinOn $ Iden "expr")]) 684 | > 685 | > ,("select a from t full join u on expr" 686 | > ,ms [TRJoin (TRSimple "t") JoinFull (TRSimple "u") 687 | > (Just $ JoinOn $ Iden "expr")]) 688 | > 689 | > ,("select a from t cross join u" 690 | > ,ms [TRJoin (TRSimple "t") JoinCross (TRSimple "u") Nothing]) 691 | > 692 | > ,("select a from t natural inner join u" 693 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 694 | > (Just JoinNatural)]) 695 | > 696 | > ,("select a from t inner join u using(a,b)" 697 | > ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u") 698 | > (Just $ JoinUsing ["a", "b"])]) 699 | > 700 | > ,("select a from (select a from t)" 701 | > ,ms [TRQueryExpr $ ms [TRSimple "t"]]) 702 | > 703 | > ,("select a from t as u" 704 | > ,ms [TRAlias (TRSimple "t") "u"]) 705 | > 706 | > ,("select a from t u" 707 | > ,ms [TRAlias (TRSimple "t") "u"]) 708 | > 709 | > ,("select a from (t cross join u) as u" 710 | > ,ms [TRAlias (TRParens $ TRJoin (TRSimple "t") JoinCross 711 | > (TRSimple "u") Nothing) "u"]) 712 | > ] 713 | > where 714 | > ms f = makeSelect {qeSelectList = [(Iden "a",Nothing)] 715 | > ,qeFrom = f} 716 | 717 | > allQueryExprTests :: [(String,QueryExpr)] 718 | > allQueryExprTests = concat [selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests ++ queryExprJoinTests] 719 | 720 | todo: use external api parsing code 721 | 722 | > makeTest :: (Eq a, Show a) => Parser a -> (String,a) -> H.Test 723 | > makeTest parser (src,expected) = H.TestLabel src $ H.TestCase $ do 724 | > let gote = parse (whitespace *> parser <* eof) "" src 725 | > case gote of 726 | > Left e -> H.assertFailure $ show e 727 | > Right got -> H.assertEqual src expected got 728 | -------------------------------------------------------------------------------- /Text/Parsec/String/Char.hs: -------------------------------------------------------------------------------- 1 | module Text.Parsec.String.Char where 2 | 3 | {- 4 | 5 | Wrappers for the Text.Parsec.Char module with the types fixed to 6 | 'Text.Parsec.String.Parser a', i.e. the stream is String, no user 7 | state, Identity monad. 8 | 9 | -} 10 | 11 | import qualified Text.Parsec.Char as C 12 | import Text.Parsec.String (Parser) 13 | 14 | spaces :: Parser () 15 | spaces = C.spaces 16 | 17 | space :: Parser Char 18 | space = C.space 19 | 20 | newline :: Parser Char 21 | newline = C.newline 22 | 23 | tab :: Parser Char 24 | tab = C.tab 25 | 26 | upper :: Parser Char 27 | upper = C.upper 28 | 29 | lower :: Parser Char 30 | lower = C.lower 31 | 32 | alphaNum :: Parser Char 33 | alphaNum = C.alphaNum 34 | 35 | letter :: Parser Char 36 | letter = C.letter 37 | 38 | digit :: Parser Char 39 | digit = C.digit 40 | 41 | hexDigit :: Parser Char 42 | hexDigit = C.hexDigit 43 | 44 | octDigit :: Parser Char 45 | octDigit = C.octDigit 46 | 47 | char :: Char -> Parser Char 48 | char = C.char 49 | 50 | string :: String -> Parser String 51 | string = C.string 52 | 53 | anyChar :: Parser Char 54 | anyChar = C.anyChar 55 | 56 | oneOf :: [Char] -> Parser Char 57 | oneOf = C.oneOf 58 | 59 | noneOf :: [Char] -> Parser Char 60 | noneOf = C.noneOf 61 | 62 | satisfy :: (Char -> Bool) -> Parser Char 63 | satisfy = C.satisfy 64 | -------------------------------------------------------------------------------- /Text/Parsec/String/Combinator.hs: -------------------------------------------------------------------------------- 1 | 2 | module Text.Parsec.String.Combinator where 3 | 4 | {- 5 | 6 | Wrappers for the Text.Parsec.Combinator module with the types fixed to 7 | 'Text.Parsec.String.Parser a', i.e. the stream is String, no user 8 | state, Identity monad. 9 | 10 | -} 11 | 12 | import qualified Text.Parsec.Combinator as C 13 | import Text.Parsec.String (Parser) 14 | 15 | choice :: [Parser a] -> Parser a 16 | choice = C.choice 17 | 18 | 19 | count :: Int -> Parser a -> Parser [a] 20 | count = C.count 21 | 22 | between :: Parser open -> Parser close -> Parser a -> Parser a 23 | between = C.between 24 | 25 | 26 | option :: a -> Parser a -> Parser a 27 | option = C.option 28 | 29 | optionMaybe :: Parser a -> Parser (Maybe a) 30 | optionMaybe = C.optionMaybe 31 | 32 | optional :: Parser a -> Parser () 33 | optional = C.optional 34 | 35 | skipMany1 :: Parser a -> Parser () 36 | skipMany1 = C.skipMany1 37 | 38 | many1 :: Parser a -> Parser [a] 39 | many1 = C.many1 40 | 41 | sepBy :: Parser a -> Parser sep -> Parser [a] 42 | sepBy = C.sepBy 43 | 44 | sepBy1 :: Parser a -> Parser sep -> Parser [a] 45 | sepBy1 = C.sepBy1 46 | 47 | endBy :: Parser a -> Parser sep -> Parser [a] 48 | endBy = C.endBy 49 | 50 | endBy1 :: Parser a -> Parser sep -> Parser [a] 51 | endBy1 = C.endBy1 52 | 53 | sepEndBy :: Parser a -> Parser sep -> Parser [a] 54 | sepEndBy = C.sepEndBy 55 | 56 | sepEndBy1 :: Parser a -> Parser sep -> Parser [a] 57 | sepEndBy1 = C.sepEndBy1 58 | 59 | chainl :: Parser a -> Parser (a -> a -> a) -> a -> Parser a 60 | chainl = C.chainl 61 | 62 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a 63 | chainl1 = C.chainl1 64 | 65 | chainr :: Parser a -> Parser (a -> a -> a) -> a -> Parser a 66 | chainr = C.chainr 67 | 68 | chainr1 :: Parser a -> Parser (a -> a -> a) -> Parser a 69 | chainr1 = C.chainr1 70 | 71 | eof :: Parser () 72 | eof = C.eof 73 | 74 | notFollowedBy :: Show a => Parser a -> Parser () 75 | notFollowedBy = C.notFollowedBy 76 | 77 | manyTill :: Parser a -> Parser end -> Parser [a] 78 | manyTill = C.manyTill 79 | 80 | lookAhead :: Parser a -> Parser a 81 | lookAhead = C.lookAhead 82 | 83 | anyToken :: Parser Char 84 | anyToken = C.anyToken 85 | -------------------------------------------------------------------------------- /Text/Parsec/String/Expr.hs: -------------------------------------------------------------------------------- 1 | 2 | module Text.Parsec.String.Expr (buildExpressionParser 3 | ,Operator(..) 4 | ,OperatorTable 5 | ,E.Assoc(..) 6 | )where 7 | 8 | {- 9 | 10 | Wrappers for the Text.Parsec.Expr module with simplified types. 11 | 12 | -} 13 | 14 | import Text.Parsec.String (Parser) 15 | import qualified Text.Parsec.Expr as E 16 | 17 | -- not sure if this is neccessary, or a type alias would be good 18 | -- enough 19 | data Operator a = Infix (Parser (a -> a -> a)) E.Assoc 20 | | Prefix (Parser (a -> a)) 21 | | Postfix (Parser (a -> a)) 22 | 23 | type OperatorTable a = [[Operator a]] 24 | 25 | buildExpressionParser :: OperatorTable a -> Parser a -> Parser a 26 | buildExpressionParser t = E.buildExpressionParser (map (map f) t) 27 | where 28 | f (Infix p a) = E.Infix p a 29 | f (Prefix p) = E.Prefix p 30 | f (Postfix p) = E.Postfix p 31 | -------------------------------------------------------------------------------- /Text/Parsec/String/Parsec.hs: -------------------------------------------------------------------------------- 1 | 2 | module Text.Parsec.String.Parsec where 3 | 4 | {- 5 | 6 | wrapper for some Text.Parsec functions which use a simplified type 7 | 8 | -} 9 | 10 | import Text.Parsec.String (Parser) 11 | import qualified Text.Parsec as P 12 | 13 | 14 | try :: Parser a -> Parser a 15 | try = P.try 16 | 17 | parse :: Parser a -> P.SourceName -> String -> Either P.ParseError a 18 | parse = P.parse 19 | -------------------------------------------------------------------------------- /TextParsecExpr.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[parsing-expressions-with-fixity]] 3 | = Parsing expressions with fixity 4 | 5 | Text.Parsec.Expr allows building expression parsers with a range of 6 | operators with different precedences and associativities 7 | easily. Fixity is the (not completely standard) term for precendence 8 | and associativity together. 9 | 10 | `Text.Parsec.Expr` can be great to quickly get a parser for simple 11 | expressions or a simple programming language with simple expressions 12 | up and running. 13 | 14 | > import Text.Parsec.String (Parser) 15 | > import Text.Parsec.String.Combinator (many1, between) 16 | > import Text.Parsec.String.Char (letter, char, digit, string, oneOf) 17 | > 18 | > import Control.Applicative ((<$>), (<*>), (<*), (<|>), many, (<$)) 19 | > import Control.Monad (void) 20 | > 21 | > import qualified Text.Parsec.String.Expr as E 22 | > import FunctionsAndTypesForParsing 23 | 24 | Let's extend the SimpleExpression type and parsers to cover a range of 25 | operators with different precedences and associativity. 26 | 27 | == expressions with plus and times 28 | 29 | Let's start with a simple case: + and * with the usual fixity. Here is 30 | the abstract syntax: 31 | 32 | > data PlusTimesExpr = PteVar String 33 | > | PteNum Integer 34 | > | PteParens PlusTimesExpr 35 | > | Plus PlusTimesExpr PlusTimesExpr 36 | > | Times PlusTimesExpr PlusTimesExpr 37 | > deriving (Eq,Show) 38 | 39 | > plusTimesExpr :: Parser PlusTimesExpr 40 | > plusTimesExpr = E.buildExpressionParser pteTable pteTerm 41 | 42 | > pteTable :: [[E.Operator PlusTimesExpr]] 43 | > pteTable = [[E.Infix (Times <$ symbol "*") E.AssocLeft] 44 | > ,[E.Infix (Plus <$ symbol "+") E.AssocLeft] 45 | > ] 46 | 47 | Here you can see the operator parsers are the same as the previous 48 | `SimpleExpr` parser which used `chainl1`: `Times <$ symbol "*"` and 49 | `Plus <$ symbol "+"`. We just wrapped these up in the `E.Infix` 50 | constructor with the associativity, and put them in a list of lists 51 | which represents the precendence classes. 52 | 53 | Here is the term parser and components. All this is just the same as 54 | the `SimpleExpr` parser a previous tutorial. 55 | 56 | > pteTerm :: Parser PlusTimesExpr 57 | > pteTerm = pteVar <|> pteNum <|> pteParens 58 | 59 | > pteNum :: Parser PlusTimesExpr 60 | > pteNum = PteNum <$> integer 61 | 62 | > pteVar :: Parser PlusTimesExpr 63 | > pteVar = PteVar <$> identifier 64 | 65 | > pteParens :: Parser PlusTimesExpr 66 | > pteParens = PteParens <$> between (symbol "(") (symbol ")") plusTimesExpr 67 | 68 | support functions: 69 | 70 | > whitespace :: Parser () 71 | > whitespace = void $ many $ oneOf " \n\t" 72 | 73 | > lexeme :: Parser a -> Parser a 74 | > lexeme p = p <* whitespace 75 | 76 | > integer :: Parser Integer 77 | > integer = read <$> lexeme (many1 digit) 78 | 79 | > identifier :: Parser String 80 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar) 81 | > where 82 | > firstChar = letter <|> char '_' 83 | > nonFirstChar = digit <|> firstChar 84 | 85 | > symbol :: String -> Parser String 86 | > symbol s = lexeme $ string s 87 | 88 | Here you can see the precendence in action: 89 | 90 | ``` 91 | *Main> regularParse plusTimesExpr "a + b * c" 92 | Right (Plus (PteVar "a") (Times (PteVar "b") (PteVar "c"))) 93 | 94 | *Main> regularParse plusTimesExpr "a * b + c" 95 | Right (Plus (Times (PteVar "a") (PteVar "b")) (PteVar "c")) 96 | ``` 97 | 98 | == a full featured expression type 99 | 100 | Now let's try a much bigger example with lots more operators. Now we 101 | are thinking ahead to the first version of the SQL query parser, and 102 | preparing for this. 103 | 104 | Here are our new operators in precedence order: 105 | 106 | === unary + - 107 | 108 | ``` 109 | +a 110 | -3 111 | ``` 112 | 113 | === exponentiation 114 | 115 | ``` 116 | a ^ 3 117 | ``` 118 | 119 | associativity: left 120 | 121 | === multiplication, division, modulo 122 | 123 | ``` 124 | a * 3 125 | 3 / b 126 | a % 5 127 | ``` 128 | 129 | associativity: left 130 | 131 | === addition, subtraction 132 | 133 | ``` 134 | a + b 135 | a - b 136 | ``` 137 | 138 | associativity: left 139 | 140 | === less than, greater than 141 | 142 | ``` 143 | a < b 144 | a > b 145 | ``` 146 | 147 | associativity: none 148 | 149 | === equals 150 | 151 | ``` 152 | a = 3 153 | ``` 154 | 155 | associativity: right 156 | 157 | === not 158 | 159 | ``` 160 | not a 161 | ``` 162 | 163 | === and 164 | 165 | ``` 166 | a and b 167 | ``` 168 | 169 | associativity: left 170 | 171 | === or 172 | 173 | ``` 174 | a or b 175 | ``` 176 | 177 | associativity: left 178 | 179 | Here is the abstract syntax type: 180 | 181 | > data SimpleExpr = Num Integer 182 | > | Var String 183 | > | Parens SimpleExpr 184 | > | PrefixOp String SimpleExpr 185 | > | BinaryOp SimpleExpr String SimpleExpr 186 | > deriving (Eq,Show) 187 | 188 | Here is the new expression parser: 189 | 190 | > simpleExpr :: Parser SimpleExpr 191 | > simpleExpr = E.buildExpressionParser table term 192 | 193 | > table :: [[E.Operator SimpleExpr]] 194 | > table = [[prefix "-", prefix "+"] 195 | > ,[binary "^" E.AssocLeft] 196 | > ,[binary "*" E.AssocLeft 197 | > ,binary "/" E.AssocLeft 198 | > ,binary "%" E.AssocLeft] 199 | > ,[binary "+" E.AssocLeft 200 | > ,binary "-" E.AssocLeft] 201 | > ,[binary "<" E.AssocNone 202 | > ,binary ">" E.AssocNone] 203 | > ,[binary "=" E.AssocRight] 204 | > ,[prefix "not"] 205 | > ,[binary "and" E.AssocLeft] 206 | > ,[binary "or" E.AssocLeft] 207 | > ] 208 | > where 209 | > binary name assoc = 210 | > E.Infix (mkBinOp name <$ symbol name) assoc 211 | > mkBinOp nm a b = BinaryOp a nm b 212 | > prefix name = E.Prefix (PrefixOp name <$ symbol name) 213 | 214 | TODO: expand and explain the bits. 215 | 216 | Here is the term parser. 217 | 218 | > term :: Parser SimpleExpr 219 | > term = var <|> num <|> parens 220 | 221 | > num :: Parser SimpleExpr 222 | > num = Num <$> integer 223 | 224 | > var :: Parser SimpleExpr 225 | > var = Var <$> identifier 226 | 227 | > parens :: Parser SimpleExpr 228 | > parens = between (symbol "(") (symbol ")") simpleExpr 229 | 230 | TODO: write lots of parsing examples, including parse failures with 231 | ambiguity. 232 | 233 | issue: double prefix op, link to bug on parsec bug tracker. 234 | 235 | The source in Text.Parsec.Expr is not too big. You can have a look and 236 | try to understand it. There are several standard approaches in parsing 237 | theory to parse expressions with data driven precendences and 238 | associativity. I don't know which one Text.Parsec.Expr uses, but if 239 | you find these and read about them, then the source of 240 | Text.Parsec.Expr might be a bit more understandable. 241 | -------------------------------------------------------------------------------- /TextParsecPerm.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[permutation-parsing]] 3 | = Permutation parsing 4 | 5 | This can parse a bunch of different things in any order. TODO: 6 | examples. 7 | -------------------------------------------------------------------------------- /TextParsecToken.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[token-parsing]] 3 | = Token parsing 4 | 5 | This can be used quickly create a set of token parsers handling lots 6 | of little issues which you otherwise have to deal with manually. 7 | 8 | -------------------------------------------------------------------------------- /VerySimpleExpressions.lhs: -------------------------------------------------------------------------------- 1 | 2 | [[very-simple-expression-parsing]] 3 | = Very simple expression parsing 4 | 5 | In this tutorial we will develop a parser for a very simple expression 6 | language, and start learning about the set of combinators which comes 7 | with Parsec. 8 | 9 | > import Text.Parsec (ParseError) 10 | > import Text.Parsec.String (Parser) 11 | > import Text.Parsec.String.Parsec (try) 12 | > import Text.Parsec.String.Char (oneOf, char, digit, satisfy) 13 | > import Text.Parsec.String.Combinator (many1, choice, chainl1) 14 | > import Control.Applicative ((<|>), many) 15 | > import Control.Monad (void) 16 | > import Data.Char (isLetter, isDigit) 17 | > import FunctionsAndTypesForParsing 18 | 19 | == num 20 | 21 | The first element we will have in this expression language is positive 22 | integral numbers: 23 | 24 | > numberExamples :: [(String,Integer)] 25 | > numberExamples = [("1", 1) 26 | > ,("23", 23)] 27 | 28 | TODO: make examples with parsing failures for all of the example 29 | scripts below? 30 | 31 | To parse a number, we need to parse one or more digits, and then read 32 | the resulting string. We can use the combinator `many1` to help with 33 | this. We will also use do notation. 34 | 35 | > num :: Parser Integer 36 | > num = do 37 | > n <- many1 digit 38 | > return (read n) 39 | 40 | Let's try it out. 41 | 42 | ``` 43 | *Main> regularParse num "1" 44 | Right 1 45 | 46 | *Main> regularParse num "123456" 47 | Right 123456 48 | 49 | *Main> regularParse num "aa" 50 | Left (line 1, column 1): 51 | unexpected "a" 52 | expecting digit 53 | 54 | ``` 55 | 56 | How does it work? First, we parse one or more (`many1`) digits (`digit`), 57 | and give the result the name 'n'. Then we convert the string to an 58 | integer using `read`. 59 | 60 | The `many1` function's type looks like this: 61 | 62 | ``` 63 | many1 :: Parser a -> Parser [a] 64 | ``` 65 | 66 | It applies the parser given one or more times, returning the result. 67 | 68 | Let's see what happens when we use the `many` combinator which parses 69 | zero or more items instead of one or more. 70 | 71 | > num1 :: Parser Integer 72 | > num1 = do 73 | > n <- many digit 74 | > return (read n) 75 | 76 | ``` 77 | *Main> regularParse num1 "1" 78 | Right 1 79 | 80 | *Main> regularParse num1 "123456" 81 | Right 123456 82 | 83 | *Main> regularParse num1 "aa" 84 | Right *** Exception: Prelude.read: no parse 85 | ``` 86 | 87 | == var 88 | 89 | For var, we have to decide on a syntax for the identifiers. Let's go 90 | for a common choice: identifiers must start with a letter or 91 | underscore, and then they can be followed by zero or more letters, 92 | underscores or digits in any combination. 93 | 94 | > varExamples :: [(String,String)] 95 | > varExamples = [("test", "test") 96 | > ,("_stuff", "_stuff") 97 | > ,("_1234", "_1234")] 98 | 99 | > var :: Parser String 100 | > var = do 101 | > fc <- firstChar 102 | > rest <- many nonFirstChar 103 | > return (fc:rest) 104 | > where 105 | > firstChar = satisfy (\a -> isLetter a || a == '_') 106 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 107 | 108 | This time, we create two helper parsers: `firstChar`, which parses a 109 | letter or underscore, and `nonFirstChar` which parses a digit, letter 110 | or underscore. This time, we use the `many` function instead of 111 | `many1`. 112 | 113 | Try it out in ghci. I like to try things which you expect to work, and 114 | also to try things which you expect to not work and make sure you get 115 | an error. 116 | 117 | == parens 118 | 119 | The parens parser will eventually parse any expression inside 120 | parentheses. First it will just parse integers inside parentheses. 121 | 122 | > data Parentheses = Parentheses Integer 123 | > deriving (Eq,Show) 124 | > 125 | > parensExamples :: [(String, Parentheses)] 126 | > parensExamples = [("(1)", Parentheses 1) 127 | > ,("(17)", Parentheses 17)] 128 | 129 | > parens :: Parser Parentheses 130 | > parens = do 131 | > void $ char '(' 132 | > e <- many1 digit 133 | > void $ char ')' 134 | > return (Parentheses (read e)) 135 | 136 | There is a new function: `void`. This might be familiar to you 137 | already. This is used to ignore the result of the `char` parser, since 138 | we are not interested in this value. You can also write the function 139 | without `void`, but ghc will give you a warning if you have warnings 140 | turned on. 141 | 142 | One way of turning warnings on in ghci is to enter `:set -Wall` at the 143 | ghci prompt. 144 | 145 | > parens' :: Parser Parentheses 146 | > parens' = do 147 | > char '(' 148 | > e <- many1 digit 149 | > char ')' 150 | > return (Parentheses (read e)) 151 | 152 | ``` 153 | *Main> :set -Wall 154 | *Main> :l "VerySimpleExpressions.lhs" 155 | 156 | ... 157 | 158 | FirstRealParsing.lhs:140:7: Warning: 159 | A do-notation statement discarded a result of type Char. 160 | Suppress this warning by saying "_ <- char '('", 161 | or by using the flag -fno-warn-unused-do-bind 162 | 163 | FirstRealParsing.lhs:142:7: Warning: 164 | A do-notation statement discarded a result of type Char. 165 | Suppress this warning by saying "_ <- char ')'", 166 | or by using the flag -fno-warn-unused-do-bind 167 | 168 | ... 169 | 170 | ``` 171 | 172 | As you can see, another way to suppress the warning is to use 173 | `_ <- char '('`. 174 | 175 | One issue with this parser is that it doesn't handle whitespace: 176 | 177 | ``` 178 | *Main> regularParse parens "(1)" 179 | Right (Parentheses 1) 180 | 181 | *Main> regularParse parens "( 1)" 182 | Left (line 1, column 2): 183 | unexpected " " 184 | expecting digit 185 | 186 | *Main> regularParse parens "(1 )" 187 | Left (line 1, column 3): 188 | unexpected " " 189 | expecting digit or ")" 190 | ``` 191 | 192 | We will look at this issue below. 193 | 194 | == add 195 | 196 | Now we will write a little parser to parse strings like 'a+b' where a 197 | and b are numbers. 198 | 199 | > data SingleAdd = SingleAdd Integer Integer 200 | > deriving (Eq,Show) 201 | > 202 | > singleAddExamples :: [(String, SingleAdd)] 203 | > singleAddExamples = [("1+2", SingleAdd 1 2) 204 | > ,("101+202", SingleAdd 101 202)] 205 | 206 | > add :: Parser SingleAdd 207 | > add = do 208 | > e0 <- many1 digit 209 | > void $ char '+' 210 | > e1 <- many1 digit 211 | > return (SingleAdd (read e0) (read e1)) 212 | 213 | It has the same whitespace issues as the parens parser. 214 | 215 | ``` 216 | *Main> regularParse add "1+2" 217 | Right (SingleAdd 1 2) 218 | 219 | *Main> regularParse add "1 +2" 220 | Left (line 1, column 2): 221 | unexpected " " 222 | expecting digit or "+" 223 | 224 | ``` 225 | 226 | == whitespace 227 | 228 | Here is a parser which will skip zero or more whitespace characters. 229 | 230 | > whitespace :: Parser () 231 | > whitespace = void $ many $ oneOf " \n\t" 232 | 233 | We can use this to make our parsers handle whitespace better. 234 | 235 | ``` 236 | *Main> regularParse whitespace " " 237 | Right () 238 | *Main> regularParse whitespace " " 239 | Right () 240 | *Main> regularParse whitespace "\t" 241 | Right () 242 | *Main> regularParse whitespace " \n " 243 | Right () 244 | *Main> regularParse whitespace "" 245 | Right () 246 | ``` 247 | 248 | Notice that it always succeeds. 249 | 250 | Here is the parens parser rewritten with a common approach to 251 | whitespace handling: 252 | 253 | > parensW :: Parser Parentheses 254 | > parensW = do 255 | > whitespace 256 | > void $ char '(' 257 | > whitespace 258 | > e <- many1 digit 259 | > whitespace 260 | > void $ char ')' 261 | > whitespace 262 | > return (Parentheses (read e)) 263 | 264 | ``` 265 | *Main> regularParse parensW "(1)" 266 | Right (Parentheses 1) 267 | 268 | *Main> regularParse parensW " (1)" 269 | Right (Parentheses 1) 270 | 271 | *Main> regularParse parensW " (1 )" 272 | Right (Parentheses 1) 273 | 274 | *Main> regularParse parensW " ( 1 ) " 275 | Right (Parentheses 1) 276 | ``` 277 | 278 | Looks good. 279 | 280 | In the original parsec documentation, one of the concepts mentioned is 281 | the idea of 'lexeme' parsing. This is a style in which every token 282 | parser should also consume and ignore any trailing whitespace. 283 | 284 | This is a simple convention which with a bit of care allows skipping 285 | whitespace exactly once wherever it needs to be skipped. To complete 286 | the lexeme style, we should also always skip leading whitespace at the 287 | top level only. This feels more elegant than spamming all the parsing 288 | code with many calls to `whitespace`. 289 | 290 | > lexeme :: Parser a -> Parser a 291 | > lexeme p = do 292 | > x <- p 293 | > whitespace 294 | > return x 295 | 296 | > parseWithWhitespace :: Parser a -> String -> Either ParseError a 297 | > parseWithWhitespace p = parseWithEof wrapper 298 | > where 299 | > wrapper = do 300 | > whitespace 301 | > p 302 | 303 | 304 | Here is the parens parser rewritten to use lexeme: 305 | 306 | > parensL :: Parser Parentheses 307 | > parensL = do 308 | > void $ lexeme $ char '(' 309 | > e <- lexeme $ many1 digit 310 | > void $ lexeme $ char ')' 311 | > return (Parentheses (read e)) 312 | 313 | ``` 314 | *Main> parseWithWhitespace parensL "(1)" 315 | Right (Parentheses 1) 316 | 317 | *Main> parseWithWhitespace parensL " (1)" 318 | Right (Parentheses 1) 319 | 320 | *Main> parseWithWhitespace parensL " ( 1)" 321 | Right (Parentheses 1) 322 | 323 | *Main> parseWithWhitespace parensL " ( 1 ) " 324 | Right (Parentheses 1) 325 | ``` 326 | 327 | The `parseWithWhitespace` function can also use `(>>)` to make it a 328 | bit shorter, `wrapper = whiteSpace >> p`. 329 | 330 | Here is the shorter version of this function using `(>>)`: 331 | 332 | > parseWithWhitespace' :: Parser a -> String -> Either ParseError a 333 | > parseWithWhitespace' p = parseWithEof (whitespace >> p) 334 | 335 | Try rewriting the SingleAdd parser to use `lexeme`, and test it out to 336 | convince yourself that it skips whitespace correctly. 337 | 338 | == simple expr 339 | 340 | Now we are ready to write a parser which parses simple expressions 341 | made from these components. Here is the data type for these 342 | expressions. 343 | 344 | > data SimpleExpr = Num Integer 345 | > | Var String 346 | > | Add SimpleExpr SimpleExpr 347 | > | Parens SimpleExpr 348 | > deriving (Eq,Show) 349 | 350 | It's so simple that it is almost useless at the moment. 351 | 352 | > simpleExprExamples :: [(String,SimpleExpr)] 353 | > simpleExprExamples = 354 | > [("a", Var "a") 355 | > ,("1", Num 1) 356 | > ,("2 + 3", Add (Num 2) (Num 3)) 357 | > ,("(42)", Parens (Num 42))] 358 | 359 | TODO: some more complex examples 360 | 361 | Here are all our component parsers with `lexeme`, and with the 362 | `SimpleExpr` constructors: 363 | 364 | > numE :: Parser SimpleExpr 365 | > numE = do 366 | > n <- lexeme $ many1 digit 367 | > return $ Num $ read n 368 | 369 | There doesn't seem to be a unique obviously correct place to put the 370 | lexeme call in the var parser: 371 | 372 | > varE :: Parser SimpleExpr 373 | > varE = lexeme $ do 374 | > fc <- firstChar 375 | > rest <- many nonFirstChar 376 | > return $ Var (fc:rest) 377 | > where 378 | > firstChar = satisfy (\a -> isLetter a || a == '_') 379 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 380 | 381 | Here is an alternative, with the call to lexeme in a different place, 382 | but gives effectively the same function. 383 | 384 | > varE' :: Parser SimpleExpr 385 | > varE' = do 386 | > fc <- firstChar 387 | > rest <- lexeme $ many nonFirstChar 388 | > return $ Var (fc:rest) 389 | > where 390 | > firstChar = satisfy (\a -> isLetter a || a == '_') 391 | > nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_') 392 | 393 | > parensE :: Parser SimpleExpr 394 | > parensE = do 395 | > void $ lexeme $ char '(' 396 | > e <- lexeme $ many1 digit 397 | > void $ lexeme $ char ')' 398 | > return $ Parens $ Num $ read e 399 | 400 | In the parens parser, we can reuse the `numE` parser like this: 401 | 402 | > parensE' :: Parser SimpleExpr 403 | > parensE' = do 404 | > void $ lexeme $ char '(' 405 | > e <- numE 406 | > void $ lexeme $ char ')' 407 | > return $ Parens e 408 | 409 | Here is the add parser using `numE` also. 410 | 411 | > addE :: Parser SimpleExpr 412 | > addE = do 413 | > e0 <- numE 414 | > void $ lexeme $ char '+' 415 | > e1 <- numE 416 | > return $ Add e0 e1 417 | 418 | === choice 419 | 420 | To combine these, we can use an operator called `(<|>)`: 421 | 422 | > numOrVar :: Parser SimpleExpr 423 | > numOrVar = numE <|> varE 424 | 425 | It tries the first parser, and it if fails (without consuming any 426 | input), it tries the second parser. More about the 'consuming input' 427 | concept later. 428 | 429 | Here is another way to write the numOrVar parser: 430 | 431 | > numOrVar' :: Parser SimpleExpr 432 | > numOrVar' = choice [numE,varE] 433 | 434 | `choice` is just wrapper around `(<|>)`. You can choose which one to 435 | use based on which is more readable in each particular case. 436 | 437 | ``` 438 | *Main> parseWithWhitespace numOrVar "a" 439 | Right (Var "a") 440 | 441 | *Main> parseWithWhitespace numOrVar "1" 442 | Right (Num 1) 443 | 444 | *Main> parseWithWhitespace numOrVar "!" 445 | Left (line 1, column 1): 446 | unexpected "!" 447 | expecting digit 448 | ``` 449 | 450 | Here is the first version of the simpleExpr parser: 451 | 452 | > simpleExpr :: Parser SimpleExpr 453 | > simpleExpr = numE <|> varE <|> addE <|> parensE 454 | 455 | ``` 456 | *Main> parseWithWhitespace simpleExpr "12" 457 | Right (Num 12) 458 | 459 | *Main> parseWithWhitespace simpleExpr "aa" 460 | Right (Var "aa") 461 | 462 | *Main> parseWithWhitespace simpleExpr "1+2" 463 | Left (line 1, column 2): 464 | unexpected '+' 465 | expecting digit or end of input 466 | 467 | *Main> parseWithWhitespace simpleExpr "(1)" 468 | Right (Parens (Num 1)) 469 | 470 | *Main> parseWithWhitespace simpleExpr "(aa)" 471 | Left (line 1, column 2): 472 | unexpected "a" 473 | expecting digit 474 | 475 | ``` 476 | 477 | It works well for some of the parsers. One problem is that the `addE` 478 | and `parensE` parsers don't parse general expressions as the 479 | components, but just `numE`. Another problem is that the `addE` 480 | doesn't work at all: the `numE` parser parses the first number, and 481 | the `addE` parser is never tried. This is an example of `(<|>)` not 482 | trying the second parser if the first parser succeeds, even if a later 483 | alternative would consume more input or successfully parse the whole 484 | input. 485 | 486 | Let's try and rearrange the order: 487 | 488 | > simpleExpr1 :: Parser SimpleExpr 489 | > simpleExpr1 = addE <|> numE <|> varE <|> parensE 490 | 491 | ``` 492 | *Main> parseWithWhitespace simpleExpr1 "12" 493 | Left (line 1, column 3): 494 | unexpected end of input 495 | expecting digit or "+" 496 | 497 | *Main> parseWithWhitespace simpleExpr1 "aa" 498 | Right (Var "aa") 499 | 500 | *Main> parseWithWhitespace simpleExpr1 "1+2" 501 | Right (Add (Num 1) (Num 2)) 502 | 503 | *Main> parseWithWhitespace simpleExpr1 "(1)" 504 | Right (Parens (Num 1)) 505 | 506 | ``` 507 | 508 | We swapped one problem for another. Let's fix this using the `try` 509 | function. 510 | 511 | 512 | > simpleExpr2 :: Parser SimpleExpr 513 | > simpleExpr2 = try addE <|> numE <|> varE <|> parensE 514 | 515 | ``` 516 | *Main> parseWithWhitespace simpleExpr2 "12" 517 | Right (Num 12) 518 | 519 | *Main> parseWithWhitespace simpleExpr2 "aa" 520 | Right (Var "aa") 521 | 522 | *Main> parseWithWhitespace simpleExpr2 "1+2" 523 | Right (Add (Num 1) (Num 2)) 524 | 525 | *Main> parseWithWhitespace simpleExpr2 "(1)" 526 | Right (Parens (Num 1)) 527 | ``` 528 | 529 | Now everything seems to work fine. The `try` function is very powerful 530 | and easy to use, and can be used where in a more traditional parsing 531 | approach you would have to use left factoring or something else. 532 | 533 | The `try` function implements backtracking. When this is used with 534 | `(<|>)`, it means that if the first parser fails, it will undo the 535 | consumed input and carry on with the next option, instead of failing 536 | completely. This works even if the `try` is nested deeply within the 537 | first parser given to `(<|>)`. 538 | 539 | `try` has its downsides (some of which we will see later), and I 540 | usually try to minimise its use or eliminate it completely. I found I 541 | often got into a complete mess when I used `try` too much when writing 542 | parsers for something a little tricky like SQL, and that although 543 | doing some left-factoring appeared at first to be tedious and appeared 544 | to make the code less readable, I eventually decided that for me it 545 | made the code more readable since what was happening was more 546 | transparent. 547 | 548 | Now we are going to fix this parser to parse arbitrarily nested 549 | expressions. In a way, the method used will roughly mean we are left 550 | factoring the `numE` and `addE` common prefix. 551 | 552 | Here is the naive implementation: 553 | 554 | > parensE3 :: Parser SimpleExpr 555 | > parensE3 = do 556 | > void $ lexeme $ char '(' 557 | > e <- simpleExpr3 558 | > void $ lexeme $ char ')' 559 | > return $ Parens e 560 | 561 | 562 | > addE3 :: Parser SimpleExpr 563 | > addE3 = do 564 | > e0 <- simpleExpr3 565 | > void $ lexeme $ char '+' 566 | > e1 <- simpleExpr3 567 | > return $ Add e0 e1 568 | 569 | 570 | > simpleExpr3 :: Parser SimpleExpr 571 | > simpleExpr3 = try addE3 <|> numE <|> varE <|> parensE3 572 | 573 | If you run this parser, it will enter an infinite loop, since 574 | `simpleExpr3` and `addE3` will keep calling each other recursively 575 | without making any progress. 576 | 577 | ``` 578 | *Main> parseWithWhitespace simpleExpr3 "a+b" 579 | C-c Interrupted. 580 | ``` 581 | Let's try without `add`. 582 | 583 | > parensE4 :: Parser SimpleExpr 584 | > parensE4 = do 585 | > void $ lexeme $ char '(' 586 | > e <- simpleExpr4 587 | > void $ lexeme $ char ')' 588 | > return $ Parens e 589 | 590 | > simpleExpr4 :: Parser SimpleExpr 591 | > simpleExpr4 = numE <|> varE <|> parensE4 592 | 593 | ``` 594 | *Main> parseWithWhitespace simpleExpr4 "a" 595 | Right (Var "a") 596 | 597 | *Main> parseWithWhitespace simpleExpr4 "1" 598 | Right (Num 1) 599 | 600 | *Main> parseWithWhitespace simpleExpr4 "(1)" 601 | Right (Parens (Num 1)) 602 | 603 | *Main> parseWithWhitespace simpleExpr4 "((a))" 604 | Right (Parens (Parens (Var "a"))) 605 | 606 | ``` 607 | 608 | At least this part seems to work OK. 609 | 610 | Let's try to stop the add parser from calling itself indirectly: 611 | 612 | Here is a parameterized parens parser where we supply the nested 613 | expression parser as an argument. This is used here to try to make the 614 | code easier to follow and avoid rewriting this parser out again and 615 | again in full. 616 | 617 | > parensEN :: Parser SimpleExpr -> Parser SimpleExpr 618 | > parensEN simpleExprImpl = do 619 | > void $ lexeme $ char '(' 620 | > e <- simpleExprImpl 621 | > void $ lexeme $ char ')' 622 | > return $ Parens e 623 | 624 | Here is a new parser, which parses expressions except add. 625 | 626 | > term :: Parser SimpleExpr -> Parser SimpleExpr 627 | > term simpleExprImpl = numE <|> varE <|> parensEN simpleExprImpl 628 | 629 | > term5 :: Parser SimpleExpr 630 | > term5 = term simpleExpr5 631 | 632 | > addE5 :: Parser SimpleExpr 633 | > addE5 = do 634 | > e0 <- term5 635 | > void $ lexeme $ char '+' 636 | > e1 <- term5 637 | > return $ Add e0 e1 638 | 639 | > simpleExpr5 :: Parser SimpleExpr 640 | > simpleExpr5 = try addE5 <|> term5 641 | 642 | ``` 643 | *Main> parseWithWhitespace simpleExpr5 "1" 644 | Right (Num 1) 645 | 646 | *Main> parseWithWhitespace simpleExpr5 "a" 647 | Right (Var "a") 648 | 649 | *Main> parseWithWhitespace simpleExpr5 "(a)" 650 | Right (Parens (Var "a")) 651 | 652 | *Main> parseWithWhitespace simpleExpr5 "1+2" 653 | Right (Add (Num 1) (Num 2)) 654 | 655 | *Main> parseWithWhitespace simpleExpr5 "1+a" 656 | Right (Add (Num 1) (Var "a")) 657 | 658 | *Main> parseWithWhitespace simpleExpr5 "(1+a)" 659 | Right (Parens (Add (Num 1) (Var "a"))) 660 | 661 | *Main> parseWithWhitespace simpleExpr5 "1+a+b" 662 | Left (line 1, column 4): 663 | unexpected '+' 664 | expecting end of input 665 | ``` 666 | 667 | Almost. Let's see what happens when the second `term` in `add` is 668 | changed to the general expression parser. 669 | 670 | > term6 :: Parser SimpleExpr 671 | > term6 = term simpleExpr6 672 | 673 | > addE6 :: Parser SimpleExpr 674 | > addE6 = do 675 | > e0 <- term6 676 | > void $ lexeme $ char '+' 677 | > e1 <- simpleExpr6 678 | > return $ Add e0 e1 679 | 680 | > simpleExpr6 :: Parser SimpleExpr 681 | > simpleExpr6 = try addE6 <|> term6 682 | 683 | ``` 684 | *Main> parseWithWhitespace simpleExpr6 "a + b + c" 685 | Right (Add (Var "a") (Add (Var "b") (Var "c"))) 686 | ``` 687 | 688 | Maybe it looks like we've made it. But there is a problem. We've 689 | parsed the + operator as if it has right associativity: 690 | 691 | ``` 692 | a + b + c -> a + (b + c) 693 | ``` 694 | 695 | But it should be left associative: 696 | 697 | ``` 698 | a + b + c -> (a + b) + c 699 | ``` 700 | 701 | Let's left factor the parsing and fix this: 702 | 703 | > term7 :: Parser SimpleExpr 704 | > term7 = term simpleExpr7 705 | 706 | > simpleExpr7 :: Parser SimpleExpr 707 | > simpleExpr7 = do 708 | > -- first parse a term 709 | > e <- term7 710 | > -- then see if it is followed by an '+ expr' suffix 711 | > maybeAddSuffix e 712 | > where 713 | > -- this function takes an expression, and parses a 714 | > -- '+ expr' suffix, returning an Add expression 715 | > -- it recursively calls itself via the maybeAddSuffix function 716 | > addSuffix e0 = do 717 | > void $ lexeme $ char '+' 718 | > e1 <- term7 719 | > maybeAddSuffix (Add e0 e1) 720 | > -- this is the wrapper for addSuffix, which adapts it so that if 721 | > -- addSuffix fails, it returns just the original expression 722 | > maybeAddSuffix e = addSuffix e <|> return e 723 | 724 | ``` 725 | *Main> parseWithWhitespace simpleExpr7 "a + b + c" 726 | Right (Add (Add (Var "a") (Var "b")) (Var "c")) 727 | ``` 728 | 729 | Now the parser seems to work for everything it should. 730 | 731 | There is a combinator function in Parsec we can use which abstracts 732 | this sort of pattern, `chainl1`. 733 | 734 | > simpleExpr8 :: Parser SimpleExpr 735 | > simpleExpr8 = chainl1 term8 op 736 | > where 737 | > op = do 738 | > void $ lexeme $ char '+' 739 | > return Add 740 | > term8 = term simpleExpr8 741 | 742 | How does this work? Here is the type of `chainl1`: 743 | 744 | ```haskell 745 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a 746 | chainl1 term op = ... 747 | ``` 748 | 749 | The type of the Add constructor in pseudocode is: 750 | 751 | ```haskell 752 | Add :: SimpleExpr -> SimpleExpr -> SimpleExpr 753 | ``` 754 | 755 | The `op` parser here now just parses the operator itself, i.e. '+' 756 | (and not the second expression like our simpleExpr7 parser). The 757 | return from the `op` function is a function which accepts two elements 758 | and combines them using the appropriate operator representation. In 759 | this case, the represenation is the `Add` constructor. 760 | 761 | You can look at the source 762 | 763 | and see if you can understand how it works. If you can't work it out, 764 | you could come back to it later when you have more experience writing 765 | parsing code. 766 | 767 | == Testing with the examples 768 | 769 | TODO: write a little manual tester that accepts a parser and a list of 770 | examples, and checks they all parse correctly. 771 | 772 | == Testing with quickcheck 773 | 774 | Let's see if we can check with quickcheck. It's a bit tricky testing 775 | parsers in this way, but one way to do something useful is to generate 776 | random asts, convert them to concrete syntax, parse them, and check 777 | the result. We can write a simple 'pretty printer' to convert an ast 778 | to concrete syntax. 779 | 780 | === a pretty printer 781 | 782 | TODO: a really simple pretty printer just pasting strings together, no 783 | layout. 784 | 785 | === the quick check code 786 | 787 | TODO: write a quickcheck property and arbitary instance and show 788 | running it at the ghci prompt 789 | 790 | -------------------------------------------------------------------------------- /main.css: -------------------------------------------------------------------------------- 1 | h1, h2 { 2 | display:block; 3 | background-color: #f0f0f0; 4 | border-top: thin #c0c0c0 solid; 5 | /*position:relative;*/ 6 | padding-left:1ex; 7 | /*z-index: -10;*/ 8 | } 9 | h1 { 10 | font-size: x-large; 11 | /*left: -3ex;*/ 12 | margin-top: 3ex; 13 | /*width: 100%;*/ 14 | } 15 | h2 { 16 | font-size: large; 17 | /*left: -1.5ex;*/ 18 | margin-top: 1.5ex; 19 | /*width: 100%;*/ 20 | } 21 | body { 22 | margin-left: 5em; 23 | margin-right: 5em; 24 | margin-bottom: 5em; 25 | margin-top: 2em; 26 | } 27 | #TOC { 28 | float:right; 29 | z-index:10; 30 | background-color: #f0f0f0; 31 | border: thin #c0c0c0 solid; 32 | padding:2ex; 33 | } 34 | .header { 35 | /*position:relative; 36 | left: -4ex;*/ 37 | border-top: thin #c0c0c0 solid; 38 | border-bottom: thin #c0c0c0 solid; 39 | display:inline; 40 | padding: 1ex; 41 | background-color: #f0f0f0; 42 | font-weight: bold; 43 | } 44 | .footer { 45 | text-align: center; 46 | font-size: small; 47 | } 48 | pre { 49 | padding: 0.5ex; 50 | } 51 | 52 | 53 | pre { 54 | background-color: #f0f0f0; 55 | } 56 | 57 | /* 58 | .SqlPostgresql pre.sourceCode { 59 | padding: 0.5em; 60 | background-color: #f0f6f6; 61 | } 62 | .sql pre.sourceCode { 63 | padding: 0.5em; 64 | background-color: #f0f6f6; 65 | } 66 | .GeneratedSql .SqlPostgresql pre.sourceCode, .SqlPostgresql .GeneratedSql pre.sourceCode { 67 | padding: 0.5em; 68 | background-color: #f0f6e0; 69 | } 70 | .UnusedSql .SqlPostgresql pre.sourceCode, .SqlPostgresql .UnusedSql pre.sourceCode { 71 | padding: 0.5em; 72 | background-color: #e9e9e9; 73 | } 74 | .haskell,.Haskell pre.sourceCode { 75 | background-color: #f5f5d9; 76 | } 77 | 78 | .sh pre.sourceCode { 79 | padding: 0.5em; 80 | background-color: #f0f0f0; 81 | } 82 | */ 83 | table, tr, td { 84 | border-collapse:collapse; 85 | cell-padding:2px; 86 | cell-spacing:2px; 87 | /* padding:2px 88 | spacing:2px 89 | margin:2px*/ 90 | vertical-align:top; 91 | } 92 | td pre { 93 | width: 98%; 94 | height: 98%; 95 | vertical-align:top; 96 | } 97 | table { 98 | width:100%; 99 | table-layout:fixed; 100 | } 101 | td { 102 | width: 50%; 103 | vertical-align:top; 104 | overflow:auto; 105 | } 106 | hr { 107 | border: 0; 108 | color: black; 109 | background-color: black; 110 | height: 1px; 111 | width: 75%; 112 | } 113 | 114 | .tablediv { 115 | width:100%; 116 | } 117 | 118 | /* higlighting kate */ 119 | 120 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode, table.sourceCode pre 121 | { /*margin: 2; padding: 2; border: 2; */ vertical-align: baseline; border: none; } 122 | td.lineNumbers { border-right: 1px solid #AAAAAA; text-align: right; color: #AAAAAA; padding-right: 5px; padding-left: 5px; } 123 | td.sourceCode { padding-left: 5px; } 124 | pre.sourceCode { } 125 | pre.sourceCode span.Normal { } 126 | pre.sourceCode span.Keyword { color: #007020; font-weight: bold; } 127 | pre.sourceCode span.DataType { color: #902000; } 128 | pre.sourceCode span.DecVal { color: #40a070; } 129 | pre.sourceCode span.BaseN { color: #40a070; } 130 | pre.sourceCode span.Float { color: #40a070; } 131 | pre.sourceCode span.Char { color: #4070a0; } 132 | pre.sourceCode span.String { color: #4070a0; } 133 | pre.sourceCode span.Comment { color: #60a0b0; font-style: italic; } 134 | pre.sourceCode span.Others { color: #007020; } 135 | pre.sourceCode span.Alert { color: red; font-weight: bold; } 136 | pre.sourceCode span.Function { color: #06287e; } 137 | pre.sourceCode span.RegionMarker { } 138 | pre.sourceCode span.Error { color: red; font-weight: bold; } 139 | -------------------------------------------------------------------------------- /make_website.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # this file is used to render the html via asciidoc from the source 6 | # files and the README.asciidoc 7 | 8 | mkdir -p build 9 | 10 | set +e 11 | rm build/IntroToParsing.lhs 12 | set -e 13 | 14 | cat GettingStarted.lhs VerySimpleExpressions.lhs ApplicativeStyle.lhs CombinatorReview.lhs FunctionsAndTypesForParsing.lhs TextParsecExpr.lhs AnIssueWithTokenParsers.lhs TextParsecPerm.lhs TextParsecToken.lhs ValueExpressions.lhs QueryExpressions.lhs FromClause.lhs SimpleSQLQueryParser0.lhs PrettyPrinting0.lhs ErrorMessages.lhs > build/IntroToParsing.lhs 15 | 16 | echo > build/intro_to_parsing.asciidoc 17 | echo :toc: right >> build/intro_to_parsing.asciidoc 18 | echo :sectnums: >> build/intro_to_parsing.asciidoc 19 | echo :toclevels: 10 >> build/intro_to_parsing.asciidoc 20 | echo :source-highlighter: pygments >> build/intro_to_parsing.asciidoc 21 | echo >> build/intro_to_parsing.asciidoc 22 | 23 | cat README.asciidoc | runhaskell render/FixReadme.lhs >> build/intro_to_parsing.asciidoc 24 | cat build/IntroToParsing.lhs | runhaskell render/Render.lhs >> build/intro_to_parsing.asciidoc 25 | 26 | cat build/intro_to_parsing.asciidoc | asciidoctor -d book - | runhaskell AddLinks.lhs > build/index.html 27 | -------------------------------------------------------------------------------- /render/FixReadme.lhs: -------------------------------------------------------------------------------- 1 | 2 | Fix the README for the html genereation 3 | 4 | > import Data.List 5 | 6 | > main :: IO () 7 | > main = do 8 | > ls <- getContents 9 | > putStrLn $ unlines $ map fixLinks $ lines ls 10 | 11 | > fixLinks x | "link:" `isPrefixOf` x = [] 12 | > fixLinks x | "// <<" `isPrefixOf` x = drop 3 x 13 | > fixLinks x = x 14 | -------------------------------------------------------------------------------- /render/Render.lhs: -------------------------------------------------------------------------------- 1 | 2 | This code is to take the .lhs and add some formatting so it renders 3 | nicely with asciidoctor. 4 | 5 | TODO 6 | 7 | concat docs then render as multiple pages 8 | get the inter/intra doc links working 9 | 10 | find a better fix for the * issue 11 | 12 | > import System.Environment 13 | > import Data.Char 14 | > import System.IO 15 | > import System.FilePath 16 | > import Data.List 17 | 18 | > main :: IO () 19 | > main = do 20 | > --[fn] <- getArgs 21 | > src <- getContents -- readFile fn 22 | > --let title = makeTitle $ takeBaseName fn 23 | > --putStrLn $ ":toc: right\n:toclevels: 8\n:sectnums:\n" 24 | > --putStrLn "= Introduction to parsing in Haskell with Parsec\n" 25 | > putStrLn . unlines . map fixHeadings 26 | > . addBlocks False . lines $ src 27 | > where 28 | > makeTitle (c:c1:cs) | isUpper c && isUpper c1 = 29 | > let (rest,cs') = span isUpper cs 30 | > in case (rest,cs') of 31 | > ([],_) -> ' ' : c : ' ' : c1 : makeTitle cs' 32 | > (_, d:_) | isDigit d -> ' ' : c : c1 : 33 | > (rest ++ " " ++ makeTitle cs') 34 | > _ -> ' ' : c : c1 : (init rest ++ " " ++ [last rest] 35 | > ++ makeTitle cs') 36 | > makeTitle (c:cs) | isUpper c = ' ' : toUpper c : makeTitle cs 37 | > makeTitle (c:cs) = c : makeTitle cs 38 | > makeTitle [] = [] 39 | > fixHeadings ('=':xs) = '=':'=':xs 40 | > fixHeadings xs = xs 41 | > addBlocks False (l:ls) 42 | > | isLiterate l = 43 | > "[source,haskell]\n----" : stripLiterate l 44 | > : addBlocks True ls 45 | > | otherwise = l : addBlocks False ls 46 | 47 | > addBlocks True (l:ls) 48 | > | isLiterate l = 49 | > stripLiterate l : addBlocks True ls 50 | > | otherwise = "----" : l : addBlocks False ls 51 | > addBlocks _ [] = [] 52 | > stripLiterate ('>':' ':l) = l 53 | > stripLiterate (">") = "" 54 | > stripLiterate l = l 55 | > isLiterate ('>':_) = True 56 | > isLiterate _ = False 57 | 58 | escape asterisk: rules worked out by experiment. If there is an 59 | asterisk on a non code block line, and there are more asterisks on the 60 | line, then put a backslash before the first asterisk only to stop it 61 | from being interpreted as bold formatting. Not sure why only the first 62 | one should have a backslash, or why if you put a backslash in front of 63 | subsequence asterisks, then the backslash is rendered ... 64 | 65 | Doesn't work, need to parse paragraphs not lines, todo, and need to 66 | make sure it doesn't replace inside blocks 67 | 68 | > {-escapeAsterisk l 69 | > | length (elemIndices '*' l) > 1 = 70 | > let (a,b) = break (=='*') l 71 | > in a ++ ('\\':b) 72 | > | otherwise = l-} 73 | -------------------------------------------------------------------------------- /source_text: -------------------------------------------------------------------------------- 1 | correct 2 | --------------------------------------------------------------------------------