├── .gitignore
├── AddLinks.lhs
├── AnIssueWithTokenParsers.lhs
├── ApplicativeStyle.lhs
├── CombinatorReview.lhs
├── ErrorMessages.lhs
├── FromClause.lhs
├── FunctionsAndTypesForParsing.lhs
├── GettingStarted.lhs
├── IntroductionToCombinators.lhs
├── LICENSE
├── ParseFile.lhs
├── ParseSQLQueries.lhs
├── ParseString.lhs
├── ParsingTpch.lhs
├── PrettyPrinting0.lhs
├── QueryExpressions.lhs
├── README.asciidoc
├── Setup.hs
├── SimpleSQLQueryParser0.lhs
├── Text
    └── Parsec
    │   └── String
    │       ├── Char.hs
    │       ├── Combinator.hs
    │       ├── Expr.hs
    │       └── Parsec.hs
├── TextParsecExpr.lhs
├── TextParsecPerm.lhs
├── TextParsecToken.lhs
├── ValueExpressions.lhs
├── VerySimpleExpressions.lhs
├── main.css
├── make_website.sh
├── render
    ├── FixReadme.lhs
    └── Render.lhs
└── source_text


/.gitignore:
--------------------------------------------------------------------------------
1 | /build/
2 | /nohup.out
3 | /cabal.sandbox.config
4 | /.cabal-sandbox
5 | /.ghc.environment.x86_64-linux-8.6.5
6 | /dist-newstyle/
7 | /dist/


--------------------------------------------------------------------------------
/AddLinks.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | Little hack to add links to the navigation bars when generating the
 3 | html website.
 4 | 
 5 | > main :: IO ()
 6 | > main = interact addLinks
 7 | 
 8 | 
 9 | > addLinks :: String -> String
10 | > addLinks [] = error "not found"
11 | > addLinks ('<':'/':'u':'l':'>':'\n':'<':'/':'d':'i':'v':'>':xs) =
12 | >     "</ul>" ++ linkSection ++ "\n</div>" ++ xs
13 | > addLinks (x:xs) = x : addLinks xs
14 | 
15 | > linkSection :: String
16 | > linkSection =
17 | >   "<hr />\n\
18 | >   \<ul class=\"sectlevel1\">\n\
19 | >   \<li><a href=\"http://jakewheat.github.io/intro_to_parsing\" class=\"bare\">Homepage</a></li>\n\
20 | >   \<li><a href=\"https://github.com/JakeWheat/intro_to_parsing\" class=\"bare\">Repository</a></li>\n\
21 | >   \<li><a href=\"https://github.com/JakeWheat/intro_to_parsing/issues\" class=\"bare\">Bug tracker</a></li>\n\
22 | >   \<li><a href=\"http://jakewheat.github.io/\" class=\"bare\">Parent project</a>\n\
23 | >   \</li><li>jakewheatmail@gmail.com</li>\n\
24 | >   \</ul>\n"
25 | 


--------------------------------------------------------------------------------
/AnIssueWithTokenParsers.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[an-issue-with-token-parsers]]
  3 | = An issue with token parsers
  4 | 
  5 | This is a tutorial about an issue with the token parsing we have so
  6 | far.
  7 | 
  8 | > import Text.Parsec.String (Parser)
  9 | > import Text.Parsec.String.Parsec (try)
 10 | > import Text.Parsec.String.Combinator (many1, notFollowedBy)
 11 | > import Text.Parsec.String.Char (digit, string, oneOf, satisfy, char, letter)
 12 | >
 13 | > import Control.Applicative ((<$>), (<*>), (<*), many, (<$), (<|>))
 14 | > import Control.Monad (void, guard)
 15 | >
 16 | > import qualified Text.Parsec.String.Expr as E
 17 | > import FunctionsAndTypesForParsing
 18 | 
 19 | Here is a simplified expression type and parser:
 20 | 
 21 | > data SimpleExpr = Num Integer
 22 | >                 | BinaryOp SimpleExpr String SimpleExpr
 23 | >                    deriving (Eq,Show)
 24 | >
 25 | > simpleExpr :: Parser SimpleExpr
 26 | > simpleExpr = E.buildExpressionParser table num
 27 | >
 28 | > table :: [[E.Operator SimpleExpr]]
 29 | > table = [[binary "<=" E.AssocNone
 30 | >          ,binary ">=" E.AssocNone]
 31 | >         ,[binary "<" E.AssocNone
 32 | >          ,binary ">" E.AssocNone]
 33 | >         ]
 34 | >   where
 35 | >     binary name assoc =
 36 | >         E.Infix (mkBinOp name <$ symbol name) assoc
 37 | >     mkBinOp nm a b = BinaryOp a nm b
 38 | >
 39 | > num :: Parser SimpleExpr
 40 | > num = Num <$> integer
 41 | >
 42 | > whitespace :: Parser ()
 43 | > whitespace = void $ many $ oneOf " \n\t"
 44 | >
 45 | > lexeme :: Parser a -> Parser a
 46 | > lexeme p = p <* whitespace
 47 | >
 48 | > integer :: Parser Integer
 49 | > integer = read <$> lexeme (many1 digit)
 50 | >
 51 | > symbol :: String -> Parser String
 52 | > symbol s = lexeme $ string s
 53 | 
 54 | Let's try it out:
 55 | 
 56 | ```
 57 | *Main> regularParse simpleExpr "1=2"
 58 | Right (Num 1)
 59 | 
 60 | *Main> regularParse simpleExpr "1>=2"
 61 | Right (BinaryOp (Num 1) ">=" (Num 2))
 62 | 
 63 | *Main> regularParse simpleExpr "1>2"
 64 | Left (line 1, column 2):
 65 | unexpected "2"
 66 | expecting ">="
 67 | ```
 68 | 
 69 | What happened? The parser tried to parse > as >=, failed, and since
 70 | the failure consumed some input (the first >), it failed completely.
 71 | 
 72 | We are going to change the symbol parser to fix this. Here is a
 73 | parameterized version of the simpleExpr parser so we can try a few
 74 | variations out.
 75 | 
 76 | > simpleExprP :: (String -> Parser String) -> Parser SimpleExpr
 77 | > simpleExprP sym = E.buildExpressionParser (tableP sym) num
 78 | >
 79 | > tableP :: (String -> Parser String) -> [[E.Operator SimpleExpr]]
 80 | > tableP sym = [[binary "<=" E.AssocNone
 81 | >               ,binary ">=" E.AssocNone]
 82 | >              ,[binary "<" E.AssocNone
 83 | >               ,binary ">" E.AssocNone]]
 84 | >   where
 85 | >     binary name assoc =
 86 | >         E.Infix (mkBinOp name <$ sym name) assoc
 87 | >     mkBinOp nm a b = BinaryOp a nm b
 88 | 
 89 | Let's reproduce the failure:
 90 | 
 91 | ```
 92 | *Main> regularParse (simpleExprP symbol) "1>=2"
 93 | Right (BinaryOp (Num 1) ">=" (Num 2))
 94 | 
 95 | *Main> regularParse (simpleExprP symbol) "1>2"
 96 | Left (line 1, column 2):
 97 | unexpected "2"
 98 | expecting ">="
 99 | ```
100 | 
101 | We are going to look at two possible solutions.
102 | 
103 | 1. Let's use `try`:
104 | 
105 | ```
106 | *Main> regularParse (simpleExprP (try . symbol)) "1>=2"
107 | Right (BinaryOp (Num 1) ">=" (Num 2))
108 | 
109 | *Main> regularParse (simpleExprP (try . symbol)) "1>2"
110 | Right (BinaryOp (Num 1) ">" (Num 2))
111 | ```
112 | 
113 | This seems to have done the job. There is still a problem
114 | though. Consider a case when the precedence is the other way round -
115 | the `<` and `>` are higher precedence than `<=` and `>=`,
116 | 
117 | > simpleExprP1 :: (String -> Parser String) -> Parser SimpleExpr
118 | > simpleExprP1 sym = E.buildExpressionParser (tableP1 sym) num
119 | >
120 | > tableP1 :: (String -> Parser String) -> [[E.Operator SimpleExpr]]
121 | > tableP1 sym = [[binary "<" E.AssocNone
122 | >               ,binary ">" E.AssocNone]
123 | >              ,[binary "<=" E.AssocNone
124 | >               ,binary ">=" E.AssocNone]]
125 | >   where
126 | >     binary name assoc =
127 | >         E.Infix (mkBinOp name <$ sym name) assoc
128 | >     mkBinOp nm a b = BinaryOp a nm b
129 | 
130 | ```
131 | *Main> regularParse (simpleExprP1 (try . symbol)) "1>2"
132 | Right (BinaryOp (Num 1) ">" (Num 2))
133 | 
134 | *Main> regularParse (simpleExprP1 (try . symbol)) "1>=2"
135 | Left (line 1, column 3):
136 | unexpected "="
137 | expecting digit
138 | ```
139 | 
140 | Although the precendence order is a little contrived in this case,
141 | this issue could easily crop up for real when we start adding more
142 | operators. Let's fix it now.
143 | 
144 | This could be solved by adding a `try` at a earlier place in the
145 | parsing. Because of how the `buildExpressionParser` function works,
146 | it's not obvious where the `try` could go.
147 | 
148 | Let's try tackling the problem in a different way. One way of looking
149 | at this is to consider that the symbol parser stops parsing too soon:
150 | 
151 | ```
152 | *Main> parseWithLeftOver (symbol ">") ">="
153 | Right (">","=")
154 | ```
155 | 
156 | What it should do is keep parsing symbol characters until it gets a
157 | result string which can't be a symbol, and stop one character before
158 | this..
159 | 
160 | Here is a slightly naive way of doing it, which will be good enough
161 | for quite a while:
162 | 
163 | > symbol1 :: String -> Parser String
164 | > symbol1 s = try $ lexeme $ do
165 | >     u <- many1 (oneOf "<>=+-^%/*")
166 | >     guard (s == u)
167 | >     return s
168 | 
169 | Here is a similar alternative:
170 | 
171 | > symbol2 :: String -> Parser String
172 | > symbol2 s = try $ lexeme $ do
173 | >     void $ string s
174 | >     notFollowedBy (oneOf "<>=+-^%/*")
175 | >     return s
176 | 
177 | Let's try them out:
178 | 
179 | ```
180 | *Main> parseWithLeftOver (symbol1 ">") ">="
181 | Left (line 1, column 3):
182 | unexpected end of input
183 | 
184 | *Main> parseWithLeftOver (symbol1 ">") ">"
185 | Right (">","")
186 | 
187 | *Main> parseWithLeftOver (symbol1 ">") ">= 3"
188 | Left (line 1, column 3):
189 | unexpected " "
190 | 
191 | *Main> parseWithLeftOver (symbol1 ">=") ">= 3"
192 | Right (">="," 3")
193 | 
194 | 
195 | ```
196 | The error messages don't seem very good, but it parses and fails to
197 | parse correctly.
198 | 
199 | ```
200 | *Main> parseWithLeftOver (symbol2 ">") ">="
201 | Left (line 1, column 3):
202 | unexpected '='
203 | 
204 | *Main> parseWithLeftOver (symbol2 ">") ">"
205 | Right (">","")
206 | 
207 | *Main> parseWithLeftOver (symbol2 ">") ">= 3"
208 | Left (line 1, column 3):
209 | unexpected '='
210 | 
211 | *Main> parseWithLeftOver (symbol2 ">=") ">= 3"
212 | Right (">="," 3")
213 | ```
214 | 
215 | This one appears to give better error messages in this limited
216 | scenario, apart from that they both work the same.
217 | 
218 | Let's try them out in the full expression parser:
219 | 
220 | ```
221 | *Main> regularParse (simpleExprP symbol1) "1>=2"
222 | Right (BinaryOp (Num 1) ">=" (Num 2))
223 | 
224 | *Main> regularParse (simpleExprP symbol1) "1>2"
225 | Right (BinaryOp (Num 1) ">" (Num 2))
226 | 
227 | *Main> regularParse (simpleExprP symbol2) "1>=2"
228 | Right (BinaryOp (Num 1) ">=" (Num 2))
229 | 
230 | *Main> regularParse (simpleExprP symbol2) "1>2"
231 | Right (BinaryOp (Num 1) ">" (Num 2))
232 | ```
233 | 
234 | They both work fine here. Let's see some error messages in this
235 | context.
236 | 
237 | ```
238 | *Main> parseWithEof (simpleExprP symbol1) "1>*2"
239 | Left (line 1, column 4):
240 | unexpected "2"
241 | expecting operator
242 | 
243 | *Main> parseWithEof (simpleExprP symbol2) "1>*2"
244 | Left (line 1, column 4):
245 | unexpected '*'
246 | expecting operator
247 | ```
248 | 
249 | Both error messages are a bit crap. So much for the second variation
250 | producing better error messages.
251 | 
252 | Let's look at the equivalent issue with respect to keyword parsing. We
253 | can get a similar problem here.
254 | 
255 | > keyword :: String -> Parser String
256 | > keyword s = try $ string s
257 | 
258 | ```
259 | *Main> parseWithEof (keyword "not") "not"
260 | Right "not"
261 | 
262 | *Main> parseWithEof (keyword "not") "nothing"
263 | Left (line 1, column 4):
264 | unexpected 'h'
265 | expecting end of input
266 | 
267 | *Main> parseWithEof (keyword "not" <|> keyword "nothing") "nothing"
268 | Left (line 1, column 4):
269 | unexpected 'h'
270 | expecting end of input
271 | 
272 | *Main> parseWithEof (keyword "nothing" <|> keyword "not") "nothing"
273 | Right "nothing"
274 | ```
275 | 
276 | We can fix this overlapping prefix issue by reordering the
277 | choices. But let's fix the `keyword` parser in a similar way to the
278 | symbol parser.
279 | 
280 | TODO: I don't know if symbol is the right name, I don't think Parsec
281 | usually uses symbol in this way. Maybe it should be called operator.
282 | 
283 | > identifier :: Parser String
284 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
285 | >   where
286 | >     firstChar = letter <|> char '_'
287 | >     nonFirstChar = digit <|> firstChar
288 | 
289 | > keyword1 :: String -> Parser String
290 | > keyword1 k = try $ do
291 | >     i <- identifier
292 | >     guard (i == k)
293 | >     return k
294 | 
295 | TODO: later note in error messages about choosing identifier here
296 | instead of e.g. many1 letter.
297 | 
298 | ```
299 | *Main> parseWithEof (keyword1 "not") "not"
300 | Right "not"
301 | 
302 | *Main> parseWithEof (keyword1 "not") "nothing"
303 | Left (line 1, column 8):
304 | unexpected end of input
305 | expecting digit, letter or "_"
306 | 
307 | *Main> parseWithEof (keyword1 "not" <|> keyword1 "nothing") "nothing"
308 | Right "nothing"
309 | 
310 | *Main> parseWithEof (keyword1 "nothing" <|> keyword1 "not") "nothing"
311 | Right "nothing"
312 | 
313 | *Main> parseWithEof (keyword1 "not" <|> keyword1 "nothing") "not"
314 | Right "not"
315 | 
316 | *Main> parseWithEof (keyword1 "nothing" <|> keyword1 "not") "not"
317 | Right "not"
318 | ```
319 | 
320 | Try implementing the `keyword2` parser which uses `notFollowedBy`
321 | instead of `guard`, using something analogous to the change from
322 | `symbol1` to `symbol2` above.
323 | 
324 | After this, you can try reimplementing the expression parser from the
325 | Text.Parsec.Expr tutorial using the new symbol and keyword parsers.
326 | 


--------------------------------------------------------------------------------
/ApplicativeStyle.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[applicative-style-parsing-code]]
  3 | = Applicative style parsing code
  4 | 
  5 | Now we can go back over the expression parsing code written in the
  6 | last tutorial, and make it much more concise, and also make it more
  7 | readable. We are going to do this mainly by using functions from the
  8 | typeclass Applicative.
  9 | 
 10 | Remember you can (and should) use the functions `regularParse` and its
 11 | variations (TODO list them here) to try out the all these parsers in
 12 | ghci, and you can write your own variations to experiment with if you
 13 | are unsure about anything.
 14 | 
 15 | > import Text.Parsec.String (Parser)
 16 | > import Text.Parsec.String.Char (oneOf, char, digit, letter, satisfy)
 17 | > import Text.Parsec.String.Combinator (many1, chainl1)
 18 | > import Control.Applicative ((<$>), (<*>), (<*), (*>), (<|>), many, (<$))
 19 | > import Control.Monad (void, ap)
 20 | > import Data.Char (isLetter, isDigit)
 21 | > import FunctionsAndTypesForParsing
 22 | 
 23 | Here is the SimpleExpr type again:
 24 | 
 25 | > data SimpleExpr = Num Integer
 26 | >                 | Var String
 27 | >                 | Add SimpleExpr SimpleExpr
 28 | >                 | Parens SimpleExpr
 29 | >                   deriving (Eq,Show)
 30 | 
 31 | Here is the basic pattern behind most of the rewrites we are going to
 32 | cover. Here is a function which takes a constructor and two parsers
 33 | for the two arguments for the constructor. It parses the two
 34 | arguments, then applies the constructor to the results:
 35 | 
 36 | > myParser1 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c
 37 | > myParser1 ctor pa pb = do
 38 | >     a <- pa
 39 | >     b <- pb
 40 | >     return $ ctor a b
 41 | 
 42 | TODO: concrete example, plus examples at the bottom of this section
 43 | (for ctor <$> a, ctor <$> a <*> b, ctor <$> a <*> b <*> c).
 44 | 
 45 | This can be rewritten without the do syntactic sugar like this:
 46 | 
 47 | > myParser2 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c
 48 | > myParser2 ctor pa pb =
 49 | >     pa >>= \a -> pb >>= \b -> return $ ctor a b
 50 | 
 51 | And can also be rewritten like this:
 52 | 
 53 | > myParser3 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c
 54 | > myParser3 ctor pa pb = ctor `fmap` pa `ap` pb
 55 | 
 56 | (This uses functions from Applicative instead of Monad.) We replace
 57 | the use of `>>=` with `fmap` and `ap`. This isn't always possible, but
 58 | it often is.
 59 | 
 60 | Here is the version using the operators for `fmap` and `ap` (`fmap`
 61 | changed to `<$>`, and `ap` changed to `<*>`).
 62 | 
 63 | > myParser4 :: (a -> b -> c) -> Parser a -> Parser b -> Parser c
 64 | > myParser4 ctor pa pb = ctor <$> pa <*> pb
 65 | 
 66 | This style takes less typing, and is often much simpler to write and
 67 | read.
 68 | 
 69 | This pattern 'scales', you can use:
 70 | 
 71 | ```
 72 | Ctor <$> pa
 73 | ```
 74 | 
 75 | for a single argument constructor. This might also be familiar to you
 76 | as
 77 | 
 78 | ```
 79 | fmap Ctor pa
 80 | ```
 81 | 
 82 | or
 83 | 
 84 | ```
 85 | Ctor `fmap` pa
 86 | ```
 87 | 
 88 | All of which mean the same thing, just slightly different spellings.
 89 | 
 90 | This can also be written using Monad operators:
 91 | 
 92 | ```
 93 | pa >>= liftM ctor
 94 | ```
 95 | 
 96 | or
 97 | 
 98 | ```
 99 | liftM ctor =<< pa
100 | ```
101 | 
102 | (`liftM` is in the module `Control.Monad`)
103 | 
104 | These `liftM` versions effectively mean the same thing as the previous
105 | versions with `fmap` and `<$>`.
106 | 
107 | You can use
108 | 
109 | ```
110 | Ctor <$> pa <*> pb <*> pc
111 | ```
112 | 
113 | for three args, and so on. So you use `<$>` between the pure
114 | constructor and the first arg, then `<*>` between each subsequent arg.
115 | 
116 | Let's go over the simple expression parsers and try to rewrite them
117 | using this style. We will see a few other new functions. I will break
118 | things down into a lot of steps.
119 | 
120 | == lexeme
121 | 
122 | Here is the old lexeme parser, 'D' suffix for 'do notation'.
123 | 
124 | > lexemeD :: Parser a -> Parser a
125 | > lexemeD p = do
126 | >            x <- p
127 | >            whitespace
128 | >            return x
129 | 
130 | > whitespace :: Parser ()
131 | > whitespace = void $ many $ oneOf " \n\t"
132 | 
133 | First, we can move the `whitespace` from its own separate line.
134 | 
135 | > lexemeA0 :: Parser a -> Parser a
136 | > lexemeA0 p = do
137 | >            x <- p <* whitespace
138 | >            return x
139 | 
140 | The expression `pa <* pb` means run `pa`, then run `pb`, and return
141 | the result of `pa`. It is sort of equivalent to this code:
142 | 
143 | ```haskell
144 | (<*) :: Parser a -> Parser b -> Parser a
145 | (<*) pa pb = do
146 |     a <- pa
147 |     void pb
148 |     return a
149 | ```
150 | 
151 | (It isn't implemented this way, since `(<*)` only needs Applicative
152 | and not Monad.)
153 | 
154 | Now we can use the usual monad syntax rewrites, first eliminate the
155 | name `x`.
156 | 
157 | > lexemeA1 :: Parser a -> Parser a
158 | > lexemeA1 p = do
159 | >              p <* whitespace
160 | 
161 | Now remove the redundant do:
162 | 
163 | > lexemeA :: Parser a -> Parser a
164 | > lexemeA p = p <* whitespace
165 | 
166 | == num
167 | 
168 | Now let's tackle the `num` parser.
169 | 
170 | > numD :: Parser SimpleExpr
171 | > numD = do
172 | >     n <- lexemeD $ many1 digit
173 | >     return $ Num $ read n
174 | 
175 | Let's move 'read' to the first line.
176 | 
177 | > numA0 :: Parser SimpleExpr
178 | > numA0 = do
179 | >     n <- read <$> lexemeA (many1 digit)
180 | >     return $ Num n
181 | 
182 | This uses `(<$>)` which we saw above. You may have done code rewrites
183 | like this using `fmap` with IO in other Haskell code.
184 | 
185 | Now let's move the `Num` ctor as well:
186 | 
187 | > numA1 :: Parser SimpleExpr
188 | > numA1 = do
189 | >     n <- (Num . read) <$> lexemeA (many1 digit)
190 | >     return n
191 | 
192 | You can also write it in this way:
193 | 
194 | > numA2 :: Parser SimpleExpr
195 | > numA2 = do
196 | >     n <- Num <$> read <$> lexemeA (many1 digit)
197 | >     return n
198 | 
199 | Why does this work? It it equivalent to the previous version partly
200 | because of the applicative laws.
201 | 
202 | Let's break it down:
203 | 
204 | > numA2'' :: Parser SimpleExpr
205 | > numA2'' = do
206 | >     n <- numb
207 | >     return n
208 | >   where
209 | >     numb :: Parser SimpleExpr
210 | >     numb = Num <$> int
211 | >     int :: Parser Integer
212 | >     int = read <$> lexemeA (many1 digit)
213 | 
214 | In terms of style, which do you think looks better:
215 | `(a . b) <$> p` or `a <$> b <$> p`.
216 | 
217 | The next step for num, we can eliminate the temporary name `n` and the
218 | `do`:
219 | 
220 | > numA3 :: Parser SimpleExpr
221 | > numA3 = (Num . read) <$> lexemeA (many1 digit)
222 | 
223 | In more 'industrial' parser code, I would usually write some
224 | tokenization parsers separately like this:
225 | 
226 | > integerA4 :: Parser Integer
227 | > integerA4 = read <$> lexemeA (many1 digit)
228 | 
229 | Then the num expression parser looks like this:
230 | 
231 | > numA4 :: Parser SimpleExpr
232 | > numA4 = Num <$> integerA4
233 | 
234 | and we also get a integer parser which we can reuse if we need to
235 | parse an integer in another context.
236 | 
237 | == var
238 | 
239 | Here is the previous var parser:
240 | 
241 | > varD :: Parser SimpleExpr
242 | > varD = lexemeA $ do
243 | >     fc <- firstChar
244 | >     rest <- many nonFirstChar
245 | >     return $ Var (fc:rest)
246 | >   where
247 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
248 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
249 | 
250 | The first thing we can do is to make the `firstChar` and
251 | `nonFirstChar` a little easier to read, using `(<|>)`, `char`,
252 | `letter` and `digit`:
253 | 
254 | > varA0 :: Parser SimpleExpr
255 | > varA0 = lexemeA $ do
256 | >     fl <- firstChar
257 | >     rest <- many nonFirstChar
258 | >     return $ Var (fl:rest)
259 | >   where
260 | >     firstChar = letter <|> char '_'
261 | >     nonFirstChar = digit <|> firstChar
262 | 
263 | Here is another way of making the function a little better:
264 | 
265 | > varA0' :: Parser SimpleExpr
266 | > varA0' = lexemeA $ do
267 | >     fl <- satisfy validFirstChar
268 | >     rest <- many (satisfy validNonFirstChar)
269 | >     return $ Var (fl:rest)
270 | >   where
271 | >     validFirstChar a = isLetter a || a == '_'
272 | >     validNonFirstChar a = validFirstChar a || isDigit a
273 | 
274 | We can lift the `(:)` using the Applicative operators.
275 | 
276 | > varA1 :: Parser SimpleExpr
277 | > varA1 = do
278 | >     i <- iden
279 | >     return $ Var i
280 | >   where
281 | >     iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar)
282 | >     firstChar = letter <|> char '_'
283 | >     nonFirstChar = digit <|> firstChar
284 | 
285 | We used the prefix version of `(:)` to use it with `(<$>)` and
286 | `(<*>)`. The lexemeA call was moved to the `iden` helper function.
287 | 
288 | Now tidy it up using `(<$>)` with the `Var` constructor:
289 | 
290 | > varA2 :: Parser SimpleExpr
291 | > varA2 = Var <$> iden
292 | >   where
293 | >     iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar)
294 | >     firstChar = letter <|> char '_'
295 | >     nonFirstChar = digit <|> firstChar
296 | 
297 | We could also split the `iden` into a separate top level function,
298 | with the same idea as with splitting the `integer` parser.
299 | 
300 | == parens
301 | 
302 | Here is the starting point:
303 | 
304 | > parensD :: Parser SimpleExpr
305 | > parensD = do
306 | >     void $ lexemeA $ char '('
307 | >     e <- simpleExprD
308 | >     void $ lexemeA $ char ')'
309 | >     return $ Parens e
310 | 
311 | Here is the rewrite in one step:
312 | 
313 | > parensA0 :: Parser SimpleExpr
314 | > parensA0 =
315 | >     Parens <$> (lexemeA (char '(')
316 | >                 *> simpleExprD
317 | >                 <* lexemeA (char ')'))
318 | 
319 | // f'in asciidoc, the first operator following is (*>)
320 | 
321 | Here you can see that there is a `(\*>)` which works in the opposite
322 | direction to `(<*)`. The precendence of these operators means that we
323 | have to use some extra parentheses (!) here.
324 | 
325 | TODO: lost the chained <*. Put something below about this so there is
326 | a concrete example.
327 | 
328 | == simple expr
329 | 
330 | Here is the old version:
331 | 
332 | > termD :: Parser SimpleExpr
333 | > termD = numD <|> varD <|> parensD
334 | 
335 | > simpleExprD :: Parser SimpleExpr
336 | > simpleExprD = chainl1 termD op
337 | >   where
338 | >     op = do
339 | >          void $ lexemeA $ char '+'
340 | >          return Add
341 | 
342 | We can simplify the `op` function using the techniques we've already
343 | seen:
344 | 
345 | > simpleExprA0 :: Parser SimpleExpr
346 | > simpleExprA0 = chainl1 termD op
347 | >   where op = lexemeA (char '+') *> return Add
348 | 
349 | The pattern `p *> return f` can use a different operator like this:
350 | `f <$ p`. Here it is in the expression parser:
351 | 
352 | > simpleExprA1 :: Parser SimpleExpr
353 | > simpleExprA1 = chainl1 termD op
354 | >   where op = Add <$ lexemeA (char '+')
355 | 
356 | You could also write the `op` parser inline:
357 | 
358 | > simpleExprA2 :: Parser SimpleExpr
359 | > simpleExprA2 = chainl1 termD (Add <$ lexemeA (char '+'))
360 | 
361 | Maybe this last step makes it less readable?
362 | 
363 | == summary
364 | 
365 | Here is the finished job for all the simple expression code without
366 | separate token parsers:
367 | 
368 | > num' :: Parser SimpleExpr
369 | > num' = (Num . read) <$> lexemeA (many1 digit)
370 | 
371 | > var' :: Parser SimpleExpr
372 | > var' = Var <$> iden
373 | >   where
374 | >     iden = lexemeA ((:) <$> firstChar <*> many nonFirstChar)
375 | >     firstChar = letter <|> char '_'
376 | >     nonFirstChar = digit <|> firstChar
377 | 
378 | > parens' :: Parser SimpleExpr
379 | > parens' =
380 | >     Parens <$> (lexemeA (char '(')
381 | >                 *> simpleExpr'
382 | >                 <* lexemeA (char ')'))
383 | 
384 | > term' :: Parser SimpleExpr
385 | > term' = num' <|> var' <|> parens'
386 | 
387 | > simpleExpr' :: Parser SimpleExpr
388 | > simpleExpr' = chainl1 term' op
389 | >   where op = Add <$ lexemeA (char '+')
390 | 
391 | Here they are with separate token parsers and a helper function:
392 | 
393 | > lexeme :: Parser a -> Parser a
394 | > lexeme p = p <* whitespace
395 | 
396 | > identifier :: Parser String
397 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
398 | >   where
399 | >     firstChar = letter <|> char '_'
400 | >     nonFirstChar = digit <|> firstChar
401 | 
402 | > integer :: Parser Integer
403 | > integer = read <$> lexeme (many1 digit)
404 | 
405 | Here is a lexeme wrapper for parsing single character symbols.
406 | 
407 | > symbol :: Char -> Parser ()
408 | > symbol c = void $ lexeme $ char c
409 | 
410 | Here is another little helper function. It barely pays its way in this
411 | short example, but even though it is only used once, I think it is
412 | worth it to make the code clearer.
413 | 
414 | > betweenParens :: Parser a -> Parser a
415 | > betweenParens p = symbol '(' *> p <* symbol ')'
416 | 
417 | Now the expression parsers:
418 | 
419 | > num :: Parser SimpleExpr
420 | > num = Num <$> integer
421 | 
422 | > var :: Parser SimpleExpr
423 | > var = Var <$> identifier
424 | 
425 | > parens :: Parser SimpleExpr
426 | > parens = Parens <$> betweenParens simpleExpr
427 | 
428 | > term :: Parser SimpleExpr
429 | > term = num <|> var <|> parens
430 | 
431 | > simpleExpr :: Parser SimpleExpr
432 | > simpleExpr = chainl1 term' op
433 | >   where op = Add <$ lexemeA (char '+')
434 | 
435 | Splitting the lexer parser layer out means that we have one place
436 | where we have to remember to add `lexeme` wrappers, and also I think
437 | makes the code easier to follow.
438 | 


--------------------------------------------------------------------------------
/CombinatorReview.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[combinator-review]]
  3 | = Combinator review
  4 | 
  5 | In this tutorial we will go through all the functions in
  6 | Text.Parsec.Combinator, and some useful ones in Control.Applicative
  7 | and Control.Monad as well.
  8 | 
  9 | > import Text.Parsec (ParseError)
 10 | > import Text.Parsec.String (Parser)
 11 | > import Text.Parsec.String.Parsec (try)
 12 | > import Text.Parsec.String.Char (oneOf, char, digit
 13 | >                                ,string, letter, satisfy)
 14 | > import Text.Parsec.String.Combinator (many1, choice, chainl1, between
 15 | >                                      ,count, option, optionMaybe, optional)
 16 | > import Control.Applicative ((<$>), (<*>), (<$), (<*), (*>), (<|>), many)
 17 | > import Control.Monad (void, ap, mzero)
 18 | > import Data.Char (isLetter, isDigit)
 19 | > import FunctionsAndTypesForParsing
 20 | 
 21 | == Text.Parsec.Combinator
 22 | 
 23 | You should look at the source for these functions and try to
 24 | understand how they are implemented.
 25 | 
 26 | <http://hackage.haskell.org/package/parsec-3.1.3/docs/src/Text-Parsec-Combinator.html>
 27 | 
 28 | The style of the source code in the Parsec library sources is a little
 29 | different to what we used at the end of the last tutorial. You can try
 30 | reimplementing each of the Text.Parsec.Combinator module functions
 31 | using the Applicative style. See if you can find a way to reassure
 32 | yourself that the rewritten versions you make are correct, perhaps via
 33 | writing automated tests, or perhaps some other method.
 34 | 
 35 | You should be able to easily understand the implementation of all the
 36 | functions in Text.Parsec.Combinator except possibly `anyToken` and
 37 | `notFollowedBy`.
 38 | 
 39 | === choice
 40 | 
 41 | ```haskell
 42 | choice :: [Parser a] -> Parser a
 43 | ```
 44 | 
 45 | `choice ps` tries to apply the parsers in the list `ps` in order,
 46 | until one of them succeeds. It returns the value of the succeeding
 47 | parser.
 48 | 
 49 | > a :: Parser Char
 50 | > a = char 'a'
 51 | >
 52 | > b :: Parser Char
 53 | > b = char 'b'
 54 | >
 55 | > aOrB :: Parser Char
 56 | > aOrB = choice [a,b]
 57 | 
 58 | ```
 59 | *Main> regularParse aOrB "a"
 60 | Right 'a'
 61 | 
 62 | *Main> regularParse aOrB "b"
 63 | Right 'b'
 64 | 
 65 | *Main> regularParse aOrB "c"
 66 | Left (line 1, column 1):
 67 | unexpected "c"
 68 | expecting "a" or "b"
 69 | ```
 70 | 
 71 | ==== using with try
 72 | 
 73 | If a parser fails with `(<|>)` or `choice`, then it will only try the
 74 | next parser if the last parser consumed no input.
 75 | 
 76 | TODO: make the parsers return the keyword and update the examples
 77 | 
 78 | > byKeyword :: Parser ()
 79 | > byKeyword = void $ string "by"
 80 | >
 81 | > betweenKeyword :: Parser ()
 82 | > betweenKeyword = void $ string "between"
 83 | 
 84 | Since both of these have the same prefix - b - if we combine them
 85 | using `choice` then it doesn't work correctly:
 86 | 
 87 | ```
 88 | *Main> regularParse byKeyword "by"
 89 | Right ()
 90 | 
 91 | *Main> regularParse byKeyword "between"
 92 | Left (line 1, column 1):
 93 | unexpected "e"
 94 | expecting "by"
 95 | 
 96 | *Main> regularParse betweenKeyword "between"
 97 | Right ()
 98 | 
 99 | *Main> regularParse betweenKeyword "by"
100 | Left (line 1, column 1):
101 | unexpected "y"
102 | expecting "between"
103 | 
104 | *Main> regularParse (choice [betweenKeyword,byKeyword]) "between"
105 | Right ()
106 | 
107 | *Main> regularParse (choice [betweenKeyword,byKeyword]) "by"
108 | Left (line 1, column 1):
109 | unexpected "y"
110 | expecting "between"
111 | 
112 | *Main> regularParse (choice [byKeyword,betweenKeyword]) "between"
113 | Left (line 1, column 1):
114 | unexpected "e"
115 | expecting "by"
116 | 
117 | *Main> regularParse (choice [byKeyword,betweenKeyword]) "by"
118 | Right ()
119 | ```
120 | 
121 | If we use `try` on the first option, then it all works fine.
122 | 
123 | ```
124 | *Main> regularParse (choice [try byKeyword,betweenKeyword]) "by"
125 | Right ()
126 | 
127 | *Main> regularParse (choice [try byKeyword,betweenKeyword]) "between"
128 | Right ()
129 | ```
130 | 
131 | === count
132 | 
133 | ```haskell
134 | count :: Int -> Parser a -> Parser [a]
135 | ```
136 | 
137 | `count n p` parses `n` occurrences of `p`. If `n` is smaller or equal
138 | to zero, the parser is equivalent to `return []`. It returns a list of
139 | the n values returned by `p`.
140 | 
141 | ```
142 | *Main> regularParse (count 5 a) "aaaaa"
143 | Right "aaaaa"
144 | 
145 | *Main> regularParse (count 5 a) "aaaa"
146 | Left (line 1, column 5):
147 | unexpected end of input
148 | expecting "a"
149 | 
150 | *Main> regularParse (count 5 a) "aaaab"
151 | Left (line 1, column 5):
152 | unexpected "b"
153 | expecting "a"
154 | 
155 | *Main> regularParse (count 5 aOrB) "aabaa"
156 | Right "aabaa"
157 | ```
158 | 
159 | === between
160 | 
161 | ```haskell
162 | between :: Parser open -> Parser close -> Parser a -> Parser a
163 | ```
164 | 
165 | `between open close p` parses `open`, followed by `p` and
166 | `close`. It returns the value returned by `p`.
167 | 
168 | We can replace the betweenParens from the previous tutorial using
169 | this:
170 | 
171 | > betweenParens :: Parser a -> Parser a
172 | > betweenParens p = between (symbol '(') (symbol ')') p
173 | 
174 | It hardly seems worth it to make this change, but it might be slightly
175 | quicker to read and understand if you aren't already familiar with
176 | some code or haven't viewed it for a while. This is good for 'code
177 | maintenance', where we need to fix bugs or add new features quickly to
178 | code we haven't looked at for two years or something.
179 | 
180 | Here are the support functions for this parser.
181 | 
182 | > symbol :: Char -> Parser Char
183 | > symbol c = lexeme $ char c
184 | 
185 | > lexeme :: Parser a -> Parser a
186 | > lexeme p = p <* whitespace
187 | 
188 | > whitespace :: Parser ()
189 | > whitespace = void $ oneOf " \n\t"
190 | 
191 | === option
192 | 
193 | ```haskell
194 | option :: a -> Parser a -> Parser a
195 | ```
196 | 
197 | `option x p` tries to apply parser `p`. If `p` fails without
198 | consuming input, it returns the value `x`, otherwise the value returned
199 | by `p`.
200 | 
201 | ```
202 | *Main> regularParse (option "" (count 5 aOrB)) "aaaaa"
203 | Right "aaaaa"
204 | 
205 | *Main> regularParse (option "" (count 5 aOrB)) "caaaa"
206 | Right ""
207 | 
208 | *Main> regularParse (option "" (count 5 aOrB)) "aaaa"
209 | Left (line 1, column 5):
210 | unexpected end of input
211 | expecting "a" or "b"
212 | 
213 | *Main> regularParse (option "" (count 5 aOrB)) "aaaac"
214 | Left (line 1, column 5):
215 | unexpected "c"
216 | expecting "a" or "b"
217 | 
218 | *Main> regularParse (option "" (try (count 5 aOrB))) "aaaa"
219 | Right ""
220 | ```
221 | 
222 | === optionMaybe
223 | 
224 | ```haskell
225 | optionMaybe :: Parser a -> Parser (Maybe a)
226 | ```
227 | 
228 | `optionMaybe p` tries to apply parser `p`. If `p` fails without consuming
229 | input, it returns `Nothing`, otherwise it returns `Just` the value returned
230 | by `p`.
231 | 
232 | ```
233 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaaa"
234 | Right (Just "aaaaa")
235 | 
236 | *Main> regularParse (optionMaybe (count 5 aOrB)) "caaaa"
237 | Right Nothing
238 | 
239 | *Main> regularParse (optionMaybe (count 5 aOrB)) "caaa"
240 | Right Nothing
241 | 
242 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaa"
243 | Left (line 1, column 5):
244 | unexpected end of input
245 | expecting "a" or "b"
246 | 
247 | *Main> regularParse (optionMaybe (count 5 aOrB)) "aaaac"
248 | Left (line 1, column 5):
249 | unexpected "c"
250 | expecting "a" or "b"
251 | 
252 | *Main> regularParse (optionMaybe (try $ count 5 aOrB)) "aaaac"
253 | Right Nothing
254 | ```
255 | 
256 | === optional
257 | 
258 | ```haskell
259 | optional :: Parser a -> Parser ()
260 | ```
261 | 
262 | `optional p` tries to apply parser `p`. It will parse `p` or
263 | nothing. It only fails if `p` fails after consuming input. It discards
264 | the result of `p`.
265 | 
266 | ```
267 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaaa"
268 | Right ((),"")
269 | 
270 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "caaaa"
271 | Right ((),"caaaa")
272 | 
273 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "caaa"
274 | Right ((),"caaa")
275 | 
276 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaa"
277 | Left (line 1, column 5):
278 | unexpected end of input
279 | expecting "a" or "b"
280 | 
281 | *Main> parseWithLeftOver (optional (count 5 aOrB)) "aaaac"
282 | Left (line 1, column 5):
283 | unexpected "c"
284 | expecting "a" or "b"
285 | 
286 | *Main> parseWithLeftOver (optional (try $ count 5 aOrB)) "aaaac"
287 | Right ((),"aaaac")
288 | ```
289 | 
290 | === skipMany1
291 | 
292 | ```haskell
293 | skipMany1 :: Parser a -> Parser ()
294 | ```
295 | 
296 | `skipMany1 p` applies the parser `p` one or more times, skipping its result.
297 | 
298 | === many1
299 | 
300 | ```haskell
301 | many1 :: Parser a -> Parser [a]
302 | ```
303 | 
304 | many1 p applies the parser p one or more times. Returns a list of the
305 | returned values of p.
306 | 
307 | ```haskell
308 |   word  = many1 letter
309 | ```
310 | 
311 | === sepBy
312 | 
313 | ```haskell
314 | sepBy :: Parser a -> Parser sep -> Parser [a]
315 | ```
316 | 
317 | sepBy p sep parses zero or more occurrences of p, separated by
318 | sep. Returns a list of values returned by p.
319 | 
320 | ```haskell
321 |   commaSep p  = p `sepBy` (symbol ",")
322 | ```
323 | 
324 | === sepBy1
325 | 
326 | ```haskell
327 | sepBy1 :: Parser a -> Parser sep -> Parser [a]
328 | ```
329 | 
330 | sepBy1 p sep parses one or more occurrences of p, separated by
331 | sep. Returns a list of values returned by p.
332 | 
333 | === endBy
334 | 
335 | ```haskell
336 | endBy :: Parser a -> Parser sep -> Parser [a]
337 | ```
338 | 
339 | endBy p sep parses zero or more occurrences of p, seperated and ended
340 | by sep. Returns a list of values returned by p.
341 | 
342 | ```haskell
343 |    cStatements  = cStatement `endBy` semi
344 | ```
345 | 
346 | === endBy1
347 | 
348 | ```haskell
349 | endBy1 :: Parser a -> Parser sep -> Parser [a]
350 | ```
351 | 
352 | endBy1 p sep parses one or more occurrences of p, seperated and ended
353 | by sep. Returns a list of values returned by p.
354 | 
355 | === sepEndBy
356 | 
357 | ```haskell
358 | sepEndBy :: Parser a -> Parser sep -> Parser [a]
359 | ```
360 | 
361 | sepEndBy p sep parses zero or more occurrences of p, separated and
362 | optionally ended by sep, ie. haskell style statements. Returns a list
363 | of values returned by p.
364 | 
365 | ```haskell
366 |   haskellStatements  = haskellStatement `sepEndBy` semi
367 | ```
368 | 
369 | === sepEndBy1
370 | 
371 | ```haskell
372 | sepEndBy1 :: Parser a -> Parser sep -> Parser [a]
373 | ```
374 | 
375 | sepEndBy1 p sep parses one or more occurrences of p, separated and
376 | optionally ended by sep. Returns a list of values returned by p.
377 | 
378 | === chainl
379 | 
380 | ```haskell
381 | chainl :: Parser a -> Parser (a -> a -> a) -> a -> Parser a
382 | ```
383 | 
384 | chainl p op x parser zero or more occurrences of p, separated by
385 | op. Returns a value obtained by a left associative application of all
386 | functions returned by op to the values returned by p. If there are
387 | zero occurrences of p, the value x is returned.
388 | 
389 | === chainl1
390 | 
391 | ```haskell
392 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a
393 | ```
394 | 
395 | chainl1 p op x parser one or more occurrences of p, separated by op
396 | Returns a value obtained by a left associative application of all
397 | functions returned by op to the values returned by p. . This parser
398 | can for example be used to eliminate left recursion which typically
399 | occurs in expression grammars.
400 | 
401 | === chainr
402 | 
403 | ```haskell
404 | chainr :: Parser a -> Parser (a -> a -> a) -> a -> Parser a
405 | ```
406 | 
407 | chainr p op x parser zero or more occurrences of p, separated by op
408 | Returns a value obtained by a right associative application of all
409 | functions returned by op to the values returned by p. If there are no
410 | occurrences of p, the value x is returned.
411 | 
412 | === chainr1
413 | 
414 | ```haskell
415 | chainr1 :: Parser a -> Parser (a -> a -> a) -> Parser a
416 | ```
417 | 
418 | chainr1 p op x parser one or more occurrences of `p`, separated by op
419 | Returns a value obtained by a right associative application of all
420 | functions returned by op to the values returned by p.
421 | 
422 | === eof
423 | 
424 | ```haskell
425 | eof :: Parser ()
426 | ```
427 | 
428 | This parser only succeeds at the end of the input. This is not a
429 | primitive parser but it is defined using notFollowedBy.
430 | 
431 | ```haskell
432 |   eof  = notFollowedBy anyToken <?> "end of input"
433 | ```
434 | 
435 | The (<?>) operator is used for error messages. We will come back to
436 | error messages after writing the basic SQL parser.
437 | 
438 | === notFollowedBy
439 | 
440 | ```haskell
441 | notFollowedBy :: Show a => Parser a -> Parser ()
442 | ```
443 | 
444 | notFollowedBy p only succeeds when parser p fails. This parser does
445 | not consume any input. This parser can be used to implement the
446 | 'longest match' rule. For example, when recognizing keywords (for
447 | example let), we want to make sure that a keyword is not followed by a
448 | legal identifier character, in which case the keyword is actually an
449 | identifier (for example lets). We can program this behaviour as
450 | follows:
451 | 
452 | ```haskell
453 |   keywordLet  = try (do{ string "let"
454 |                        ; notFollowedBy alphaNum
455 |                        })
456 | ```
457 | 
458 | === manyTill
459 | 
460 | ```haskell
461 | manyTill :: Parser a -> Parser end -> Parser [a]
462 | ```
463 | 
464 | manyTill p end applies parser p zero or more times until parser end
465 | succeeds. Returns the list of values returned by p. This parser can be
466 | used to scan comments:
467 | 
468 | ```haskell
469 |   simpleComment   = do{ string "<!--"
470 |                       ; manyTill anyChar (try (string "-->"))
471 |                       }
472 | ```
473 | // asciidoc: the string should be "-->"
474 | 
475 | Note the overlapping parsers anyChar and string "-\->", and therefore
476 | the use of the try combinator.
477 | 
478 | === lookAhead
479 | 
480 | ```haskell
481 | lookAhead :: Parser a -> Parser a
482 | ```
483 | 
484 | lookAhead p parses p without consuming any input.
485 | 
486 | If p fails and consumes some input, so does lookAhead. Combine with
487 | try if this is undesirable.
488 | 
489 | === anyToken
490 | 
491 | ```haskell
492 | anyToken :: Parser Char
493 | ```
494 | 
495 | The parser anyToken accepts any kind of token. It is for example used
496 | to implement eof. Returns the accepted token.
497 | 
498 | == Control.Applicative
499 | 
500 | Here are the functions from Applicative that are used:
501 | 
502 | `(<$>)`, `(<*>)`, `(<$)`, `(<*)`, `(*>)`, `(<|>)`, `many`
503 | 
504 | TODO: examples for all of these
505 | 
506 | We've already seen all of these, except `(<$)`. This is often used to
507 | parse a keyword and return a no argument constructor:
508 | 
509 | > data Something = Type1 | Type2 | Type3
510 | 
511 | > something :: Parser Something
512 | > something = choice [Type1 <$ string "type1"
513 | >                    ,Type2 <$ string "type2"
514 | >                    ,Type3 <$ string "type3"]
515 | 
516 | // the first symbol following should be (<**>)
517 | // the backslashes are for asciidoc
518 | // todo: do this in the render.lhs?
519 | // I wish I understood why you need backslashes in some places
520 | // and not others
521 | 
522 | There is also `(<\**>)` which is `(<*>)` with the arguments flipped.
523 | 
524 | TODO: double check using these from Parsec instead of
525 | Control.Applicative: possible performance implictions?
526 | 
527 | == Control.Monad
528 | 
529 | === return
530 | 
531 | One use of return is to always succeed, and return a value:
532 | 
533 | > alwaysX :: Parser Char
534 | > alwaysX = return 'x'
535 | 
536 | ```
537 | *Main> parseWithLeftOver (a <|> alwaysX) "a"
538 | Right ('a',"")
539 | 
540 | *Main> parseWithLeftOver (a <|> alwaysX) "b"
541 | Right ('x',"b")
542 | ```
543 | 
544 | === mzero
545 | 
546 | This function is used in the implementation of `choice`:
547 | 
548 | > choice' :: [Parser a] -> Parser a
549 | > choice' ps = foldr (<|>) mzero ps
550 | 
551 | 
552 | 
553 | TODO: go through a bunch of functions + do notation examples
554 | 
555 |  >>=
556 |  =<<
557 |  >>
558 |  void
559 |  mapM, mapM_
560 |  sequence,sequence_
561 |  guard
562 |  return
563 | mzero
564 | mplus
565 | when, unless
566 | liftMN
567 | ap
568 | quick note about fail, will return to this in the error messages stage
569 | 
570 | TODO: using trace
571 | 


--------------------------------------------------------------------------------
/ErrorMessages.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | [[error-messages]]
 3 | = Error messages
 4 | 
 5 | In this tutorial, we will start looking at the error messages
 6 | generated by Parsec and how we can influence to improve them.
 7 | 
 8 | TODO:
 9 | 
10 | review the different methods of generating failure in parsec and what
11 | they look like, including choice.
12 | 
13 | examine how rearranging parsers can change the error messages,
14 | committing to a parsing branch to improve the messages
15 | 
16 | also possibly look at adding additional parsing to detect common
17 | errors to provide a nicer error messages
18 | 
19 | maybe talk about the idea of being permissive in the parsing proper
20 | stage, then doing additional syntax-type checks on the ast after
21 | 
22 | 
23 | == ways of generating errors in parsec
24 | 
25 | == how parsec combines errors or drops them into the void
26 | 
27 | 
28 | == Parsing tpch
29 | 
30 | Let's try the parser out on the TPC-H queries.
31 | 
32 | ```
33 | ```
34 | 
35 | Summary of errors so far:
36 | q1: typed literal: type_name 'literal value'
37 | q2: scalar subquery
38 | q3: typed literal
39 | q4: typed literal
40 | q5: typed literal
41 | q6: typed literal
42 | q7: ??
43 | q8: extract??
44 | q9: extract ??
45 | q10: typed literal
46 | q11: scalar sub query
47 | q12: ??
48 | q13: not like??
49 | q14: decimal literal
50 | q15: cte
51 | q16: count distinct
52 | q17: decimal literal
53 | q18: in subquery
54 | q19: in literal list
55 | q20: in subquery
56 | q21: exists subquery
57 | q22: substring
58 | 
59 | TODO: try to figure out each issue. Not very easy: the error messages
60 | are not very good.
61 | 
62 | Do some ad hoc stuff to try to improve each error message? Keep
63 | simplifying expressions then try to understand specific principles.
64 | Then make some generalizations that can be made across the parser
65 | 
66 | 
67 | == value expressions in isolation
68 | 
69 | We can start with the real error code now by considering the
70 | valueexpression parser in isolation.
71 | 
72 | ideas:
73 | error inside function parameter list
74 | bad identifier
75 | unrecognised operator
76 | unrecognised chars
77 | unmatching parens
78 | keyword errors in case
79 | unterminated string lit
80 | 3 dot components
81 | binary op with missing second arg
82 | associativity errors with binary ops?
83 | blacklist errors
84 | 
85 | == query expressions
86 | 
87 | mispelling keywords at this level
88 | clauses in wrong order
89 | adding extra tokens or removing them at clause/value expression
90 | boundaries
91 | 
92 | == summarizing?
93 | 
94 | == updated parsing code with improved error message behaviour
95 | 
96 | 


--------------------------------------------------------------------------------
/FromClause.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[from-clause]]
  3 | = From clause
  4 | 
  5 | In this tutorial, we extend the from clause support to the following:
  6 | we will support implicit and explicit joins, including keywords
  7 | natural, inner, outer, left, right, full, cross, on and using, plus
  8 | parens and simple aliases (e.g. select a from t u, but not select a
  9 | from t(a,b)). We don't support oracle outer join syntax (+) or the
 10 | other 'pre-ANSI' variations on this theme. No lateral keyword or apply
 11 | or pivot.
 12 | 
 13 | > {-# LANGUAGE TupleSections #-}
 14 | >
 15 | > --import Text.Groom (groom)
 16 | > --import qualified Text.Parsec as P
 17 | > import Text.Parsec.String (Parser)
 18 | > import Text.Parsec.String.Parsec (try)
 19 | > import Text.Parsec.String.Combinator
 20 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>))
 21 | > import Control.Monad
 22 | > import Data.Maybe ()
 23 | > import qualified Test.HUnit as H
 24 | > import FunctionsAndTypesForParsing
 25 | > import Debug.Trace
 26 | 
 27 | TODO: should explicitly import from these two modules (and same in
 28 | QueryExpressions.lhs)
 29 | 
 30 | > import ValueExpressions (ValueExpr(..), valueExpr, identifier, makeTest, parens)
 31 | > import QueryExpressions (selectList,whereClause,groupByClause,having,orderBy
 32 | >                         ,commaSep1, keyword_,blackListIdentifier)
 33 | 
 34 | == Abstract syntax
 35 | 
 36 | Here are is the updated `QueryExpr` and the new `TableRef` abstract
 37 | syntax types.
 38 | 
 39 | > data QueryExpr
 40 | >     = Select
 41 | >       {qeSelectList :: [(ValueExpr,Maybe String)]
 42 | >       ,qeFrom :: [TableRef]
 43 | >       ,qeWhere :: Maybe ValueExpr
 44 | >       ,qeGroupBy :: [ValueExpr]
 45 | >       ,qeHaving :: Maybe ValueExpr
 46 | >       ,qeOrderBy :: [ValueExpr]
 47 | >       } deriving (Eq,Show)
 48 | >
 49 | > makeSelect :: QueryExpr
 50 | > makeSelect = Select {qeSelectList = []
 51 | >                     ,qeFrom = []
 52 | >                     ,qeWhere = Nothing
 53 | >                     ,qeGroupBy = []
 54 | >                     ,qeHaving = Nothing
 55 | >                     ,qeOrderBy = []}
 56 | >
 57 | > data TableRef = TRSimple String
 58 | >               | TRJoin TableRef JoinType TableRef (Maybe JoinCondition)
 59 | >               | TRParens TableRef
 60 | >               | TRAlias TableRef String
 61 | >               | TRQueryExpr QueryExpr
 62 | >                 deriving (Eq,Show)
 63 | 
 64 | This syntax for table references can represent invalid syntax, for
 65 | instance two nested aliases. The justification for this is that
 66 | sometimes trying to accurately represent only exactly what is valid
 67 | creates something much more complex. Maybe this is a good tradeoff in
 68 | this situation, and maybe not.
 69 | 
 70 | > data JoinType = JoinInner | JoinLeft | JoinRight | JoinFull | JoinCross
 71 | >                 deriving (Eq,Show)
 72 | >
 73 | > data JoinCondition = JoinOn ValueExpr
 74 | >                    | JoinUsing [String]
 75 | >                    | JoinNatural
 76 | >                    deriving (Eq,Show)
 77 | 
 78 | With the join condition, we've done the opposite to TableRef - we've
 79 | combined `natural` and `on`/`using`, since only one of these can be
 80 | present, even though this departs a little from the concrete
 81 | syntax.
 82 | 
 83 | First we will develop the standalone from clause parser, then we will
 84 | update the query expression syntax and parsing to incorporate our new
 85 | from clause parser.
 86 | 
 87 | == simple table name
 88 | 
 89 | Let's start with something simple: a from clause can be multiple comma
 90 | separated tablerefs, aka an implicit join.
 91 | 
 92 | > multipleTRSimpleTests :: [(String, [TableRef])]
 93 | > multipleTRSimpleTests = [("from a,b", [TRSimple "a", TRSimple "b"])]
 94 | >
 95 | > from0 :: Parser [TableRef]
 96 | > from0 = keyword_ "from" *> commaSep1 (TRSimple <$> identifier)
 97 | 
 98 | ```
 99 | *Main> H.runTestTT $ H.TestList $ map (makeTest from0) multipleTRSimpleTests
100 | Cases: 1  Tried: 1  Errors: 0  Failures: 0
101 | Counts {cases = 1, tried = 1, errors = 0, failures = 0}
102 | ```
103 | 
104 | Let's do the query expression, parens and alias first, before tackling
105 | joins.
106 | 
107 | == subquery
108 | 
109 | Here is the example:
110 | 
111 | > trQueryExprTests :: [(String, [TableRef])]
112 | > trQueryExprTests =
113 | >     [("from (select a from t)"
114 | >     ,[TRQueryExpr $ makeSelect {qeSelectList = [(Iden "a", Nothing)]
115 | >                                ,qeFrom = [TRSimple "t"]}])]
116 | 
117 | Here is the query expression parser we can use:
118 | 
119 | > queryExpr1 :: Parser [TableRef] -> Parser QueryExpr
120 | > queryExpr1 from' = Select
121 | >                    <$> selectList
122 | >                    <*> option [] from'
123 | >                    <*> optionMaybe whereClause
124 | >                    <*> option [] groupByClause
125 | >                    <*> optionMaybe having
126 | >                    <*> option [] orderBy
127 | >
128 | > from1 :: Parser [TableRef]
129 | > from1 =
130 | >     keyword_ "from" *> commaSep1 trefTerm
131 | >   where
132 | >     trefTerm = choice [TRSimple <$> identifier
133 | >                       ,TRQueryExpr <$> parens (queryExpr1 from1)]
134 | 
135 | == parens
136 | 
137 | We can't do a sensible example for these right now - we need explicit
138 | joins and then the parens can be used to override the associativity of
139 | a three way join, or to specify over what part of the expression to
140 | apply an alias.
141 | 
142 | > trParensTests :: [(String, [TableRef])]
143 | > trParensTests = [("from (a)", [TRParens $ TRSimple "a"])]
144 | 
145 | We can write some more tests for parens after we've done the explicit
146 | joins.
147 | 
148 | > from2 :: Parser [TableRef]
149 | > from2 =
150 | >     keyword_ "from" *> commaSep1 trefTerm
151 | >   where
152 | >     trefTerm = choice [TRSimple <$> identifier
153 | >                       ,try (TRQueryExpr <$> parens (queryExpr1 from2))
154 | >                       ,TRParens <$> parens trefTerm]
155 | 
156 | == alias
157 | 
158 | > trAliasTests :: [(String,[TableRef])]
159 | > trAliasTests = [("from a as b", [TRAlias (TRSimple "a") "b"])
160 | >                ,("from a b", [TRAlias (TRSimple "a") "b"])]
161 | 
162 | The alias can be treated like a postfix operator.
163 | 
164 | > suffixWrapper :: (a -> Parser a) -> a -> Parser a
165 | > suffixWrapper p a = p a <|> return a
166 | 
167 | TODO: ?? not sure about this
168 | 
169 | > from3 :: Parser [TableRef]
170 | > from3 =
171 | >     keyword_ "from" *> commaSep1 trefTerm
172 | >   where
173 | >     trefTerm = choice [TRSimple <$> identifier
174 | >                       ,try (TRQueryExpr <$> parens (queryExpr1 from3))
175 | >                       ,TRParens <$> parens trefTerm]
176 | >                >>= suffixWrapper alias
177 | >     alias tr = TRAlias tr <$> (optional (keyword_ "as") *> identifier)
178 | 
179 | How to make it keep nesting?
180 | 
181 | == joins
182 | 
183 | Here is a casual sketch of the target grammar:
184 | 
185 | ```
186 | tref
187 | (cross | [natural]
188 |          ([inner]
189 |           | left [outer]
190 |           | right [outer]
191 |           | full [outer]
192 |          )
193 | join tref
194 | [on expr | using (...)]
195 | ```
196 | 
197 | Let's start with parsers for the 'join operator' in the middle and for
198 | the join condition:
199 | 
200 | === join type
201 | 
202 | > joinType :: Parser JoinType
203 | > joinType = choice
204 | >     [JoinCross <$ keyword_ "cross" <* keyword_ "join"
205 | >     ,JoinInner <$ keyword_ "inner" <* keyword_ "join"
206 | >     ,JoinLeft <$ keyword_ "left"
207 | >            <* optional (keyword_ "outer")
208 | >            <* keyword_ "join"
209 | >     ,JoinRight <$ keyword_ "right"
210 | >             <* optional (keyword_ "outer")
211 | >             <* keyword_ "join"
212 | >     ,JoinFull <$ keyword_ "full"
213 | >            <* optional (keyword_ "outer")
214 | >            <* keyword_ "join"
215 | >     ,JoinInner <$ keyword_ "join"]
216 | 
217 | ```
218 | *Main> parseWithEof joinType "cross join"
219 | Right JoinCross
220 | 
221 | *Main> parseWithEof joinType "inner join"
222 | Right JoinInner
223 | 
224 | *Main> parseWithEof joinType "left outer join"
225 | Right JoinLeft
226 | 
227 | *Main> parseWithEof joinType "left join"
228 | Right JoinLeft
229 | 
230 | *Main> parseWithEof joinType "right outer join"
231 | Right JoinRight
232 | 
233 | *Main> parseWithEof joinType "right join"
234 | Right JoinRight
235 | 
236 | *Main> parseWithEof joinType "full outer join"
237 | Right JFull
238 | 
239 | *Main> parseWithEof joinType "full join"
240 | Right JoinFull
241 | 
242 | *Main> parseWithEof joinType "join"
243 | Right JoinInner
244 | ```
245 | 
246 | I thought about factoring out the common bits with the joinType parser:
247 | 
248 | > joinType0 :: Parser JoinType
249 | > joinType0 = choice
250 | >     [choice
251 | >      [JoinCross <$ try (keyword_ "cross")
252 | >      ,JoinInner <$ try (keyword_ "inner")
253 | >      ,choice [JoinLeft <$ try (keyword_ "left")
254 | >              ,JoinRight <$ try (keyword_ "right")
255 | >              ,JoinFull <$ try (keyword_ "full")]
256 | >       <* optional (try $ keyword_ "outer")]
257 | >      <* keyword_ "join"
258 | >     ,JoinInner <$ keyword_ "join"]
259 | 
260 | But I think the longer version is much easier to follow, even if it is
261 | a little more boring.
262 | 
263 | === join condition
264 | 
265 | The idea with the join condition is that we pass a bool to say whether
266 | we've already seen the 'natural' keyword. If so, then we don't try to
267 | parse 'on' or 'using'.
268 | 
269 | > joinCondition :: Bool -> Parser JoinCondition
270 | > joinCondition nat =
271 | >     choice [guard nat >> return JoinNatural
272 | >            ,keyword_ "on" >> JoinOn <$> valueExpr []
273 | >            ,keyword_ "using" >> JoinUsing <$> parens (commaSep1 identifier)
274 | >            ]
275 | 
276 | ```
277 | *Main> parseWithEof (joinCondition False) "on a"
278 | Right (JoinOn (Iden "a"))
279 | 
280 | *Main> parseWithEof (joinCondition False) "on a + b"
281 | Right (JoinOn (BinOp (Iden "a") "+" (Iden "b")))
282 | 
283 | *Main> parseWithEof (joinCondition False) "using (a,b)"
284 | Right (JoinUsing ["a","b"])
285 | 
286 | *Main> parseWithEof (joinCondition True) "using (a,b)"
287 | Left (line 1, column 1):
288 | unexpected 'u'
289 | expecting end of input
290 | 
291 | *Main> parseWithEof (joinCondition True) ""
292 | Right JoinNatural
293 | ```
294 | 
295 | === simple binary join
296 | 
297 | Let's try some simple binary joins:
298 | 
299 | > simpleBinaryJoinTests :: [(String,[TableRef])]
300 | > simpleBinaryJoinTests =
301 | >     [("from a join b"
302 | >      ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") Nothing])
303 | >
304 | >     ,("from a natural join b"
305 | >      ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b") (Just JoinNatural)])
306 | >
307 | >     ,("from a join b on a.x = b.y"
308 | >      ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b")
309 | >        (Just $ JoinOn $ BinOp (DIden "a" "x") "="(DIden "b" "y"))])
310 | >
311 | >     ,("from a join b using(x,y)"
312 | >      ,[TRJoin (TRSimple "a") JoinInner (TRSimple "b")
313 | >        (Just $ JoinUsing ["x","y"])])
314 | >
315 | >     ,("from a cross join b"
316 | >      ,[TRJoin (TRSimple "a") JoinCross (TRSimple "b") Nothing])
317 | >
318 | >     ]
319 | 
320 | We want to parse the first table, then optionally parse the 'natural'
321 | keyword, then the join type, then the second table, then optionally
322 | parse the join condition.
323 | 
324 | > from4 :: Parser [TableRef]
325 | > from4 = keyword_ "from" >> (:[]) <$> do
326 | >      t0 <- simpleTref
327 | >      nat <- option False (True <$ keyword_ "natural")
328 | >      jt <- joinType
329 | >      t1 <- simpleTref
330 | >      jc <- optionMaybe (joinCondition nat)
331 | >      return $ TRJoin t0 jt t1 jc
332 | >   where
333 | >     simpleTref = TRSimple <$> identifier
334 | 
335 | Let's start extending this into the full target parser. In this next
336 | version, I've tried to combine all the versions we've seen so far.
337 | 
338 | > from5 :: Parser [TableRef]
339 | > from5 = keyword_ "from" >> commaSep1 tref
340 | >   where
341 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
342 | >     joinTrefSuffix t0 = do
343 | >         nat <- option False (True <$ keyword_ "natural")
344 | >         jt <- joinType
345 | >         t1 <- nonJoinTref
346 | >         jc <- optionMaybe (joinCondition nat)
347 | >         return $ TRJoin t0 jt t1 jc
348 | >     nonJoinTref = choice [TRSimple <$> identifier
349 | >                          ,try (TRQueryExpr <$> parens (queryExpr1 from5))
350 | >                          ,TRParens <$> parens tref]
351 | >                   >>= suffixWrapper alias
352 | >     alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> identifier))
353 | 
354 | ```
355 | *Main> H.runTestTT $ H.TestList $ map (makeTest from5) (multipleTRSimpleTests ++ trQueryExprTests ++ trParensTests ++ trAliasTests ++ simpleBinaryJoinTests)
356 |  ### Failure in: 5:from a join b
357 | (line 1, column 14):
358 | unexpected end of input
359 | expecting digit, letter, "_", "--" or "/*"
360 |  ### Failure in: 6:from a natural join b
361 | from a natural join b
362 | expected: [TRJoin (TRSimple "a") JoinInner (TRSimple "b") (Just JoinNatural)]
363 |  but got: [TRJoin (TRAlias (TRSimple "a") "natural") JoinInner (TRSimple "b") Nothing]
364 |  ### Failure in: 7:from a join b on a.x = b.y
365 | (line 1, column 15):
366 | unexpected "o"
367 | expecting "--" or "/*"
368 |  ### Failure in: 8:from a join b using(x,y)
369 | (line 1, column 15):
370 | unexpected "u"
371 | expecting "--" or "/*"
372 |  ### Failure in: 9:from a cross join b
373 | from a cross join b
374 | expected: [TRJoin (TRSimple "a") JoinCross (TRSimple "b") Nothing]
375 |  but got: [TRJoin (TRAlias (TRSimple "a") "cross") JoinInner (TRSimple "b") Nothing]
376 | Cases: 10  Tried: 10  Errors: 0  Failures: 5
377 | Counts {cases = 10, tried = 10, errors = 0, failures = 5}
378 | ```
379 | 
380 | What's going wrong? If you look at some of the issues, it looks like
381 | we are getting keywords parsed as aliases. Let's fix that first:
382 | 
383 | > from6 :: Parser [TableRef]
384 | > from6 = keyword_ "from" >> commaSep1 tref
385 | >   where
386 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
387 | >     joinTrefSuffix t0 = do
388 | >         nat <- option False (True <$ keyword_ "natural")
389 | >         jt <- joinType
390 | >         t1 <- nonJoinTref
391 | >         jc <- optionMaybe (joinCondition nat)
392 | >         return $ TRJoin t0 jt t1 jc
393 | >     nonJoinTref = choice [TRSimple <$> identifier
394 | >                          ,try (TRQueryExpr <$> parens (queryExpr1 from6))
395 | >                          ,TRParens <$> parens tref]
396 | >                   >>= suffixWrapper alias
397 | >     alias tr = TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier)
398 | >     aliasIdentifier = blackListIdentifier
399 | >                       ["natural"
400 | >                       ,"inner"
401 | >                       ,"outer"
402 | >                       ,"cross"
403 | >                       ,"left"
404 | >                       ,"right"
405 | >                       ,"full"
406 | >                       ,"join"
407 | >                       ,"on"
408 | >                       ,"using"]
409 | 
410 | That didn't solve the problem. I think we also have a problem since
411 | the `alias` can now fail after consuming input, we need to use `try`.
412 | 
413 | > from7 :: Parser [TableRef]
414 | > from7 = keyword_ "from" >> commaSep1 tref
415 | >   where
416 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
417 | >     joinTrefSuffix t0 = do
418 | >         nat <- option False (True <$ keyword_ "natural")
419 | >         jt <- joinType
420 | >         t1 <- nonJoinTref
421 | >         jc <- optionMaybe (joinCondition nat)
422 | >         return $ TRJoin t0 jt t1 jc
423 | >     nonJoinTref = choice [TRSimple <$> identifier
424 | >                          ,try (TRQueryExpr <$> parens (queryExpr1 from7))
425 | >                          ,TRParens <$> parens tref]
426 | >                   >>= suffixWrapper (try . alias)
427 | >     alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier))
428 | >     aliasIdentifier = blackListIdentifier
429 | >                       ["natural"
430 | >                       ,"inner"
431 | >                       ,"outer"
432 | >                       ,"cross"
433 | >                       ,"left"
434 | >                       ,"right"
435 | >                       ,"full"
436 | >                       ,"join"
437 | >                       ,"on"
438 | >                       ,"using"]
439 | 
440 | The final step is to make it parse n-way explicit joins.
441 | 
442 | > threeWayJoinTests :: [(String,[TableRef])]
443 | > threeWayJoinTests =
444 | >     [("from a join b join c"
445 | >      ,[TRJoin
446 | >        (TRJoin (TRSimple "a") JoinInner (TRSimple "b") Nothing)
447 | >        JoinInner (TRSimple "c") Nothing])]
448 | 
449 | 
450 | > from8 :: Parser [TableRef]
451 | > from8 = keyword_ "from" >> commaSep1 tref
452 | >   where
453 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
454 | >     joinTrefSuffix t0 = (do
455 | >          nat <- option False (True <$ keyword_ "natural")
456 | >          TRJoin t0 <$> joinType
457 | >                    <*> nonJoinTref
458 | >                    <*> optionMaybe (joinCondition nat))
459 | >         >>= suffixWrapper joinTrefSuffix
460 | >     nonJoinTref = choice [TRSimple <$> identifier
461 | >                          ,try (TRQueryExpr <$> parens (queryExpr1 from8))
462 | >                          ,TRParens <$> parens tref]
463 | >                   >>= suffixWrapper (try . alias)
464 | >     alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier))
465 | >     aliasIdentifier = blackListIdentifier
466 | >                       ["natural"
467 | >                       ,"inner"
468 | >                       ,"outer"
469 | >                       ,"cross"
470 | >                       ,"left"
471 | >                       ,"right"
472 | >                       ,"full"
473 | >                       ,"join"
474 | >                       ,"on"
475 | >                       ,"using"]
476 | 
477 | We get left associative with this code. I don't know if this is correct.
478 | 
479 | We should do some more testing to make sure this code is good. TODO
480 | 
481 | == query expressions
482 | 
483 | Let's create the full query expression parser now:
484 | 
485 | > queryExprJoinTests :: [(String,QueryExpr)]
486 | > queryExprJoinTests =
487 | >     [("select a from t"
488 | >      ,ms [TRSimple "t"])
489 | >
490 | >     ,("select a from t,u"
491 | >      ,ms [TRSimple "t", TRSimple "u"])
492 | >
493 | >     ,("select a from t inner join u on expr"
494 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
495 | >           (Just $ JoinOn $ Iden "expr")])
496 | >
497 | >     ,("select a from t left join u on expr"
498 | >      ,ms [TRJoin (TRSimple "t") JoinLeft (TRSimple "u")
499 | >           (Just $ JoinOn $ Iden "expr")])
500 | >
501 | >     ,("select a from t right join u on expr"
502 | >      ,ms [TRJoin (TRSimple "t") JoinRight (TRSimple "u")
503 | >           (Just $ JoinOn $ Iden "expr")])
504 | >
505 | >     ,("select a from t full join u on expr"
506 | >      ,ms [TRJoin (TRSimple "t") JoinFull (TRSimple "u")
507 | >           (Just $ JoinOn $ Iden "expr")])
508 | >
509 | >     ,("select a from t cross join u"
510 | >      ,ms [TRJoin (TRSimple "t") JoinCross (TRSimple "u") Nothing])
511 | >
512 | >     ,("select a from t natural inner join u"
513 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
514 | >           (Just JoinNatural)])
515 | >
516 | >     ,("select a from t inner join u using(a,b)"
517 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
518 | >           (Just $ JoinUsing ["a", "b"])])
519 | >
520 | >     ,("select a from (select a from t)"
521 | >      ,ms [TRQueryExpr $ ms [TRSimple "t"]])
522 | >
523 | >     ,("select a from t as u"
524 | >      ,ms [TRAlias (TRSimple "t") "u"])
525 | >
526 | >     ,("select a from t u"
527 | >      ,ms [TRAlias (TRSimple "t") "u"])
528 | >
529 | >     ,("select a from (t cross join u) as u"
530 | >      ,ms [TRAlias (TRParens $ TRJoin (TRSimple "t") JoinCross
531 | >                                      (TRSimple "u") Nothing) "u"])
532 | >     ]
533 | >   where
534 | >     ms f = makeSelect {qeSelectList = [(Iden "a",Nothing)]
535 | >                       ,qeFrom = f}
536 | 
537 | Here are all the other query expression tests updated with the new
538 | QueryExpr type.
539 | 
540 | > singleSelectItemTests :: [(String,QueryExpr)]
541 | > singleSelectItemTests =
542 | >     [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})]
543 | 
544 | > multipleSelectItemsTests :: [(String,QueryExpr)]
545 | > multipleSelectItemsTests =
546 | >     [("select a"
547 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]})
548 | >     ,("select a,b"
549 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
550 | >                                  ,(Iden "b",Nothing)]})
551 | >     ,("select 1+2,3+4"
552 | >      ,makeSelect {qeSelectList =
553 | >                      [(BinOp (NumLit 1) "+" (NumLit 2),Nothing)
554 | >                      ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]})
555 | >     ]
556 | 
557 | > selectListTests :: [(String,QueryExpr)]
558 | > selectListTests =
559 | >     [("select a as a, b as b"
560 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a")
561 | >                                  ,(Iden "b", Just "b")]})
562 | >
563 | >     ,("select a a, b b"
564 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a")
565 | >                                  ,(Iden "b", Just "b")]})
566 | >     ] ++ multipleSelectItemsTests
567 | >       ++ singleSelectItemTests
568 | 
569 | > fromTests :: [(String,QueryExpr)]
570 | > fromTests =
571 | >     [("select a from t"
572 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
573 | >                  ,qeFrom = [TRSimple "t"]})]
574 | 
575 | > whereTests :: [(String,QueryExpr)]
576 | > whereTests =
577 | >     [("select a from t where a = 5"
578 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
579 | >                  ,qeFrom = [TRSimple "t"]
580 | >                  ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)})
581 | >     ]
582 | 
583 | > groupByTests :: [(String,QueryExpr)]
584 | > groupByTests =
585 | >     [("select a,sum(b) from t group by a"
586 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
587 | >                                  ,(App "sum" [Iden "b"],Nothing)]
588 | >                  ,qeFrom = [TRSimple "t"]
589 | >                  ,qeGroupBy = [Iden "a"]
590 | >                  })
591 | >
592 | >     ,("select a,b,sum(c) from t group by a,b"
593 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
594 | >                                  ,(Iden "b",Nothing)
595 | >                                  ,(App "sum" [Iden "c"],Nothing)]
596 | >                  ,qeFrom = [TRSimple "t"]
597 | >                  ,qeGroupBy = [Iden "a",Iden "b"]
598 | >                  })
599 | >     ]
600 | 
601 | > havingTests :: [(String,QueryExpr)]
602 | > havingTests =
603 | >   [("select a,sum(b) from t group by a having sum(b) > 5"
604 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
605 | >                                  ,(App "sum" [Iden "b"],Nothing)]
606 | >                  ,qeFrom = [TRSimple "t"]
607 | >                  ,qeGroupBy = [Iden "a"]
608 | >                  ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5)
609 | >                  })
610 | >   ]
611 | 
612 | > orderByTests :: [(String,QueryExpr)]
613 | > orderByTests =
614 | >     [("select a from t order by a"
615 | >      ,ms [Iden "a"])
616 | >
617 | >     ,("select a from t order by a, b"
618 | >      ,ms [Iden "a", Iden "b"])
619 | >     ]
620 | >   where
621 | >     ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)]
622 | >                       ,qeFrom = [TRSimple "t"]
623 | >                       ,qeOrderBy = o}
624 | 
625 | ```
626 | *Main> H.runTestTT $ H.TestList $ map (makeTest (queryExpr1 from8)) (selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests ++ queryExprJoinTests)
627 |  ### Failure in: 7:select a from t where a = 5
628 | (line 1, column 25):
629 | unexpected "="
630 | expecting "--" or "/*"
631 |  ### Failure in: 8:select a,sum(b) from t group by a
632 | (line 1, column 33):
633 | unexpected "a"
634 | expecting "--" or "/*"
635 |  ### Failure in: 9:select a,b,sum(c) from t group by a,b
636 | (line 1, column 35):
637 | unexpected "a"
638 | expecting "--" or "/*"
639 |  ### Failure in: 10:select a,sum(b) from t group by a having sum(b) > 5
640 | (line 1, column 33):
641 | unexpected "a"
642 | expecting "--" or "/*"
643 |  ### Failure in: 11:select a from t order by a
644 | (line 1, column 26):
645 | unexpected "a"
646 | expecting "--" or "/*"
647 |  ### Failure in: 12:select a from t order by a, b
648 | (line 1, column 26):
649 | unexpected "a"
650 | expecting "--" or "/*"
651 | Cases: 26  Tried: 26  Errors: 0  Failures: 6
652 | Counts {cases = 26, tried = 26, errors = 0, failures = 6}
653 | ```
654 | 
655 | The problem is the table alias parser is trying to parse keywords
656 | again. Here is the `from` parser with the alias name blacklist
657 | expanded.
658 | 
659 | > from :: Parser [TableRef]
660 | > from = keyword_ "from" >> commaSep1 tref
661 | >   where
662 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
663 | >     joinTrefSuffix t0 = (do
664 | >          nat <- option False (True <$ keyword_ "natural")
665 | >          TRJoin t0 <$> joinType
666 | >                    <*> nonJoinTref
667 | >                    <*> optionMaybe (joinCondition nat))
668 | >         >>= suffixWrapper joinTrefSuffix
669 | >     nonJoinTref = choice [TRSimple <$> identifier
670 | >                          ,try (TRQueryExpr <$> parens queryExpr)
671 | >                          ,TRParens <$> parens tref]
672 | >                   >>= suffixWrapper (try . alias)
673 | >     alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier))
674 | >     aliasIdentifier = blackListIdentifier
675 | >                       [-- join keywords
676 | >                        "natural"
677 | >                       ,"inner"
678 | >                       ,"outer"
679 | >                       ,"cross"
680 | >                       ,"left"
681 | >                       ,"right"
682 | >                       ,"full"
683 | >                       ,"join"
684 | >                       ,"on"
685 | >                       ,"using"
686 | >                        -- subsequent clause keywords
687 | >                       ,"where"
688 | >                       ,"group"
689 | >                       ,"having"
690 | >                       ,"order"
691 | >                       ]
692 | 
693 | Here is the final query expression parser:
694 | 
695 | > queryExpr :: Parser QueryExpr
696 | > queryExpr = Select
697 | >             <$> selectList
698 | >             <*> option [] from
699 | >             <*> optionMaybe whereClause
700 | >             <*> option [] groupByClause
701 | >             <*> optionMaybe having
702 | >             <*> option [] orderBy
703 | 


--------------------------------------------------------------------------------
/FunctionsAndTypesForParsing.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[functions-and-types-for-parsing]]
  3 | = Functions and types for parsing
  4 | 
  5 | In this file is the source and explanation for the parsing functions
  6 | which we've been using, and some limited notes about the wrappers and
  7 | full types in Parsec.
  8 | 
  9 | > module FunctionsAndTypesForParsing where
 10 | >
 11 | > import Text.Parsec (ParseError)
 12 | > import Text.Parsec.String (Parser)
 13 | > import Text.Parsec.String.Parsec (parse)
 14 | > import Text.Parsec.String.Char (oneOf)
 15 | > import Text.Parsec.String.Combinator (eof,manyTill,anyToken)
 16 | > import Control.Applicative ((<$>), (<*>), (<*), (*>), many)
 17 | > import Control.Monad (void)
 18 | 
 19 | 
 20 | == Functions for parsing
 21 | 
 22 | Here are the testing functions which were used earlier:
 23 | 
 24 | The basic parse function: this is a pretty simple wrapper. The parse
 25 | function from parsec just adds a filename to use in parse errors,
 26 | which is set as the empty string here.
 27 | 
 28 | > regularParse :: Parser a -> String -> Either ParseError a
 29 | > regularParse p = parse p ""
 30 | 
 31 | 'parse' is a basic function in the family of functions for running
 32 | parsers in Parsec. You can compose the parser functions in the Parser
 33 | monad, then run the top level function using 'parse' and get back an
 34 | 'Either ParserError a' as the result. There are a few alternatives to
 35 | 'parse' in Parsec, mostly when you are using a more general parser
 36 | type instead of 'Parser a' (which is an alias for 'ParsecT String ()
 37 | Identity a'). Have a look in the Text.Parsec.Prim module for these
 38 | <http://hackage.haskell.org/package/parsec-3.1.3/docs/Text-Parsec-Prim.html>.
 39 | 
 40 | This function will run the parser, but additionally fail if it doesn't
 41 | consume all the input.
 42 | 
 43 | > parseWithEof :: Parser a -> String -> Either ParseError a
 44 | > parseWithEof p = parse (p <* eof) ""
 45 | 
 46 | This function will apply the parser, then also return any left over
 47 | input which wasn't parsed.
 48 | 
 49 | > parseWithLeftOver :: Parser a -> String -> Either ParseError (a,String)
 50 | > parseWithLeftOver p = parse ((,) <$> p <*> leftOver) ""
 51 | >   where leftOver = manyTill anyToken eof
 52 | 
 53 | TODO: what happens when you use 'many anyToken <* eof' variations
 54 | instead? Maybe should talk about greediness? Or talk about it in a
 55 | better place in the tutorial.
 56 | 
 57 | > parseWithWSEof :: Parser a -> String -> Either ParseError a
 58 | > parseWithWSEof p = parseWithEof (whiteSpace *> p)
 59 | >   where whiteSpace = void $ many $ oneOf " \n\t"
 60 | 
 61 | You should have a look at the two helper executables, and see if you
 62 | can understand the code now. You can see them online here:
 63 | 
 64 | <https://github.com/JakeWheat/intro_to_parsing/blob/master/ParseFile.lhs>
 65 | 
 66 | <https://github.com/JakeWheat/intro_to_parsing/blob/master/ParseString.lhs>
 67 | 
 68 | == type signatures revisited
 69 | 
 70 | todo: update this to refer to real parsec instead of the string
 71 | wrappers here.
 72 | 
 73 | I think you should always use type signatures with Parsec. Because the
 74 | Parsec code is really generalized, without the type GHC will refuse to
 75 | compile this code. Try commenting out the type signature above and
 76 | loading into ghci to see the error message.
 77 | 
 78 | There is an alternative: you can get this code to compile without a
 79 | type signature by using the NoMonomorphismRestriction language
 80 | pragma. You can also see the type signature that GHC will choose for
 81 | this function by commenting the type signature and using -Wall and
 82 | -XNoMonomorphismRestriction together. Using NoMonomorphismRestriction
 83 | is a popular solution to these sorts of problems in haskell.
 84 | 
 85 | It's up to you whether you prefer to always write type signatures when
 86 | you are developing parsing code, or use the NoMonomorphismRestriction
 87 | pragma. Even if you can use NoMonomorphismRestriction, when using
 88 | explicit type signatures you usually get much simpler compiler error
 89 | messages.
 90 | 
 91 | 
 92 | == Parser
 93 | 
 94 | The definition of Parser and a partial explanation of the full type
 95 | signature.
 96 | 
 97 | ```
 98 | type Parser = Parsec String ()
 99 | ```
100 | 
101 | This means that a function returning Parser a parses from a String
102 | with () as the initial state.
103 | 
104 | The Parsec type is defined like this:
105 | 
106 | ```
107 | type Parsec s u = ParsecT s u Identity
108 | ```
109 | 
110 | ParsecT is a monad transformer, I think it is the primitive one in the
111 | Parsec library, and the 'Parsec' type is a type alias which sets the
112 | base monad to be Identity.
113 | 
114 | Here is the haddock for the ParsecT type:
115 | 
116 | `ParsecT s u m a` is a parser with stream type `s`, user state type `u`,
117 | underlying monad `m` and return type `a`.
118 | 
119 | The full types that you see like this:
120 | 
121 | ```
122 | satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
123 | ```
124 | 
125 | refer to the same things (stream type s, user state type u, underlying
126 | monad m).
127 | 
128 | We are using String as the stream type (i.e. the input type), () as
129 | the user state type (this effectively means no user state, since ()
130 | only has one value), and the underlying monad is Identity: we are
131 | using no other underlying monad, so `Parser a` expands to `ParsecT
132 | String () Identity a`.
133 | 
134 | I.e. the source is String, the user state is (), and the underlying monad
135 | is Identity.
136 | 
137 | == Other information
138 | 
139 | TODO: Here is some other information on Parsec and Haskell:
140 | links, tutorials on fp, section in rwh, lyah?, old parsec docs,
141 | parsec docs on hackage, other parser combinator libs (uu, polyparse,
142 | trifecta?)
143 | 


--------------------------------------------------------------------------------
/GettingStarted.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[getting-started]]
  3 | = Getting started
  4 | 
  5 | This is an introduction to parsing with Haskell and Parsec.
  6 | 
  7 | Prerequisites: you should know some basic Haskell and have GHC and
  8 | cabal-install installed (installing the Haskell Platform will give you
  9 | this).
 10 | 
 11 | This tutorial was originally written using GHC 7.6.3 and Parsec 3.1.3,
 12 | which are the versions which come with the Haskell Platform
 13 | 2013.2.0.0. It should also work fine with GHC 7.8.4 and GHC 7.10.2 and
 14 | through to at least the latest release of Parsec 3.1.x.
 15 | 
 16 | This tutorial was written using Literate Haskell files available here:
 17 | link:https://github.com/JakeWheat/intro_to_parsing[].
 18 | 
 19 | I recommend you download them all, and follow along in your favourite
 20 | editor, and use GHCi to experiment. To download the intro_to_parsing
 21 | files, use:
 22 | 
 23 | ```
 24 | git clone https://github.com/JakeWheat/intro_to_parsing.git
 25 | ```
 26 | 
 27 | Here are the imports.
 28 | 
 29 | > import Text.Parsec.String (Parser)
 30 | > import Text.Parsec.String.Char (anyChar)
 31 | > import Text.Parsec.String.Char
 32 | > import FunctionsAndTypesForParsing (regularParse, parseWithEof, parseWithLeftOver)
 33 | > import Data.Char
 34 | > import Text.Parsec.String.Combinator (many1)
 35 | 
 36 | == First parser
 37 | 
 38 | The first parser:
 39 | 
 40 | ```
 41 | anyChar :: Parser Char
 42 | ```
 43 | 
 44 | This parser is in the module `Text.Parsec.Char`. There is a wrapper in
 45 | this tutorial's project, `Text.Parsec.String.Char`, which gives this
 46 | function a simplified type.
 47 | 
 48 | Whenever we write a parser which parses to a value of type `a`, we give
 49 | it the return type of `Parser a`. In this case, we parse a character
 50 | so the return type is `Parser Char`. The `Parser` type itself is in
 51 | the module `Text.Parsec.String`. We will cover this in more detail
 52 | later.
 53 | 
 54 | Let's use this parser. I will assume you have GHC and cabal-install
 55 | installed (which provides the 'cabal' executable) and both are in your
 56 | PATH. The Haskell Platform is one way that provides this.
 57 | 
 58 | Change to the directory where you downloaded the intro_to_parsing
 59 | source files (which will contain the GettingStarted.lhs file). Then
 60 | you can set up a cabal sandbox and be ready to work with the code by
 61 | running the following commands in that directory:
 62 | 
 63 | ```
 64 | cabal v1-update
 65 | cabal v1-sandbox init
 66 | cabal v1-install parsec HUnit
 67 | cabal v1-repl
 68 | ```
 69 | 
 70 | Now you will get the ghci prompt. Type in ':l GettingStarted.lhs'. You
 71 | can run the parser using a wrapper, enter the following at the ghci
 72 | prompt: `regularParse anyChar "a"`.
 73 | 
 74 | Here is a transcript of running ghci via 'cabal repl':
 75 | 
 76 | ```
 77 | $ cabal repl
 78 | Warning: The repl command is a part of the legacy v1 style of cabal usage.
 79 | 
 80 | Please switch to using either the new project style and the new-repl command
 81 | or the legacy v1-repl alias as new-style projects will become the default in
 82 | the next version of cabal-install. Please file a bug if you cannot replicate a
 83 | working v1- use case with the new-style commands.
 84 | 
 85 | For more information, see: https://wiki.haskell.org/Cabal/NewBuild
 86 | 
 87 | GHCi, version 8.6.5: http://www.haskell.org/ghc/  :? for help
 88 | Prelude> :l GettingStarted.lhs
 89 | [1 of 5] Compiling Text.Parsec.String.Char ( Text/Parsec/String/Char.hs, interpreted )
 90 | [2 of 5] Compiling Text.Parsec.String.Combinator ( Text/Parsec/String/Combinator.hs, interpreted )
 91 | [3 of 5] Compiling Text.Parsec.String.Parsec ( Text/Parsec/String/Parsec.hs, interpreted )
 92 | [4 of 5] Compiling FunctionsAndTypesForParsing ( FunctionsAndTypesForParsing.lhs, interpreted )
 93 | [5 of 5] Compiling Main             ( GettingStarted.lhs, interpreted )
 94 | Ok, five modules loaded.
 95 | *Main> regularParse anyChar "a"
 96 | Right 'a'
 97 | *Main> 
 98 | ```
 99 | 
100 | You can exit ghci by entering ':quit' or using Ctrl-d. From now on, to
101 | start ghci again, you can just change to the directory with
102 | GettingStarted.lhs and run 'cabal repl'. ghci should have readline
103 | support so you can browse through your command history using up and
104 | down arrow, etc.
105 | 
106 | This is the type of `regularParse`. It is wrapper which takes a parser
107 | function such as anyChar, and wraps it so you can parse a string to
108 | either a parse error, or the return value from your parser function:
109 | 
110 | ```
111 | regularParse :: Parser a -> String -> Either ParseError a
112 | ```
113 | 
114 | 
115 | Here are some examples of running this parser on various input:
116 | 
117 | ```
118 | *Main> regularParse anyChar "a"
119 | Right 'a'
120 | 
121 | *Main> regularParse anyChar "b"
122 | Right 'b'
123 | 
124 | *Main> regularParse anyChar "0"
125 | Right '0'
126 | 
127 | *Main> regularParse anyChar " "
128 | Right ' '
129 | 
130 | *Main> regularParse anyChar "\n"
131 | Right '\n'
132 | 
133 | *Main> regularParse anyChar "aa"
134 | Right 'a'
135 | 
136 | *Main> regularParse anyChar ""
137 | Left (line 1, column 1):
138 | unexpected end of input
139 | 
140 | *Main> regularParse anyChar " a"
141 | Right ' '
142 | ```
143 | 
144 | 
145 | You can see that if there are no characters, we get an error.
146 | Otherwise, it takes the first character and returns it, and throws
147 | away any trailing characters. The details of the helper function
148 | `regularParse` will come later.
149 | 
150 | Here are two alternatives to `regularParse` you can also use for
151 | experimenting for the time being:
152 | 
153 | ```
154 | parseWithEof :: Parser a -> String -> Either ParseError a
155 | ```
156 | 
157 | ```
158 | parseWithLeftOver :: Parser a -> String -> Either ParseError (a,String)
159 | ```
160 | 
161 | These can be useful when you are not sure if your parser is consuming
162 | all your input string or not. The eof parser will error if you haven't
163 | consumed all the input, and the leftover parser can instead tell you
164 | what was not consumed from the input.
165 | 
166 | ```
167 | *Main> regularParse anyChar "a"
168 | Right 'a'
169 | 
170 | *Main> parseWithEof anyChar "a"
171 | Right 'a'
172 | 
173 | *Main> parseWithLeftOver anyChar "a"
174 | Right ('a',"")
175 | 
176 | *Main> *Main> regularParse anyChar ""
177 | Left (line 1, column 1):
178 | unexpected end of input
179 | 
180 | *Main> parseWithEof anyChar ""
181 | Left (line 1, column 1):
182 | unexpected end of input
183 | 
184 | *Main> parseWithLeftOver anyChar ""
185 | Left (line 1, column 1):
186 | unexpected end of input
187 | 
188 | *Main> regularParse anyChar "aa"
189 | Right 'a'
190 | 
191 | *Main> parseWithEof anyChar "aa"
192 | Left (line 1, column 2):
193 | unexpected 'a'
194 | expecting end of input
195 | 
196 | *Main> parseWithLeftOver anyChar "aa"
197 | Right ('a',"a")
198 | 
199 | *Main> parseWithLeftOver anyChar "abc"
200 | Right ('a',"bc")
201 | 
202 | ```
203 | 
204 | You can use these functions and ghci to experiment. Try running all
205 | the parsers in ghci on various input strings as you work through the
206 | document to get a good feel for all the different features. Tip: you
207 | can also write the parsers inline in the function call, for example:
208 | 
209 | ```
210 | *Main> regularParse (many1 digit) "1"
211 | Right "1"
212 | 
213 | *Main> regularParse (many1 digit) "122"
214 | Right "122"
215 | ```
216 | 
217 | This can be used to quickly try out new ad hoc parsing code.
218 | 
219 | == Type signatures
220 | 
221 | The real Parsec functions have quite complex type signatures. This
222 | makes a lot of things very tricky before you understand them, and can
223 | make some of the error messages you'll see really difficult to
224 | understand. I've created some wrapper modules, which set the types of
225 | all the functions from Parsec we use to be much more restricted. This
226 | will make the types easy to understand, and reduce the amount of
227 | tricky to understand compiler errors you get. You can use this
228 | approach when writing your own parser code with Parsec. These wrapper
229 | modules are created with the following name pattern:
230 | `Text.Parsec.Char` -> `Text.Parsec.String.Char`.
231 | 
232 | Later on, we will look at the general types in more detail.
233 | 
234 | == Text.Parsec.Char
235 | 
236 | Let's go through some of the functions in `Text.Parsec.Char` module from
237 | the Parsec package. The haddock is available here:
238 | <http://hackage.haskell.org/package/parsec-3.1.3/docs/Text-Parsec-Char.html>.
239 | 
240 | Here is the `satisfy` function, with its full type signature.
241 | 
242 | ```
243 | satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
244 | ```
245 | 
246 | This is one of the main primitive functions in Parsec. This looks at
247 | the next character from the current input, and if the function (`Char
248 | -> Bool`) returns true for this character, it 'pops' it from the input
249 | and returns it. In this way, the current position in the input string
250 | is tracked behind the scenes.
251 | 
252 | In the simplified type wrappers, the `satisfy` function's type is this:
253 | 
254 | ```
255 | satisfy :: (Char -> Bool) -> Parser Char
256 | ```
257 | 
258 | This makes it a bit clearer what it is doing. All the functions in
259 | `Text.Parsec.Char` are reproduced in the local `Text.Parsec.String.Char`
260 | module with simplified types
261 | (<https://github.com/JakeWheat/intro_to_parsing/blob/master/Text/Parsec/String/Char.hs>).
262 | 
263 | Here are some examples of satisfy in action.
264 | 
265 | ```
266 | *Main> parseWithEof (satisfy (=='a')) "a"
267 | Right 'a'
268 | 
269 | *Main> parseWithEof (satisfy (=='b')) "a"
270 | Left (line 1, column 1):
271 | unexpected "a"
272 | 
273 | *Main> parseWithEof (satisfy (`elem` "abc")) "a"
274 | Right 'a'
275 | 
276 | *Main> parseWithEof (satisfy (`elem` "abc")) "d"
277 | Left (line 1, column 1):
278 | unexpected "d"
279 | 
280 | *Main> parseWithEof (satisfy isDigit) "d"
281 | Left (line 1, column 1):
282 | unexpected "d"
283 | 
284 | *Main> parseWithEof (satisfy isDigit) "1"
285 | Right '1'
286 | ```
287 | 
288 | You can see that it is easy to use `==`, or `elem` or one of the
289 | functions from the Data.Char module.
290 | 
291 | If you look at the docs on hackage
292 | <http://hackage.haskell.org/package/parsec-3.1.3/docs/Text-Parsec-Char.html>,
293 | you can view the source. The implementations of most of the functions
294 | in `Text.Parsec.Char` are straightforward. I recommend you look at the
295 | source for all of these functions.
296 | 
297 | You can see in the source that the `satisfy` function is a little more
298 | primitive than the other functions.
299 | 
300 | Here is the parser we used above in the `anyChar` parser:
301 | 
302 | ```
303 | anyChar :: Parser Char
304 | ```
305 | 
306 | If you look at the source via the haddock link above, you can see it
307 | uses `satisfy`.
308 | 
309 | Here are some other simple wrappers of `satisfy` from
310 | `Text.Parsec.Char` which use different validation functions.
311 | 
312 | The `char` parser parses a specific character which you supply:
313 | 
314 | ```
315 | char :: Char -> Parser Char
316 | ```
317 | 
318 | ```
319 | *Main> regularParse (char 'a') "a"
320 | Right 'a'
321 | 
322 | *Main> regularParse (char 'a') "b"
323 | Left (line 1, column 1):
324 | unexpected "b"
325 | expecting "a"
326 | ```
327 | 
328 | These parsers all parse single hardcoded characters
329 | 
330 | ```
331 | space :: Parser Char
332 | newline :: Parser Char
333 | tab :: Parser Char
334 | ```
335 | 
336 | They all return a `Char`. You might be able to guess what `Char` each
337 | of them returns, you can double check your intuition using ghci.
338 | 
339 | These parser all parse one character from a hardcoded set of
340 | characters:
341 | 
342 | ```
343 | upper :: Parser Char
344 | lower :: Parser Char
345 | alphaNum :: Parser Char
346 | letter :: Parser Char
347 | digit :: Parser Char
348 | hexDigit :: Parser Char
349 | octDigit :: Parser Char
350 | ```
351 | 
352 | In these cases, the return value is less redundant.
353 | 
354 | `oneOf` and `noneOf` parse any of the characters in the given list
355 | 
356 | ```
357 | oneOf :: [Char] -> Parser Char
358 | noneOf :: [Char] -> Parser Char
359 | ```
360 | 
361 | These are just simple wrappers of satisfy using `elem`.
362 | 
363 | 
364 | You should try all these parsers out in ghci, e.g.:
365 | 
366 | ```
367 | 
368 | regularParse space " "
369 | 
370 | regularParse upper "A"
371 | 
372 | regularParse (char 'b') "B"
373 | 
374 | regularParse (oneOf "abc") "c"
375 | 
376 | ```
377 | 
378 | Here are the final functions in `Text.Parsec.Char`:
379 | 
380 | `string` matches a complete string, one character at a time. I think
381 | the implementation of this function is like it is for efficiency when
382 | parsing from, e.g., `Data.Text.Text`, instead of `String`, but I'm not
383 | sure. We will skip the detailed explanation of the implementation for
384 | now.
385 | 
386 | ```
387 | string :: String -> Parser String
388 | ```
389 | 
390 | ```
391 | *Main> regularParse (string "one") "one"
392 | Right "one"
393 | 
394 | *Main> regularParse (string "one") "two"
395 | Left (line 1, column 1):
396 | unexpected "t"
397 | expecting "one"
398 | ```
399 | 
400 | Here is the `spaces` parser, which, if you look at the source, you can
401 | see uses a combinator (`skipMany`). We will cover this combinator
402 | shortly.
403 | 
404 | ```
405 | spaces :: Parser ()
406 | ```
407 | 
408 | ```
409 | *Main> regularParse spaces ""
410 | Right ()
411 | 
412 | *Main> regularParse spaces " "
413 | Right ()
414 | 
415 | *Main> regularParse spaces "   "
416 | Right ()
417 | 
418 | *Main> regularParse spaces " a  "
419 | Right ()
420 | 
421 | *Main> regularParse spaces "a a  "
422 | Right ()
423 | ```
424 | 
425 | It always succeeds.
426 | 
427 | == A couple of helper executables
428 | 
429 | Here are two exes which you can use to parse either a string or a file
430 | to help you experiment. This will save you having to figure out how to
431 | write this boilerplate until later.
432 | 
433 | <https://github.com/JakeWheat/intro_to_parsing/blob/master/ParseFile.lhs>
434 | 
435 | <https://github.com/JakeWheat/intro_to_parsing/blob/master/ParseString.lhs>
436 | 
437 | Now you can easily experiment using ghci, or with a string on the
438 | command line, or by putting the text to parse into a file and parsing
439 | that.
440 | 


--------------------------------------------------------------------------------
/IntroductionToCombinators.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | = Introduction to combinators
  3 | 
  4 | In this tutorial we will develop a parser for a very simple expression
  5 | language, and start learning about the set of combinators which comes
  6 | with Parsec.
  7 | 
  8 | > import Text.Parsec (ParseError)
  9 | > import Text.Parsec.String (Parser)
 10 | > import Text.Parsec.String.Parsec (try)
 11 | > import Text.Parsec.String.Char (oneOf, char, digit
 12 | >                                ,string, letter, satisfy)
 13 | > import Text.Parsec.String.Combinator (many1, choice, chainl1, between)
 14 | > import Control.Applicative ((<$>), (<*>), (<$), (<*), (*>), (<|>), many)
 15 | > import Control.Monad (void, ap)
 16 | > import Data.Char (isLetter, isDigit)
 17 | > import FunctionsAndTypesForParsing
 18 | 
 19 | Let's create a very simple expression language:
 20 | 
 21 | > data SimpleExpr = Num Integer
 22 | >                 | Var String
 23 | >                 | Add SimpleExpr SimpleExpr
 24 | >                 | Parens SimpleExpr
 25 | >                   deriving (Eq,Show)
 26 | 
 27 | It's a bit simple and almost useless at the moment, but we will expand
 28 | on this a lot in later tutorials.
 29 | 
 30 | Here are some examples:
 31 | 
 32 | > simpleExprExamples :: [(String,SimpleExpr)]
 33 | > simpleExprExamples =
 34 | >     [("a", Var "a")
 35 | >     ,("1", Num 1)
 36 | >     ,("2 + 3", Add (Num 2) (Num 3))
 37 | >     ,("(42)", Parens (Num 42))]
 38 | 
 39 | TODO: some more complex examples
 40 | 
 41 | Let's write a simple parser for these, and introduce a few things
 42 | along the way.
 43 | 
 44 | We will write a parser for each constructor separately, then look at
 45 | how we can write a parser for all of them together.
 46 | 
 47 | ==== Num
 48 | 
 49 | To parse a number, we need to parse one or more digits, and then read
 50 | the resulting string. We can use the combinator 'many1' to help with
 51 | this. We will also use do notation.
 52 | 
 53 | > num :: Parser SimpleExpr
 54 | > num = do
 55 | >     n <- many1 digit
 56 | >     return (Num (read n))
 57 | 
 58 | Let's try it out. TODO: examples
 59 | 
 60 | How does it work? First, we parse one or more (many1) digits (digit),
 61 | and give the result the name 'n'. Then we convert the string to an
 62 | integer using read, and wrap it in a Num constructor.
 63 | 
 64 | The many1 function's type looks like this:
 65 | 
 66 | ```
 67 | many1 :: Parser a -> Parser [a]
 68 | ```
 69 | 
 70 | It applies the parser given one or more times, returning the result.
 71 | 
 72 | TODO: example show what happens when you use 'many' instead of 'many1'
 73 | 
 74 | ==== Var
 75 | 
 76 | For var, we have to decide on a syntax for the identifiers. Let's go
 77 | for a common choice: identifiers must start with a letter or
 78 | underscore, and then they can be followed by zero or more letters,
 79 | underscores or digits in any combination.
 80 | 
 81 | > var :: Parser SimpleExpr
 82 | > var = do
 83 | >     fc <- firstChar
 84 | >     rest <- many nonFirstChar
 85 | >     return (Var (fc:rest))
 86 | >   where
 87 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
 88 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
 89 | 
 90 | This time, we create two helper parsers: firstChar - which parses a
 91 | letter or underscore, and nonFirstChar = which parses a digit, letter
 92 | or underscore. This time, we use the 'many' function instead of
 93 | 'many1': TODO - demonstrate why using examples only.
 94 | 
 95 | > add :: Parser SimpleExpr
 96 | > add = do
 97 | >     e0 <- num
 98 | >     void $ char '+'
 99 | >     e1 <- num
100 | >     return (Add e0 e1)
101 | 
102 | There are two simplifications here. We don't have a general simple
103 | expression parser yet, so it only supports parsing adding two number
104 | literals, and we don't deal with whitespace yet, so you must write
105 | "1+2" and not "1 + 2". We will deal with both of these issues below.
106 | 
107 | We used void to ignore the return of "char '+'". This is not required,
108 | but supresses a warning which you should get (since you are using
109 | -Wall, right?) and I think it is also good style to explicitly say
110 | that the result is being ignored.
111 | 
112 | To use -Wall in ghci, enter the following at the prompt:
113 | 
114 | ```
115 | *Main> :set -Wall
116 | ```
117 | 
118 | Try it out, then replace the line
119 | 
120 | ```
121 |     void $ char '+'
122 | ```
123 | 
124 | with
125 | 
126 | ```
127 |     char '+'
128 | ```
129 | 
130 | And check you see the warning. Another way of avoiding the warning is
131 | to write this:
132 | 
133 | ```
134 |     _ <- char '+'
135 | ```
136 | 
137 | ==== parens
138 | 
139 | > parens :: Parser SimpleExpr
140 | > parens = do
141 | >     void $ char '('
142 | >     e <- num
143 | >     void $ char ')'
144 | >     return (Parens e)
145 | 
146 | The same two issues from the 'add' parser apply here: whitespace and
147 | lack of a general simple expression parser, so it just uses the num
148 | parser again instead. Now we will tackle the whitespace issue.
149 | 
150 | === whitespace and lexeme parsing
151 | 
152 | Here is a parser which will skip zero or more whitespace characters.
153 | 
154 | > whiteSpace :: Parser ()
155 | > whiteSpace = void $ many $ oneOf " \n\t"
156 | 
157 | In the original parsec documentation, one of the concepts mentioned is
158 | the idea of 'lexeme' parsing. This is a style in which every token
159 | parser should also consume and ignore any trailing whitespace. This is
160 | a simple convention which with a bit of care allows skipping
161 | whitespace exactly once wherever it needs to be skipped. To complete
162 | the lexeme style, we should also always skip leading whitespace at the
163 | top level only.
164 | 
165 | > parseWithWhitespace :: Parser a -> String -> Either ParseError a
166 | > parseWithWhitespace p = parseWithEof wrapper
167 | >   where
168 | >     wrapper = do
169 | >         whiteSpace
170 | >         p
171 | 
172 | the wrapper function can also use (>>) to make it a bit shorter:
173 | 
174 | ```
175 | wrapper = whiteSpace >> p
176 | ```
177 | 
178 | Here is a shorter version of this function using (>>):
179 | 
180 | > parseWithWhitespace' :: Parser a -> String -> Either ParseError a
181 | > parseWithWhitespace' p = parseWithEof (whiteSpace >> p)
182 | 
183 | Here is the num parser rewritten in the lexeme style:
184 | 
185 | > lexeme :: Parser a -> Parser a
186 | > lexeme p = do
187 | >            x <- p
188 | >            whiteSpace
189 | >            return x
190 | 
191 | TODO: review the placement of the function 'lexeme' in all the code
192 | below. Maybe something could be said about all the places that it can
193 | be put.
194 | 
195 | > num' :: Parser SimpleExpr
196 | > num' = lexeme $ do
197 | >     n <- many1 digit
198 | >     return (Num (read n))
199 | 
200 | Here it is in action:
201 | 
202 | ```
203 | *Main Data.List> parseWithEof num "1"
204 | Right (Num 1)
205 | 
206 | *Main Data.List> parseWithEof num " 1"
207 | Left (line 1, column 1):
208 | unexpected " "
209 | expecting digit
210 | 
211 | *Main Data.List> parseWithEof num "1 "
212 | Left (line 1, column 2):
213 | unexpected ' '
214 | expecting digit or end of input
215 | 
216 | *Main Data.List> parseWithEof num " 1 "
217 | Left (line 1, column 1):
218 | unexpected " "
219 | expecting digit
220 | 
221 | *Main Data.List> parseWithWhitespace num' "1"
222 | Right (Num 1)
223 | 
224 | *Main Data.List> parseWithWhitespace num' " 1"
225 | Right (Num 1)
226 | 
227 | *Main Data.List> parseWithWhitespace num' "1 "
228 | Right (Num 1)
229 | 
230 | *Main Data.Lst> parseWithWhitespace num' " 1 "
231 | Right (Num 1)
232 | ```
233 | 
234 | Here are the other functions in lexeme style.
235 | 
236 | > var' :: Parser SimpleExpr
237 | > var' = lexeme $ do
238 | >     fl <- firstChar
239 | >     rest <- many nonFirstChar
240 | >     return (Var (fl:rest))
241 | >   where
242 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
243 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
244 | 
245 | > add' :: Parser SimpleExpr
246 | > add' = do
247 | >     e0 <- num'
248 | >     void $ lexeme $ char '+'
249 | >     e1 <- num'
250 | >     return (Add e0 e1)
251 | 
252 | > parens' :: Parser SimpleExpr
253 | > parens' = do
254 | >     void $ lexeme $ char '('
255 | >     e <- num'
256 | >     void $ lexeme $ char ')'
257 | >     return (Parens e)
258 | 
259 | In this style, you have to be slightly careful to make sure you call
260 | whitespace at the right points.
261 | 
262 | Let's try and implement the simpleExpr parser. We can use a function
263 | called 'choice':
264 | 
265 | > numOrVar :: Parser SimpleExpr
266 | > numOrVar = choice [num', var']
267 | 
268 | It tries each parser one at a time, finishing with the first one that
269 | succeeds. There are some more details about this later on.
270 | 
271 | Here is another way to write the numOrVar parser:
272 | 
273 | > numOrVar' :: Parser SimpleExpr
274 | > numOrVar' = num' <|> var'
275 | 
276 | In general, you can write 'choice [p0, p1, p2, ...]' as 'p0 <|> p1 <|>
277 | p2 <|> ...'.
278 | 
279 | TODO: a bunch of examples
280 | 
281 | Here is the first version of the simpleExpr parser:
282 | 
283 | > simpleExpr :: Parser SimpleExpr
284 | > simpleExpr = choice [num', var', add', parens']
285 | 
286 | TODO: a bunch of examples
287 | 
288 | It works well for some of the parsers, but fails with add': the num'
289 | always partially succeeds first, then fails, so the add' is never
290 | tried. We can rearrange this parser like this:
291 | 
292 | > simpleExpr' :: Parser SimpleExpr
293 | > simpleExpr' = choice [add', num', var', parens']
294 | 
295 | TODO: examples again
296 | 
297 | We have another problem now: when we start parsing the add', see a
298 | num, then fail, it gives up completely. This is because the 'choice'
299 | function (and <|>) will only try the next parser if the parser that
300 | failed consumed no input before failing.
301 | 
302 | TODO: show examples with satify and choice on 1 char match/no match,
303 | and two char match/no match.
304 | 
305 | Here is one way to fix it:
306 | 
307 | > simpleExpr'' :: Parser SimpleExpr
308 | > simpleExpr'' = choice [try add', num', var', parens']
309 | 
310 | The try function implements backtracking. When this is used in a
311 | choice like this, it means that if the add' parser fails, it will undo
312 | the consumed input and carry on with the next option, instead of
313 | failing completely. If there is another place using try higher up in
314 | the call stack, then we will continue there, otherwise the whole parse
315 | will fail immediately.
316 | 
317 | The same happens with <|>, we can implement the simpleExpr parser like
318 | this also:
319 | 
320 | > simpleExpr''' :: Parser SimpleExpr
321 | > simpleExpr''' = try add' <|> num' <|> var' <|> parens'
322 | 
323 | TODO: show the examples all working
324 | 
325 | Now we can make 'parens' and 'add' use a general simple expression
326 | parser. Parens is simple:
327 | 
328 | > parens'' :: Parser SimpleExpr
329 | > parens'' = do
330 | >     void $ lexeme $ char '('
331 | >     e <- simpleExpr'''
332 | >     void $ lexeme $ char ')'
333 | >     return (Parens e)
334 | 
335 | There is a problem implementing 'add' in the same way:
336 | 
337 | > add'' :: Parser SimpleExpr
338 | > add'' = do
339 | >     e0 <- simpleExpr'''
340 | >     void $ lexeme $ char '+'
341 | >     e1 <- simpleExpr'''
342 | >     return (Add e0 e1)
343 | 
344 | It will never return since it calls simpleExpr''' which calls add''
345 | again.
346 | 
347 | Let's look at another problem:
348 | 
349 | ```
350 | *Main> parseWithWhitespace simpleExpr''' " 1 + 1 + 1"
351 | Left (line 1, column 8):
352 | unexpected '+'
353 | expecting end of input
354 | ```
355 | 
356 | Our parser will only parse one operator, and not a chain of them.
357 | 
358 | Here is one way to solve it:
359 | 
360 | > simpleExpr4 :: Parser SimpleExpr
361 | > simpleExpr4 = do
362 | >     e <- term
363 | >     maybeAddSuffix e
364 | >   where
365 | >     maybeAddSuffix e = addSuffix e <|> return e
366 | >     addSuffix e0 = do
367 | >         void $ lexeme $ char '+'
368 | >         e1 <- term
369 | >         maybeAddSuffix (Add e0 e1)
370 | >     term = num' <|> var' <|> parens'
371 | 
372 | TODO: explain how this works in much more detail
373 | 
374 | This forces our Add operator to be left associative. This also solves
375 | the previous problem with add calling simpleExpr recursively. There is
376 | lots of discussion about these issues in Parsing theory documents you
377 | can find online, etc..
378 | 
379 | == general simple expression parser
380 | 
381 | Here is the all the parser code written out again for clarity.
382 | 
383 | > numD :: Parser SimpleExpr
384 | > numD = lexeme $ do
385 | >     n <- many1 digit
386 | >     return $ Num $ read n
387 | 
388 | > varD :: Parser SimpleExpr
389 | > varD = lexeme $ do
390 | >     fl <- firstChar
391 | >     rest <- many nonFirstChar
392 | >     return $ Var (fl:rest)
393 | >   where
394 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
395 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
396 | 
397 | > parensD :: Parser SimpleExpr
398 | > parensD = do
399 | >     void $ lexeme $ char '('
400 | >     e <- simpleExprD
401 | >     void $ lexeme $ char ')'
402 | >     return $ Parens e
403 | 
404 | > simpleExprD :: Parser SimpleExpr
405 | > simpleExprD = do
406 | >     e <- term
407 | >     maybeAddSuffix e
408 | >   where
409 | >     maybeAddSuffix e =
410 | >         choice [addSuffix e
411 | >                ,return e]
412 | >     addSuffix e0 = do
413 | >         void $ lexeme $ char '+'
414 | >         e1 <- term
415 | >         maybeAddSuffix (Add e0 e1)
416 | >     term = numD <|> varD <|> parensD
417 | 
418 | 
419 | === Testing with the examples
420 | 
421 | TODO: write a little manual tester that accepts a parser and a list of
422 | examples, and checks they all parse correctly.
423 | 
424 | === Testing with quickcheck
425 | 
426 | Let's see if we can check with quickcheck. It's a bit tricky testing
427 | parsers in this way, but one way to do something useful is to generate
428 | random asts, convert them to concrete syntax, parse them, and check
429 | the result. We can write a simple 'pretty printer' to convert an ast
430 | to concrete syntax.
431 | 
432 | ==== a pretty printer
433 | 
434 | TODO: a really simple pretty printer just pasting strings together, no
435 | layout.
436 | 
437 | ==== the quick check code
438 | 
439 | TODO: write a quickcheck property and arbitary instance and show
440 | running it at the ghci prompt
441 | 
442 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013-2019, Jake Wheat
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Jake Wheat nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/ParseFile.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | = Parsing a file executable
 3 | 
 4 | This is the example exe file which will parse a string from the file
 5 | given on the command line. You can substitute your own parser for the
 6 | parser given below.
 7 | 
 8 | 
 9 | > import System.Environment
10 | > import Text.Parsec
11 | > import Text.Parsec.String
12 | > import Control.Monad
13 | 
14 | > main :: IO ()
15 | > main = do
16 | >     a <- getArgs
17 | >     case a of
18 | >       [str] -> parseFromFile myParser str >>= either print print
19 | >       _ -> error "please pass one argument with the file containing the text to parse"
20 | 
21 | This is the parser which you can replace with your own code:
22 | 
23 | > myParser :: Parser ()
24 | > myParser = void $ string "correct"
25 | 
26 | Here is an example of running this program:
27 | 
28 | $ echo x > source_text && runhaskell ParseFile.lhs source_text
29 | "source_text" (line 1, column 1):
30 | unexpected "x"
31 | expecting "correct"
32 | $ echo correct > source_text && runhaskell ParseFile.lhs source_text
33 | ()
34 | 


--------------------------------------------------------------------------------
/ParseSQLQueries.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | = Parsing a file with SQL queries
 3 | 
 4 | This is an example exe which parses a list of SQL queries in a file
 5 | separated by semi colons.
 6 | 
 7 | > import System.Environment
 8 | > import Text.Parsec (ParseError, errorPos, sourceName, sourceLine, sourceColumn)
 9 | > import Text.Parsec.String
10 | > import Text.Parsec.String.Char
11 | > import Text.Parsec.String.Combinator
12 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many)
13 | > import Control.Monad
14 | > import SimpleSQLQueryParser0
15 | > import Data.List
16 | > import qualified PrettyPrinting0 as P
17 | 
18 | > main :: IO ()
19 | > main = do
20 | >     a <- getArgs
21 | >     case a of
22 | >       [str] -> parseFromFile myParser str
23 | >                >>= either (putStrLn . showError)
24 | >                           (putStrLn . intercalate "\n" . map P.prettyQueryExpr)
25 | >       _ -> error "please pass one argument with the file containing the queries to parse"
26 | 
27 | > myParser :: Parser [QueryExpr]
28 | > myParser = whitespace
29 | >            *> sepBy1 queryExpr semi
30 | >            <* optional semi
31 | >            <* eof
32 | >   where semi = void $ lexeme $ char ';'
33 | 
34 | > showError :: ParseError -> String
35 | > showError e =
36 | >     let p = errorPos e
37 | >     in sourceName p ++ ":" ++ show (sourceLine p) ++ ":"
38 | >        ++ show (sourceColumn p) ++ ":\n"
39 | >        ++ show e
40 | 


--------------------------------------------------------------------------------
/ParseString.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | = Parsing a string executable
 3 | 
 4 | This is the example exe file which will parse a string given on the
 5 | command line. You can substitute your own parser for the parser given
 6 | below.
 7 | 
 8 | 
 9 | > import System.Environment
10 | > import Text.Parsec
11 | > import Text.Parsec.String
12 | > import Control.Monad
13 | 
14 | > main :: IO ()
15 | > main = do
16 | >     a <- getArgs
17 | >     case a of
18 | >       [str] -> either print print $ parse myParser "" str
19 | >       _ -> error "please pass one argument with the string to parse"
20 | 
21 | This is the parser which you can replace with your own code:
22 | 
23 | > myParser :: Parser ()
24 | > myParser = void $ string "correct"
25 | 
26 | Here is an example of running this program:
27 | 
28 | ```
29 | $ runhaskell ParseString.lhs x
30 | (line 1, column 1):
31 | unexpected "x"
32 | expecting "correct"
33 | 
34 | $ runhaskell ParseString.lhs correct
35 | ()
36 | ```
37 | 


--------------------------------------------------------------------------------
/ParsingTpch.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | = Parsing TPC-H queries
 3 | 
 4 | In this file we are going to get the TPC-H queries parsing. This can
 5 | be an example of taking a parser, identifying some missing features,
 6 | and adding them. The TPC-H queries are slightly more complex than
 7 | trivial, and will also serve to exercise the human readable layout
 8 | feature of the pretty printer.
 9 | 
10 | Let's try the parser out on the TPC-H queries.
11 | 
12 | ```
13 | ```
14 | 
15 | Summary of errors so far:
16 | q1: typed literal: type_name 'literal value'
17 | q2: scalar subquery
18 | q3: typed literal
19 | q4: typed literal
20 | q5: typed literal
21 | q6: typed literal
22 | q7: ??
23 | q8: extract??
24 | q9: extract ??
25 | q10: typed literal
26 | q11: scalar sub query
27 | q12: ??
28 | q13: not like??
29 | q14: decimal literal
30 | q15: cte
31 | q16: count distinct
32 | q17: decimal literal
33 | q18: in subquery
34 | q19: in literal list
35 | q20: in subquery
36 | q21: exists subquery
37 | q22: substring
38 | 


--------------------------------------------------------------------------------
/PrettyPrinting0.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[pretty-printing]]
  3 | = Pretty printing
  4 | 
  5 | Here is a pretty printer for the parser version 0.
  6 | 
  7 | It uses Text.PrettyPrint module. I think this module is a bit out of
  8 | date these days, it would be nice to update this code to a more
  9 | fashionable pretty printing library. I haven't put much
 10 | comments/explanation here.
 11 | 
 12 | > module PrettyPrinting0 where
 13 | 
 14 | > import Prelude hiding ((<>))
 15 | > import SimpleSQLQueryParser0 (ValueExpr(..), QueryExpr(..), TableRef(..)
 16 | >                              ,JoinType(..), JoinCondition(..))
 17 | > import qualified SimpleSQLQueryParser0 as S
 18 | > import Text.PrettyPrint (render, vcat, text, (<>), (<+>), empty, parens,
 19 | >                          nest, Doc, punctuate, comma, sep, quotes,
 20 | >                          doubleQuotes, hsep)
 21 | > import Data.Maybe (maybeToList, catMaybes)
 22 | > import FunctionsAndTypesForParsing
 23 | > --import Text.Parsec.String.Combinator (eof)
 24 | > import qualified Test.HUnit as H
 25 | > import Text.Parsec (parse, ParseError)
 26 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many)
 27 | > --import Text.Parsec.String (Parser)
 28 | 
 29 | The basic concept in this pretty printer is we convert the ast into
 30 | `Doc` values, then convert these into a string. We have a bunch of
 31 | functions to convert `String` to `Doc`s, and combine these docs with
 32 | different layouts.
 33 | 
 34 | By using a pretty printer library, we can get human readable source
 35 | very easily compared with trying to convert directly to strings
 36 | ourselves.
 37 | 
 38 | == api
 39 | 
 40 | > prettyQueryExpr :: QueryExpr -> String
 41 | > prettyQueryExpr = render . queryExpr
 42 | 
 43 | == value expressions
 44 | 
 45 | > valueExpr :: ValueExpr -> Doc
 46 | > valueExpr (StringLit s) = quotes $ text s
 47 | > valueExpr (NumLit i) = text $ show i
 48 | > valueExpr (Iden s) = text s
 49 | > valueExpr (DIden q i) = text q <> text "." <> text i
 50 | > valueExpr Star = text "*"
 51 | > valueExpr (DStar q) = text q <> text "." <> text "*"
 52 | > valueExpr (App f es) = text f <> parens (commaSep $ map valueExpr es)
 53 | > valueExpr (PrefOp op e) = sep[text op, valueExpr e]
 54 | > valueExpr (BinOp e0 op e1) = sep [valueExpr e0, text op, valueExpr e1]
 55 | > valueExpr (Case test whens els) =
 56 | >     sep [text "case" <+> maybe empty valueExpr test
 57 | >         ,nest 5 $ sep (map wh whens
 58 | >                        ++ [maybe empty
 59 | >                            (\e -> text "else" <+> valueExpr e)
 60 | >                            els])
 61 | >         ,text "end"]
 62 | >   where wh (w,t) = sep [text "when" <+> valueExpr w
 63 | >                        ,text "then" <+> valueExpr t]
 64 | > valueExpr (Parens e) = parens $ valueExpr e
 65 | 
 66 | == query expressions
 67 | 
 68 | > queryExpr :: QueryExpr -> Doc
 69 | > queryExpr (Select sl fr wh gb hv ob) = sep
 70 | >     [text "select" <+> nest 7 (commaSep $ map selectItem sl)
 71 | >      -- from
 72 | >     ,ml fr $ \f -> text "from" <+> nest 7 (commaSep $ map tref f)
 73 | >      -- where
 74 | >     ,me wh $ \w -> text "where" <+> nest 6 (valueExpr w)
 75 | >      -- group by
 76 | >     ,ml gb $ \g -> text "group by" <+> nest 9 (commaSep $ map valueExpr g)
 77 | >      -- having
 78 | >     ,me hv $ \h -> text "having" <+> nest 6 (valueExpr h)
 79 | >      -- order by
 80 | >     ,ml ob $ \o -> text "order by" <+> nest 9 (commaSep $ map valueExpr o)
 81 | >     ]
 82 | >   where
 83 | >     selectItem (e,a) = valueExpr e <+> me a (\a' -> text "as" <+> text a')
 84 | >
 85 | >     tref (TRSimple t) = text t
 86 | >     tref (TRParens t) = parens $ tref t
 87 | >     tref (TRAlias t a) = tref t <+> text "as" <+> text a
 88 | >     tref (TRQueryExpr q) = parens $ queryExpr q
 89 | >     tref (TRJoin t0 jt t1 jc) = sep
 90 | >         [tref t0
 91 | >         ,joinName jt jc <+> tref t1
 92 | >         ,case jc of
 93 | >              Just (JoinOn e) -> text "on" <+> valueExpr e
 94 | >              Just (JoinUsing is) -> text "using" <+> parens (commaSep $ map text is)
 95 | >              Just JoinNatural -> empty
 96 | >              Nothing -> empty]
 97 | >     joinName jt jc =
 98 | >       hsep [case jc of
 99 | >                Just JoinNatural -> text "natural"
100 | >                _ -> empty
101 | >            ,case jt of
102 | >                JoinInner -> text "inner join"
103 | >                JoinCross -> text "cross join"
104 | >                JoinLeft -> text "left join"
105 | >                JoinRight -> text "right join"
106 | >                JoinFull -> text "full join"]
107 | >     me e r = maybe empty r e
108 | >     ml [] _ = empty
109 | >     ml l r = r l
110 | 
111 | == helpers
112 | 
113 | > commaSep :: [Doc] -> Doc
114 | > commaSep = sep . punctuate comma
115 | 
116 | Have a look at the haddock for this module and see if you can work out
117 | how the code above works.
118 | 
119 | ```
120 | *PrettyPrinting0> either (error . show) valueExpr (parseWithEof (S.valueExpr []) "a and b")
121 | a and b
122 | 
123 | *PrettyPrinting0> either (error . show) queryExpr (parseWithEof S.queryExpr "select a from t inner join u using(a,b)")
124 | select a from t inner join u using (a, b)
125 | ```
126 | 
127 | == tests
128 | 
129 | Now we can do some tests: we take the previous test data, and for each
130 | test add an additional test which pretty prints then parses the
131 | results to see that it is unchanged.
132 | 
133 | > makeTest :: (Eq a, Show a) =>
134 | >             (String -> Either ParseError a)
135 | >          -> (a -> String)
136 | >          -> (String,a)
137 | >          -> H.Test
138 | > makeTest parser pretty (src,expected) = H.TestLabel src $ H.TestCase $ do
139 | >     let gote = parser src
140 | >     case gote of
141 | >       Left e -> H.assertFailure $ show e
142 | >       Right got -> do
143 | >         H.assertEqual src expected got
144 | >         let prsql = pretty got
145 | >             gotpretty = parser prsql
146 | >         case gotpretty of
147 | >           Left e -> H.assertFailure $ "pretty: " ++ prsql ++ "\n" ++ show e
148 | >           Right gotp -> H.assertEqual ("pretty: " ++ prsql) expected gotp
149 | 
150 | TODO: fix parsing issue
151 | 
152 | ```
153 | *PrettyPrinting0> H.runTestTT $ H.TestList $ map (makeTest S.parseQueryExpr (render . queryExpr)) S.allQueryExprTests
154 | Cases: 26  Tried: 26  Errors: 0  Failures: 0
155 | Counts {cases = 26, tried = 26, errors = 0, failures = 0}
156 | *PrettyPrinting0> H.runTestTT $ H.TestList $ map (makeTest S.parseValueExpr (render . valueExpr)) S.allValueExprTests
157 |  ### Failure in: 34:case when a=1 then 2 when a=3 then 4 else 5 end
158 | (line 1, column 11):
159 | unexpected "a"
160 | expecting "--" or "/*"
161 | Cases: 35  Tried: 35  Errors: 0  Failures: 1
162 | Counts {cases = 35, tried = 35, errors = 0, failures = 1}
163 | ```
164 | 


--------------------------------------------------------------------------------
/QueryExpressions.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[query-expressions]]
  3 | = Query expressions
  4 | 
  5 | We can now start on the 'select' parser. In the SQL standard, it
  6 | refers to these things as 'query expressions' to distinguish then from
  7 | 'value expressions', so we will reuse this language here.
  8 | 
  9 | The subset of SQL we will support is this:
 10 | 
 11 | TODO: write lots of examples here
 12 | 
 13 | select queries only, no union, intersect or except. No common table
 14 | expressions.
 15 | 
 16 | we will support all the value expressions that the value expression
 17 | parser above supports
 18 | 
 19 | we will support select lists with optional aliases, with the 'as'
 20 | optional in the alias
 21 | 
 22 | we will support the * as in 'select * from t', and the variation
 23 | 'select t.* from t', but not the alias version select * as (a,b,c)
 24 | from t.
 25 | 
 26 | we support two part dotted identifiers in value expressions, but no
 27 | other sort (such as 3-part dotted value expression identifiers, or
 28 | schema qualified table names or function names).
 29 | 
 30 | for the from clause, we will only support optional 'from table_name'.
 31 | 
 32 | supports where
 33 | 
 34 | we will support regular group by lists, but not the new group by
 35 | options in SQL2003 (group by (), grouping sets, cube, rollup).
 36 | 
 37 | we will support having
 38 | 
 39 | we support order by, with multiple columns, but not explicit asc or
 40 | desc , and no 'nulls first' or 'nulls last' syntax.
 41 | 
 42 | 
 43 | No support for offset and fetch first, or variations.
 44 | 
 45 | > {-# LANGUAGE TupleSections #-}
 46 | > module QueryExpressions where
 47 | >
 48 | > --import Text.Groom (groom)
 49 | > --import qualified Text.Parsec as P
 50 | > import Text.Parsec.String (Parser)
 51 | > import Text.Parsec (try,optionMaybe, optional, sepBy1,option)
 52 | > import Control.Applicative ((<$>),(*>),(<*>))
 53 | > import Control.Monad (void,guard)
 54 | > --import Debug.Trace
 55 | > --import Data.List (intercalate)
 56 | > import Data.Maybe ()
 57 | > import qualified Test.HUnit as H
 58 | > import FunctionsAndTypesForParsing
 59 | 
 60 | > import ValueExpressions (ValueExpr(..), valueExpr, identifier, symbol, keyword, comma)
 61 | 
 62 | Here is the datatype for query expressions to get started with. In
 63 | this tutorial, we will only support an optional single table in the
 64 | from clause, and this will be expanded in the next tutorial.
 65 | 
 66 | TODO: rearrange the from/where/groupby/having/orderby to a separate
 67 | datatype which is optional in a select, and the from part is mandatory
 68 | in this new type. This follows the standard and is more accurate
 69 | syntax.
 70 | 
 71 | > data QueryExpr
 72 | >     = Select
 73 | >       {qeSelectList :: [(ValueExpr,Maybe String)]
 74 | >       ,qeFrom :: Maybe String
 75 | >       ,qeWhere :: Maybe ValueExpr
 76 | >       ,qeGroupBy :: [ValueExpr]
 77 | >       ,qeHaving :: Maybe ValueExpr
 78 | >       ,qeOrderBy :: [ValueExpr]
 79 | >       } deriving (Eq,Show)
 80 | 
 81 | Here is a default value which can be used to easily construct query
 82 | expression values.
 83 | 
 84 | > makeSelect :: QueryExpr
 85 | > makeSelect = Select {qeSelectList = []
 86 | >                     ,qeFrom = Nothing
 87 | >                     ,qeWhere = Nothing
 88 | >                     ,qeGroupBy = []
 89 | >                     ,qeHaving = Nothing
 90 | >                     ,qeOrderBy = []}
 91 | 
 92 | == select lists
 93 | 
 94 | Let's start with something simple:
 95 | 
 96 | ```
 97 | select [value expr]
 98 | ```
 99 | 
100 | TODO: shorten the names of these examples
101 | 
102 | > singleSelectItemTests :: [(String,QueryExpr)]
103 | > singleSelectItemTests =
104 | >     [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})]
105 | 
106 | Here are a couple of wrappers for `symbol` and `keyword` which wrap
107 | them with void.
108 | 
109 | > keyword_ :: String -> Parser ()
110 | > keyword_ = void . keyword
111 | 
112 | > symbol_ :: String -> Parser ()
113 | > symbol_ = void . symbol
114 | 
115 | > singleSelectItem :: Parser QueryExpr
116 | > singleSelectItem = do
117 | >     keyword_ "select"
118 | >     e <- valueExpr []
119 | >     return $ makeSelect {qeSelectList = [(e,Nothing)]}
120 | 
121 | You can use the old test runner to check these:
122 | 
123 | ```
124 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest singleSelectItem) parseSingleSelectItemTestData
125 | Cases: 1  Tried: 1  Errors: 0  Failures: 0
126 | Counts {cases = 1, tried = 1, errors = 0, failures = 0}
127 | ```
128 | 
129 | Let's rewrite it in the Applicative and mostly point free style.
130 | 
131 | > singleSelectItemApplicative :: Parser QueryExpr
132 | > singleSelectItemApplicative =
133 | >     (\sl -> makeSelect {qeSelectList = sl})
134 | >     <$> (keyword_ "select" *> (((:[]) . (,Nothing)) <$> valueExpr []))
135 | 
136 | That didn't go so well. Using a wrapper:
137 | 
138 | > singleSelectItemApplicative' :: Parser QueryExpr
139 | > singleSelectItemApplicative' =
140 | >     ms <$> (keyword_ "select" *> valueExpr [])
141 | >   where
142 | >     ms e = makeSelect {qeSelectList = [(e,Nothing)]}
143 | 
144 | Now let's write something that supports multiple value expressions,
145 | e.g.
146 | 
147 | ```
148 | select 1+2, 3+4;
149 | ```
150 | 
151 | > multipleSelectItemsTests :: [(String,QueryExpr)]
152 | > multipleSelectItemsTests =
153 | >     [("select a"
154 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]})
155 | >     ,("select a,b"
156 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
157 | >                                  ,(Iden "b",Nothing)]})
158 | >     ,("select 1+2,3+4"
159 | >      ,makeSelect {qeSelectList =
160 | >                      [(BinOp (NumLit 1) "+" (NumLit 2),Nothing)
161 | >                      ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]})
162 | >     ]
163 | 
164 | > selectMultipleItems :: Parser QueryExpr
165 | > selectMultipleItems = do
166 | >     keyword_ "select"
167 | >     es <- commaSep1 (valueExpr [])
168 | >     return $ makeSelect {qeSelectList = map (,Nothing) es}
169 | 
170 | > commaSep1 :: Parser a -> Parser [a]
171 | > commaSep1 = (`sepBy1` comma)
172 | 
173 | === aliases
174 | 
175 | We can write names for the columns produced from a select list using
176 | the keyword `as`, and we can miss out the `as`:
177 | 
178 | ```sql
179 | select a as a1, b as b1, f(c) as c1;
180 | 
181 | -- no as
182 | select a a1, b b1;
183 | ```
184 | 
185 | > selectListTests :: [(String,QueryExpr)]
186 | > selectListTests =
187 | >     [("select a as a1, b as b1"
188 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a1")
189 | >                                  ,(Iden "b", Just "b1")]})
190 | >     ,("select a a1, b b1"
191 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a1")
192 | >                                  ,(Iden "b", Just "b1")]})
193 | >     ] ++ multipleSelectItemsTests
194 | >       ++ singleSelectItemTests
195 | 
196 | Finally, here is the select list parser and the helper for select
197 | items:
198 | 
199 | > selectItem0 :: Parser (ValueExpr, Maybe String)
200 | > selectItem0 = (,) <$> valueExpr [] <*> optionMaybe (try alias)
201 | >   where alias = optional (keyword_ "as") *> identifier
202 | 
203 | > selectList0 :: Parser [(ValueExpr, Maybe String)]
204 | > selectList0 = keyword_ "select" *> commaSep1 selectItem0
205 | 
206 | > queryExpr0 :: Parser QueryExpr
207 | > queryExpr0 = mkSelect <$> selectList0
208 | >   where mkSelect sl = makeSelect {qeSelectList = sl}
209 | 
210 | == from clause
211 | 
212 | > from :: Parser String
213 | > from = keyword_ "from" *> identifier
214 | 
215 | > fromTests :: [(String,QueryExpr)]
216 | > fromTests =
217 | >     [("select a from t"
218 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
219 | >                  ,qeFrom = (Just "t")})]
220 | 
221 | > queryExpr1 :: Parser QueryExpr
222 | > queryExpr1 = mkSelect
223 | >              <$> selectList0
224 | >              <*> optionMaybe from
225 | >   where mkSelect sl fr = makeSelect {qeSelectList = sl
226 | >                                     ,qeFrom = fr}
227 | 
228 | ```
229 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest queryExpr1) (selectListTests ++ fromTests)
230 |  ### Failure in: 6:select a from t
231 | (line 1, column 16):
232 | unexpected end of input
233 | expecting digit, letter, "_", "--" or "/*"
234 | Cases: 7  Tried: 7  Errors: 0  Failures: 1
235 | Counts {cases = 7, tried = 7, errors = 0, failures = 1}
236 | ```
237 | 
238 | This is a keyword issue again. We are parsing the `from` as if it was
239 | a column alias and then getting stuck.
240 | 
241 | > blackListIdentifier :: [String] -> Parser String
242 | > blackListIdentifier bl = do
243 | >     i <- identifier
244 | >     guard (i `notElem` bl)
245 | >     return i
246 | 
247 | > selectItem :: Parser (ValueExpr, Maybe String)
248 | > selectItem = (,) <$> valueExpr [] <*> optionMaybe (try alias)
249 | >   where alias = optional (keyword_ "as") *> blackListIdentifier ["from"]
250 | 
251 | > selectList :: Parser [(ValueExpr, Maybe String)]
252 | > selectList = keyword_ "select" *> commaSep1 selectItem
253 | 
254 | > queryExpr2 :: Parser QueryExpr
255 | > queryExpr2 = mkSelect
256 | >              <$> selectList
257 | >              <*> optionMaybe from
258 | >   where mkSelect sl fr = makeSelect {qeSelectList = sl
259 | >                                     ,qeFrom = fr}
260 | 
261 | That did the job for now.
262 | 
263 | == where
264 | 
265 | The where, group by, having, and order by parsers are simple.
266 | 
267 | > whereTests :: [(String,QueryExpr)]
268 | > whereTests =
269 | >     [("select a from t where a = 5"
270 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
271 | >                  ,qeFrom = Just "t"
272 | >                  ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)})
273 | >     ]
274 | 
275 | > whereClause :: Parser ValueExpr
276 | > whereClause = keyword_ "where" *> valueExpr []
277 | 
278 | > queryExpr3 :: Parser QueryExpr
279 | > queryExpr3 = mkSelect
280 | >              <$> selectList
281 | >              <*> optionMaybe from
282 | >              <*> optionMaybe whereClause
283 | >   where mkSelect sl fr wh =
284 | >             makeSelect {qeSelectList = sl
285 | >                        ,qeFrom = fr
286 | >                        ,qeWhere = wh}
287 | 
288 | == group by
289 | 
290 | > groupByTests :: [(String,QueryExpr)]
291 | > groupByTests =
292 | >     [("select a,sum(b) from t group by a"
293 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
294 | >                                  ,(App "sum" [Iden "b"],Nothing)]
295 | >                  ,qeFrom = Just "t"
296 | >                  ,qeGroupBy = [Iden "a"]
297 | >                  })
298 | >     ,("select a,b,sum(c) from t group by a,b"
299 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
300 | >                                  ,(Iden "b",Nothing)
301 | >                                  ,(App "sum" [Iden "c"],Nothing)]
302 | >                  ,qeFrom = Just "t"
303 | >                  ,qeGroupBy = [Iden "a",Iden "b"]
304 | >                  })
305 | >     ]
306 | 
307 | > groupByClause :: Parser [ValueExpr]
308 | > groupByClause = keyword_ "group" *> keyword_ "by"
309 | >                 *> commaSep1 (valueExpr [])
310 | 
311 | > queryExpr4 :: Parser QueryExpr
312 | > queryExpr4 = mkSelect
313 | >              <$> selectList
314 | >              <*> optionMaybe from
315 | >              <*> optionMaybe whereClause
316 | >              <*> option [] groupByClause
317 | >   where mkSelect sl fr wh gr =
318 | >             makeSelect {qeSelectList = sl
319 | >                        ,qeFrom = fr
320 | >                        ,qeWhere = wh
321 | >                        ,qeGroupBy = gr}
322 | 
323 | == having
324 | 
325 | > havingTests :: [(String,QueryExpr)]
326 | > havingTests =
327 | >   [("select a,sum(b) from t group by a having sum(b) > 5"
328 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
329 | >                                  ,(App "sum" [Iden "b"],Nothing)]
330 | >                  ,qeFrom = (Just "t")
331 | >                  ,qeGroupBy = [Iden "a"]
332 | >                  ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5)
333 | >                  })
334 | >   ]
335 | 
336 | > having :: Parser ValueExpr
337 | > having = keyword_ "having" *> (valueExpr [])
338 | 
339 | > queryExpr5 :: Parser QueryExpr
340 | > queryExpr5 = mkSelect
341 | >              <$> selectList
342 | >              <*> optionMaybe from
343 | >              <*> optionMaybe whereClause
344 | >              <*> option [] groupByClause
345 | >              <*> optionMaybe having
346 | >   where mkSelect sl fr wh gr hv =
347 | >             makeSelect {qeSelectList = sl
348 | >                        ,qeFrom = fr
349 | >                        ,qeWhere = wh
350 | >                        ,qeGroupBy = gr
351 | >                        ,qeHaving = hv}
352 | 
353 | Looking nice so far. Did you run the tests for each stage?
354 | 
355 | 
356 | == order by
357 | 
358 | > orderByTests :: [(String,QueryExpr)]
359 | > orderByTests =
360 | >     [("select a from t order by a"
361 | >      ,ms [Iden "a"])
362 | >     ,("select a from t order by a, b"
363 | >      ,ms [Iden "a", Iden "b"])
364 | >     ]
365 | >   where
366 | >     ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)]
367 | >                       ,qeFrom = (Just "t")
368 | >                       ,qeOrderBy = o}
369 | 
370 | > orderBy :: Parser [ValueExpr]
371 | > orderBy = keyword_ "order" *> keyword_ "by"
372 | >           *> commaSep1 (valueExpr [])
373 | 
374 | > queryExpr6 :: Parser QueryExpr
375 | > queryExpr6 = Select
376 | >              <$> selectList
377 | >              <*> optionMaybe from
378 | >              <*> optionMaybe whereClause
379 | >              <*> option [] groupByClause
380 | >              <*> optionMaybe having
381 | >              <*> option [] orderBy
382 | 
383 | ```
384 | *QueryExpressions> H.runTestTT $ H.TestList $ map (makeTest queryExpr6) (selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests)
385 | Cases: 13  Tried: 13  Errors: 0  Failures: 0
386 | Counts {cases = 13, tried = 13, errors = 0, failures = 0}
387 | ```
388 | 
389 | TODO: talk about putting the maybes/default values inside from, where, etc??
390 | 


--------------------------------------------------------------------------------
/README.asciidoc:
--------------------------------------------------------------------------------
  1 | = Intro to Parsing with Parsec in Haskell
  2 | 
  3 | == Overview
  4 | 
  5 | WIP, a tutorial which demonstrates the basics of Parsec and goes on to
  6 | build a SQL query parser.
  7 | 
  8 | You can view this tutorial as HTML online here:
  9 | 
 10 | http://jakewheat.github.io/intro_to_parsing/
 11 | 
 12 | and you can view the files directly in the github repository here:
 13 | 
 14 | https://github.com/JakeWheat/intro_to_parsing
 15 | 
 16 | === Summary of sections
 17 | 
 18 | // the first link for each section will work in the readme on github,
 19 | // the second link is for the rendered html and doesn't work here
 20 | 
 21 | link:GettingStarted.lhs[]
 22 | 
 23 | // <<getting-started, Getting Started>>
 24 | 
 25 | Introduction to parsing with Parsec, including a review of
 26 | Text.Parsec.Char functions.
 27 | 
 28 | link:VerySimpleExpressions.lhs[]
 29 | 
 30 | // <<very-simple-expression-parsing, Very simple expression parsing>>
 31 | 
 32 | Creating a very simple expression language parser, and introducing
 33 | some functions from Text.Parsec.Combinator.
 34 | 
 35 | link:ApplicativeStyle.lhs[]
 36 | 
 37 | // <<applicative-style-parsing-code, Applicative style parsing code>>
 38 | 
 39 | Rewriting the simple expression parser code in a more succinct style.
 40 | 
 41 | link:CombinatorReview.lhs[]
 42 | 
 43 | // <<combinator-review,Combinator review>>
 44 | 
 45 | Review and examples of all functions from Text.Parsec.Combinator, and
 46 | some from Control.Applicative and Control.Monad.
 47 | 
 48 | link:FunctionsAndTypesForParsing.lhs[]
 49 | 
 50 | // <<functions-and-types-for-parsing,Functions and types for parsing>>
 51 | 
 52 | The utility functions used in the previous tutorials, plus some notes
 53 | on types in Parsec.
 54 | 
 55 | link:TextParsecExpr.lhs[]
 56 | 
 57 | // <<parsing-expressions-with-fixity, Parsing expressions with fixity>>
 58 | 
 59 | This covers using the Text.Parsec.Expr for expression parsing with
 60 | prefix, postfix and infix operators with fixity.
 61 | 
 62 | link:AnIssueWithTokenParsers.lhs[]
 63 | 
 64 | // <<an-issue-with-token-parsers, An issue with token parsers>>
 65 | 
 66 | Looks at an issue we have with the way the symbol parser in the
 67 | Text.Parsec.Expr tutorial was used, and some possible fixes.
 68 | 
 69 | link:TextParsecPerm.lhs[]
 70 | 
 71 | // <<permutation-parsing, Permutation parsing>>
 72 | 
 73 | This covers the Text.Parsec.Perm module which is used for parsing
 74 | different things in flexible order.
 75 | 
 76 | link:TextParsecToken.lhs[]
 77 | 
 78 | // <<token-parsing, Token parsing>>
 79 | 
 80 | This covers Text.Parsec.Token which can be used to create token
 81 | parsers easily.
 82 | 
 83 | link:ValueExpressions.lhs[]
 84 | 
 85 | // <<value-expressions, Value expressions>>
 86 | 
 87 | This covers building a parser a subset of value expressions from SQL,
 88 | which are an extension of the simple expression types and parsers
 89 | covered in previous tutorials.
 90 | 
 91 | link:QueryExpressions.lhs[]
 92 | 
 93 | // <<query-expressions, Query expressions>>
 94 | 
 95 | This covers building a parser to parse query expressions with select
 96 | lists, simple from, where, group by, having and order by.
 97 | 
 98 | link:FromClause.lhs[]
 99 | 
100 | // <<from-clause, From clause>>
101 | 
102 | This extend the parser for query expressions to support a from clause
103 | with much more features including joins.
104 | 
105 | link:SimpleSQLQueryParser0.lhs[]
106 | 
107 | // <<simple-sql-query-parser, Simple SQL query parser>>
108 | 
109 | Here is the code from ValueExpressions, QueryExpressions and
110 | FromClause plus tests put together and rearranged as a coherent
111 | standalone module.
112 | 
113 | link:PrettyPrinting0.lhs[]
114 | 
115 | // <<pretty-printing, Pretty printing>>
116 | 
117 | This quick module covers a simple pretty printer for our SQL ast.
118 | 
119 | link:ErrorMessages.lhs[]
120 | 
121 | // <<error-messages, Error messages>>
122 | 
123 | In this document, we will explore error messages with parsec and how
124 | restructuring parser code can lead to better or worse error messages.
125 | 
126 | === Going further
127 | 
128 | If you are interested in SQL parsing, check out the project to build
129 | a complete SQL parser here:
130 | http://jakewheat.github.io/simple-sql-parser/latest. The parsing code in the
131 | simple-sql-parser project is based on this tutorial code.
132 | 
133 | ////
134 | 
135 | Later documents
136 | 
137 | Additional provisional documents not yet started:
138 | 
139 | Parsing TPC-H queries
140 | 
141 | We will use the tpch queries as examples to help improve the pretty
142 | printer. First there are a few extra bits of syntax to be able to
143 | parse these queries
144 | 
145 | Pretty printing part 2
146 | 
147 | some tweaks to the pretty printer to improve the layout for the tpch
148 | queries
149 | 
150 | Writing tests
151 | 
152 | Here we will take the ad hoc tests and build an organised test suite
153 | with a wrapper for hunit, wrapper for test.framework wrapper and maybe
154 | tasty
155 | 
156 | Refactored project + cabal package
157 | 
158 | In this tutorial, we will take the sql parser, pretty printer and
159 | tests, and create a complete cabal package.
160 | 
161 | TODO: talk about robustness and the casual way the parser has been put
162 | together and the casual way issues have been tackled.
163 | 
164 | Writing a command line sql interface
165 | 
166 | quick experiment to try to implement the front end for a multiline sql
167 | command line using fake incremental parsing which parsec doesn't
168 | support directly.
169 | 
170 | Position annotation
171 | 
172 | In this tutorial, we will add position annotation to the parsing, so
173 | that a later stage could, e.g., provide type error messages with the
174 | correct line and column numbers.
175 | 
176 | Dialects
177 | 
178 | In this tutorial, we will discuss how we can support other SQL dialects
179 | 
180 | Separate lexer
181 | 
182 | In this tutorial, we will look at creating a proper separate lexer to
183 | see how it is done, and remark on what the tradeoffs seem to be.
184 | 
185 | Fixing fixity
186 | 
187 | Parsing full SQL expressions is a mess, and trying to do the fixity at
188 | parse time has many downsides. Here is another approach, to ignore
189 | fixity at parse time and fix it in a pass on the ast after parsing.
190 | 
191 | Quasiquotes
192 | 
193 | In this tutorial, we will create quasiquoters for sql query
194 | expressions and value expressions, and see how powerful this can be
195 | 
196 | 
197 | Something about syntax highlighting, generating documentation + links?
198 | 
199 | ////
200 | 
201 | === Extras
202 | 
203 | * link:ParseString.lhs[]
204 | 
205 | an executable which contains the boilerplate to run a parsec parser on
206 | a string passed as an argument
207 | 
208 | * link:ParseFile.lhs[]
209 | 
210 | an executable which contains the boilerplate to run a parsec parser on
211 | a file passed as an argument
212 | 
213 | Contact: +++jakewheatmail@gmail.com+++
214 | 
215 | License: BSD3
216 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/SimpleSQLQueryParser0.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[simple-sql-query-parser]]
  3 | = Simple SQL query parser
  4 | 
  5 | Here is the complete syntax, parser and tests for the value and query
  6 | expressions so far as a self contained module
  7 | 
  8 | > module SimpleSQLQueryParser0 where
  9 | >
 10 | > import Text.Parsec.String (Parser)
 11 | > import Text.Parsec.String.Parsec (try)
 12 | > import Text.Parsec.String.Char
 13 | > import Text.Parsec.String.Combinator
 14 | > import Text.Parsec (parse,ParseError)
 15 | > import Control.Applicative ((<$>),(<*), (*>),(<*>), (<$), (<|>), many)
 16 | > import qualified Text.Parsec.String.Expr as E
 17 | > import Control.Monad
 18 | > --import Data.List (intercalate)
 19 | > import Data.Maybe ()
 20 | > import qualified Test.HUnit as H
 21 | > import FunctionsAndTypesForParsing
 22 | > import Debug.Trace
 23 | 
 24 | == Supported SQL
 25 | 
 26 | === comments
 27 | 
 28 | ```sql
 29 | -- single line comment
 30 | /*
 31 | multiline
 32 | comment
 33 | */
 34 | ```
 35 | 
 36 | The `/* */` comments do not nest.
 37 | 
 38 | === value expressions
 39 | 
 40 | ==== literals
 41 | 
 42 | postive integral literals and string literals with single quote,
 43 | without escaping of single quote within string (so there is no way to
 44 | create a string literal with a single quote in it).
 45 | 
 46 | ```sql
 47 | 1
 48 | 500
 49 | 'string literal'
 50 | ```
 51 | ==== identifiers
 52 | 
 53 | Unquoted identifiers only, an identifier may start with a letter or
 54 | underscore, and contain letters, underscores and digits.
 55 | 
 56 | ```sql
 57 | a
 58 | something
 59 | _test_
 60 | a123
 61 | ```
 62 | 
 63 | ==== dotted identifiers
 64 | 
 65 | Supports two part dotted identifiers only. Both parts must parse
 66 | according to the rules for regular identifiers.
 67 | 
 68 | ```sql
 69 | t.a
 70 | something.something_else
 71 | ```
 72 | 
 73 | ==== star
 74 | 
 75 | Star, plus dotted star using the identifier rules for the first part.
 76 | 
 77 | ```sql
 78 | *
 79 | t.*
 80 | ```
 81 | 
 82 | ==== function application
 83 | 
 84 | The function name must parse as a valid identifier.
 85 | 
 86 | ```sql
 87 | f()
 88 | g(1)
 89 | h(2,'something')
 90 | ```
 91 | 
 92 | ==== operators
 93 | 
 94 | Here is the range of operators supported.
 95 | 
 96 | ```sql
 97 | a = b
 98 | a > b
 99 | a < b
100 | a >= b
101 | a <= b
102 | a != b
103 | a <> b
104 | a and b
105 | a or b
106 | 1 + 2
107 | 1 - 2
108 | 1 * 2
109 | 1 / 2
110 | 'some' || 'thing'
111 | a like b
112 | not a
113 | ```
114 | 
115 | ==== case expression
116 | 
117 | ```sql
118 | case a
119 | when 3 then 'got three'
120 | when 5 then 'got five'
121 | else 'neither'
122 | end
123 | ```
124 | 
125 | ```sql
126 | case
127 | when a = 3 then 'a is three'
128 | when b = 4 then 'b is four'
129 | else 'neither'
130 | end
131 | ```
132 | 
133 | The else branch is optional in both cases.
134 | 
135 | ==== parentheses
136 | 
137 | ```sql
138 | (1 + 2) * 3
139 | ```
140 | 
141 | Parentheses are explicit in the abstract syntax.
142 | 
143 | === query expressions
144 | 
145 | TODO: examples
146 | 
147 | select queries only, no union, intersect or except. No common table
148 | expressions.
149 | 
150 | select list aliases, with the 'as' optional in the alias
151 | 
152 | 'select * from t'
153 | 'select t.* from t'
154 | but not the alias version 'select * as (a,b,c) from t'.
155 | 
156 | from clause
157 | 
158 | implicit and explicit joins, including keywords
159 | natural, inner, outer, left, right, full, cross, on and using, plus
160 | parens and simple aliases (e.g. select a from t u, but not select a
161 | from t(a,b)).
162 | 
163 | where
164 | 
165 | group by lists
166 | but not the new group by options in SQL2003 (group by (), grouping
167 | sets, cube, rollup).
168 | 
169 | having
170 | 
171 | order by, with multiple columns, but not explicit asc or
172 | desc , and no 'nulls first' or 'nulls last' syntax.
173 | 
174 | No support for offset and fetch first, or variations.
175 | 
176 | 
177 | == Abstract syntax
178 | 
179 | > data ValueExpr = StringLit String
180 | >                | NumLit Integer
181 | >                | Iden String
182 | >                | DIden String String -- a.b
183 | >                | Star
184 | >                | DStar String -- t.*
185 | >                | App String [ValueExpr]
186 | >                | PrefOp String ValueExpr
187 | >                | BinOp ValueExpr String ValueExpr
188 | >                | Case (Maybe ValueExpr) -- test value
189 | >                       [(ValueExpr,ValueExpr)] -- when branches
190 | >                       (Maybe ValueExpr) -- else value
191 | >                | Parens ValueExpr
192 | >                  deriving (Eq,Show)
193 | 
194 | > data QueryExpr
195 | >     = Select
196 | >       {qeSelectList :: [(ValueExpr,Maybe String)]
197 | >       ,qeFrom :: [TableRef]
198 | >       ,qeWhere :: Maybe ValueExpr
199 | >       ,qeGroupBy :: [ValueExpr]
200 | >       ,qeHaving :: Maybe ValueExpr
201 | >       ,qeOrderBy :: [ValueExpr]
202 | >       } deriving (Eq,Show)
203 | 
204 | > makeSelect :: QueryExpr
205 | > makeSelect = Select {qeSelectList = []
206 | >                     ,qeFrom = []
207 | >                     ,qeWhere = Nothing
208 | >                     ,qeGroupBy = []
209 | >                     ,qeHaving = Nothing
210 | >                     ,qeOrderBy = []}
211 | 
212 | > data TableRef = TRSimple String
213 | >               | TRJoin TableRef JoinType TableRef (Maybe JoinCondition)
214 | >               | TRParens TableRef
215 | >               | TRAlias TableRef String
216 | >               | TRQueryExpr QueryExpr
217 | >                 deriving (Eq,Show)
218 | 
219 | > data JoinType = JoinInner | JoinLeft | JoinRight | JoinFull | JoinCross
220 | >                 deriving (Eq,Show)
221 | 
222 | > data JoinCondition = JoinOn ValueExpr
223 | >                    | JoinUsing [String]
224 | >                    | JoinNatural
225 | >                    deriving (Eq,Show)
226 | 
227 | == Value expression parsing
228 | 
229 | === term components
230 | 
231 | > num :: Parser ValueExpr
232 | > num = NumLit <$> integer
233 | 
234 | > iden :: [String] -> Parser ValueExpr
235 | > iden blacklist = Iden <$> identifierBlacklist blacklist
236 | 
237 | > parensValue :: Parser ValueExpr
238 | > parensValue = Parens <$> parens (valueExpr [])
239 | 
240 | > stringLit :: Parser ValueExpr
241 | > stringLit = StringLit <$> stringToken
242 | 
243 | > dIden :: Parser ValueExpr
244 | > dIden = DIden <$> identifier <*> (dot *> identifier)
245 | 
246 | > star :: Parser ValueExpr
247 | > star = Star <$ symbol "*"
248 | 
249 | > dstar :: Parser ValueExpr
250 | > dstar = DStar <$> (identifier <* dot <* symbol "*")
251 | 
252 | > app :: Parser ValueExpr
253 | > app = App <$> identifier <*> parens (commaSep $ valueExpr [])
254 | 
255 | === case
256 | 
257 | > caseValue :: Parser ValueExpr
258 | > caseValue =
259 | >     Case
260 | >     <$> (keyword "case" *> optionMaybe caseVal)
261 | >     <*> many1 whenClause
262 | >     <*> optionMaybe elseClause
263 | >     <* keyword "end"
264 | >  where
265 | >    whenClause = (,) <$> (keyword "when" *> caseVal)
266 | >                     <*> (keyword "then" *> caseVal)
267 | >    elseClause = keyword "else" *> caseVal
268 | >    caseVal = valueExpr blackList
269 | >    blackList = ["case", "when", "then", "else", "end"]
270 | 
271 | === term
272 | 
273 | > term :: [String] -> Parser ValueExpr
274 | > term blackList = choice [caseValue
275 | >                         ,try app
276 | >                         ,try dstar
277 | >                         ,try dIden
278 | >                         ,iden blackList
279 | >                         ,num
280 | >                         ,parensValue
281 | >                         ,stringLit
282 | >                         ,star]
283 | 
284 | === operators
285 | 
286 | > table :: [[E.Operator ValueExpr]]
287 | > table = [[prefix "-", prefix "+"]
288 | >          ,[binary "^" E.AssocLeft]
289 | >          ,[binary "*" E.AssocLeft
290 | >           ,binary "/" E.AssocLeft
291 | >           ,binary "%" E.AssocLeft]
292 | >          ,[binary "+" E.AssocLeft
293 | >           ,binary "-" E.AssocLeft]
294 | >          ,[binary "<=" E.AssocRight
295 | >           ,binary ">=" E.AssocRight
296 | >           ,binaryK "like" E.AssocNone
297 | >           ,binary "!=" E.AssocRight
298 | >           ,binary "<>" E.AssocRight
299 | >           ,binary "||" E.AssocRight]
300 | >          ,[binary "<" E.AssocNone
301 | >           ,binary ">" E.AssocNone]
302 | >          ,[binary "=" E.AssocRight]
303 | >          ,[prefixK "not"]
304 | >          ,[binaryK "and" E.AssocLeft]
305 | >          ,[binaryK "or" E.AssocLeft]]
306 | >   where
307 | >     binary name assoc =
308 | >         E.Infix (mkBinOp name <$ symbol name) assoc
309 | >     mkBinOp nm a b = BinOp a nm b
310 | >     prefix name = E.Prefix (PrefOp name <$ symbol name)
311 | >     binaryK name assoc =
312 | >         E.Infix (mkBinOp name <$ keyword name) assoc
313 | >     prefixK name = E.Prefix (PrefOp name <$ keyword name)
314 | 
315 | === valueExpr
316 | 
317 | > valueExpr :: [String] -> Parser ValueExpr
318 | > valueExpr blackList = E.buildExpressionParser table (term blackList)
319 | 
320 | == Query expression parsing
321 | 
322 | > selectList :: Parser [(ValueExpr, Maybe String)]
323 | > selectList = keyword_ "select" *> commaSep1 selectItem
324 | 
325 | > selectItem :: Parser (ValueExpr, Maybe String)
326 | > selectItem = (,) <$> valueExpr [] <*> optionMaybe (try alias)
327 | >   where alias = optional (keyword_ "as") *> identifierBlacklist ["from"]
328 | 
329 | > whereClause :: Parser ValueExpr
330 | > whereClause = keyword_ "where" *> valueExpr []
331 | 
332 | > groupByClause :: Parser [ValueExpr]
333 | > groupByClause = keyword_ "group" *> keyword_ "by"
334 | >                 *> commaSep1 (valueExpr [])
335 | 
336 | > having :: Parser ValueExpr
337 | > having = keyword_ "having" *> (valueExpr [])
338 | 
339 | > orderBy :: Parser [ValueExpr]
340 | > orderBy = keyword_ "order" *> keyword_ "by"
341 | >           *> commaSep1 (valueExpr [])
342 | 
343 | === from clause
344 | 
345 | > from :: Parser [TableRef]
346 | > from = keyword_ "from" >> commaSep1 tref
347 | >   where
348 | >     tref = nonJoinTref >>= suffixWrapper joinTrefSuffix
349 | >     joinTrefSuffix t0 = (do
350 | >          nat <- option False (True <$ keyword_ "natural")
351 | >          TRJoin t0 <$> joinType
352 | >                    <*> nonJoinTref
353 | >                    <*> optionMaybe (joinCondition nat))
354 | >         >>= suffixWrapper joinTrefSuffix
355 | >     nonJoinTref = choice [TRSimple <$> identifier
356 | >                          ,try (TRQueryExpr <$> parens queryExpr)
357 | >                          ,TRParens <$> parens tref]
358 | >                   >>= suffixWrapper (try . alias)
359 | >     alias tr = try (TRAlias tr <$> (optional (keyword_ "as") *> aliasIdentifier))
360 | >     aliasIdentifier = identifierBlacklist
361 | >                       [-- join keywords
362 | >                        "natural"
363 | >                       ,"inner"
364 | >                       ,"outer"
365 | >                       ,"cross"
366 | >                       ,"left"
367 | >                       ,"right"
368 | >                       ,"full"
369 | >                       ,"join"
370 | >                       ,"on"
371 | >                       ,"using"
372 | >                        -- subsequent clause keywords
373 | >                       ,"where"
374 | >                       ,"group"
375 | >                       ,"having"
376 | >                       ,"order"
377 | >                       ]
378 | 
379 | > joinType :: Parser JoinType
380 | > joinType = choice
381 | >     [JoinCross <$ keyword_ "cross" <* keyword_ "join"
382 | >     ,JoinInner <$ keyword_ "inner" <* keyword_ "join"
383 | >     ,JoinLeft <$ keyword_ "left"
384 | >            <* optional (keyword_ "outer")
385 | >            <* keyword_ "join"
386 | >     ,JoinRight <$ keyword_ "right"
387 | >             <* optional (keyword_ "outer")
388 | >             <* keyword_ "join"
389 | >     ,JoinFull <$ keyword_ "full"
390 | >            <* optional (keyword_ "outer")
391 | >            <* keyword_ "join"
392 | >     ,JoinInner <$ keyword_ "join"]
393 | 
394 | > joinCondition :: Bool -> Parser JoinCondition
395 | > joinCondition nat =
396 | >     choice [guard nat >> return JoinNatural
397 | >            ,keyword_ "on" >> JoinOn <$> valueExpr []
398 | >            ,keyword_ "using" >> JoinUsing <$> parens (commaSep1 identifier)
399 | >            ]
400 | 
401 | === queryExpr
402 | 
403 | > queryExpr :: Parser QueryExpr
404 | > queryExpr = Select
405 | >             <$> selectList
406 | >             <*> option [] from
407 | >             <*> optionMaybe whereClause
408 | >             <*> option [] groupByClause
409 | >             <*> optionMaybe having
410 | >             <*> option [] orderBy
411 | 
412 | == tokens
413 | 
414 | > whitespace :: Parser ()
415 | > whitespace =
416 | >     choice [simpleWhitespace *> whitespace
417 | >            ,lineComment *> whitespace
418 | >            ,blockComment *> whitespace
419 | >            ,return ()]
420 | >   where
421 | >     lineComment = try (string "--")
422 | >                   *> manyTill anyChar (void (char '\n') <|> eof)
423 | >     blockComment = try (string "/*")
424 | >                    *> manyTill anyChar (try $ string "*/")
425 | >     simpleWhitespace = void $ many1 (oneOf " \t\n")
426 | 
427 | > lexeme :: Parser a -> Parser a
428 | > lexeme p = p <* whitespace
429 | 
430 | > integer :: Parser Integer
431 | > integer = read <$> lexeme (many1 digit)
432 | 
433 | > identifier :: Parser String
434 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
435 | >   where
436 | >     firstChar = letter <|> char '_'
437 | >     nonFirstChar = digit <|> firstChar
438 | 
439 | > symbol :: String -> Parser String
440 | > symbol s = try $ lexeme $ do
441 | >     u <- many1 (oneOf "<>=+-^%/*!|")
442 | >     guard (s == u)
443 | >     return s
444 | 
445 | > openParen :: Parser Char
446 | > openParen = lexeme $ char '('
447 | 
448 | > closeParen :: Parser Char
449 | > closeParen = lexeme $ char ')'
450 | 
451 | > stringToken :: Parser String
452 | > stringToken = lexeme (char '\'' *> manyTill anyChar (char '\''))
453 | 
454 | > dot :: Parser Char
455 | > dot = lexeme $ char '.'
456 | 
457 | > comma :: Parser Char
458 | > comma = lexeme $ char ','
459 | 
460 | == helper functions
461 | 
462 | > keyword :: String -> Parser String
463 | > keyword k = try $ do
464 | >     i <- identifier
465 | >     guard (i == k)
466 | >     return k
467 | 
468 | > parens :: Parser a -> Parser a
469 | > parens = between openParen closeParen
470 | 
471 | > commaSep :: Parser a -> Parser [a]
472 | > commaSep = (`sepBy` comma)
473 | 
474 | > keyword_ :: String -> Parser ()
475 | > keyword_ = void . keyword
476 | 
477 | > symbol_ :: String -> Parser ()
478 | > symbol_ = void . symbol
479 | 
480 | > commaSep1 :: Parser a -> Parser [a]
481 | > commaSep1 = (`sepBy1` comma)
482 | 
483 | > identifierBlacklist :: [String] -> Parser String
484 | > identifierBlacklist bl = do
485 | >     i <- identifier
486 | >     guard (i `notElem` bl)
487 | >     return i
488 | 
489 | > suffixWrapper :: (a -> Parser a) -> a -> Parser a
490 | > suffixWrapper p a = p a <|> return a
491 | 
492 | == the parser api
493 | 
494 | > parseQueryExpr :: String -> Either ParseError QueryExpr
495 | > parseQueryExpr = parse (whitespace *> queryExpr <* eof) ""
496 | 
497 | > parseValueExpr :: String -> Either ParseError ValueExpr
498 | > parseValueExpr = parse (whitespace *> valueExpr [] <* eof) ""
499 | 
500 | 
501 | == tests
502 | 
503 | > data TestItem = Group String [TestItem]
504 | >               | ValueExpressionTest String ValueExpr
505 | >               | QueryExpressionTest String QueryExpr
506 | 
507 | > numLitTests :: [(String,ValueExpr)]
508 | > numLitTests =
509 | >     [("1", NumLit 1)
510 | >     ,("54321", NumLit 54321)]
511 | >
512 | > idenTests :: [(String,ValueExpr)]
513 | > idenTests =
514 | >     [("test", Iden "test")
515 | >     ,("_something3", Iden "_something3")]
516 | >
517 | > operatorTests :: [(String,ValueExpr)]
518 | > operatorTests =
519 | >    map (\o -> (o ++ " a", PrefOp o (Iden "a"))) ["not", "+", "-"]
520 | >    ++ map (\o -> ("a " ++ o ++ " b", BinOp (Iden "a") o (Iden "b")))
521 | >           ["=",">","<", ">=", "<=", "!=", "<>"
522 | >           ,"and", "or", "+", "-", "*", "/", "||", "like"]
523 | >
524 | > parensTests :: [(String,ValueExpr)]
525 | > parensTests = [("(1)", Parens (NumLit 1))]
526 | 
527 | > basicTests :: [(String,ValueExpr)]
528 | > basicTests = numLitTests ++ idenTests ++ operatorTests ++ parensTests
529 | 
530 | > stringLiteralTests :: [(String,ValueExpr)]
531 | > stringLiteralTests =
532 | >     [("''", StringLit "")
533 | >     ,("'test'", StringLit "test")]
534 | 
535 | > dIdenTests :: [(String,ValueExpr)]
536 | > dIdenTests =
537 | >     [("t.a", DIden "t" "a")]
538 | 
539 | > starTests :: [(String,ValueExpr)]
540 | > starTests = [("*", Star)]
541 | 
542 | 
543 | > dStarTests :: [(String,ValueExpr)]
544 | > dStarTests = [("t.*", DStar "t")]
545 | 
546 | > appTests :: [(String,ValueExpr)]
547 | > appTests = [("f()", App "f" [])
548 | >            ,("f(1)", App "f" [NumLit 1])
549 | >            ,("f(1,a)", App "f" [NumLit 1, Iden "a"])]
550 | 
551 | > caseTests :: [(String,ValueExpr)]
552 | > caseTests =
553 | >     [("case a when 1 then 2 end"
554 | >      ,Case (Just $ Iden "a") [(NumLit 1,NumLit 2)] Nothing)
555 | >
556 | >     ,("case a when 1 then 2 when 3 then 4 end"
557 | >      ,Case (Just $ Iden "a")
558 | >            [(NumLit 1, NumLit 2)
559 | >            ,(NumLit 3, NumLit 4)]
560 | >            Nothing)
561 | >
562 | >     ,("case a when 1 then 2 when 3 then 4 else 5 end"
563 | >      ,Case (Just $ Iden "a")
564 | >            [(NumLit 1, NumLit 2)
565 | >            ,(NumLit 3, NumLit 4)]
566 | >            (Just $ NumLit 5))
567 | >
568 | >     ,("case when a=1 then 2 when a=3 then 4 else 5 end"
569 | >      ,Case Nothing
570 | >            [(BinOp (Iden "a") "=" (NumLit 1), NumLit 2)
571 | >            ,(BinOp (Iden "a") "=" (NumLit 3), NumLit 4)]
572 | >            (Just $ NumLit 5))
573 | >     ]
574 | 
575 | > allValueExprTests :: [(String,ValueExpr)]
576 | > allValueExprTests = concat [basicTests
577 | >                             ,stringLiteralTests
578 | >                             ,dIdenTests
579 | >                             ,starTests
580 | >                             ,dStarTests
581 | >                             ,appTests
582 | >                             ,caseTests]
583 | 
584 | > singleSelectItemTests :: [(String,QueryExpr)]
585 | > singleSelectItemTests =
586 | >     [("select 1", makeSelect {qeSelectList = [(NumLit 1,Nothing)]})]
587 | 
588 | > multipleSelectItemsTests :: [(String,QueryExpr)]
589 | > multipleSelectItemsTests =
590 | >     [("select a"
591 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]})
592 | >     ,("select a,b"
593 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
594 | >                                  ,(Iden "b",Nothing)]})
595 | >     ,("select 1+2,3+4"
596 | >      ,makeSelect {qeSelectList =
597 | >                      [(BinOp (NumLit 1) "+" (NumLit 2),Nothing)
598 | >                      ,(BinOp (NumLit 3) "+" (NumLit 4),Nothing)]})
599 | >     ]
600 | 
601 | > selectListTests :: [(String,QueryExpr)]
602 | > selectListTests =
603 | >     [("select a as a, b as b"
604 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a")
605 | >                                  ,(Iden "b", Just "b")]})
606 | >     ,("select a a, b b"
607 | >      ,makeSelect {qeSelectList = [(Iden "a", Just "a")
608 | >                                  ,(Iden "b", Just "b")]})
609 | >     ] ++ multipleSelectItemsTests
610 | >       ++ singleSelectItemTests
611 | 
612 | > fromTests :: [(String,QueryExpr)]
613 | > fromTests =
614 | >     [("select a from t"
615 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
616 | >                  ,qeFrom = [TRSimple "t"]})]
617 | 
618 | > whereTests :: [(String,QueryExpr)]
619 | > whereTests =
620 | >     [("select a from t where a = 5"
621 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)]
622 | >                  ,qeFrom = [TRSimple "t"]
623 | >                  ,qeWhere = Just $ BinOp (Iden "a") "=" (NumLit 5)})
624 | >     ]
625 | 
626 | > groupByTests :: [(String,QueryExpr)]
627 | > groupByTests =
628 | >     [("select a,sum(b) from t group by a"
629 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
630 | >                                  ,(App "sum" [Iden "b"],Nothing)]
631 | >                  ,qeFrom = [TRSimple "t"]
632 | >                  ,qeGroupBy = [Iden "a"]
633 | >                  })
634 | >     ,("select a,b,sum(c) from t group by a,b"
635 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
636 | >                                  ,(Iden "b",Nothing)
637 | >                                  ,(App "sum" [Iden "c"],Nothing)]
638 | >                  ,qeFrom = [TRSimple "t"]
639 | >                  ,qeGroupBy = [Iden "a",Iden "b"]
640 | >                  })
641 | >     ]
642 | 
643 | > havingTests :: [(String,QueryExpr)]
644 | > havingTests =
645 | >   [("select a,sum(b) from t group by a having sum(b) > 5"
646 | >      ,makeSelect {qeSelectList = [(Iden "a",Nothing)
647 | >                                  ,(App "sum" [Iden "b"],Nothing)]
648 | >                  ,qeFrom = [TRSimple "t"]
649 | >                  ,qeGroupBy = [Iden "a"]
650 | >                  ,qeHaving = Just $ BinOp (App "sum" [Iden "b"]) ">" (NumLit 5)
651 | >                  })
652 | >   ]
653 | 
654 | > orderByTests :: [(String,QueryExpr)]
655 | > orderByTests =
656 | >     [("select a from t order by a"
657 | >      ,ms [Iden "a"])
658 | >     ,("select a from t order by a, b"
659 | >      ,ms [Iden "a", Iden "b"])
660 | >     ]
661 | >   where
662 | >     ms o = makeSelect {qeSelectList = [(Iden "a",Nothing)]
663 | >                       ,qeFrom = [TRSimple "t"]
664 | >                       ,qeOrderBy = o}
665 | 
666 | > queryExprJoinTests :: [(String,QueryExpr)]
667 | > queryExprJoinTests =
668 | >     [("select a from t"
669 | >      ,ms [TRSimple "t"])
670 | >     ,("select a from t,u"
671 | >      ,ms [TRSimple "t", TRSimple "u"])
672 | >
673 | >     ,("select a from t inner join u on expr"
674 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
675 | >           (Just $ JoinOn $ Iden "expr")])
676 | >
677 | >     ,("select a from t left join u on expr"
678 | >      ,ms [TRJoin (TRSimple "t") JoinLeft (TRSimple "u")
679 | >           (Just $ JoinOn $ Iden "expr")])
680 | >
681 | >     ,("select a from t right join u on expr"
682 | >      ,ms [TRJoin (TRSimple "t") JoinRight (TRSimple "u")
683 | >           (Just $ JoinOn $ Iden "expr")])
684 | >
685 | >     ,("select a from t full join u on expr"
686 | >      ,ms [TRJoin (TRSimple "t") JoinFull (TRSimple "u")
687 | >           (Just $ JoinOn $ Iden "expr")])
688 | >
689 | >     ,("select a from t cross join u"
690 | >      ,ms [TRJoin (TRSimple "t") JoinCross (TRSimple "u") Nothing])
691 | >
692 | >     ,("select a from t natural inner join u"
693 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
694 | >           (Just JoinNatural)])
695 | >
696 | >     ,("select a from t inner join u using(a,b)"
697 | >      ,ms [TRJoin (TRSimple "t") JoinInner (TRSimple "u")
698 | >           (Just $ JoinUsing ["a", "b"])])
699 | >
700 | >     ,("select a from (select a from t)"
701 | >      ,ms [TRQueryExpr $ ms [TRSimple "t"]])
702 | >
703 | >     ,("select a from t as u"
704 | >      ,ms [TRAlias (TRSimple "t") "u"])
705 | >
706 | >     ,("select a from t u"
707 | >      ,ms [TRAlias (TRSimple "t") "u"])
708 | >
709 | >     ,("select a from (t cross join u) as u"
710 | >      ,ms [TRAlias (TRParens $ TRJoin (TRSimple "t") JoinCross
711 | >                                      (TRSimple "u") Nothing) "u"])
712 | >     ]
713 | >   where
714 | >     ms f = makeSelect {qeSelectList = [(Iden "a",Nothing)]
715 | >                       ,qeFrom = f}
716 | 
717 | > allQueryExprTests :: [(String,QueryExpr)]
718 | > allQueryExprTests = concat [selectListTests ++ fromTests ++ whereTests ++ groupByTests ++ havingTests ++ orderByTests ++ queryExprJoinTests]
719 | 
720 | todo: use external api parsing code
721 | 
722 | > makeTest :: (Eq a, Show a) => Parser a -> (String,a) -> H.Test
723 | > makeTest parser (src,expected) = H.TestLabel src $ H.TestCase $ do
724 | >     let gote = parse (whitespace *> parser <* eof) "" src
725 | >     case gote of
726 | >       Left e -> H.assertFailure $ show e
727 | >       Right got -> H.assertEqual src expected got
728 | 


--------------------------------------------------------------------------------
/Text/Parsec/String/Char.hs:
--------------------------------------------------------------------------------
 1 | module Text.Parsec.String.Char where
 2 | 
 3 | {-
 4 | 
 5 | Wrappers for the Text.Parsec.Char module with the types fixed to
 6 | 'Text.Parsec.String.Parser a', i.e. the stream is String, no user
 7 | state, Identity monad.
 8 | 
 9 | -}
10 | 
11 | import qualified Text.Parsec.Char as C
12 | import Text.Parsec.String (Parser)
13 | 
14 | spaces :: Parser ()
15 | spaces = C.spaces
16 | 
17 | space :: Parser Char
18 | space = C.space
19 | 
20 | newline :: Parser Char
21 | newline = C.newline
22 | 
23 | tab :: Parser Char
24 | tab = C.tab
25 | 
26 | upper :: Parser Char
27 | upper = C.upper
28 | 
29 | lower :: Parser Char
30 | lower = C.lower
31 | 
32 | alphaNum :: Parser Char
33 | alphaNum = C.alphaNum
34 | 
35 | letter :: Parser Char
36 | letter = C.letter
37 | 
38 | digit :: Parser Char
39 | digit = C.digit
40 | 
41 | hexDigit :: Parser Char
42 | hexDigit = C.hexDigit
43 | 
44 | octDigit :: Parser Char
45 | octDigit = C.octDigit
46 | 
47 | char :: Char -> Parser Char
48 | char = C.char
49 | 
50 | string :: String -> Parser String
51 | string = C.string
52 | 
53 | anyChar :: Parser Char
54 | anyChar = C.anyChar
55 | 
56 | oneOf :: [Char] -> Parser Char
57 | oneOf = C.oneOf
58 | 
59 | noneOf :: [Char] -> Parser Char
60 | noneOf = C.noneOf
61 | 
62 | satisfy :: (Char -> Bool) -> Parser Char
63 | satisfy = C.satisfy
64 | 


--------------------------------------------------------------------------------
/Text/Parsec/String/Combinator.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Text.Parsec.String.Combinator where
 3 | 
 4 | {-
 5 | 
 6 | Wrappers for the Text.Parsec.Combinator module with the types fixed to
 7 | 'Text.Parsec.String.Parser a', i.e. the stream is String, no user
 8 | state, Identity monad.
 9 | 
10 | -}
11 | 
12 | import qualified Text.Parsec.Combinator as C
13 | import Text.Parsec.String (Parser)
14 | 
15 | choice :: [Parser a] -> Parser a
16 | choice = C.choice
17 | 
18 | 
19 | count :: Int -> Parser a -> Parser [a]
20 | count = C.count
21 | 
22 | between :: Parser open -> Parser close -> Parser a -> Parser a
23 | between = C.between
24 | 
25 | 
26 | option :: a -> Parser a -> Parser a
27 | option = C.option
28 | 
29 | optionMaybe :: Parser a -> Parser (Maybe a)
30 | optionMaybe = C.optionMaybe
31 | 
32 | optional :: Parser a -> Parser ()
33 | optional = C.optional
34 | 
35 | skipMany1 :: Parser a -> Parser ()
36 | skipMany1 = C.skipMany1
37 | 
38 | many1 :: Parser a -> Parser [a]
39 | many1 = C.many1
40 | 
41 | sepBy :: Parser a -> Parser sep -> Parser [a]
42 | sepBy = C.sepBy
43 | 
44 | sepBy1 :: Parser a -> Parser sep -> Parser [a]
45 | sepBy1 = C.sepBy1
46 | 
47 | endBy :: Parser a -> Parser sep -> Parser [a]
48 | endBy = C.endBy
49 | 
50 | endBy1 :: Parser a -> Parser sep -> Parser [a]
51 | endBy1 = C.endBy1
52 | 
53 | sepEndBy :: Parser a -> Parser sep -> Parser [a]
54 | sepEndBy = C.sepEndBy
55 | 
56 | sepEndBy1 :: Parser a -> Parser sep -> Parser [a]
57 | sepEndBy1 = C.sepEndBy1
58 | 
59 | chainl :: Parser a -> Parser (a -> a -> a) -> a -> Parser a
60 | chainl = C.chainl
61 | 
62 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a
63 | chainl1 = C.chainl1
64 | 
65 | chainr :: Parser a -> Parser (a -> a -> a) -> a -> Parser a
66 | chainr = C.chainr
67 | 
68 | chainr1 :: Parser a -> Parser (a -> a -> a) -> Parser a
69 | chainr1 = C.chainr1
70 | 
71 | eof :: Parser ()
72 | eof = C.eof
73 | 
74 | notFollowedBy :: Show a => Parser a -> Parser ()
75 | notFollowedBy = C.notFollowedBy
76 | 
77 | manyTill :: Parser a -> Parser end -> Parser [a]
78 | manyTill = C.manyTill
79 | 
80 | lookAhead :: Parser a -> Parser a
81 | lookAhead = C.lookAhead
82 | 
83 | anyToken :: Parser Char
84 | anyToken = C.anyToken
85 | 


--------------------------------------------------------------------------------
/Text/Parsec/String/Expr.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Text.Parsec.String.Expr (buildExpressionParser
 3 |                                ,Operator(..)
 4 |                                ,OperatorTable
 5 |                                ,E.Assoc(..)
 6 |                                )where
 7 | 
 8 | {-
 9 | 
10 | Wrappers for the Text.Parsec.Expr module with simplified types.
11 | 
12 | -}
13 | 
14 | import Text.Parsec.String (Parser)
15 | import qualified Text.Parsec.Expr as E
16 | 
17 | -- not sure if this is neccessary, or a type alias would be good
18 | -- enough
19 | data Operator a = Infix (Parser (a -> a -> a)) E.Assoc
20 |                 | Prefix (Parser (a -> a))
21 |                 | Postfix (Parser (a -> a))
22 | 
23 | type OperatorTable a = [[Operator a]]
24 | 
25 | buildExpressionParser :: OperatorTable a -> Parser a -> Parser a
26 | buildExpressionParser t = E.buildExpressionParser (map (map f) t)
27 |   where
28 |     f (Infix p a) = E.Infix p a
29 |     f (Prefix p) = E.Prefix p
30 |     f (Postfix p) = E.Postfix p
31 | 


--------------------------------------------------------------------------------
/Text/Parsec/String/Parsec.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | module Text.Parsec.String.Parsec where
 3 | 
 4 | {-
 5 | 
 6 | wrapper for some Text.Parsec functions which use a simplified type
 7 | 
 8 | -}
 9 | 
10 | import Text.Parsec.String (Parser)
11 | import qualified Text.Parsec as P
12 | 
13 | 
14 | try :: Parser a -> Parser a
15 | try = P.try
16 | 
17 | parse :: Parser a -> P.SourceName -> String -> Either P.ParseError a
18 | parse = P.parse
19 | 


--------------------------------------------------------------------------------
/TextParsecExpr.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[parsing-expressions-with-fixity]]
  3 | = Parsing expressions with fixity
  4 | 
  5 | Text.Parsec.Expr allows building expression parsers with a range of
  6 | operators with different precedences and associativities
  7 | easily. Fixity is the (not completely standard) term for precendence
  8 | and associativity together.
  9 | 
 10 | `Text.Parsec.Expr` can be great to quickly get a parser for simple
 11 | expressions or a simple programming language with simple expressions
 12 | up and running.
 13 | 
 14 | > import Text.Parsec.String (Parser)
 15 | > import Text.Parsec.String.Combinator (many1, between)
 16 | > import Text.Parsec.String.Char (letter, char, digit, string, oneOf)
 17 | >
 18 | > import Control.Applicative ((<$>), (<*>), (<*), (<|>), many, (<$))
 19 | > import Control.Monad (void)
 20 | >
 21 | > import qualified Text.Parsec.String.Expr as E
 22 | > import FunctionsAndTypesForParsing
 23 | 
 24 | Let's extend the SimpleExpression type and parsers to cover a range of
 25 | operators with different precedences and associativity.
 26 | 
 27 | == expressions with plus and times
 28 | 
 29 | Let's start with a simple case: + and * with the usual fixity. Here is
 30 | the abstract syntax:
 31 | 
 32 | > data PlusTimesExpr = PteVar String
 33 | >                    | PteNum Integer
 34 | >                    | PteParens PlusTimesExpr
 35 | >                    | Plus PlusTimesExpr PlusTimesExpr
 36 | >                    | Times PlusTimesExpr PlusTimesExpr
 37 | >                          deriving (Eq,Show)
 38 | 
 39 | > plusTimesExpr :: Parser PlusTimesExpr
 40 | > plusTimesExpr = E.buildExpressionParser pteTable pteTerm
 41 | 
 42 | > pteTable :: [[E.Operator PlusTimesExpr]]
 43 | > pteTable = [[E.Infix (Times <$ symbol "*") E.AssocLeft]
 44 | >            ,[E.Infix (Plus <$ symbol "+") E.AssocLeft]
 45 | >            ]
 46 | 
 47 | Here you can see the operator parsers are the same as the previous
 48 | `SimpleExpr` parser which used `chainl1`: `Times <$ symbol "*"` and
 49 | `Plus <$ symbol "+"`. We just wrapped these up in the `E.Infix`
 50 | constructor with the associativity, and put them in a list of lists
 51 | which represents the precendence classes.
 52 | 
 53 | Here is the term parser and components. All this is just the same as
 54 | the `SimpleExpr` parser a previous tutorial.
 55 | 
 56 | > pteTerm :: Parser PlusTimesExpr
 57 | > pteTerm = pteVar <|> pteNum <|> pteParens
 58 | 
 59 | > pteNum :: Parser PlusTimesExpr
 60 | > pteNum = PteNum <$> integer
 61 | 
 62 | > pteVar :: Parser PlusTimesExpr
 63 | > pteVar = PteVar <$> identifier
 64 | 
 65 | > pteParens :: Parser PlusTimesExpr
 66 | > pteParens = PteParens <$> between (symbol "(") (symbol ")") plusTimesExpr
 67 | 
 68 | support functions:
 69 | 
 70 | > whitespace :: Parser ()
 71 | > whitespace = void $ many $ oneOf " \n\t"
 72 | 
 73 | > lexeme :: Parser a -> Parser a
 74 | > lexeme p = p <* whitespace
 75 | 
 76 | > integer :: Parser Integer
 77 | > integer = read <$> lexeme (many1 digit)
 78 | 
 79 | > identifier :: Parser String
 80 | > identifier = lexeme ((:) <$> firstChar <*> many nonFirstChar)
 81 | >   where
 82 | >     firstChar = letter <|> char '_'
 83 | >     nonFirstChar = digit <|> firstChar
 84 | 
 85 | > symbol :: String -> Parser String
 86 | > symbol s = lexeme $ string s
 87 | 
 88 | Here you can see the precendence in action:
 89 | 
 90 | ```
 91 | *Main> regularParse plusTimesExpr "a + b * c"
 92 | Right (Plus (PteVar "a") (Times (PteVar "b") (PteVar "c")))
 93 | 
 94 | *Main> regularParse plusTimesExpr "a * b + c"
 95 | Right (Plus (Times (PteVar "a") (PteVar "b")) (PteVar "c"))
 96 | ```
 97 | 
 98 | == a full featured expression type
 99 | 
100 | Now let's try a much bigger example with lots more operators. Now we
101 | are thinking ahead to the first version of the SQL query parser, and
102 | preparing for this.
103 | 
104 | Here are our new operators in precedence order:
105 | 
106 | === unary + -
107 | 
108 | ```
109 | +a
110 | -3
111 | ```
112 | 
113 | === exponentiation
114 | 
115 | ```
116 | a ^ 3
117 | ```
118 | 
119 | associativity: left
120 | 
121 | === multiplication, division, modulo
122 | 
123 | ```
124 | a * 3
125 | 3 / b
126 | a % 5
127 | ```
128 | 
129 | associativity: left
130 | 
131 | === addition, subtraction
132 | 
133 | ```
134 | a + b
135 | a - b
136 | ```
137 | 
138 | associativity: left
139 | 
140 | === less than, greater than
141 | 
142 | ```
143 | a < b
144 | a > b
145 | ```
146 | 
147 | associativity: none
148 | 
149 | === equals
150 | 
151 | ```
152 | a = 3
153 | ```
154 | 
155 | associativity: right
156 | 
157 | === not
158 | 
159 | ```
160 | not a
161 | ```
162 | 
163 | === and
164 | 
165 | ```
166 | a and b
167 | ```
168 | 
169 | associativity: left
170 | 
171 | === or
172 | 
173 | ```
174 | a or b
175 | ```
176 | 
177 | associativity: left
178 | 
179 | Here is the abstract syntax type:
180 | 
181 | > data SimpleExpr = Num Integer
182 | >                 | Var String
183 | >                 | Parens SimpleExpr
184 | >                 | PrefixOp String SimpleExpr
185 | >                 | BinaryOp SimpleExpr String SimpleExpr
186 | >                    deriving (Eq,Show)
187 | 
188 | Here is the new expression parser:
189 | 
190 | > simpleExpr :: Parser SimpleExpr
191 | > simpleExpr = E.buildExpressionParser table term
192 | 
193 | > table :: [[E.Operator SimpleExpr]]
194 | > table = [[prefix "-", prefix "+"]
195 | >         ,[binary "^" E.AssocLeft]
196 | >         ,[binary "*" E.AssocLeft
197 | >          ,binary "/" E.AssocLeft
198 | >          ,binary "%" E.AssocLeft]
199 | >         ,[binary "+" E.AssocLeft
200 | >          ,binary "-" E.AssocLeft]
201 | >         ,[binary "<" E.AssocNone
202 | >          ,binary ">" E.AssocNone]
203 | >         ,[binary "=" E.AssocRight]
204 | >         ,[prefix "not"]
205 | >         ,[binary "and" E.AssocLeft]
206 | >         ,[binary "or" E.AssocLeft]
207 | >         ]
208 | >   where
209 | >     binary name assoc =
210 | >         E.Infix (mkBinOp name <$ symbol name) assoc
211 | >     mkBinOp nm a b = BinaryOp a nm b
212 | >     prefix name = E.Prefix (PrefixOp name <$ symbol name)
213 | 
214 | TODO: expand and explain the bits.
215 | 
216 | Here is the term parser.
217 | 
218 | > term :: Parser SimpleExpr
219 | > term = var <|> num <|> parens
220 | 
221 | > num :: Parser SimpleExpr
222 | > num = Num <$> integer
223 | 
224 | > var :: Parser SimpleExpr
225 | > var = Var <$> identifier
226 | 
227 | > parens :: Parser SimpleExpr
228 | > parens = between (symbol "(") (symbol ")") simpleExpr
229 | 
230 | TODO: write lots of parsing examples, including parse failures with
231 | ambiguity.
232 | 
233 | issue: double prefix op, link to bug on parsec bug tracker.
234 | 
235 | The source in Text.Parsec.Expr is not too big. You can have a look and
236 | try to understand it. There are several standard approaches in parsing
237 | theory to parse expressions with data driven precendences and
238 | associativity. I don't know which one Text.Parsec.Expr uses, but if
239 | you find these and read about them, then the source of
240 | Text.Parsec.Expr might be a bit more understandable.
241 | 


--------------------------------------------------------------------------------
/TextParsecPerm.lhs:
--------------------------------------------------------------------------------
1 | 
2 | [[permutation-parsing]]
3 | = Permutation parsing
4 | 
5 | This can parse a bunch of different things in any order. TODO:
6 | examples.
7 | 


--------------------------------------------------------------------------------
/TextParsecToken.lhs:
--------------------------------------------------------------------------------
1 | 
2 | [[token-parsing]]
3 | = Token parsing
4 | 
5 | This can be used quickly create a set of token parsers handling lots
6 | of little issues which you otherwise have to deal with manually.
7 | 
8 | 


--------------------------------------------------------------------------------
/VerySimpleExpressions.lhs:
--------------------------------------------------------------------------------
  1 | 
  2 | [[very-simple-expression-parsing]]
  3 | = Very simple expression parsing
  4 | 
  5 | In this tutorial we will develop a parser for a very simple expression
  6 | language, and start learning about the set of combinators which comes
  7 | with Parsec.
  8 | 
  9 | > import Text.Parsec (ParseError)
 10 | > import Text.Parsec.String (Parser)
 11 | > import Text.Parsec.String.Parsec (try)
 12 | > import Text.Parsec.String.Char (oneOf, char, digit, satisfy)
 13 | > import Text.Parsec.String.Combinator (many1, choice, chainl1)
 14 | > import Control.Applicative ((<|>), many)
 15 | > import Control.Monad (void)
 16 | > import Data.Char (isLetter, isDigit)
 17 | > import FunctionsAndTypesForParsing
 18 | 
 19 | == num
 20 | 
 21 | The first element we will have in this expression language is positive
 22 | integral numbers:
 23 | 
 24 | > numberExamples :: [(String,Integer)]
 25 | > numberExamples = [("1", 1)
 26 | >                  ,("23", 23)]
 27 | 
 28 | TODO: make examples with parsing failures for all of the example
 29 | scripts below?
 30 | 
 31 | To parse a number, we need to parse one or more digits, and then read
 32 | the resulting string. We can use the combinator `many1` to help with
 33 | this. We will also use do notation.
 34 | 
 35 | > num :: Parser Integer
 36 | > num = do
 37 | >     n <- many1 digit
 38 | >     return (read n)
 39 | 
 40 | Let's try it out.
 41 | 
 42 | ```
 43 | *Main> regularParse num "1"
 44 | Right 1
 45 | 
 46 | *Main> regularParse num "123456"
 47 | Right 123456
 48 | 
 49 | *Main> regularParse num "aa"
 50 | Left (line 1, column 1):
 51 | unexpected "a"
 52 | expecting digit
 53 | 
 54 | ```
 55 | 
 56 | How does it work? First, we parse one or more (`many1`) digits (`digit`),
 57 | and give the result the name 'n'. Then we convert the string to an
 58 | integer using `read`.
 59 | 
 60 | The `many1` function's type looks like this:
 61 | 
 62 | ```
 63 | many1 :: Parser a -> Parser [a]
 64 | ```
 65 | 
 66 | It applies the parser given one or more times, returning the result.
 67 | 
 68 | Let's see what happens when we use the `many` combinator which parses
 69 | zero or more items instead of one or more.
 70 | 
 71 | > num1 :: Parser Integer
 72 | > num1 = do
 73 | >     n <- many digit
 74 | >     return (read n)
 75 | 
 76 | ```
 77 | *Main> regularParse num1 "1"
 78 | Right 1
 79 | 
 80 | *Main> regularParse num1 "123456"
 81 | Right 123456
 82 | 
 83 | *Main> regularParse num1 "aa"
 84 | Right *** Exception: Prelude.read: no parse
 85 | ```
 86 | 
 87 | == var
 88 | 
 89 | For var, we have to decide on a syntax for the identifiers. Let's go
 90 | for a common choice: identifiers must start with a letter or
 91 | underscore, and then they can be followed by zero or more letters,
 92 | underscores or digits in any combination.
 93 | 
 94 | > varExamples :: [(String,String)]
 95 | > varExamples = [("test", "test")
 96 | >               ,("_stuff", "_stuff")
 97 | >               ,("_1234", "_1234")]
 98 | 
 99 | > var :: Parser String
100 | > var = do
101 | >     fc <- firstChar
102 | >     rest <- many nonFirstChar
103 | >     return (fc:rest)
104 | >   where
105 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
106 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
107 | 
108 | This time, we create two helper parsers: `firstChar`, which parses a
109 | letter or underscore, and `nonFirstChar` which parses a digit, letter
110 | or underscore. This time, we use the `many` function instead of
111 | `many1`.
112 | 
113 | Try it out in ghci. I like to try things which you expect to work, and
114 | also to try things which you expect to not work and make sure you get
115 | an error.
116 | 
117 | == parens
118 | 
119 | The parens parser will eventually parse any expression inside
120 | parentheses. First it will just parse integers inside parentheses.
121 | 
122 | > data Parentheses = Parentheses Integer
123 | >                    deriving (Eq,Show)
124 | >
125 | > parensExamples :: [(String, Parentheses)]
126 | > parensExamples = [("(1)", Parentheses 1)
127 | >                  ,("(17)", Parentheses 17)]
128 | 
129 | > parens :: Parser Parentheses
130 | > parens = do
131 | >     void $ char '('
132 | >     e <- many1 digit
133 | >     void $ char ')'
134 | >     return (Parentheses (read e))
135 | 
136 | There is a new function: `void`. This might be familiar to you
137 | already. This is used to ignore the result of the `char` parser, since
138 | we are not interested in this value. You can also write the function
139 | without `void`, but ghc will give you a warning if you have warnings
140 | turned on.
141 | 
142 | One way of turning warnings on in ghci is to enter `:set -Wall` at the
143 | ghci prompt.
144 | 
145 | > parens' :: Parser Parentheses
146 | > parens' = do
147 | >     char '('
148 | >     e <- many1 digit
149 | >     char ')'
150 | >     return (Parentheses (read e))
151 | 
152 | ```
153 | *Main> :set -Wall
154 | *Main> :l "VerySimpleExpressions.lhs"
155 | 
156 | ...
157 | 
158 | FirstRealParsing.lhs:140:7: Warning:
159 |     A do-notation statement discarded a result of type Char.
160 |     Suppress this warning by saying "_ <- char '('",
161 |     or by using the flag -fno-warn-unused-do-bind
162 | 
163 | FirstRealParsing.lhs:142:7: Warning:
164 |     A do-notation statement discarded a result of type Char.
165 |     Suppress this warning by saying "_ <- char ')'",
166 |     or by using the flag -fno-warn-unused-do-bind
167 | 
168 | ...
169 | 
170 | ```
171 | 
172 | As you can see, another way to suppress the warning is to use
173 | `_ <- char '('`.
174 | 
175 | One issue with this parser is that it doesn't handle whitespace:
176 | 
177 | ```
178 | *Main> regularParse parens "(1)"
179 | Right (Parentheses 1)
180 | 
181 | *Main> regularParse parens "( 1)"
182 | Left (line 1, column 2):
183 | unexpected " "
184 | expecting digit
185 | 
186 | *Main> regularParse parens "(1 )"
187 | Left (line 1, column 3):
188 | unexpected " "
189 | expecting digit or ")"
190 | ```
191 | 
192 | We will look at this issue below.
193 | 
194 | == add
195 | 
196 | Now we will write a little parser to parse strings like 'a+b' where a
197 | and b are numbers.
198 | 
199 | > data SingleAdd = SingleAdd Integer Integer
200 | >                  deriving (Eq,Show)
201 | >
202 | > singleAddExamples :: [(String, SingleAdd)]
203 | > singleAddExamples = [("1+2", SingleAdd 1 2)
204 | >                     ,("101+202", SingleAdd 101 202)]
205 | 
206 | > add :: Parser SingleAdd
207 | > add = do
208 | >     e0 <- many1 digit
209 | >     void $ char '+'
210 | >     e1 <- many1 digit
211 | >     return (SingleAdd (read e0) (read e1))
212 | 
213 | It has the same whitespace issues as the parens parser.
214 | 
215 | ```
216 | *Main> regularParse add "1+2"
217 | Right (SingleAdd 1 2)
218 | 
219 | *Main> regularParse add "1 +2"
220 | Left (line 1, column 2):
221 | unexpected " "
222 | expecting digit or "+"
223 | 
224 | ```
225 | 
226 | == whitespace
227 | 
228 | Here is a parser which will skip zero or more whitespace characters.
229 | 
230 | > whitespace :: Parser ()
231 | > whitespace = void $ many $ oneOf " \n\t"
232 | 
233 | We can use this to make our parsers handle whitespace better.
234 | 
235 | ```
236 | *Main> regularParse whitespace " "
237 | Right ()
238 | *Main> regularParse whitespace "  "
239 | Right ()
240 | *Main> regularParse whitespace "\t"
241 | Right ()
242 | *Main> regularParse whitespace " \n "
243 | Right ()
244 | *Main> regularParse whitespace ""
245 | Right ()
246 | ```
247 | 
248 | Notice that it always succeeds.
249 | 
250 | Here is the parens parser rewritten with a common approach to
251 | whitespace handling:
252 | 
253 | > parensW :: Parser Parentheses
254 | > parensW = do
255 | >     whitespace
256 | >     void $ char '('
257 | >     whitespace
258 | >     e <- many1 digit
259 | >     whitespace
260 | >     void $ char ')'
261 | >     whitespace
262 | >     return (Parentheses (read e))
263 | 
264 | ```
265 | *Main> regularParse parensW "(1)"
266 | Right (Parentheses 1)
267 | 
268 | *Main> regularParse parensW " (1)"
269 | Right (Parentheses 1)
270 | 
271 | *Main> regularParse parensW " (1 )"
272 | Right (Parentheses 1)
273 | 
274 | *Main> regularParse parensW " ( 1 ) "
275 | Right (Parentheses 1)
276 | ```
277 | 
278 | Looks good.
279 | 
280 | In the original parsec documentation, one of the concepts mentioned is
281 | the idea of 'lexeme' parsing. This is a style in which every token
282 | parser should also consume and ignore any trailing whitespace.
283 | 
284 | This is a simple convention which with a bit of care allows skipping
285 | whitespace exactly once wherever it needs to be skipped. To complete
286 | the lexeme style, we should also always skip leading whitespace at the
287 | top level only. This feels more elegant than spamming all the parsing
288 | code with many calls to `whitespace`.
289 | 
290 | > lexeme :: Parser a -> Parser a
291 | > lexeme p = do
292 | >            x <- p
293 | >            whitespace
294 | >            return x
295 | 
296 | > parseWithWhitespace :: Parser a -> String -> Either ParseError a
297 | > parseWithWhitespace p = parseWithEof wrapper
298 | >   where
299 | >     wrapper = do
300 | >         whitespace
301 | >         p
302 | 
303 | 
304 | Here is the parens parser rewritten to use lexeme:
305 | 
306 | > parensL :: Parser Parentheses
307 | > parensL = do
308 | >     void $ lexeme $ char '('
309 | >     e <- lexeme $ many1 digit
310 | >     void $ lexeme $ char ')'
311 | >     return (Parentheses (read e))
312 | 
313 | ```
314 | *Main> parseWithWhitespace parensL "(1)"
315 | Right (Parentheses 1)
316 | 
317 | *Main> parseWithWhitespace parensL " (1)"
318 | Right (Parentheses 1)
319 | 
320 | *Main> parseWithWhitespace parensL " ( 1)"
321 | Right (Parentheses 1)
322 | 
323 | *Main> parseWithWhitespace parensL " ( 1 ) "
324 | Right (Parentheses 1)
325 | ```
326 | 
327 | The `parseWithWhitespace` function can also use `(>>)` to make it a
328 | bit shorter, `wrapper = whiteSpace >> p`.
329 | 
330 | Here is the shorter version of this function using `(>>)`:
331 | 
332 | > parseWithWhitespace' :: Parser a -> String -> Either ParseError a
333 | > parseWithWhitespace' p = parseWithEof (whitespace >> p)
334 | 
335 | Try rewriting the SingleAdd parser to use `lexeme`, and test it out to
336 | convince yourself that it skips whitespace correctly.
337 | 
338 | == simple expr
339 | 
340 | Now we are ready to write a parser which parses simple expressions
341 | made from these components. Here is the data type for these
342 | expressions.
343 | 
344 | > data SimpleExpr = Num Integer
345 | >                 | Var String
346 | >                 | Add SimpleExpr SimpleExpr
347 | >                 | Parens SimpleExpr
348 | >                   deriving (Eq,Show)
349 | 
350 | It's so simple that it is almost useless at the moment.
351 | 
352 | > simpleExprExamples :: [(String,SimpleExpr)]
353 | > simpleExprExamples =
354 | >     [("a", Var "a")
355 | >     ,("1", Num 1)
356 | >     ,("2 + 3", Add (Num 2) (Num 3))
357 | >     ,("(42)", Parens (Num 42))]
358 | 
359 | TODO: some more complex examples
360 | 
361 | Here are all our component parsers with `lexeme`, and with the
362 | `SimpleExpr` constructors:
363 | 
364 | > numE :: Parser SimpleExpr
365 | > numE = do
366 | >     n <- lexeme $ many1 digit
367 | >     return $ Num $ read n
368 | 
369 | There doesn't seem to be a unique obviously correct place to put the
370 | lexeme call in the var parser:
371 | 
372 | > varE :: Parser SimpleExpr
373 | > varE = lexeme $ do
374 | >     fc <- firstChar
375 | >     rest <- many nonFirstChar
376 | >     return $ Var (fc:rest)
377 | >   where
378 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
379 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
380 | 
381 | Here is an alternative, with the call to lexeme in a different place,
382 | but gives effectively the same function.
383 | 
384 | > varE' :: Parser SimpleExpr
385 | > varE' = do
386 | >     fc <- firstChar
387 | >     rest <- lexeme $ many nonFirstChar
388 | >     return $ Var (fc:rest)
389 | >   where
390 | >     firstChar = satisfy (\a -> isLetter a || a == '_')
391 | >     nonFirstChar = satisfy (\a -> isDigit a || isLetter a || a == '_')
392 | 
393 | > parensE :: Parser SimpleExpr
394 | > parensE = do
395 | >     void $ lexeme $ char '('
396 | >     e <- lexeme $ many1 digit
397 | >     void $ lexeme $ char ')'
398 | >     return $ Parens $ Num $ read e
399 | 
400 | In the parens parser, we can reuse the `numE` parser like this:
401 | 
402 | > parensE' :: Parser SimpleExpr
403 | > parensE' = do
404 | >     void $ lexeme $ char '('
405 | >     e <- numE
406 | >     void $ lexeme $ char ')'
407 | >     return $ Parens e
408 | 
409 | Here is the add parser using `numE` also.
410 | 
411 | > addE :: Parser SimpleExpr
412 | > addE = do
413 | >     e0 <- numE
414 | >     void $ lexeme $ char '+'
415 | >     e1 <- numE
416 | >     return $ Add e0 e1
417 | 
418 | === choice
419 | 
420 | To combine these, we can use an operator called `(<|>)`:
421 | 
422 | > numOrVar :: Parser SimpleExpr
423 | > numOrVar = numE <|> varE
424 | 
425 | It tries the first parser, and it if fails (without consuming any
426 | input), it tries the second parser. More about the 'consuming input'
427 | concept later.
428 | 
429 | Here is another way to write the numOrVar parser:
430 | 
431 | > numOrVar' :: Parser SimpleExpr
432 | > numOrVar' = choice [numE,varE]
433 | 
434 | `choice` is just wrapper around `(<|>)`. You can choose which one to
435 | use based on which is more readable in each particular case.
436 | 
437 | ```
438 | *Main> parseWithWhitespace numOrVar "a"
439 | Right (Var "a")
440 | 
441 | *Main> parseWithWhitespace numOrVar "1"
442 | Right (Num 1)
443 | 
444 | *Main> parseWithWhitespace numOrVar "!"
445 | Left (line 1, column 1):
446 | unexpected "!"
447 | expecting digit
448 | ```
449 | 
450 | Here is the first version of the simpleExpr parser:
451 | 
452 | > simpleExpr :: Parser SimpleExpr
453 | > simpleExpr = numE <|> varE <|> addE <|> parensE
454 | 
455 | ```
456 | *Main> parseWithWhitespace simpleExpr "12"
457 | Right (Num 12)
458 | 
459 | *Main> parseWithWhitespace simpleExpr "aa"
460 | Right (Var "aa")
461 | 
462 | *Main> parseWithWhitespace simpleExpr "1+2"
463 | Left (line 1, column 2):
464 | unexpected '+'
465 | expecting digit or end of input
466 | 
467 | *Main> parseWithWhitespace simpleExpr "(1)"
468 | Right (Parens (Num 1))
469 | 
470 | *Main> parseWithWhitespace simpleExpr "(aa)"
471 | Left (line 1, column 2):
472 | unexpected "a"
473 | expecting digit
474 | 
475 | ```
476 | 
477 | It works well for some of the parsers. One problem is that the `addE`
478 | and `parensE` parsers don't parse general expressions as the
479 | components, but just `numE`. Another problem is that the `addE`
480 | doesn't work at all: the `numE` parser parses the first number, and
481 | the `addE` parser is never tried. This is an example of `(<|>)` not
482 | trying the second parser if the first parser succeeds, even if a later
483 | alternative would consume more input or successfully parse the whole
484 | input.
485 | 
486 | Let's try and rearrange the order:
487 | 
488 | > simpleExpr1 :: Parser SimpleExpr
489 | > simpleExpr1 = addE <|> numE <|> varE <|> parensE
490 | 
491 | ```
492 | *Main> parseWithWhitespace simpleExpr1 "12"
493 | Left (line 1, column 3):
494 | unexpected end of input
495 | expecting digit or "+"
496 | 
497 | *Main> parseWithWhitespace simpleExpr1 "aa"
498 | Right (Var "aa")
499 | 
500 | *Main> parseWithWhitespace simpleExpr1 "1+2"
501 | Right (Add (Num 1) (Num 2))
502 | 
503 | *Main> parseWithWhitespace simpleExpr1 "(1)"
504 | Right (Parens (Num 1))
505 | 
506 | ```
507 | 
508 | We swapped one problem for another. Let's fix this using the `try`
509 | function.
510 | 
511 | 
512 | > simpleExpr2 :: Parser SimpleExpr
513 | > simpleExpr2 = try addE <|> numE <|> varE <|> parensE
514 | 
515 | ```
516 | *Main> parseWithWhitespace simpleExpr2 "12"
517 | Right (Num 12)
518 | 
519 | *Main> parseWithWhitespace simpleExpr2 "aa"
520 | Right (Var "aa")
521 | 
522 | *Main> parseWithWhitespace simpleExpr2 "1+2"
523 | Right (Add (Num 1) (Num 2))
524 | 
525 | *Main> parseWithWhitespace simpleExpr2 "(1)"
526 | Right (Parens (Num 1))
527 | ```
528 | 
529 | Now everything seems to work fine. The `try` function is very powerful
530 | and easy to use, and can be used where in a more traditional parsing
531 | approach you would have to use left factoring or something else.
532 | 
533 | The `try` function implements backtracking. When this is used with
534 | `(<|>)`, it means that if the first parser fails, it will undo the
535 | consumed input and carry on with the next option, instead of failing
536 | completely. This works even if the `try` is nested deeply within the
537 | first parser given to `(<|>)`.
538 | 
539 | `try` has its downsides (some of which we will see later), and I
540 | usually try to minimise its use or eliminate it completely. I found I
541 | often got into a complete mess when I used `try` too much when writing
542 | parsers for something a little tricky like SQL, and that although
543 | doing some left-factoring appeared at first to be tedious and appeared
544 | to make the code less readable, I eventually decided that for me it
545 | made the code more readable since what was happening was more
546 | transparent.
547 | 
548 | Now we are going to fix this parser to parse arbitrarily nested
549 | expressions. In a way, the method used will roughly mean we are left
550 | factoring the `numE` and `addE` common prefix.
551 | 
552 | Here is the naive implementation:
553 | 
554 | > parensE3 :: Parser SimpleExpr
555 | > parensE3 = do
556 | >     void $ lexeme $ char '('
557 | >     e <- simpleExpr3
558 | >     void $ lexeme $ char ')'
559 | >     return $ Parens e
560 | 
561 | 
562 | > addE3 :: Parser SimpleExpr
563 | > addE3 = do
564 | >     e0 <- simpleExpr3
565 | >     void $ lexeme $ char '+'
566 | >     e1 <- simpleExpr3
567 | >     return $ Add e0 e1
568 | 
569 | 
570 | > simpleExpr3 :: Parser SimpleExpr
571 | > simpleExpr3 = try addE3 <|> numE <|> varE <|> parensE3
572 | 
573 | If you run this parser, it will enter an infinite loop, since
574 | `simpleExpr3` and `addE3` will keep calling each other recursively
575 | without making any progress.
576 | 
577 | ```
578 | *Main> parseWithWhitespace simpleExpr3 "a+b"
579 |   C-c Interrupted.
580 | ```
581 | Let's try without `add`.
582 | 
583 | > parensE4 :: Parser SimpleExpr
584 | > parensE4 = do
585 | >     void $ lexeme $ char '('
586 | >     e <- simpleExpr4
587 | >     void $ lexeme $ char ')'
588 | >     return $ Parens e
589 | 
590 | > simpleExpr4 :: Parser SimpleExpr
591 | > simpleExpr4 = numE <|> varE <|> parensE4
592 | 
593 | ```
594 | *Main> parseWithWhitespace simpleExpr4 "a"
595 | Right (Var "a")
596 | 
597 | *Main> parseWithWhitespace simpleExpr4 "1"
598 | Right (Num 1)
599 | 
600 | *Main> parseWithWhitespace simpleExpr4 "(1)"
601 | Right (Parens (Num 1))
602 | 
603 | *Main> parseWithWhitespace simpleExpr4 "((a))"
604 | Right (Parens (Parens (Var "a")))
605 | 
606 | ```
607 | 
608 | At least this part seems to work OK.
609 | 
610 | Let's try to stop the add parser from calling itself indirectly:
611 | 
612 | Here is a parameterized parens parser where we supply the nested
613 | expression parser as an argument. This is used here to try to make the
614 | code easier to follow and avoid rewriting this parser out again and
615 | again in full.
616 | 
617 | > parensEN :: Parser SimpleExpr -> Parser SimpleExpr
618 | > parensEN simpleExprImpl = do
619 | >     void $ lexeme $ char '('
620 | >     e <- simpleExprImpl
621 | >     void $ lexeme $ char ')'
622 | >     return $ Parens e
623 | 
624 | Here is a new parser, which parses expressions except add.
625 | 
626 | > term :: Parser SimpleExpr -> Parser SimpleExpr
627 | > term simpleExprImpl = numE <|> varE <|> parensEN simpleExprImpl
628 | 
629 | > term5 :: Parser SimpleExpr
630 | > term5 = term simpleExpr5
631 | 
632 | > addE5 :: Parser SimpleExpr
633 | > addE5 = do
634 | >     e0 <- term5
635 | >     void $ lexeme $ char '+'
636 | >     e1 <- term5
637 | >     return $ Add e0 e1
638 | 
639 | > simpleExpr5 :: Parser SimpleExpr
640 | > simpleExpr5 = try addE5 <|> term5
641 | 
642 | ```
643 | *Main> parseWithWhitespace simpleExpr5 "1"
644 | Right (Num 1)
645 | 
646 | *Main> parseWithWhitespace simpleExpr5 "a"
647 | Right (Var "a")
648 | 
649 | *Main> parseWithWhitespace simpleExpr5 "(a)"
650 | Right (Parens (Var "a"))
651 | 
652 | *Main> parseWithWhitespace simpleExpr5 "1+2"
653 | Right (Add (Num 1) (Num 2))
654 | 
655 | *Main> parseWithWhitespace simpleExpr5 "1+a"
656 | Right (Add (Num 1) (Var "a"))
657 | 
658 | *Main> parseWithWhitespace simpleExpr5 "(1+a)"
659 | Right (Parens (Add (Num 1) (Var "a")))
660 | 
661 | *Main> parseWithWhitespace simpleExpr5 "1+a+b"
662 | Left (line 1, column 4):
663 | unexpected '+'
664 | expecting end of input
665 | ```
666 | 
667 | Almost. Let's see what happens when the second `term` in `add` is
668 | changed to the general expression parser.
669 | 
670 | > term6 :: Parser SimpleExpr
671 | > term6 = term simpleExpr6
672 | 
673 | > addE6 :: Parser SimpleExpr
674 | > addE6 = do
675 | >     e0 <- term6
676 | >     void $ lexeme $ char '+'
677 | >     e1 <- simpleExpr6
678 | >     return $ Add e0 e1
679 | 
680 | > simpleExpr6 :: Parser SimpleExpr
681 | > simpleExpr6 = try addE6 <|> term6
682 | 
683 | ```
684 | *Main> parseWithWhitespace simpleExpr6 "a + b + c"
685 | Right (Add (Var "a") (Add (Var "b") (Var "c")))
686 | ```
687 | 
688 | Maybe it looks like we've made it. But there is a problem. We've
689 | parsed the + operator as if it has right associativity:
690 | 
691 | ```
692 | a + b + c -> a + (b + c)
693 | ```
694 | 
695 | But it should be left associative:
696 | 
697 | ```
698 | a + b + c -> (a + b) + c
699 | ```
700 | 
701 | Let's left factor the parsing and fix this:
702 | 
703 | > term7 :: Parser SimpleExpr
704 | > term7 = term simpleExpr7
705 | 
706 | > simpleExpr7 :: Parser SimpleExpr
707 | > simpleExpr7 = do
708 | >     -- first parse a term
709 | >     e <- term7
710 | >     -- then see if it is followed by an '+ expr' suffix
711 | >     maybeAddSuffix e
712 | >   where
713 | >     -- this function takes an expression, and parses a
714 | >     -- '+ expr' suffix, returning an Add expression
715 | >     -- it recursively calls itself via the maybeAddSuffix function
716 | >     addSuffix e0 = do
717 | >         void $ lexeme $ char '+'
718 | >         e1 <- term7
719 | >         maybeAddSuffix (Add e0 e1)
720 | >     -- this is the wrapper for addSuffix, which adapts it so that if
721 | >     -- addSuffix fails, it returns just the original expression
722 | >     maybeAddSuffix e = addSuffix e <|> return e
723 | 
724 | ```
725 | *Main> parseWithWhitespace simpleExpr7 "a + b + c"
726 | Right (Add (Add (Var "a") (Var "b")) (Var "c"))
727 | ```
728 | 
729 | Now the parser seems to work for everything it should.
730 | 
731 | There is a combinator function in Parsec we can use which abstracts
732 | this sort of pattern, `chainl1`.
733 | 
734 | > simpleExpr8 :: Parser SimpleExpr
735 | > simpleExpr8 = chainl1 term8 op
736 | >   where
737 | >     op = do
738 | >         void $ lexeme $ char '+'
739 | >         return Add
740 | >     term8 = term simpleExpr8
741 | 
742 | How does this work? Here is the type of `chainl1`:
743 | 
744 | ```haskell
745 | chainl1 :: Parser a -> Parser (a -> a -> a) -> Parser a
746 | chainl1 term op = ...
747 | ```
748 | 
749 | The type of the Add constructor in pseudocode is:
750 | 
751 | ```haskell
752 | Add :: SimpleExpr -> SimpleExpr -> SimpleExpr
753 | ```
754 | 
755 | The `op` parser here now just parses the operator itself, i.e. '+'
756 | (and not the second expression like our simpleExpr7 parser). The
757 | return from the `op` function is a function which accepts two elements
758 | and combines them using the appropriate operator representation. In
759 | this case, the represenation is the `Add` constructor.
760 | 
761 | You can look at the source
762 | <http://hackage.haskell.org/package/parsec-3.1.3/docs/src/Text-Parsec-Combinator.html#chainl1>
763 | and see if you can understand how it works. If you can't work it out,
764 | you could come back to it later when you have more experience writing
765 | parsing code.
766 | 
767 | == Testing with the examples
768 | 
769 | TODO: write a little manual tester that accepts a parser and a list of
770 | examples, and checks they all parse correctly.
771 | 
772 | == Testing with quickcheck
773 | 
774 | Let's see if we can check with quickcheck. It's a bit tricky testing
775 | parsers in this way, but one way to do something useful is to generate
776 | random asts, convert them to concrete syntax, parse them, and check
777 | the result. We can write a simple 'pretty printer' to convert an ast
778 | to concrete syntax.
779 | 
780 | === a pretty printer
781 | 
782 | TODO: a really simple pretty printer just pasting strings together, no
783 | layout.
784 | 
785 | === the quick check code
786 | 
787 | TODO: write a quickcheck property and arbitary instance and show
788 | running it at the ghci prompt
789 | 
790 | 


--------------------------------------------------------------------------------
/main.css:
--------------------------------------------------------------------------------
  1 | h1, h2 {
  2 |   display:block;
  3 |   background-color: #f0f0f0;
  4 |   border-top: thin #c0c0c0 solid;
  5 |   /*position:relative;*/
  6 |   padding-left:1ex;
  7 |   /*z-index: -10;*/
  8 | }
  9 | h1 {
 10 |   font-size: x-large;
 11 |   /*left: -3ex;*/
 12 |   margin-top: 3ex;
 13 |   /*width: 100%;*/
 14 | }
 15 | h2 {
 16 |   font-size: large;
 17 |   /*left: -1.5ex;*/
 18 |   margin-top: 1.5ex;
 19 |   /*width: 100%;*/
 20 | }
 21 | body {
 22 |   margin-left: 5em;
 23 |   margin-right: 5em;
 24 |   margin-bottom: 5em;
 25 |   margin-top: 2em;
 26 | }
 27 | #TOC {
 28 |   float:right;
 29 |   z-index:10;
 30 |   background-color: #f0f0f0;
 31 |   border: thin #c0c0c0 solid;
 32 |   padding:2ex;
 33 | }
 34 | .header {
 35 |   /*position:relative;
 36 |   left: -4ex;*/
 37 |   border-top: thin #c0c0c0 solid;
 38 |   border-bottom: thin #c0c0c0 solid;
 39 |   display:inline;
 40 |   padding: 1ex;
 41 |   background-color: #f0f0f0;
 42 |   font-weight: bold;
 43 | }
 44 | .footer {
 45 |   text-align: center;
 46 |   font-size: small;
 47 | }
 48 | pre {
 49 |   padding: 0.5ex;
 50 | }
 51 | 
 52 | 
 53 | pre {
 54 |   background-color: #f0f0f0;
 55 | }
 56 | 
 57 | /*
 58 | .SqlPostgresql pre.sourceCode {
 59 |   padding: 0.5em;
 60 |   background-color: #f0f6f6;
 61 | }
 62 | .sql pre.sourceCode {
 63 |   padding: 0.5em;
 64 |   background-color: #f0f6f6;
 65 | }
 66 | .GeneratedSql .SqlPostgresql pre.sourceCode, .SqlPostgresql .GeneratedSql pre.sourceCode {
 67 |   padding: 0.5em;
 68 |   background-color: #f0f6e0;
 69 | }
 70 | .UnusedSql .SqlPostgresql pre.sourceCode, .SqlPostgresql .UnusedSql pre.sourceCode {
 71 |   padding: 0.5em;
 72 |   background-color: #e9e9e9;
 73 | }
 74 | .haskell,.Haskell pre.sourceCode {
 75 |   background-color: #f5f5d9;
 76 | }
 77 | 
 78 | .sh pre.sourceCode {
 79 |   padding: 0.5em;
 80 |   background-color: #f0f0f0;
 81 | }
 82 | */
 83 | table, tr, td {
 84 |   border-collapse:collapse;
 85 |   cell-padding:2px;
 86 |   cell-spacing:2px;
 87 | /*  padding:2px
 88 |   spacing:2px
 89 |   margin:2px*/
 90 |   vertical-align:top;
 91 | }
 92 | td pre {
 93 |   width: 98%;
 94 |   height: 98%;
 95 |   vertical-align:top;
 96 | }
 97 | table {
 98 |   width:100%;
 99 |   table-layout:fixed;
100 | }
101 | td {
102 |   width: 50%;
103 |   vertical-align:top;
104 |   overflow:auto;
105 | }
106 | hr {
107 |   border: 0;
108 |   color: black;
109 |   background-color: black;
110 |   height: 1px;
111 |   width: 75%;
112 | }
113 | 
114 | .tablediv {
115 |   width:100%;
116 | }
117 | 
118 | /* higlighting kate */
119 | 
120 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode, table.sourceCode pre
121 |    { /*margin: 2; padding: 2; border: 2; */ vertical-align: baseline; border: none; }
122 | td.lineNumbers { border-right: 1px solid #AAAAAA; text-align: right; color: #AAAAAA; padding-right: 5px; padding-left: 5px; }
123 | td.sourceCode { padding-left: 5px; }
124 | pre.sourceCode { }
125 | pre.sourceCode span.Normal { }
126 | pre.sourceCode span.Keyword { color: #007020; font-weight: bold; }
127 | pre.sourceCode span.DataType { color: #902000; }
128 | pre.sourceCode span.DecVal { color: #40a070; }
129 | pre.sourceCode span.BaseN { color: #40a070; }
130 | pre.sourceCode span.Float { color: #40a070; }
131 | pre.sourceCode span.Char { color: #4070a0; }
132 | pre.sourceCode span.String { color: #4070a0; }
133 | pre.sourceCode span.Comment { color: #60a0b0; font-style: italic; }
134 | pre.sourceCode span.Others { color: #007020; }
135 | pre.sourceCode span.Alert { color: red; font-weight: bold; }
136 | pre.sourceCode span.Function { color: #06287e; }
137 | pre.sourceCode span.RegionMarker { }
138 | pre.sourceCode span.Error { color: red; font-weight: bold; }
139 | 


--------------------------------------------------------------------------------
/make_website.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # this file is used to render the html via asciidoc from the source
 6 | # files and the README.asciidoc
 7 | 
 8 | mkdir -p build
 9 | 
10 | set +e
11 | rm build/IntroToParsing.lhs
12 | set -e
13 | 
14 | cat GettingStarted.lhs VerySimpleExpressions.lhs ApplicativeStyle.lhs CombinatorReview.lhs FunctionsAndTypesForParsing.lhs TextParsecExpr.lhs AnIssueWithTokenParsers.lhs TextParsecPerm.lhs TextParsecToken.lhs ValueExpressions.lhs QueryExpressions.lhs FromClause.lhs SimpleSQLQueryParser0.lhs PrettyPrinting0.lhs ErrorMessages.lhs > build/IntroToParsing.lhs
15 | 
16 | echo > build/intro_to_parsing.asciidoc
17 | echo :toc: right >> build/intro_to_parsing.asciidoc
18 | echo :sectnums: >> build/intro_to_parsing.asciidoc
19 | echo :toclevels: 10 >> build/intro_to_parsing.asciidoc
20 | echo :source-highlighter: pygments >> build/intro_to_parsing.asciidoc
21 | echo >> build/intro_to_parsing.asciidoc
22 | 
23 | cat README.asciidoc | runhaskell render/FixReadme.lhs >> build/intro_to_parsing.asciidoc
24 | cat build/IntroToParsing.lhs | runhaskell render/Render.lhs >> build/intro_to_parsing.asciidoc
25 | 
26 | cat build/intro_to_parsing.asciidoc | asciidoctor -d book - | runhaskell AddLinks.lhs > build/index.html
27 | 


--------------------------------------------------------------------------------
/render/FixReadme.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | Fix the README for the html genereation
 3 | 
 4 | > import Data.List
 5 | 
 6 | > main :: IO ()
 7 | > main = do
 8 | >     ls <- getContents
 9 | >     putStrLn $ unlines $ map fixLinks $ lines ls
10 | 
11 | > fixLinks x | "link:" `isPrefixOf` x = []
12 | > fixLinks x | "// <<" `isPrefixOf` x = drop 3 x
13 | > fixLinks x = x
14 | 


--------------------------------------------------------------------------------
/render/Render.lhs:
--------------------------------------------------------------------------------
 1 | 
 2 | This code is to take the .lhs and add some formatting so it renders
 3 | nicely with asciidoctor.
 4 | 
 5 | TODO
 6 | 
 7 | concat docs then render as multiple pages
 8 | get the inter/intra doc links working
 9 | 
10 | find a better fix for the * issue
11 | 
12 | > import System.Environment
13 | > import Data.Char
14 | > import System.IO
15 | > import System.FilePath
16 | > import Data.List
17 | 
18 | > main :: IO ()
19 | > main = do
20 | >     --[fn] <- getArgs
21 | >     src <- getContents -- readFile fn
22 | >     --let title = makeTitle $ takeBaseName fn
23 | >     --putStrLn $ ":toc: right\n:toclevels: 8\n:sectnums:\n"
24 | >     --putStrLn "= Introduction to parsing in Haskell with Parsec\n"
25 | >     putStrLn . unlines . map fixHeadings
26 | >       . addBlocks False . lines $ src
27 | >   where
28 | >     makeTitle (c:c1:cs) | isUpper c && isUpper c1 =
29 | >       let (rest,cs') = span isUpper cs
30 | >       in case (rest,cs') of
31 | >             ([],_) -> ' ' : c : ' ' : c1 : makeTitle cs'
32 | >             (_, d:_) | isDigit d -> ' ' : c : c1 :
33 | >                                     (rest ++ " " ++  makeTitle cs')
34 | >             _ -> ' ' : c : c1 : (init rest ++  " " ++ [last rest]
35 | >                                    ++ makeTitle cs')
36 | >     makeTitle (c:cs) | isUpper c = ' ' : toUpper c : makeTitle cs
37 | >     makeTitle (c:cs) = c : makeTitle cs
38 | >     makeTitle [] = []
39 | >     fixHeadings ('=':xs) = '=':'=':xs
40 | >     fixHeadings xs = xs
41 | >     addBlocks False (l:ls)
42 | >         | isLiterate l =
43 | >             "[source,haskell]\n----" : stripLiterate l
44 | >             : addBlocks True ls
45 | >         | otherwise = l : addBlocks False ls
46 | 
47 | >     addBlocks True (l:ls)
48 | >         | isLiterate l =
49 | >             stripLiterate l : addBlocks True ls
50 | >         | otherwise = "----" : l : addBlocks False ls
51 | >     addBlocks _ [] = []
52 | >     stripLiterate ('>':' ':l) = l
53 | >     stripLiterate (">") = ""
54 | >     stripLiterate l = l
55 | >     isLiterate ('>':_) = True
56 | >     isLiterate _ = False
57 | 
58 | escape asterisk: rules worked out by experiment. If there is an
59 | asterisk on a non code block line, and there are more asterisks on the
60 | line, then put a backslash before the first asterisk only to stop it
61 | from being interpreted as bold formatting. Not sure why only the first
62 | one should have a backslash, or why if you put a backslash in front of
63 | subsequence asterisks, then the backslash is rendered ...
64 | 
65 | Doesn't work, need to parse paragraphs not lines, todo, and need to
66 | make sure it doesn't replace inside blocks
67 | 
68 | >     {-escapeAsterisk l
69 | >         | length (elemIndices '*' l) > 1 =
70 | >               let (a,b) = break (=='*') l
71 | >               in a ++ ('\\':b)
72 | >         | otherwise = l-}
73 | 


--------------------------------------------------------------------------------
/source_text:
--------------------------------------------------------------------------------
1 | correct
2 | 


--------------------------------------------------------------------------------