├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Setup.hs
├── lexer-applicative.cabal
├── src
    └── Language
    │   └── Lexer
    │       └── Applicative.hs
├── stack.yaml
└── tests
    └── test.hs


/.travis.yml:
--------------------------------------------------------------------------------
  1 | # This is the complex Travis configuration, which is intended for use
  2 | # on open source libraries which need compatibility across multiple GHC
  3 | # versions, must work with cabal-install, and should be
  4 | # cross-platform. For more information and other options, see:
  5 | #
  6 | # https://docs.haskellstack.org/en/stable/travis_ci/
  7 | #
  8 | # Copy these contents into the root directory of your Github project in a file
  9 | # named .travis.yml
 10 | 
 11 | # Use new container infrastructure to enable caching
 12 | sudo: false
 13 | 
 14 | # Do not choose a language; we provide our own build tools.
 15 | language: generic
 16 | 
 17 | # Caching so the next build will be fast too.
 18 | cache:
 19 |   directories:
 20 |   - $HOME/.ghc
 21 |   - $HOME/.cabal
 22 |   - $HOME/.stack
 23 | 
 24 | # The different configurations we want to test. We have BUILD=cabal which uses
 25 | # cabal-install, and BUILD=stack which uses Stack. More documentation on each
 26 | # of those below.
 27 | #
 28 | # We set the compiler values here to tell Travis to use a different
 29 | # cache file per set of arguments.
 30 | #
 31 | # If you need to have different apt packages for each combination in the
 32 | # matrix, you can use a line such as:
 33 | #     addons: {apt: {packages: [libfcgi-dev,libgmp-dev]}}
 34 | matrix:
 35 |   include:
 36 |   # We grab the appropriate GHC and cabal-install versions from hvr's PPA. See:
 37 |   # https://github.com/hvr/multi-ghc-travis
 38 |   - env: BUILD=cabal GHCVER=8.0.2 CABALVER=1.24 HAPPYVER=1.19.5 ALEXVER=3.1.7
 39 |     compiler: ": #GHC 8.0.2"
 40 |     addons: {apt: {packages: [cabal-install-1.24,ghc-8.0.2], sources: [hvr-ghc]}}
 41 |   - env: BUILD=cabal GHCVER=8.4.3 CABALVER=1.24
 42 |     compiler: ": #GHC 8.4.3"
 43 |     addons: {apt: {packages: [cabal-install-1.24,ghc-8.4.3], sources: [hvr-ghc]}}
 44 | 
 45 |   # The Stack builds. We can pass in arbitrary Stack arguments via the ARGS
 46 |   # variable, such as using --stack-yaml to point to a different file.
 47 |   - env: BUILD=stack ARGS="--resolver lts-8"
 48 |     compiler: ": #stack 8.0.2"
 49 |     addons: {apt: {packages: [libgmp-dev]}}
 50 | 
 51 |   - env: BUILD=stack ARGS="--resolver lts-10"
 52 |     compiler: ": #stack 8.2.2"
 53 |     addons: {apt: {packages: [libgmp-dev]}}
 54 | 
 55 |   allow_failures:
 56 |   - env: BUILD=cabal GHCVER=head  CABALVER=head HAPPYVER=1.19.5 ALEXVER=3.1.7
 57 |   - env: BUILD=stack ARGS="--resolver nightly"
 58 | 
 59 | before_install:
 60 | # Using compiler above sets CC to an invalid value, so unset it
 61 | - unset CC
 62 | 
 63 | # We want to always allow newer versions of packages when building on GHC HEAD
 64 | - CABALARGS=""
 65 | - if [ "x$GHCVER" = "xhead" ]; then CABALARGS=--allow-newer; fi
 66 | 
 67 | # Download and unpack the stack executable
 68 | - export PATH=/opt/ghc/$GHCVER/bin:/opt/cabal/$CABALVER/bin:$HOME/.local/bin:/opt/alex/$ALEXVER/bin:/opt/happy/$HAPPYVER/bin:$HOME/.cabal/bin:$PATH
 69 | - mkdir -p ~/.local/bin
 70 | - |
 71 |   if [ `uname` = "Darwin" ]
 72 |   then
 73 |     travis_retry curl --insecure -L https://www.stackage.org/stack/osx-x86_64 | tar xz --strip-components=1 --include '*/stack' -C ~/.local/bin
 74 |   else
 75 |     travis_retry curl -L https://www.stackage.org/stack/linux-x86_64 | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack'
 76 |   fi
 77 | 
 78 |   # Use the more reliable S3 mirror of Hackage
 79 |   mkdir -p $HOME/.cabal
 80 |   echo 'remote-repo: hackage.haskell.org:http://hackage.fpcomplete.com/' > $HOME/.cabal/config
 81 |   echo 'remote-repo-cache: $HOME/.cabal/packages' >> $HOME/.cabal/config
 82 | 
 83 |   if [ "$CABALVER" != "1.16" ]
 84 |   then
 85 |     echo 'jobs: $ncpus' >> $HOME/.cabal/config
 86 |   fi
 87 | 
 88 | install:
 89 | - echo "$(ghc --version) [$(ghc --print-project-git-commit-id 2> /dev/null || echo '?')]"
 90 | - if [ -f configure.ac ]; then autoreconf -i; fi
 91 | - |
 92 |   set -ex
 93 |   case "$BUILD" in
 94 |     stack)
 95 |       stack --no-terminal --install-ghc $ARGS test --bench --only-dependencies
 96 |       ;;
 97 |     cabal)
 98 |       cabal --version
 99 |       travis_retry cabal update
100 | 
101 |       # Get the list of packages from the stack.yaml file
102 |       PACKAGES=$(stack --install-ghc query locals | grep '^ *path' | sed 's@^ *path:@@')
103 | 
104 |       cabal install --only-dependencies --enable-tests --enable-benchmarks --force-reinstalls --ghc-options=-O0 --reorder-goals --max-backjumps=-1 $CABALARGS $PACKAGES
105 |       ;;
106 |   esac
107 |   set +ex
108 | 
109 | script:
110 | - |
111 |   set -ex
112 |   case "$BUILD" in
113 |     stack)
114 |       stack --no-terminal $ARGS test --bench --no-run-benchmarks --haddock --no-haddock-deps
115 |       ;;
116 |     cabal)
117 |       cabal install --enable-tests --enable-benchmarks --force-reinstalls --ghc-options=-O0 --reorder-goals --max-backjumps=-1 $CABALARGS $PACKAGES
118 | 
119 |       ORIGDIR=$(pwd)
120 |       for dir in $PACKAGES
121 |       do
122 |         cd $dir
123 |         cabal check || [ "$CABALVER" == "1.16" ]
124 |         cabal sdist
125 |         PKGVER=$(cabal info . | awk '{print $2;exit}')
126 |         SRC_TGZ=$PKGVER.tar.gz
127 |         cd dist
128 |         tar zxfv "$SRC_TGZ"
129 |         cd "$PKGVER"
130 |         cabal configure --enable-tests
131 |         cabal build
132 |         cabal test
133 |         cd $ORIGDIR
134 |       done
135 |       ;;
136 |   esac
137 |   set +ex
138 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | Changelog
 2 | =========
 3 | 
 4 | 2.1.0.2
 5 | -------
 6 | 
 7 | Fix compatibility with GHC 8.4 and drop support for GHC 7.x
 8 | 
 9 | 2.1.0.1
10 | -------
11 | 
12 | Fix a link in the README
13 | 
14 | 2.1
15 | ---
16 | 
17 | * Restore compatibility with older GHCs
18 | * Change the type of `longestShortest`
19 | 
20 | 2.0
21 | ---
22 | 
23 | This is a major redesign of the API. Notable changes:
24 | 
25 | - The lexer now supports parsing the longest prefix/shortest suffix
26 |   (see `longestShortest`)
27 | - Instead of throwing an exception, we return a stream. The stream can be
28 |   consumed directly, converted to a list or either-error-list of tokens.
29 | 
30 | 1.1.1
31 | -----
32 | 
33 | Add `tokensEither`
34 | 
35 | 1.1
36 | ---
37 | 
38 | Upgrade to srcloc 0.5
39 | 
40 | 1.0.0.1
41 | -------
42 | 
43 | Signal a lexical error (instead of looping) when a regex does not consume any
44 | characters
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Roman Cheplyaka
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included
12 | in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Simple applicative lexer based on the article
2 | [Lexical analysis with parser combinators][1]
3 | and the [regex-applicative][2] library.
4 | 
5 | [1]: https://ro-che.info/articles/2015-01-02-lexical-analysis
6 | [2]: http://hackage.haskell.org/package/regex-applicative-0.3.1/docs/Text-Regex-Applicative.html
7 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/lexer-applicative.cabal:
--------------------------------------------------------------------------------
 1 | -- Initial lexer-applicative.cabal generated by cabal init.  For further 
 2 | -- documentation, see http://haskell.org/cabal/users-guide/
 3 | 
 4 | name:                lexer-applicative
 5 | version:             2.1.0.2
 6 | synopsis:            Simple lexer based on applicative regular expressions
 7 | description:         Simple lexer based on applicative regular expressions
 8 | homepage:            https://github.com/feuerbach/lexer-applicative
 9 | license:             MIT
10 | license-file:        LICENSE
11 | author:              Roman Cheplyaka <roma@ro-che.info>
12 | maintainer:          Roman Cheplyaka <roma@ro-che.info>
13 | -- copyright:           
14 | category:            Language
15 | build-type:          Simple
16 | extra-source-files:
17 |   README.md
18 |   CHANGELOG.md
19 | cabal-version:       >=1.10
20 | 
21 | Source-repository head
22 |   type:     git
23 |   location: git://github.com/feuerbach/lexer-applicative.git
24 | 
25 | library
26 |   exposed-modules:
27 |     Language.Lexer.Applicative
28 |   -- other-modules:       
29 |   -- other-extensions:    
30 |   build-depends:
31 |     base >=4.9 && < 5,
32 |     srcloc >= 0.5,
33 |     regex-applicative >= 0.3.1
34 |   hs-source-dirs:      src
35 |   default-language:    Haskell2010
36 |   ghc-options: -Wall
37 | 
38 | test-suite test
39 |   default-language:
40 |     Haskell2010
41 |   type:
42 |     exitcode-stdio-1.0
43 |   hs-source-dirs:
44 |     tests
45 |   main-is:
46 |     test.hs
47 |   build-depends:
48 |       base >= 4 && < 5
49 |     , tasty >= 0.10
50 |     , tasty-hunit >= 0.9
51 |     , regex-applicative
52 |     , lexer-applicative
53 |     , srcloc
54 |     , deepseq
55 | 


--------------------------------------------------------------------------------
/src/Language/Lexer/Applicative.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables, DeriveDataTypeable, DeriveFunctor #-}
  2 | -- | For some background, see
  3 | -- <https://ro-che.info/articles/2015-01-02-lexical-analysis>
  4 | module Language.Lexer.Applicative
  5 |   (
  6 |     -- * Building a Lexer
  7 |     Lexer(..)
  8 |   , token
  9 |   , whitespace
 10 |     -- ** Building Recognizers
 11 |   , Recognizer
 12 |   , longest
 13 |   , longestShortest
 14 |     -- * Running a Lexer
 15 |   , runLexer
 16 |     -- ** Working with a token stream
 17 |   , TokenStream(..)
 18 |   , streamToList
 19 |   , streamToEitherList
 20 |   , LexicalError(..)
 21 |   ) where
 22 | 
 23 | import Text.Regex.Applicative
 24 | import Data.Loc
 25 | import Data.List
 26 | import Data.Typeable (Typeable)
 27 | import Data.Semigroup (Semigroup(..))
 28 | import Data.Function
 29 | import Control.Exception
 30 | 
 31 | ----------------------------------------------------------------------
 32 | --                             Lexer
 33 | ----------------------------------------------------------------------
 34 | 
 35 | -- | A 'Lexer' specification consists of two recognizers: one for
 36 | -- meaningful tokens and one for whitespace and comments.
 37 | --
 38 | -- Although you can construct 'Lexer's directly, it is more convenient to
 39 | -- build them with 'token', 'whitespace', and the 'Monoid' instance like this:
 40 | --
 41 | -- @
 42 | --  myLexer :: 'Lexer' MyToken
 43 | --  myLexer = 'mconcat'
 44 | --    [ 'token'      ('longest' myToken)
 45 | --    , 'whitespace' ('longest' myWhiteSpace)
 46 | --    , 'whitespace' ('longestShortest' myCommentPrefix myCommentSuffix)
 47 | --    ]
 48 | -- @
 49 | data Lexer tok = Lexer
 50 |   { lexerTokenRE :: Recognizer tok
 51 |   , lexerWhitespaceRE :: Recognizer ()
 52 |   }
 53 |   deriving Functor
 54 | 
 55 | instance Semigroup (Lexer tok) where
 56 |   Lexer t1 w1 <> Lexer t2 w2 = Lexer (t1 <> t2) (w1 <> w2)
 57 | 
 58 | instance Monoid (Lexer tok) where
 59 |   mempty = Lexer mempty mempty
 60 |   mappend = (<>)
 61 | 
 62 | -- | Build a lexer with the given token recognizer and no (i.e. 'mempty')
 63 | -- whitespace recognizer.
 64 | --
 65 | -- 'token' is a monoid homomorphism:
 66 | --
 67 | -- @'token' a '<>' 'token' b = 'token' (a '<>' b)@
 68 | token :: Recognizer tok -> Lexer tok
 69 | token r = Lexer r mempty
 70 | 
 71 | -- | Build a lexer with the given whitespace recognizer and no (i.e. 'mempty')
 72 | -- token recognizer.
 73 | --
 74 | -- 'whitespace' is a monoid homomorphism:
 75 | --
 76 | -- @'whitespace' a '<>' 'whitespace' b = 'whitespace' (a '<>' b)@
 77 | whitespace :: Recognizer a -> Lexer tok
 78 | whitespace r = Lexer mempty (() <$ r)
 79 | 
 80 | ----------------------------------------------------------------------
 81 | --                           Recognizer
 82 | ----------------------------------------------------------------------
 83 | 
 84 | -- | A token recognizer
 85 | --
 86 | -- 'Recognizer' values are constructed by functions like 'longest' and
 87 | -- 'longestShortest', combined with `mappend`, and used by 'token' and
 88 | -- 'whitespace'.
 89 | --
 90 | -- When a recognizer returns without consuming any characters, a lexical
 91 | -- error is signaled.
 92 | newtype Recognizer tok = Recognizer (RE Char (RE Char tok))
 93 |   deriving Functor
 94 | 
 95 | instance Semigroup (Recognizer tok) where
 96 |   Recognizer r1 <> Recognizer r2 = Recognizer (r1 <|> r2)
 97 | 
 98 | instance Monoid (Recognizer tok) where
 99 |   mempty = Recognizer empty
100 |   mappend = (<>)
101 | 
102 | -- | When scanning a next token, the regular expression will compete with
103 | -- the other 'Recognizer's of its 'Lexer'. If it wins, its result
104 | -- will become the next token.
105 | --
106 | -- 'longest' has the following properties:
107 | --
108 | -- * @'longest' (r1 '<|>' r2) = 'longest' r1 '<>' 'longest' r2@
109 | --
110 | -- * @'longest' r = 'longestShortest' r 'pure'@
111 | longest
112 |   :: RE Char tok
113 |   -> Recognizer tok
114 | longest re = longestShortest re pure
115 | 
116 | -- | This is a more sophisticated recognizer than 'longest'.
117 | --
118 | -- It recognizes a token consisting of a prefix and a suffix, where prefix
119 | -- is chosen longest, and suffix is chosen shortest.
120 | --
121 | -- An example would be a C block comment
122 | --
123 | -- >/* comment text */
124 | --
125 | -- The naive
126 | --
127 | -- @'longest' ('string' "\/*" '*>' 'many' 'anySym' '*>' 'string' "*\/")@
128 | --
129 | -- doesn't work because it consumes too much: in
130 | --
131 | -- >/* xxx */ yyy /* zzz */
132 | --
133 | -- it will treat the whole line as a comment.
134 | --
135 | -- This is where 'longestShortest' comes in handy:
136 | --
137 | -- @
138 | -- 'longestShortest'
139 | --    ('string' "\/*")
140 | --    (\\_ -> 'many' 'anySym' '*>' 'string' "*\/")
141 | -- @
142 | --
143 | -- Operationally, the prefix regex first competes with other 'Recognizer's
144 | -- for the longest match. If it wins, then the shortest match for the
145 | -- suffix regex is found, and the two results are combined with the given
146 | -- function to produce a token.
147 | --
148 | -- The two regular expressions combined must consume some input, or else
149 | -- 'LexicalError' is thrown. However, any one of them may return without
150 | -- consuming input.
151 | --
152 | -- \* * *
153 | --
154 | -- Once the prefix regex wins, the choice is committed; the suffix regex
155 | -- must match or else a 'LexicalError' is thrown. Therefore,
156 | --
157 | -- @
158 | -- 'longestShortest' pref suff1
159 | --          '<>'
160 | -- 'longestShortest' pref suff2
161 | --          =
162 | -- 'longestShortest' pref suff1
163 | -- @
164 | --
165 | -- and is not the same as
166 | --
167 | -- @'longestShortest' pref (suff1 '<|>' suff2)@
168 | --
169 | -- The following holds, however:
170 | --
171 | -- @
172 | -- 'longestShortest' pref1 suff
173 | --          '<>'
174 | -- 'longestShortest' pref2 suff
175 | --          =
176 | -- 'longestShortest' (pref1 '<|>' pref2) suff
177 | -- @
178 | longestShortest
179 |   :: RE Char pref -- ^ regex for the longest prefix
180 |   -> (pref -> RE Char tok) -- ^ regex for the shortest suffix
181 |   -> Recognizer tok
182 | longestShortest prefRE suffRE =
183 |   Recognizer $
184 |     suffRE <$> prefRE
185 | 
186 | ----------------------------------------------------------------------
187 | --                           Running a Lexer
188 | ----------------------------------------------------------------------
189 | 
190 | -- | The lexical error exception
191 | data LexicalError = LexicalError !Pos
192 |   deriving (Eq, Typeable)
193 | 
194 | instance Show LexicalError where
195 |   show (LexicalError pos) = "Lexical error at " ++ displayPos pos
196 | instance Exception LexicalError
197 | 
198 | -- | A stream of tokens
199 | data TokenStream tok
200 |   = TsToken tok (TokenStream tok)
201 |   | TsEof
202 |   | TsError LexicalError
203 |   deriving (Eq, Functor, Show)
204 | 
205 | -- | Convert a 'TokenStream' to a list of tokens. Turn 'TsError' into
206 | -- a runtime 'LexicalError' exception.
207 | streamToList :: TokenStream tok -> [tok]
208 | streamToList stream =
209 |   case stream of
210 |     TsToken t stream' -> t : streamToList stream'
211 |     TsEof -> []
212 |     TsError e -> throw e
213 | 
214 | -- | Convert a 'TokenStream' into either a token list or a 'LexicalError'.
215 | -- This function may be occasionally useful, but in general its use is
216 | -- discouraged because it needs to force the whole stream before returning
217 | -- a result.
218 | streamToEitherList :: TokenStream tok -> Either LexicalError [tok]
219 | streamToEitherList =
220 |   sequence .
221 |   fix (\rec stream ->
222 |     case stream of
223 |       TsToken t stream' -> Right t : rec stream'
224 |       TsEof -> []
225 |       TsError e -> [Left e]
226 |   )
227 | 
228 | -- | Run a lexer on a string and produce a lazy stream of tokens
229 | runLexer
230 |   :: forall tok.
231 |      Lexer tok -- ^ lexer specification
232 |   -> String -- ^ source file name (used in locations)
233 |   -> String -- ^ source text
234 |   -> TokenStream (L tok)
235 | runLexer (Lexer (Recognizer pToken) (Recognizer pJunk)) src = go . annotate src
236 |   where
237 |   go l = case l of
238 |     [] -> TsEof
239 |     s@((_, pos1, _):_) ->
240 |       let
241 |         -- last position in the stream
242 |         -- in this branch s is non-empty, so this is safe
243 |         last_pos :: Pos
244 |         last_pos = case last s of (_, p, _) -> p
245 |       in
246 |       case findLongestPrefix re s of
247 | 
248 |         Nothing -> TsError (LexicalError pos1)
249 | 
250 |         Just (shortest_re, rest1) ->
251 | 
252 |           case findShortestPrefix shortest_re rest1 of
253 |             Nothing -> TsError . LexicalError $
254 |               case rest1 of
255 |                 (_, _, p):_ -> p
256 |                 [] -> last_pos
257 | 
258 |             -- If the combined match is empty, we have a lexical error
259 |             Just (_, (_, pos1', _):_) | pos1' == pos1 ->
260 |               TsError $ LexicalError pos1
261 | 
262 |             Just (Just tok, rest) ->
263 |               let
264 |                 pos2 =
265 |                   case rest of
266 |                     (_, _, p):_ -> p
267 |                     [] -> last_pos
268 | 
269 |               in TsToken (L (Loc pos1 pos2) tok) (go rest)
270 | 
271 |             Just (Nothing, rest) -> go rest
272 | 
273 |   extend :: RE Char a -> RE (Char, Pos, Pos) a
274 |   extend = comap (\(c, _, _) -> c)
275 | 
276 |   re :: RE (Char, Pos, Pos) (RE (Char, Pos, Pos) (Maybe tok))
277 |   re = extend . fmap extend $
278 |     ((Just <$>) <$> pToken) <|> ((Nothing <$) <$> pJunk)
279 | 
280 | annotate
281 |   :: String -- ^ source file name
282 |   -> String -- ^ contents
283 |   -> [(Char, Pos, Pos)] -- ^ the character, its position, and the previous position
284 | annotate src s = snd $ mapAccumL f (startPos src, startPos src) s
285 |   where
286 |     f (pos, prev_pos) ch =
287 |       let pos' = advancePos pos ch
288 |       in pos' `seq` ((pos', pos), (ch, pos, prev_pos))
289 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
1 | extra-deps: []
2 | resolver: lts-10.6
3 | packages:
4 | - '.'
5 | 


--------------------------------------------------------------------------------
/tests/test.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | 
 3 | import Test.Tasty
 4 | import Test.Tasty.HUnit
 5 | 
 6 | import Language.Lexer.Applicative
 7 | import Text.Regex.Applicative
 8 | import Text.Regex.Applicative.Common
 9 | import Data.Char
10 | import Data.Monoid
11 | import Data.Loc
12 | import Control.Exception
13 | import Control.DeepSeq
14 | 
15 | ws = whitespace $ longest $ some (psym isSpace)
16 | -- this is bad, because it accepts an empty string
17 | badWhitespace = whitespace $ longest $ many (psym isSpace)
18 | 
19 | word = longestToken $ many $ psym isAlpha
20 | 
21 | longestToken = token . longest
22 | 
23 | tokens l n s = streamToList $ runLexer l n s
24 | tokensEither l n s = streamToEitherList $ runLexer l n s
25 | 
26 | unloc (L l a) = (a, l)
27 | 
28 | -- This recognizes C-style block comments like /* ... */,
29 | -- but also matching delimiters like /*** ... ***/ (to make it more fun)
30 | blockComment = token $
31 |   longestShortest
32 |     ((++) <$> string "/" <*> many (sym '*'))
33 |     (\start -> (,) start <$> ((++) <$> many anySym <*> string (reverse start)))
34 | 
35 | main = defaultMain $ testGroup "Tests"
36 |   [ testCase "Empty string" $
37 |       tokens (longestToken (empty :: RE Char Int) <> whitespace mempty) "-" "" @=? []
38 |   , testCase "Space- and newline-separated numbers" $
39 |       unloc <$> tokens (longestToken decimal <> ws) "-" "1\n 23  456" @?=
40 |       [ (1,  Loc (Pos "-" 1 1 0) (Pos "-" 1 1 0))
41 |       , (23, Loc (Pos "-" 2 2 3) (Pos "-" 2 3 4))
42 |       , (456,Loc (Pos "-" 2 6 7) (Pos "-" 2 8 9))
43 |       ]
44 |   , testCase "Nullable parser, no error" $ do
45 |       let r = tokensEither (longestToken decimal <> badWhitespace) "-" "31 45"
46 |       case r of
47 |         Right (_ :: [L Int]) -> return ()
48 |         Left e -> assertFailure $ show e
49 |   , testCase "Nullable parser, error" $ do
50 |       let r = tokensEither (longestToken decimal <> badWhitespace) "-" "31? 45"
51 |       case r of
52 |         Right (_ :: [L Int]) -> assertFailure "No error?"
53 |         Left (LexicalError p) -> p @?= Pos "-" 1 3 2
54 |   , testCase "No matches, error" $ do
55 |       tokensEither (longestToken decimal) "-" " " @?= Left (LexicalError (Pos "-" 1 1 0))
56 |   , testCase "No matches after a recognized token" $ do
57 |       fmap unloc (runLexer (longestToken decimal <> ws) "-" "2 x") @?=
58 |         TsToken (2 :: Int, Loc (Pos "-" 1 1 0) (Pos "-" 1 1 0)) (TsError $ LexicalError (Pos "-" 1 3 2))
59 |   , testCase "streamToList throws an exception upon failure" $ do
60 |       r :: Either LexicalError [L ()] <- try . evaluate . force $ tokens mempty "-" " "
61 |       r @?= Left (LexicalError (Pos "-" 1 1 0))
62 |   , testCase "longestShortest (success)" $
63 |       fmap (map unLoc)
64 |       (tokensEither ((Left <$> blockComment) <> (Right <$> word) <> ws)
65 |         "-"
66 |         "/* xxx */ yyy /*** abc ***/ ef")
67 |       @?=
68 |         Right [Left ("/*"," xxx */"),Right "yyy",Left ("/***"," abc ***/"),Right "ef"]
69 |   , testCase "longestShortest (failure of shortest; end of stream)" $
70 |       (tokensEither (whitespace $ longestShortest (string "abc") (const empty))
71 |         "-"
72 |         "abc" :: Either LexicalError [L ()])
73 |       @?=
74 |         Left (LexicalError (Pos "-" 1 3 2))
75 |   , testCase "longestShortest (failure of shortest; not end of stream)" $
76 |       (tokensEither (whitespace $ longestShortest (string "abc") (const empty))
77 |         "-"
78 |         "abc " :: Either LexicalError [L ()])
79 |       @?=
80 |         Left (LexicalError (Pos "-" 1 3 2))
81 |   ]
82 | 
83 | -- orphan
84 | instance NFData a => NFData (L a) where
85 |   rnf (L loc a) = loc `seq` rnf a
86 | 


--------------------------------------------------------------------------------