├── test
    ├── Spec.hs
    ├── RegExp
    │   ├── Internal
    │   │   └── DFASpec.hs
    │   ├── OperationsSpec.hs
    │   └── RegExpSpec.hs
    ├── Helpers.hs
    └── Data
    │   └── GSetSpec.hs
├── Setup.hs
├── paper
    ├── mathpartir.sty
    ├── Makefile
    ├── bibliography-extra.bib
    ├── bibliography.bibdb
    ├── macro
    │   ├── generic.sty
    │   └── code.sty
    ├── .gitignore
    └── report.tex
├── .gitignore
├── stack.yaml
├── submission.sh
├── .travis.yml
├── src
    ├── Data
    │   ├── KleeneAlgebra.hs
    │   ├── BooleanAlgebra.hs
    │   ├── Semiring.hs
    │   └── GSet.hs
    ├── RegExp
    │   ├── Language.hs
    │   ├── Operations.hs
    │   ├── Equation.hs
    │   ├── Derivative.hs
    │   ├── Internal
    │   │   └── DFA.hs
    │   └── RegExp.hs
    ├── SparseVector.hs
    └── SparseMatrix.hs
├── LICENSE
├── regexp.cabal
└── README.md


/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/paper/mathpartir.sty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cacay/regexp/HEAD/paper/mathpartir.sty


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Haskell
 2 | cabal.config
 3 | .stack-work
 4 | 
 5 | # JetBrains
 6 | .idea
 7 | 
 8 | # Distribution
 9 | *.tar
10 | *.tar.gz
11 | *.pdf
12 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | flags: {}
 2 | 
 3 | packages:
 4 | - '.'
 5 | 
 6 | resolver: lts-12.7
 7 | 
 8 | extra-deps:
 9 | - semiring-num-1.6.0.1
10 | 
11 | nix:
12 |   enable: false
13 |   packages:
14 | 


--------------------------------------------------------------------------------
/paper/Makefile:
--------------------------------------------------------------------------------
 1 | LATEXMK = latexmk -pdf
 2 | 
 3 | all: report.pdf
 4 | 
 5 | report.pdf: report.tex bibliography.bib bibliography-extra.bib *.sty macro/*.sty
 6 | 	$(LATEXMK) report
 7 | 
 8 | bibliography.bib: bibliography.bibdb
 9 | 	bibdb bibliography.bibdb
10 | 
11 | clean:
12 | 	$(LATEXMK) -C report
13 | 	rm bibliography.bib
14 | 
15 | .PHONY: all clean
16 | 


--------------------------------------------------------------------------------
/paper/bibliography-extra.bib:
--------------------------------------------------------------------------------
 1 | % Entries that [bibdb](https://github.com/cacay/bibdb) cannot track.
 2 | @techreport{CheneyH03,
 3 |   title={First-class phantom types},
 4 |   author={Cheney, James and Hinze, Ralf},
 5 |   year={2003},
 6 |   institution={Cornell University}
 7 | }
 8 | 
 9 | @techreport{HopcroftK71,
10 |   title={A linear algorithm for testing equivalence of finite automata},
11 |   author={Hopcroft, John E and Karp, Richard M},
12 |   year={1971},
13 |   institution={Cornell University}
14 | }
15 | 


--------------------------------------------------------------------------------
/submission.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Create a new temporary branch
 6 | git checkout -b dist
 7 | 
 8 | # Generate report.pdf
 9 | (cd paper && make && rm -f ../report.pdf && cp report.pdf ../)
10 | 
11 | # Commit report.pdf
12 | git add -f report.pdf
13 | git commit -m "Add paper"
14 | 
15 | # Create an archive containing the source code and the report
16 | git archive --prefix=regexp/ -o dist.tar.gz HEAD
17 | 
18 | # Delete the temporary branch
19 | git checkout master && git branch -D dist
20 | 


--------------------------------------------------------------------------------
/paper/bibliography.bibdb:
--------------------------------------------------------------------------------
 1 | -- Regular Expressions
 2 | DBLP:journals/jacm/Brzozowski64 as Brzozowski64
 3 | DBLP:journals/ijfcs/CampeanuSY03 as CampeanuSY03
 4 | DBLP:conf/lata/GruberG10 as GruberG10
 5 | DBLP:conf/cpp/DoczkalKS13 as DoczkalKS13
 6 | DBLP:journals/iandc/Kozen94 as Kozen94
 7 | DBLP:conf/fsttcs/KeilT14 as KeilT14
 8 | HAL:hal-00639716v2 as BonchiP11
 9 | 
10 | -- Haskell Libraries
11 | DBLP:conf/haskell/EisenbergW12 as Singletons
12 | DBLP:conf/haskell/RuncimanNL08 as SmallCheck
13 | DBLP:conf/icfp/ClaessenH00 as QuickCheck
14 | 
15 | -- Complexity
16 | DBLP:conf/focs/Kozen77 as Kozen77
17 | DBLP:journals/tocl/GeladeN12 as GeladeN12
18 | DBLP:conf/popl/FosterKM0T15 as FosterKM0T15
19 | 


--------------------------------------------------------------------------------
/test/RegExp/Internal/DFASpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | {-# LANGUAGE OverloadedLists #-}
 3 | 
 4 | module RegExp.Internal.DFASpec where
 5 | 
 6 | import Test.Hspec
 7 | import Test.QuickCheck
 8 | import Helpers
 9 | 
10 | import Data.Either(isRight)
11 | 
12 | import RegExp.RegExp
13 | import RegExp.Internal.DFA
14 | import RegExp.Derivative(equivalent)
15 | 
16 | import Data.GSet()
17 | 
18 | 
19 | spec :: Spec
20 | spec = do
21 |     describe "regexp" $ do
22 |         it "is the inverse of fromRegExp" $ do
23 |             mapSize (`div` 4) $
24 |                 property $ \(r :: RegExp Helpers.Small) ->
25 |                     regexp (fromRegExp r) `shouldSatisfy` (isRight . equivalent r)
26 | 


--------------------------------------------------------------------------------
/test/Helpers.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveGeneric #-}
 2 | {-# LANGUAGE MultiParamTypeClasses #-}
 3 | 
 4 | {-# LANGUAGE FlexibleContexts #-}
 5 | {-# LANGUAGE FlexibleInstances #-}
 6 | {-# LANGUAGE UndecidableInstances #-}
 7 | 
 8 | 
 9 | -- | Helper functions for testing.
10 | module Helpers where
11 | 
12 | import GHC.Generics
13 | 
14 | import Test.QuickCheck.Arbitrary
15 | import Test.QuickCheck.Gen
16 | import Test.SmallCheck.Series
17 | 
18 | 
19 | -- | A finite data type with a few constructors. Useful with SmallCheck.
20 | data Small
21 |     = A
22 |     | B
23 |     | C
24 |     deriving (Eq, Ord, Bounded, Enum, Show, Read, Generic)
25 | 
26 | 
27 | instance Monad m => Serial m Small
28 | 
29 | instance Arbitrary Small where
30 |     arbitrary =
31 |         elements [minBound.. ]
32 | 
33 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Use new container infrastructure to enable caching
 2 | sudo: false
 3 | 
 4 | # Do not choose a language; we provide our own build tools.
 5 | language: generic
 6 | 
 7 | # Caching so the next build will be fast too.
 8 | cache:
 9 |   directories:
10 |   - $HOME/.stack
11 | 
12 | # Ensure necessary system libraries are present
13 | addons:
14 |   apt:
15 |     packages:
16 |       - libgmp-dev
17 | 
18 | before_install:
19 | # Download and unpack the stack executable
20 | - mkdir -p ~/.local/bin
21 | - export PATH=$HOME/.local/bin:$PATH
22 | - travis_retry curl -L https://get.haskellstack.org/stable/linux-x86_64.tar.gz | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack'
23 | 
24 | install:
25 | # Build dependencies
26 | - stack --no-terminal --install-ghc test --only-dependencies
27 | 
28 | script:
29 | # Build the package, its tests, and its docs and run the tests
30 | - stack --no-terminal test --haddock --no-haddock-deps
31 | 


--------------------------------------------------------------------------------
/src/Data/KleeneAlgebra.hs:
--------------------------------------------------------------------------------
 1 | -- | Definition of Kleene algebras.
 2 | module Data.KleeneAlgebra
 3 |     ( KleeneAlgebra(..)
 4 |     ) where
 5 | 
 6 | import Data.Semiring (Semiring (..))
 7 | 
 8 | 
 9 | -- | A Kleene algebra is an /idempotent/ semiring with an additional operation
10 | -- called the Kleene star. In addition to the semiring axioms, a Kleene algebra
11 | -- needs to satisfy the following properties:
12 | --
13 | -- == Idempotence of '<+>'
14 | -- @a '<+>' a = a@
15 | --
16 | -- == Properties of 'star'
17 | -- @'one' '<+>' (a '<.>' 'star' a) <= 'star' a@
18 | --
19 | -- @'one' '<+>' ('star' a '<.>' a) <= 'star' a@
20 | --
21 | -- @b '<+>' (a '<.>' x) <= x ==> ('star' a) '<.>' b <= x@
22 | --
23 | -- @b '<+>' (x '<.>' a) <= x ==> b '<.>' ('star' a) <= x@
24 | --
25 | -- Here, @a <= b@ is defined as @a '<+>' b = b@.
26 | class Semiring a => KleeneAlgebra a where
27 |     -- | Kleene star. Captures the notion of /iteration/.
28 |     star :: a -> a
29 | 
30 | 
31 | -- | Booleans form a (trivial) Kleene algebra.
32 | instance KleeneAlgebra Bool where
33 |     star _ =
34 |         True
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Cosku Acay
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/test/RegExp/OperationsSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | 
 3 | module RegExp.OperationsSpec where
 4 | 
 5 | import Test.Hspec
 6 | import Test.QuickCheck
 7 | import Helpers
 8 | 
 9 | import RegExp.RegExp
10 | import RegExp.Operations
11 | import qualified RegExp.Derivative as RegExp
12 | 
13 | import Data.GSet(GSet)
14 | 
15 | 
16 | spec :: Spec
17 | spec = do
18 |     describe "intersection with equation solving" $ do
19 |         it "is equivalent to intersection with DFAs" $ do
20 |             mapSize (`div` 4) $
21 |                 property $ \(r1 :: RegExp Helpers.Small) r2 ->
22 |                     (r1 `intersectionEquation` r2) `shouldMatch` (r1 `intersection` r2)
23 | 
24 |     describe "complement with equation solving" $ do
25 |         it "is equivalent to complement with DFAs" $ do
26 |             mapSize (`div` 4) $
27 |                 property $ \(r :: RegExp Helpers.Small) ->
28 |                     complement r `shouldMatch` complementEquation r
29 | 
30 | 
31 | -- | Set the expectation that the given regular expression
32 | -- should be equivalent.
33 | shouldMatch :: (GSet c, Show c, Eq c) => RegExp c -> RegExp c -> Expectation
34 | shouldMatch r1 r2 =
35 |     RegExp.equivalent r1 r2 `shouldBe` Right ()
36 | 


--------------------------------------------------------------------------------
/test/Data/GSetSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | 
 3 | module Data.GSetSpec where
 4 | 
 5 | import Test.Hspec
 6 | import Test.Hspec.SmallCheck
 7 | import Test.SmallCheck
 8 | import Helpers
 9 | 
10 | import GHC.Exts (IsList(..))
11 | 
12 | import Data.GSet
13 | import Data.BooleanAlgebra
14 | 
15 | 
16 | spec :: Spec
17 | spec = do
18 |     describe "Eq.(==)" $ do
19 |         it "is reflexive" $ do
20 |             property $ \(s :: Set Small) -> s `shouldBe` s
21 | 
22 |         it "agrees with Ord.compare" $ do
23 |             property $ \(s :: Set Small) t -> compare s t == EQ ==> s `shouldBe` t
24 | 
25 |         it "handles compelements" $ do
26 |             property $ \(s :: Set Small) ->
27 |                 s `shouldBe` complement (fromList [a | a <- [A .. ], not (a `member` s)])
28 | 
29 | 
30 | 
31 |     describe "Ord.compare" $ do
32 |         it "is reflexive" $ do
33 |             property $ \(s :: Set Small) -> compare s s `shouldBe` EQ
34 | 
35 |         it "agrees with Eq.(==)" $ do
36 |             property $ \(s :: Set Small) t -> s == t ==> compare s t `shouldBe` EQ
37 | 
38 |         it "is flippable" $ do
39 |             property $ \(s :: Set Small) t -> compare s t == LT ==> compare t s`shouldBe` GT
40 |         it "is flippable" $ do
41 |             property $ \(s :: Set Small) t -> compare s t == GT ==> compare t s`shouldBe` LT
42 | 
43 |         it "is transitive" $ do
44 |             property $ \(s :: Set Small) t u ->
45 |                 s <= t ==> t <= u ==> s <= u `shouldBe` True
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/paper/macro/generic.sty:
--------------------------------------------------------------------------------
 1 | \NeedsTeXFormat{LaTeX2e}
 2 | \ProvidesPackage{macro/generic}
 3 |   [2016/01/29 version 1.0.0 Generic Macros]
 4 | 
 5 | \RequirePackage{mathtools}
 6 | 
 7 | %%% Paired delimiters
 8 | \DeclarePairedDelimiter\parens{(}{)}             % parenthesis
 9 | \DeclarePairedDelimiter\bracks{\lbrack}{\rbrack} % brackets
10 | \DeclarePairedDelimiter\braces{\lbrace}{\rbrace} % braces
11 | \DeclarePairedDelimiter\abs{\lvert}{\rvert}      % absolute value
12 | \DeclarePairedDelimiter\norm{\lVert}{\rVert}     % double verts
13 | \DeclarePairedDelimiter\angled{\langle}{\rangle} % angle brackets
14 | \DeclarePairedDelimiter\set{\lbrace}{\rbrace}    % braces
15 | \DeclarePairedDelimiter\denot{\llbracket}{\rrbracket}
16 | \DeclarePairedDelimiterX\setdef[2]{\{}{\}}
17 |   {#1 \mathrel{}\mathclose{}\delimsize|\mathopen{}\mathrel{} #2}
18 | 
19 | 
20 | %%% Quotation marks for code
21 | \newcommand{\sq}{\text{\ttfamily{\char'15}}} % Single quote
22 | \newcommand{\qq}{\text{\ttfamily"}}          % Double quote
23 | \newcommand{\qquote}[1]{\qq #1\qq{}}         % Strings
24 | 
25 | 
26 | %%% Logic
27 | \newcommand{\DD}{\mathcal{D}}
28 | \newcommand{\EE}{\mathcal{E}}
29 | \newcommand{\FF}{\mathcal{F}}
30 | 
31 | \newcommand\defined{\mathrel{\triangleq}}
32 | 
33 | 
34 | %%% Math
35 | \DeclareMathOperator{\dom}{dom}
36 | \DeclareMathOperator{\cod}{cod}
37 | \DeclareMathOperator{\im}{im}
38 | 
39 | 
40 | %%% Induction
41 | \newcommand\centerLine[1]{
42 |   \begingroup\setbox0=\hbox{#1}%
43 |   \parbox{\wd0}{\box0}\endgroup}
44 | \newcommand\pred[1]{\mathcal{P}\parens*{#1}}
45 | \newcommand\predC[1]{\pred{\centerLine{#1}}}
46 | \newcommand\predQ[1]{\mathcal{Q}\parens*{#1}}
47 | \newcommand\predQC[1]{\predQ{\centerLine{#1}}}
48 | 
49 | 


--------------------------------------------------------------------------------
/src/Data/BooleanAlgebra.hs:
--------------------------------------------------------------------------------
 1 | -- | Definition of Boolean algebras. We base the definition on semirings
 2 | -- so it works nicely with "Data.KleeneAlgebra".
 3 | module Data.BooleanAlgebra
 4 |     ( BooleanAlgebra(..)
 5 |     -- * Operations
 6 |     , implies
 7 |     , butNot
 8 |     , ands
 9 |     , ors
10 |     ) where
11 | 
12 | import Data.Semiring (Semiring (..))
13 | 
14 | 
15 | -- | A Boolean algebra is a distributive commutative idempotent semiring with
16 | -- complement satisfying some extra equations. More concretely, on top of the
17 | -- semiring axioms, the following axioms need to hold:
18 | --
19 | -- == Distributivity of '<+>' over '<.>'
20 | -- @a '<+>' (b '<.>' c) = (a '<+>' b) '<.>' (a '<+>' c)@
21 | --
22 | -- == Commutativity of '<.>'
23 | -- @a '<.>' b = b '<.>' a@
24 | --
25 | -- == Idempotence of '<+>' and '<.>'
26 | -- @a '<+>' a = a@
27 | --
28 | -- @a '<.>' a = a@
29 | --
30 | -- == Upper bounded by 'one'
31 | -- @a '<+>' 'one' = 'one'@
32 | --
33 | -- == Properties of 'complement'
34 | -- @a '<+>' 'complement' a = 'one'@
35 | --
36 | -- @a '<.>' 'complement' a = 'zero'@
37 | class Semiring a => BooleanAlgebra a where
38 |     -- | Complement or negation.
39 |     complement :: a -> a
40 | 
41 | 
42 | -- | Booleans form a boolean algebra.
43 | instance BooleanAlgebra Bool where
44 |     complement = not
45 | 
46 | 
47 | -- * Functions over Boolean algebras
48 | 
49 | -- | @p `implies` q@ holds if @p@ holds implies @q@ holds.
50 | implies :: BooleanAlgebra a => a -> a -> a
51 | implies p q =
52 |     complement p <+> q
53 | 
54 | 
55 | -- | @p `butNot` q@ holds when @p@ holds but @q@ doesn't.
56 | butNot :: BooleanAlgebra a => a -> a -> a
57 | butNot p q =
58 |     p <.> complement q
59 | 
60 | 
61 | -- | Logical conjunction of all elements in a container.
62 | ands :: (BooleanAlgebra a, Foldable t) => t a -> a
63 | ands t =
64 |     foldr (<.>) one t
65 | 
66 | 
67 | -- | Logical disjunction of all elements in a container.
68 | ors :: (BooleanAlgebra a, Foldable t) => t a -> a
69 | ors =
70 |     foldr (<+>) zero
71 | 


--------------------------------------------------------------------------------
/src/RegExp/Language.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MonoLocalBinds #-}
 2 | 
 3 | -- | The language of a regular expression is the set of all words
 4 | -- matched by that expression. Here, we show that languages of regular
 5 | -- expressions (also called regular languages) form a Kleene algebra.
 6 | module RegExp.Language
 7 |     ( Language
 8 |     , language
 9 |     , regexp
10 |     ) where
11 | 
12 | import Data.Either (isRight)
13 | 
14 | import RegExp.RegExp
15 | import RegExp.Derivative
16 | 
17 | import Data.Semiring (Semiring(..), DetectableZero(..))
18 | import Data.KleeneAlgebra (KleeneAlgebra(..))
19 | import Data.GSet (GSet(..))
20 | 
21 | 
22 | -- | Regular languages over the alphabet @c@, i.e. set of strings that
23 | -- are matched by some regular expression.
24 | newtype Language c =
25 |     Language (RegExp c)
26 | 
27 | 
28 | -- | Compute the set of all strings given regular expression matches.
29 | language :: RegExp c -> Language c
30 | language =
31 |     Language
32 | 
33 | 
34 | -- | Given a regular language, construct a regular expression that
35 | -- matches precisely that language.
36 | regexp :: Language c -> RegExp c
37 | regexp (Language r) =
38 |     r
39 | 
40 | 
41 | -- | Equivalence of regular languages is decidable.
42 | instance GSet c => Eq (Language c) where
43 |     l1 == l2 =
44 |         isRight $ equivalent (regexp l1) (regexp l2)
45 | 
46 | 
47 | -- | Regular languages form a semiring.
48 | instance GSet c => Semiring (Language c) where
49 |     zero =
50 |         Language rZero
51 | 
52 |     one =
53 |         Language rOne
54 | 
55 |     l1 <+> l2 =
56 |         Language (regexp l1 `rPlus` regexp l2)
57 | 
58 |     l1 <.> l2 =
59 |         Language (regexp l1 `rTimes` regexp l2)
60 | 
61 | 
62 | -- | We can tell when a regular language is empty.
63 | instance GSet c => DetectableZero (Language c) where
64 |     -- | TODO: we can do this a lot more efficiently.
65 |     isZero l =
66 |         empty (regexp l)
67 | 
68 | 
69 | -- | Regular languages form a Kleene algebra.
70 | instance GSet c => KleeneAlgebra (Language c) where
71 |     star l =
72 |         Language (rStar $ regexp l)
73 | 
74 | 
75 | -- TODO: regular languages form a 'GSet', but concatenation is not intersection.
76 | -- How do we reconcile that?


--------------------------------------------------------------------------------
/paper/.gitignore:
--------------------------------------------------------------------------------
  1 | bibliography.bib
  2 | 
  3 | ## Core latex/pdflatex auxiliary files:
  4 | *.aux
  5 | *.lof
  6 | *.log
  7 | *.lot
  8 | *.fls
  9 | *.out
 10 | *.toc
 11 | *.vtc
 12 | 
 13 | ## Intermediate documents:
 14 | *.dvi
 15 | *-converted-to.*
 16 | # these rules might exclude image files for figures etc.
 17 | *.ps
 18 | # *.eps
 19 | *.pdf
 20 | 
 21 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 22 | *.bbl
 23 | *.bcf
 24 | *.blg
 25 | *-blx.aux
 26 | *-blx.bib
 27 | *.brf
 28 | *.run.xml
 29 | 
 30 | ## Build tool auxiliary files:
 31 | *.fdb_latexmk
 32 | *.synctex
 33 | *.synctex.gz
 34 | *.synctex.gz(busy)
 35 | *.pdfsync
 36 | 
 37 | ## Auxiliary and intermediate files from other packages:
 38 | 
 39 | 
 40 | # algorithms
 41 | *.alg
 42 | *.loa
 43 | 
 44 | # achemso
 45 | acs-*.bib
 46 | 
 47 | # amsthm
 48 | *.thm
 49 | 
 50 | # beamer
 51 | *.nav
 52 | *.snm
 53 | *.vrb
 54 | 
 55 | #(e)ledmac/(e)ledpar
 56 | *.end
 57 | *.[1-9]
 58 | *.[1-9][0-9]
 59 | *.[1-9][0-9][0-9]
 60 | *.[1-9]R
 61 | *.[1-9][0-9]R
 62 | *.[1-9][0-9][0-9]R
 63 | *.eledsec[1-9]
 64 | *.eledsec[1-9]R
 65 | *.eledsec[1-9][0-9]
 66 | *.eledsec[1-9][0-9]R
 67 | *.eledsec[1-9][0-9][0-9]
 68 | *.eledsec[1-9][0-9][0-9]R
 69 | 
 70 | # glossaries
 71 | *.acn
 72 | *.acr
 73 | *.glg
 74 | *.glo
 75 | *.gls
 76 | 
 77 | # gnuplottex
 78 | *-gnuplottex-*
 79 | 
 80 | # hyperref
 81 | *.brf
 82 | 
 83 | # knitr
 84 | *-concordance.tex
 85 | *.tikz
 86 | *-tikzDictionary
 87 | 
 88 | # listings
 89 | *.lol
 90 | 
 91 | # makeidx
 92 | *.idx
 93 | *.ilg
 94 | *.ind
 95 | *.ist
 96 | 
 97 | # minitoc
 98 | *.maf
 99 | *.mtc
100 | *.mtc[0-9]
101 | *.mtc[1-9][0-9]
102 | 
103 | # minted
104 | _minted*
105 | *.pyg
106 | 
107 | # morewrites
108 | *.mw
109 | 
110 | # mylatexformat
111 | *.fmt
112 | 
113 | # nomencl
114 | *.nlo
115 | 
116 | # sagetex
117 | *.sagetex.sage
118 | *.sagetex.py
119 | *.sagetex.scmd
120 | 
121 | # sympy
122 | *.sout
123 | *.sympy
124 | sympy-plots-for-*.tex/
125 | 
126 | #pythontex
127 | *.pytxcode
128 | pythontex-files-*/
129 | 
130 | # Texpad
131 | .texpadtmp
132 | 
133 | # TikZ & PGF
134 | *.dpth
135 | *.md5
136 | *.auxlock
137 | 
138 | # todonotes
139 | *.tdo
140 | 
141 | # xindy
142 | *.xdy
143 | 
144 | # WinEdt
145 | *.bak
146 | *.sav
147 | 
148 | # endfloat
149 | *.ttt
150 | *.fff
151 | 
152 | # Packages
153 | *.zip
154 | *.tar
155 | 


--------------------------------------------------------------------------------
/paper/macro/code.sty:
--------------------------------------------------------------------------------
 1 | \NeedsTeXFormat{LaTeX2e}
 2 | \ProvidesPackage{macro/code}
 3 |   [2016/01/29 version 1.0.0 Code]
 4 | 
 5 | \RequirePackage{listings}
 6 | \RequirePackage{color}
 7 | 
 8 | % Types
 9 | \newcommand\nat{\text{\lstinline{Nat}}}
10 | \newcommand\pos{\text{\lstinline{Pos}}}
11 | \newcommand\even{\text{\lstinline{Even}}}
12 | \newcommand\odd{\text{\lstinline{Odd}}}
13 | 
14 | % Listings settings
15 | \lstdefinestyle{custom}{%
16 |   commentstyle=\it,
17 |   keywordstyle=\bf,
18 |   %
19 |   breakatwhitespace=false,         % sets if automatic breaks should only happen at whitespace
20 |   breaklines=true,                 % sets automatic line breaking
21 |   captionpos=b,                    % sets the caption-position to bottom
22 |   frame=none,                      % adds a frame around the code
23 |   keepspaces=true,                 % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
24 |   numbers=none,                    % where to put the line-numbers; possible values are (none, left, right)
25 |   numbersep=5pt,                   % how far the line-numbers are from the code
26 |   numberstyle=\tiny\color{gray},   % the style that is used for the line-numbers
27 |   stepnumber=2,                    % the step between two line-numbers. If it's 1, each line will be numbered
28 |   rulecolor=\color{black},         % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here))
29 |   showspaces=false,                % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
30 |   showstringspaces=false,          % underline spaces within strings only
31 |   showtabs=false,                  % show tabs within strings adding particular underscores
32 |   tabsize=2,                       % sets default tabsize to 2 spaces
33 | }
34 | 
35 | \lstdefinelanguage{krill}{%
36 |   % Language
37 |   morekeywords={type, data},
38 |   morekeywords={close, wait, send, recv, case, of, 1},
39 |   sensitive=true,
40 |   morecomment=[l]{--},
41 |   morecomment=[n]{\{-}{-\}},
42 |   % Comments
43 |   mathescape=true,         % Latex math mode using $...$
44 |   escapechar=@,            % Latex inside code with @...@
45 |   escapeinside={\{*}{*\}}, % Latex comment with {* ... *}
46 | }
47 | 
48 | \lstset{language=haskell, style=custom}
49 | \lstset{mathescape, keepspaces}
50 | 
51 | 


--------------------------------------------------------------------------------
/regexp.cabal:
--------------------------------------------------------------------------------
 1 | name:                regexp
 2 | version:             0.1.0.0
 3 | category:            Text
 4 | synopsis:            Derivatives, intersections, complement, and equivalence of regular expressions.
 5 | homepage:            https://github.com/cacay/regexp
 6 | 
 7 | license:             MIT
 8 | license-file:        LICENSE
 9 | 
10 | author:              Cosku Acay
11 | maintainer:          coskuacay@gmail.com
12 | 
13 | build-type:          Simple
14 | cabal-version:       >=1.10
15 | 
16 | extra-source-files:  README.md
17 | 
18 | 
19 | library
20 |   hs-source-dirs:        src
21 | 
22 |   default-language:      Haskell2010
23 | 
24 |   exposed-modules:       RegExp.RegExp
25 |                          RegExp.Language
26 |                          RegExp.Derivative
27 |                          RegExp.Equation
28 |                          RegExp.Operations
29 |                          Data.BooleanAlgebra
30 |                          Data.KleeneAlgebra
31 |                          Data.GSet
32 |                          Data.Semiring
33 | 
34 |                          RegExp.Internal.DFA
35 | 
36 |   other-modules:         SparseVector
37 |                          SparseMatrix
38 | 
39 |   build-depends:         base >= 4.11,
40 |                          flow,
41 |                          mtl,
42 |                          containers,
43 |                          equivalence,
44 |                          -- semiring-num,
45 |                          singletons,
46 |                          finite-typelits,
47 |                          QuickCheck,
48 |                          smallcheck
49 | 
50 |   default-extensions:    ScopedTypeVariables
51 |                          OverloadedStrings
52 |                          StandaloneDeriving
53 |                          DeriveFunctor
54 | 
55 |   other-extensions:      GADTs
56 |                          DataKinds
57 |                          KindSignatures
58 |                          TypeFamilies
59 |                          FlexibleInstances
60 |                          FlexibleContexts
61 |                          UndecidableInstances
62 |                          OverloadedLists
63 | 
64 |   ghc-options:           -fwarn-incomplete-patterns
65 |                          -fwarn-unused-imports
66 | 
67 | 
68 | test-suite spec
69 |   type: exitcode-stdio-1.0
70 | 
71 |   hs-source-dirs:        test
72 | 
73 |   default-language:      Haskell2010
74 | 
75 |   main-is:               Spec.hs
76 | 
77 |   other-modules:         Data.GSetSpec
78 |                          RegExp.RegExpSpec
79 |                          RegExp.OperationsSpec
80 |                          RegExp.Internal.DFASpec
81 |                          Helpers
82 | 
83 |   build-depends:         base >= 4.11,
84 |                          regexp,
85 |                          hspec,
86 |                          hspec-smallcheck,
87 |                          QuickCheck,
88 |                          smallcheck
89 | 
90 |   default-extensions:    ScopedTypeVariables
91 |                          OverloadedStrings
92 |                          StandaloneDeriving
93 |                          DeriveFunctor
94 | 
95 |   ghc-options:           -Wall


--------------------------------------------------------------------------------
/src/RegExp/Operations.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | 
 3 | -- | Operations over regular expressions.
 4 | module RegExp.Operations
 5 |     ( intersection
 6 |     , complement
 7 |     , difference
 8 |     , intersectionEquation
 9 |     , complementEquation
10 |     ) where
11 | 
12 | 
13 | import qualified Data.Map as Map
14 | import qualified Data.Set as Set
15 | 
16 | import qualified RegExp.Internal.DFA as DFA
17 | import RegExp.RegExp
18 | import RegExp.Derivative
19 | import RegExp.Equation
20 | 
21 | import qualified Data.BooleanAlgebra as BooleanAlgebra
22 | import Data.Semiring(Semiring(..))
23 | import Data.GSet hiding (Set)
24 | 
25 | 
26 | -- | Regular expression that accepts words both expressions accept.
27 | intersection :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c
28 | intersection r1 r2 =
29 |     DFA.regexp $
30 |         DFA.fromRegExp r1 <.> DFA.fromRegExp r2
31 | 
32 | 
33 | -- | Regular expression that accepts words given expression doesn't.
34 | complement :: GSet c => RegExp c -> RegExp c
35 | complement r =
36 |     DFA.regexp $
37 |         BooleanAlgebra.complement $ DFA.fromRegExp r
38 | 
39 | 
40 | -- | Regular expression that accepts words the first expression does but
41 | -- the second doesn't.
42 | difference :: GSet c => RegExp c -> RegExp c -> RegExp c
43 | difference r1 r2 =
44 |     DFA.regexp $
45 |         DFA.fromRegExp r1 <.> BooleanAlgebra.complement (DFA.fromRegExp r2)
46 | 
47 | 
48 | 
49 | -- | Intersection of two regular expressions computed directly by solving
50 | -- linear equations instead of going through DFAs.
51 | intersectionEquation :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c
52 | intersectionEquation r1 r2 =
53 |     solve step (r1, r2)
54 |     where
55 |         step :: (RegExp c, RegExp c) -> RightHand c (RegExp c, RegExp c)
56 |         step (r1, r2) =
57 |             (hasEmpty, Map.fromListWith rPlus subTerms)
58 |             where
59 |                 hasEmpty =
60 |                     if nullable r1 && nullable r2 then
61 |                         rOne
62 |                     else
63 |                         rZero
64 | 
65 |                 subTerms =
66 |                     [ ((derivative c r1, derivative c r2), rLiteral p)
67 |                     | p <- Set.toList $ join (next r1) (next r2)
68 |                     , Just c <- [choose p]
69 |                     ]
70 | 
71 | 
72 | -- | Complement of a regular expression computed directly by solving
73 | -- linear equations instead of going through DFAs.
74 | complementEquation :: forall c. GSet c => RegExp c -> RegExp c
75 | complementEquation r =
76 |     solve step r
77 |     where
78 |         step :: RegExp c -> RightHand c (RegExp c)
79 |         step r =
80 |             (hasEmpty `rPlus` (notNext `rTimes` any), Map.fromListWith rPlus subTerms)
81 |             where
82 |                 hasEmpty =
83 |                     if nullable r then rZero else rOne
84 | 
85 |                 notNext =
86 |                     rLiteral $ BooleanAlgebra.complement $ BooleanAlgebra.ors $ next r
87 | 
88 |                 any :: RegExp c
89 |                 any =
90 |                     rStar $ rLiteral $ one
91 | 
92 |                 subTerms =
93 |                     [ (derivative c r, rLiteral p)
94 |                     | p <- Set.toList $ next r
95 |                     , Just c <- [choose p]
96 |                     ]
97 | 


--------------------------------------------------------------------------------
/src/Data/Semiring.hs:
--------------------------------------------------------------------------------
  1 | -- | Copied from @semiring-num-1.6.0.1@ because that doesn't work
  2 | -- with Stack nightly build. TODO: delete when that works.
  3 | module Data.Semiring
  4 |     ( Semiring(..)
  5 |     , DetectableZero(..)
  6 |     ) where
  7 | 
  8 | -- $setup
  9 | -- >>> import Data.Function
 10 | 
 11 | -- | A <https://en.wikipedia.org/wiki/Semiring Semiring> is like the
 12 | -- the combination of two 'Data.Monoid.Monoid's. The first
 13 | -- is called '<+>'; it has the identity element 'zero', and it is
 14 | -- commutative. The second is called '<.>'; it has identity element 'one',
 15 | -- and it must distribute over '<+>'.
 16 | --
 17 | -- = Laws
 18 | -- == Normal 'Monoid' laws
 19 | --
 20 | -- @(a '<+>' b) '<+>' c = a '<+>' (b '<+>' c)
 21 | --'zero' '<+>' a = a '<+>' 'zero' = a
 22 | --(a '<.>' b) '<.>' c = a '<.>' (b '<.>' c)
 23 | --'one' '<.>' a = a '<.>' 'one' = a@
 24 | --
 25 | -- == Commutativity of '<+>'
 26 | -- @a '<+>' b = b '<+>' a@
 27 | --
 28 | -- == Distribution of '<.>' over '<+>'
 29 | -- @a '<.>' (b '<+>' c) = (a '<.>' b) '<+>' (a '<.>' c)
 30 | --(a '<+>' b) '<.>' c = (a '<.>' c) '<+>' (b '<.>' c)@
 31 | --
 32 | -- == Annihilation
 33 | -- @'zero' '<.>' a = a '<.>' 'zero' = 'zero'@
 34 | --
 35 | -- An ordered semiring follows the laws:
 36 | --
 37 | -- @x '<=' y => x '<+>' z '<=' y '<+>' z
 38 | --x '<=' y => x '<+>' z '<=' y '<+>' z
 39 | --'zero' '<=' z '&&' x '<=' y => x '<.>' z '<=' y '<.>' z '&&' z '<.>' x '<=' z '<.>' y@
 40 | class Semiring a  where
 41 |     {-# MINIMAL zero , one , (<.>) , (<+>) #-}
 42 |     -- | The identity of '<+>'.
 43 |     zero
 44 |         :: a
 45 |     -- | The identity of '<.>'.
 46 |     one
 47 |         :: a
 48 |     -- | An associative binary operation, which distributes over '<+>'.
 49 |     infixl 7 <.>
 50 |     (<.>) :: a -> a -> a
 51 |     -- | An associative, commutative binary operation.
 52 |     infixl 6 <+>
 53 |     (<+>) :: a -> a -> a
 54 | 
 55 | 
 56 | -- | Useful for operations where zeroes may need to be discarded: for instance
 57 | -- in sparse matrix calculations.
 58 | class Semiring a =>
 59 |       DetectableZero a  where
 60 |     -- | 'True' if x is 'zero'.
 61 |     isZero
 62 |         :: a -> Bool
 63 | 
 64 | 
 65 | instance Semiring Bool where
 66 |     one = True
 67 |     zero = False
 68 |     (<+>) = (||)
 69 |     (<.>) = (&&)
 70 |     {-# INLINE zero #-}
 71 |     {-# INLINE one #-}
 72 |     {-# INLINE (<+>) #-}
 73 |     {-# INLINE (<.>) #-}
 74 | 
 75 | 
 76 | instance DetectableZero Bool where
 77 |     isZero = not
 78 |     {-# INLINE isZero #-}
 79 | 
 80 | 
 81 | instance Semiring () where
 82 |     one = ()
 83 |     zero = ()
 84 |     _ <+> _ = ()
 85 |     _ <.> _ = ()
 86 |     {-# INLINE zero #-}
 87 |     {-# INLINE one #-}
 88 |     {-# INLINE (<+>) #-}
 89 |     {-# INLINE (<.>) #-}
 90 | 
 91 | instance DetectableZero () where
 92 |     isZero _ = True
 93 |     {-# INLINE isZero #-}
 94 | 
 95 | 
 96 | instance (Semiring a, Semiring b) => Semiring (a, b) where
 97 |     one =
 98 |         (one, one)
 99 | 
100 |     zero =
101 |         (zero, zero)
102 | 
103 |     (a1, b1) <+> (a2, b2) =
104 |         (a1 <+> a2, b1 <+> b2)
105 | 
106 |     (a1, b1) <.> (a2, b2) =
107 |         (a1 <.> a2, b1 <.> b2)
108 | 
109 | instance (DetectableZero a, DetectableZero b) => DetectableZero (a, b) where
110 |     isZero (a, b) =
111 |         isZero a && isZero b


--------------------------------------------------------------------------------
/test/RegExp/RegExpSpec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | 
 3 | {-# LANGUAGE FlexibleContexts #-}
 4 | {-# LANGUAGE FlexibleInstances #-}
 5 | {-# LANGUAGE UndecidableInstances #-}
 6 | 
 7 | module RegExp.RegExpSpec where
 8 | 
 9 | import Test.Hspec
10 | import Test.QuickCheck
11 | 
12 | import RegExp.RegExp
13 | import RegExp.Derivative (matches)
14 | 
15 | import Data.GSet
16 | 
17 | 
18 | spec :: Spec
19 | spec = do
20 |     describe "zero" $ do
21 |         it "does not match the empty string" $ do
22 |             (rZero `matches` "") `shouldBe` False
23 |         it "does not match non-empty strings" $ do
24 |             (rZero `matches` "a") `shouldBe` False
25 |             (rZero `matches` "abc") `shouldBe` False
26 |         it "does not match any string" $ do
27 |             property $ \(w :: String) -> (rZero `matches` w) `shouldBe` False
28 | 
29 |     describe "one" $ do
30 |         it "matches the empty string" $ do
31 |             (rOne `matches` "") `shouldBe` True
32 |         it "does not match non-empty strings" $ do
33 |             property $ \c (w :: String) -> (rZero `matches` (c : w)) `shouldBe` False
34 | 
35 |     describe "plus" $ do
36 |         it "matches when either subexpression matches" $ do
37 |             let r1 = "abc"
38 |             let r2 = "def"
39 |             (rPlus r1 r2 `matches` "a") `shouldBe` True
40 | 
41 |     describe "times" $ do
42 |         it "matches when both subexpressions match in order" $ do
43 |             let r1 = "abc"
44 |             let r2 = "def"
45 |             (rTimes r1 r2 `matches` "af") `shouldBe` True
46 | 
47 |     describe "star" $ do
48 |         it "matches the empty string" $ do
49 |             (rStar "abc" `matches` "") `shouldBe` True
50 |         it "matches once" $ do
51 |             (rStar "abc" `matches` "c") `shouldBe` True
52 |         it "matches multiple times" $ do
53 |             (rStar "abc" `matches` "abcbabc") `shouldBe` True
54 | 
55 |     describe "literal" $ do
56 |         it "matches one of the options" $ do
57 |             (rLiteral "ab" `matches` "a") `shouldBe` True
58 |             (rLiteral "ab" `matches` "b") `shouldBe` True
59 |         it "does not match characters not in the class" $ do
60 |             (rLiteral "ab" `matches` "c") `shouldBe` False
61 | 
62 | 
63 |     describe "hide" $ do
64 |         it "is the inverse of view" $ do
65 |             property $ \(r :: RegExp Char) -> hide (view r) `shouldBe` r
66 | 
67 |         it "is the deep inverse of view" $ do
68 |             property $ \(r :: RegExp Char) -> hideAll (viewAll r) `shouldBe` r
69 | 
70 | 
71 |     describe "read" $ do
72 |         it "is the inverse of show" $ do
73 |             withMaxSuccess 5 $ -- Reading and/or printing is super slow!
74 |                 property $ \(r :: RegExp Char) -> read (show r) `shouldBe` r
75 | 
76 | 
77 | 
78 | -- | Fixed point of a functor.
79 | data Fix f = Fix {unFix :: f (Fix f)}
80 | 
81 | deriving instance Eq (f (Fix f)) => Eq (Fix f)
82 | deriving instance Ord (f (Fix f)) => Ord (Fix f)
83 | deriving instance Show (f (Fix f)) => Show (Fix f)
84 | 
85 | 
86 | -- | Like 'view', but fully unfolds all subtrees.
87 | viewAll :: GSet c => RegExp c -> Fix (RegExpView c)
88 | viewAll r =
89 |     Fix $ fmap viewAll $ view r
90 | 
91 | 
92 | -- | Like 'hide', but works on a fully viewed regular expression.
93 | hideAll :: GSet c => Fix (RegExpView c) -> RegExp c
94 | hideAll r =
95 |     hide (fmap hideAll $ unFix r)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # regexp
  2 | 
  3 | [![Build Status](https://travis-ci.com/cacay/regexp.svg?branch=master)](https://travis-ci.com/cacay/regexp)
  4 | 
  5 | This is a regular expression library for Haskell that focuses on higher level
  6 | operations like computing the intersection of regular expressions or deciding
  7 | whether two regular expressions match the same set of strings. This is in stark
  8 | contrast to pretty much every single regular expression library out there (including
  9 | ones for other languages), which are only concerned with matching strings. Unfortunately,
 10 | deprioritizing string matching means it isn't very efficient, so if that's all you need,
 11 | you should use a different library.
 12 | 
 13 | Here is a summary of supported features:
 14 | * Intersection and complement
 15 | * Derivatives à la [Brzozowski](https://en.wikipedia.org/wiki/Brzozowski_derivative)
 16 | * Equivalence checking
 17 | * Solving systems of linear equations with regular expression coefficients
 18 |   (which can be used to implement intersection, complement, and more)
 19 | * Arbitrary alphabets, even infinite ones!
 20 | 
 21 | 
 22 | ## Usage and Development
 23 | 
 24 | We use [Stack](https://docs.haskellstack.org) so it's pretty much
 25 | trivial to get started. If you don't have Stack already,
 26 | [install](https://docs.haskellstack.org/en/stable/README/#how-to-install)
 27 | it and set it up by running 
 28 | ```shell
 29 | stack setup
 30 | ```
 31 | in your shell. You only need to do this once. Then, you can run
 32 | ```shell
 33 | stack repl
 34 | ```
 35 | to be dropped in GHCi where you can play around with the library. This will
 36 | install all dependencies, build the library, and do whatever is necessary so
 37 | everything "Just Works™".
 38 | ```shell
 39 | stack haddock --open regexp
 40 | ```
 41 | will open the documentation in your browser and
 42 | ```shell
 43 | stack test
 44 | ```
 45 | will run the test suite.
 46 | 
 47 | Stack is all about reproducible builds, so you should not run into any issues.
 48 | 
 49 | 
 50 | ## Examples
 51 | 
 52 | Load up the library in GHCi:
 53 | ```shell
 54 | stack repl
 55 | ```
 56 | 
 57 | ### Creating Regular Expressions
 58 | 
 59 | The simplest regular expressions are `rZero`, which matches no strings, and
 60 | `rOne`, which matches only the empty string. You can combine regular expressions
 61 | using `rPlus` and `rTimes` (choice and sequencing). Kleene star is implemented
 62 | by `rStar`. For example:
 63 | ```haskell
 64 | rOne
 65 | rStar rZero
 66 | rOne `rTimes` (rZero `rPlus` rOne)
 67 | ```
 68 | are all valid expressions, though they are boring since they are all equivalent to
 69 | `rOne`. More interesting expressions can be constructed using _character
 70 | classes_. Standard Haskell string notation is interpreted as a character class
 71 | containing all characters in the string. For example,
 72 | ```haskell
 73 | "abc" :: RegExp Char
 74 | ```
 75 | is the (regular expression formed by) the character class containing the characters
 76 | `a`, `b`, and `c`. This expression will match single character strings `"a"`, `"b"`,
 77 | and `"c"` and nothing else. Note that the type annotation is required for Haskell
 78 | to interpret the string as a regular expression.
 79 | 
 80 | 
 81 | ### String Matching
 82 | 
 83 | We can check that this is indeed how character classes behave by trying to match them
 84 | against strings:
 85 | ```haskell
 86 | matches ("abc" :: RegExp Char) "a"
 87 | ==> True
 88 | 
 89 | matches ("abc" :: RegExp Char) "ab"
 90 | ==> False
 91 | 
 92 | matches (rStar "abc" :: RegExp Char) "ab"
 93 | ==> True
 94 | ```
 95 | 
 96 | 
 97 | ### Checking Equivalence
 98 | 
 99 | We can check if two regular expressions are equivalent and get a counter example
100 | in the case they are not:
101 | ```haskell
102 | equivalent ("abc" :: RegExp Char) ("abc" `rPlus` rZero) 
103 | ==> Right ()
104 | 
105 | equivalent ("abc" :: RegExp Char) ("ab") 
106 | ==> Left "c"
107 | 
108 | equivalent (rStar "abc" :: RegExp Char) ("abc" `rTimes` rStar "abc") 
109 | ==> Left ""
110 | ```
111 | 
112 | 
113 | ### Intersection and Complement
114 | 
115 | We can compute intersections and complements:
116 | ```haskell
117 | intersection (rStar "ab" :: RegExp Char) (rStar "a") 
118 | ==> rStar "a"
119 | 
120 | intersection (rStar "a" :: RegExp Char) (RegExp.Operations.complement $ rStar "a") 
121 | ==> rZero
122 | ```
123 | 


--------------------------------------------------------------------------------
/src/RegExp/Equation.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GADTs #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | {-# LANGUAGE MultiParamTypeClasses #-}
  4 | 
  5 | -- | Solving systems of linear equations with regular expression coefficients.
  6 | module RegExp.Equation
  7 |     ( RightHand
  8 |     , solve
  9 |     , scale
 10 |     , combine
 11 |     ) where
 12 | 
 13 | import Control.Exception.Base(assert)
 14 | import Control.Monad.State
 15 | 
 16 | import Data.Map.Strict(Map)
 17 | import qualified Data.Map.Strict as Map
 18 | 
 19 | import RegExp.RegExp
 20 | import Data.GSet(GSet)
 21 | 
 22 | 
 23 | -- | Right-hand side of an equation. A sum of terms where a term is
 24 | -- either a constant, or a constant times a variable. These look
 25 | -- like @r0 + r1 X1 + r2 X2 ...@ where @ri@ are regular expressions
 26 | -- and @Xi@ are variables.
 27 | type RightHand c v =
 28 |     (RegExp c, Map v (RegExp c))
 29 | 
 30 | 
 31 | -- | Solve a system of linear equations with regular expression coefficients.
 32 | -- Equations are generated on demand using the given function.
 33 | -- Coefficients in front of variables must be non-nullable to ensure the
 34 | -- system has a unique solution.
 35 | solve :: forall v c. (GSet c, Ord v) => (v -> RightHand c v) -> v -> RegExp c
 36 | solve f v =
 37 |     evalState (go v) (Context Map.empty Map.empty)
 38 |     where
 39 |         go :: (MonadState (Context c v) m) => v -> m (RegExp c)
 40 |         go v = do
 41 |             context <- get
 42 |             case Map.lookup v (solved context) of
 43 |                 Just r ->
 44 |                     -- We are done if the result was computed beforehand.
 45 |                     return r
 46 | 
 47 |                 Nothing ->
 48 |                     assert (not $ Map.member v (partial context)) $ do
 49 |                     resolved@(c, l) <- resolve (f v)
 50 | 
 51 |                     -- Eliminate @v@ in @resolved@.
 52 |                     let resolved'@(c', l') =
 53 |                             case Map.lookup v l of
 54 |                                 -- Nothing to do if @v@ doesn't occur.
 55 |                                 Nothing ->
 56 |                                     resolved
 57 | 
 58 |                                 -- Otherwise, use Arden's lemma to to rewrite @resolved@.
 59 |                                 Just cv ->
 60 |                                     assert (not $ nullable cv)
 61 |                                     scale (rStar cv) (c, Map.delete v l)
 62 | 
 63 |                     -- Add a partial solution for @v@ in the context. This is
 64 |                     -- essentially substituting the solution for @v@ in all
 65 |                     -- "following" equations.
 66 |                     put (context {partial = Map.insert v resolved' (partial context)})
 67 | 
 68 |                     -- Recursively solve all variables appearing in the
 69 |                     -- equation for @v@.
 70 |                     terms <-
 71 |                         mapM
 72 |                             (\(v, c) -> do {r <- go v; return (c `rTimes` r)})
 73 |                             (Map.toList l')
 74 | 
 75 |                     let result = foldr rPlus c' terms
 76 | 
 77 |                     put (context {solved = Map.insert v result (solved context)})
 78 |                     return result
 79 | 
 80 | 
 81 |         -- | Resolve variables in a 'RightHand' using the context.
 82 |         -- Variables not in the context are kept as is.
 83 |         resolve :: (MonadState (Context c v) m)
 84 |                 => RightHand c v
 85 |                 -> m (RightHand c v)
 86 |         resolve (c, l) = do
 87 |             resolved <- mapM resolveTerm (Map.toList l)
 88 |             return $ foldr combine (c, Map.empty) resolved
 89 | 
 90 | 
 91 |         -- | Resolve the variable in a single term using the context.
 92 |         -- If the variable is not in the context, it is returned as is.
 93 |         resolveTerm :: (MonadState (Context c v) m)
 94 |                 => (v, RegExp c)
 95 |                 -> m (RightHand c v)
 96 |         resolveTerm (v, r) = do
 97 |             context <- get
 98 |             case Map.lookup v (partial context) of
 99 |                 Nothing ->
100 |                     return (rZero, Map.singleton v r)
101 | 
102 |                 Just right -> do
103 |                     resolved <- resolve right
104 |                     return (scale r resolved)
105 | 
106 | 
107 | -- | Multiply a 'RightHand' on the left with the given coefficient.
108 | scale :: (GSet c, Ord v) => RegExp c -> RightHand c v -> RightHand c v
109 | scale r (c, l) =
110 |     (r `rTimes` c, Map.map (r `rTimes`) l)
111 | 
112 | 
113 | -- | Merge two 'RightHand's using `rPlus`.
114 | combine :: (GSet c, Ord v) => RightHand c v -> RightHand c v -> RightHand c v
115 | combine (c1, l1) (c2, l2) =
116 |     (c1 `rPlus` c2, Map.unionWith rPlus l1 l2)
117 | 
118 | 
119 | -- * Internals
120 | 
121 | -- | Context used in the implementation of 'solve'.
122 | data Context c v =
123 |     Context {
124 |         -- | Variables that have a partial solution in terms of
125 |         -- other variables.
126 |         partial :: Map v (RightHand c v),
127 | 
128 |         -- | Variables that are fully solved.
129 |         solved :: Map v (RegExp c)
130 |     }
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/src/SparseVector.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DataKinds #-}
  2 | {-# LANGUAGE KindSignatures #-}
  3 | {-# LANGUAGE TypeApplications #-}
  4 | {-# LANGUAGE TypeOperators #-}
  5 | 
  6 | -- | A very cruddy implementation of sparse vectors. We define vectors
  7 | -- with a known length @n@ so we can define a 'Semiring' instance.
  8 | --
  9 | -- TODO: find a package or make nicer
 10 | module SparseVector
 11 |     ( SparseVector
 12 |     , vector
 13 |     , (!)
 14 |     , length
 15 | 
 16 |     , (++)
 17 |     , split
 18 | 
 19 |     , map
 20 |     , zipWith
 21 | 
 22 |     , sum
 23 |     , nonZero
 24 |     , toList
 25 |     ) where
 26 | 
 27 | import Prelude hiding (length, map, sum, zipWith, (++))
 28 | import Control.Exception (assert)
 29 | 
 30 | import Data.Finite
 31 | import Data.Singletons
 32 | import Data.Singletons.Prelude
 33 | import Data.Singletons.TypeLits
 34 | 
 35 | import qualified Data.IntMap.Strict as IntMap
 36 | import Data.Semiring (Semiring(..), DetectableZero(..))
 37 | 
 38 | 
 39 | -- | Sparse vectors of length @n@ over elements of type @a@.
 40 | newtype SparseVector (n :: Nat) a =
 41 |     UnsafeMakeSparseVector {
 42 |         elements :: IntMap.IntMap a
 43 |     }
 44 | 
 45 | 
 46 | -- | Construct a sparse vector from a list of indexed elements. Indexes
 47 | -- not in the list are all set to zero. Duplicate indexes are combined
 48 | -- with '(<+>)'. We need to be able to tell when elements are zero so we
 49 | -- can filter them out.
 50 | vector :: (DetectableZero a, KnownNat n) => [(Finite n, a)] -> SparseVector n a
 51 | vector l =
 52 |     UnsafeMakeSparseVector {
 53 |         elements =
 54 |             removeZeros $
 55 |                 IntMap.fromListWith (<+>) [(fromIntegral i, a) | (i, a) <- l]
 56 |     }
 57 | 
 58 | 
 59 | -- | The value at a given index.
 60 | (!) :: (Semiring a, KnownNat n) => SparseVector n a -> Finite n -> a
 61 | v ! i =
 62 |     IntMap.findWithDefault zero (fromIntegral i) (elements v)
 63 | 
 64 | 
 65 | -- | Length of a vector.
 66 | length :: forall n a r. (KnownNat n, Integral r) => SparseVector n a -> r
 67 | length _ =
 68 |     fromIntegral $ fromSing (sing :: SNat n)
 69 | 
 70 | 
 71 | -- | Sum of all elements in a vector.
 72 | sum :: Semiring a => SparseVector n a -> a
 73 | sum v =
 74 |     IntMap.foldr (<+>) zero (elements v)
 75 | 
 76 | 
 77 | -- | Concatenate two vectors.
 78 | (++) :: KnownNat n
 79 |      => SparseVector n a
 80 |      -> SparseVector m a
 81 |      -> SparseVector (n + m) a
 82 | v1 ++ v2 =
 83 |     UnsafeMakeSparseVector {
 84 |         elements =
 85 |             IntMap.union
 86 |                 (elements v1)
 87 |                 (IntMap.mapKeysMonotonic (length v1 +) (elements v2))
 88 |     }
 89 | 
 90 | 
 91 | -- | We can map from vectors with one type for elements to another given
 92 | -- a semiring homomorphism. Note that this does not work for arbitrary
 93 | -- functions. Specifically, this function must map zeros to zeros.
 94 | map :: (DetectableZero a, DetectableZero b)
 95 |     => (a -> b)
 96 |     -> SparseVector n a
 97 |     -> SparseVector n b
 98 | map f v =
 99 |     assert (isZero $ f zero) $
100 |     UnsafeMakeSparseVector {
101 |         elements =
102 |             removeZeros $
103 |                 IntMap.map f (elements v)
104 |     }
105 | 
106 | 
107 | -- | Combine two vectors of equal length with the given function.
108 | -- The function should return zero when /both/ its arguments are
109 | -- zero.
110 | zipWith :: (DetectableZero a, DetectableZero b, DetectableZero c)
111 |         => (a -> b -> c)
112 |         -> SparseVector n a
113 |         -> SparseVector n b
114 |         -> SparseVector n c
115 | zipWith f v1 v2 =
116 |     assert (isZero $ f zero zero) $
117 |     UnsafeMakeSparseVector {
118 |         elements =
119 |             removeZeros $
120 |                 IntMap.mergeWithKey
121 |                     (\_ a b -> Just (f a b))
122 |                     (IntMap.map (`f` zero))
123 |                     (IntMap.map (zero `f`))
124 |                     (elements v1)
125 |                     (elements v2)
126 |     }
127 | 
128 | 
129 | 
130 | -- | Split a vector into two vectors.
131 | split :: forall n m a. (KnownNat n, KnownNat m)
132 |       => SparseVector (n + m) a
133 |       -> (SparseVector n a, SparseVector m a)
134 | split v =
135 |     ( UnsafeMakeSparseVector { elements = v1 }
136 |     , UnsafeMakeSparseVector { elements = v2 }
137 |     )
138 |     where
139 |         (v1, v2') =
140 |             IntMap.partitionWithKey (\k _ -> k < n) (elements v)
141 | 
142 |         v2 =
143 |             IntMap.mapKeysMonotonic (subtract n) v2'
144 | 
145 |         n =
146 |             fromIntegral $ fromSing (sing :: SNat n)
147 | 
148 | 
149 | 
150 | -- | Iterate over non-zero elements in a vector.
151 | nonZero :: KnownNat n => SparseVector n a -> [(Finite n, a)]
152 | nonZero v =
153 |     fmap (\(i, x) -> (finite $ fromIntegral i, x)) (IntMap.toList $ elements v)
154 | 
155 | 
156 | -- | Convert a vector to a list.
157 | toList :: (Semiring a, KnownNat n) => SparseVector n a -> [a]
158 | toList v =
159 |     [v ! i | i <- finites]
160 | 
161 | 
162 | -- | Vectors of length @n@ over elements drawn from a semiring also
163 | -- form a semiring.
164 | instance (DetectableZero a, KnownNat n) => Semiring (SparseVector n a) where
165 | 
166 |     -- | Vector where all entries are zero.
167 |     zero =
168 |         UnsafeMakeSparseVector {
169 |             elements =
170 |                 IntMap.empty
171 |         }
172 | 
173 | 
174 |     -- | Vector where all entries are one.
175 |     one =
176 |         -- We need to treat the trivial semiring as a special case
177 |         if isZero (one :: a) then
178 |             zero
179 |         else
180 |             UnsafeMakeSparseVector {
181 |                 elements =
182 |                     IntMap.fromList [(i, one) | i <- [0 .. length - 1]]
183 |             }
184 |         where
185 |             length =
186 |                 fromIntegral $ fromSing (sing :: SNat n)
187 | 
188 | 
189 |     -- | Vector addition.
190 |     v1 <+> v2 =
191 |         UnsafeMakeSparseVector {
192 |             elements =
193 |                 removeZeros $
194 |                     IntMap.unionWith (<+>) (elements v1) (elements v2)
195 |         }
196 | 
197 | 
198 |     -- | Vector dot product.
199 |     v1 <.> v2 =
200 |         UnsafeMakeSparseVector {
201 |             elements =
202 |                 removeZeros $
203 |                     IntMap.intersectionWith (<.>) (elements v1) (elements v2)
204 |         }
205 | 
206 | 
207 | -- | We can recognize the zero vector.
208 | instance (DetectableZero a, KnownNat n) => DetectableZero (SparseVector n a) where
209 |     isZero v1 =
210 |         IntMap.null (elements v1)
211 | 
212 | 
213 | -- | Equality of vectors is decidable.
214 | deriving instance Eq a => Eq (SparseVector n a)
215 | 
216 | 
217 | -- | We can totally order vectors.
218 | deriving instance Ord a => Ord (SparseVector n a)
219 | 
220 | 
221 | instance (Semiring a, KnownNat n, Show a) => Show (SparseVector n a) where
222 |     show =
223 |         show . toList
224 | 
225 | 
226 | 
227 | -- * Helper functions
228 | 
229 | -- | Remove zero elements.
230 | removeZeros :: DetectableZero a => IntMap.IntMap a -> IntMap.IntMap a
231 | removeZeros =
232 |     IntMap.filter (not . isZero)
233 | 
234 | 


--------------------------------------------------------------------------------
/src/RegExp/Derivative.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GADTs #-}
  2 | {-# LANGUAGE MultiParamTypeClasses #-}
  3 | {-# LANGUAGE FlexibleContexts #-}
  4 | 
  5 | -- | Derivatives of regular expressions that support character classes.
  6 | -- The development follows
  7 | -- [Symbolic Solving of Extended Regular Expression Inequalities](https://arxiv.org/abs/1410.3227).
  8 | module RegExp.Derivative
  9 |     ( Word
 10 | 
 11 |     -- * Derivatives
 12 |     , derivative
 13 |     , partialDerivative
 14 | 
 15 |     -- * Application of derivatives
 16 |     , matches
 17 |     , equivalent
 18 | 
 19 |     -- * Automata construction
 20 |     , allDerivatives
 21 |     , next
 22 |     , join
 23 |     ) where
 24 | 
 25 | import Prelude hiding (Word)
 26 | 
 27 | import Control.Exception.Base(assert)
 28 | import Control.Monad(unless, when)
 29 | import Control.Monad.Except(MonadError(..), runExceptT)
 30 | import qualified Data.Equivalence.Monad as Equiv
 31 | import qualified Data.Equivalence.STT as EquivSTT
 32 | 
 33 | import Data.Set(Set)
 34 | import qualified Data.Set as Set
 35 | 
 36 | import RegExp.RegExp
 37 | 
 38 | import Data.BooleanAlgebra
 39 | import Data.Semiring (Semiring(..))
 40 | import Data.GSet hiding (Set)
 41 | 
 42 | 
 43 | 
 44 | -- | String of characters from an alphabet @c@.
 45 | type Word c =
 46 |     [c]
 47 | 
 48 | 
 49 | 
 50 | -- * Derivatives
 51 | 
 52 | -- | Brzozowski derivative of a regular expression with respect to a character.
 53 | -- @derivative c r@ matches a word @w@ if and only if @r@ matches @cw@.
 54 | derivative :: GSet c => c -> RegExp c -> RegExp c
 55 | derivative c r =
 56 |     case view r of
 57 |         One ->
 58 |             rZero
 59 | 
 60 |         Plus r1 r2 ->
 61 |             rPlus (derivative c r1) (derivative c r2)
 62 | 
 63 |         Times r1 r2 | nullable r1 ->
 64 |             rPlus (derivative c r1 `rTimes` r2) (derivative c r2)
 65 | 
 66 |         Times r1 r2 | otherwise ->
 67 |             derivative c r1 `rTimes` r2
 68 | 
 69 |         Star r' ->
 70 |             derivative c r' `rTimes` r
 71 | 
 72 |         Literal p ->
 73 |             if c `member` p then rOne else rZero
 74 | 
 75 | 
 76 | -- | Antimirov derivative of a regular expression with respect to a character.
 77 | -- This is similar to 'derivative', but returns a set of regular expressions
 78 | -- whose union is equivalent to the Brzozowski derivative.
 79 | partialDerivative :: forall c. GSet c
 80 |                   => c
 81 |                   -> RegExp c
 82 |                   -> Set (RegExp c)
 83 | partialDerivative c r =
 84 |     case view r of
 85 |         One ->
 86 |             Set.empty
 87 | 
 88 |         Plus r1 r2 ->
 89 |             partialDerivative c r1 `Set.union` partialDerivative c r2
 90 | 
 91 |         Times r1 r2 | nullable r1 ->
 92 |             Set.union
 93 |                 (partialDerivative c r1 `setTimes` r2)
 94 |                 (partialDerivative c r2)
 95 | 
 96 |         Times r1 r2 | otherwise ->
 97 |             partialDerivative c r1 `setTimes` r2
 98 | 
 99 |         Star r' ->
100 |             partialDerivative c r' `setTimes` r
101 | 
102 |         Literal p ->
103 |             if c `member` p then Set.singleton rOne else Set.empty
104 | 
105 |     where
106 |         setTimes :: Set (RegExp c) -> RegExp c -> Set (RegExp c)
107 |         setTimes s r =
108 |             Set.map (`rTimes` r) s
109 | 
110 | 
111 | 
112 | -- * Applications
113 | 
114 | -- | @r `matches` w@ if the regular expression @r@ accepts word @w@.
115 | matches :: GSet c => RegExp c -> Word c -> Bool
116 | matches r [] =
117 |     nullable r
118 | matches r (c : w) =
119 |     matches (derivative c r) w
120 | 
121 | 
122 | -- | Two regular expressions are equivalent if and only if they match
123 | -- the same set of strings. This function will check for equivalence,
124 | -- and return a witness in the case the expressions are different.
125 | -- One of the expressions will match the witness and the other won't.
126 | equivalent :: forall c. GSet c => RegExp c -> RegExp c -> Either (Word c) ()
127 | equivalent r1 r2 =
128 |     case Equiv.runEquivM' (runExceptT (check r1 r2)) of
129 |         Left w ->
130 |             assert (r1 `matches` w /= r2 `matches` w) $
131 |             Left w
132 | 
133 |         Right () ->
134 |             Right ()
135 |     where
136 |         -- | Hopcroft and Karp's bisimulation algorithm.
137 |         check :: (MonadError (Word c) m, Equiv.MonadEquiv (EquivSTT.Class s () (RegExp c)) (RegExp c) () m)
138 |               => RegExp c
139 |               -> RegExp c
140 |               -> m ()
141 |         check r1 r2 = do
142 |             weAlreadyChecked <- Equiv.equivalent r1 r2
143 |             unless weAlreadyChecked $ do
144 |                 when (nullable r1 /= nullable r2) $
145 |                     -- These expressions differ since one can match the empty
146 |                     -- word and the other cannot. The empty word is our witness.
147 |                     throwError []
148 | 
149 |                 -- Assume these "states" are equivalent, check following states.
150 |                 Equiv.equate r1 r2
151 | 
152 |                 let derivatives =
153 |                         [ (c, derivative c r1, derivative c r2)
154 |                         | p <- Set.toList (next r1 `join` next r2)
155 |                         , Just c <- [choose p]
156 |                         ]
157 | 
158 |                 mapM_ checkNext derivatives
159 | 
160 | 
161 |         -- | Check states reached by a character.
162 |         checkNext :: (MonadError (Word c) m, Equiv.MonadEquiv (EquivSTT.Class s () (RegExp c)) (RegExp c) () m)
163 |               => (c, RegExp c, RegExp c)
164 |               -> m ()
165 |         checkNext (c, r1, r2) =
166 |             check r1 r2 `catchError` \w ->
167 |                 throwError (c : w)
168 | 
169 | 
170 | 
171 | -- * Automata construction
172 | 
173 | -- | Set of derivatives of a regular expression under all words.
174 | allDerivatives :: forall c. GSet c => RegExp c -> Set (RegExp c)
175 | allDerivatives r =
176 |     Set.insert rZero (helper Set.empty [r])
177 |     where
178 |         helper :: Set (RegExp c) -> [RegExp c] -> Set (RegExp c)
179 |         helper context [] =
180 |             context
181 | 
182 |         helper context (r : rest) | r `Set.member` context =
183 |             helper context rest
184 | 
185 |         helper context (r : rest) =
186 |             let
187 |                 derivatives =
188 |                     [ derivative c r | p <- Set.toList (next r)
189 |                                      , Just c <- [choose p]]
190 |             in
191 |                 helper (Set.insert r context) (derivatives ++ rest)
192 | 
193 | 
194 | 
195 | -- * Helpers
196 | 
197 | -- | Given a regular expression @r@, compute equivalence classes of
198 | -- character classes such that:
199 | --
200 | -- * @p `member` next r@ and @c1 `member` p && c2 `member` p@ implies
201 | --   @derivative c1 r = derivative c2 r@,
202 | -- * @not $ c `member` ors (next r)@ implies @derivative c r ~ rZero@.
203 | next :: GSet c => RegExp c -> Set (CharacterClass c)
204 | next r =
205 |     case view r of
206 |         One ->
207 |             Set.singleton zero
208 | 
209 |         Plus r1 r2 ->
210 |             join (next r1) (next r2)
211 | 
212 |         Times r1 r2 | nullable r1 ->
213 |             join (next r1) (next r2)
214 | 
215 |         Times r1 _ | otherwise ->
216 |             next r1
217 | 
218 |         Star r ->
219 |             next r
220 | 
221 |         Literal p  ->
222 |             Set.singleton p
223 | 
224 | 
225 | -- | Given two sets of mutually disjoint character classes, compute
226 | -- a set of mutually disjoint character classes that cover both input
227 | -- sets. More concretely, given @s1@ and @s2@ such that
228 | --
229 | -- @'disjoint' s1 && 'disjoint' s2@
230 | --
231 | -- we have:
232 | --
233 | -- * @'ors' ('join' s1 s2) = 'ors' s1 <+> 'ors' s2@
234 | -- * @'disjoint' ('join' s1 s2)@
235 | -- * @'all' (\p -> 'all' (\p1 -> p '<.>' p1 == 'zero' || p `subset` p1) s1) ('join' s1 s2)@
236 | -- * @'all' (\p -> 'all' (\p2 -> p '<.>' p2 == 'zero' || p `subset` p2) s2) ('join' s1 s2)@
237 | join :: GSet c
238 |      => Set (CharacterClass c)
239 |      -> Set (CharacterClass c)
240 |      -> Set (CharacterClass c)
241 | join s1 s2 = Set.fromList $ concat $ do
242 |     p1 <- Set.toList s1
243 |     p2 <- Set.toList s2
244 |     return
245 |         [ p1 <.> p2
246 |         , p1 `butNot` ors s2
247 |         , p2 `butNot` ors s1
248 |         ]
249 | 
250 | 
251 | -- | Test if a set of character classes are pairwise disjoint.
252 | disjoint :: GSet c => Set (CharacterClass c) -> Bool
253 | disjoint s =
254 |     let s' = Set.toList s in
255 |         and [ p1 <.> p2 == zero | p1 <- s', p2 <- s', p1 /= p2 ]


--------------------------------------------------------------------------------
/src/Data/GSet.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE TypeFamilies #-}
  2 | 
  3 | {-# LANGUAGE MultiParamTypeClasses #-}
  4 | {-# LANGUAGE FlexibleContexts #-}
  5 | {-# LANGUAGE FlexibleInstances #-}
  6 | {-# LANGUAGE UndecidableInstances #-}
  7 | 
  8 | 
  9 | -- | A generic interface for sets and an instance
 10 | -- supporting finite and cofinite subsets of a type.
 11 | module Data.GSet
 12 |     ( GSet (..)
 13 |     ) where
 14 | 
 15 | import Prelude hiding (and, or)
 16 | import Flow
 17 | 
 18 | import Data.String (IsString(..))
 19 | import GHC.Exts (IsList(..))
 20 | import qualified Text.ParserCombinators.ReadP as Parser
 21 | 
 22 | import Data.Function (on)
 23 | import Data.List (intercalate)
 24 | import qualified Data.Set
 25 | 
 26 | import Data.BooleanAlgebra (BooleanAlgebra(..))
 27 | import Data.Semiring (Semiring(..), DetectableZero(..))
 28 | 
 29 | import Test.QuickCheck as QuickCheck
 30 | import Test.SmallCheck.Series as SmallCheck
 31 | 
 32 | 
 33 | -- | A generic interface for sets over a type @a@.
 34 | class (BooleanAlgebra (Set a), DetectableZero (Set a), Ord (Set a)) => GSet a where
 35 |     -- | Sets of elements of @a@.
 36 |     type Set a :: *
 37 | 
 38 |     -- | The set containing a single element.
 39 |     singleton :: a -> Set a
 40 | 
 41 |     -- | Determine if an element is a member of a set.
 42 |     member :: a -> Set a -> Bool
 43 |     member a s =
 44 |         singleton a <.> s == singleton a
 45 | 
 46 |     -- | Return an arbitrary element from a non-empty set; and
 47 |     -- 'Nothing' if the set is empty. That is, the following properties
 48 |     -- should hold:
 49 |     --
 50 |     -- @'choose' 'zero' = 'Nothing'@
 51 |     --
 52 |     -- @s /= 'zero' ==> exists a. 'choose' s = 'Just' a && 'member' a s@
 53 |     choose :: Set a -> Maybe a
 54 | 
 55 | 
 56 | 
 57 | -- | Finite and cofinite subsets of a type form a 'Semiring'.
 58 | instance Ord a => Semiring (FiniteSet a) where
 59 |     zero =
 60 |         empty
 61 | 
 62 |     one =
 63 |         full
 64 | 
 65 |     (<+>) =
 66 |         union
 67 | 
 68 |     (<.>) =
 69 |         intersection
 70 | 
 71 | 
 72 | -- | We know when a finite or cofinite subset of a finite type is empty.
 73 | instance (Bounded a, Enum a, Ord a) => DetectableZero (FiniteSet a) where
 74 |     isZero p =
 75 |         size p == 0
 76 | 
 77 | 
 78 | -- | Finite and cofinite subsets of a type form a 'BooleanAlgebra'.
 79 | instance Ord a => BooleanAlgebra (FiniteSet a) where
 80 |     complement =
 81 |         setComplement
 82 | 
 83 | 
 84 | -- | Finite and cofinite sets over the elements of a finite type.
 85 | instance (Bounded a, Enum a, Ord a) => GSet a where
 86 |     type Set a = FiniteSet a
 87 | 
 88 |     singleton =
 89 |         These . Data.Set.singleton
 90 | 
 91 |     member a (These s) =
 92 |         Data.Set.member a s
 93 |     member a (ComplementOf s) =
 94 |         Data.Set.notMember a s
 95 | 
 96 |     choose (These s) =
 97 |         Data.Set.lookupMin s
 98 |     choose p@(ComplementOf _) =
 99 |         if size p == 0 then
100 |             Nothing
101 |         else
102 |             Just $ head [a | a <- [minBound..maxBound], member a p]
103 | 
104 | 
105 | 
106 | -- * Implementation of sets with a more efficient complement operation.
107 | 
108 | -- | Finite and cofinite sets over the elements of a type.
109 | data FiniteSet a
110 |     -- | Set containing the given elements.
111 |     = These (Data.Set.Set a)
112 | 
113 |     -- | Set containing the complement of the given elements.
114 |     | ComplementOf (Data.Set.Set a)
115 | 
116 | 
117 | -- | Equality of finite and cofinite subsets of a finite type is decidable.
118 | instance (Bounded a, Enum a, Ord a) => Eq (FiniteSet a) where
119 |     These s1 == These s2 =
120 |         s1 == s2
121 |     p1@(These s1) == p2@(ComplementOf s2) =
122 |          size p1 == size p2 && Data.Set.null (s1 `Data.Set.intersection` s2)
123 |     p1@(ComplementOf _) == p2@(These _) =
124 |         p2 == p1
125 |     ComplementOf s1 == ComplementOf s2 =
126 |         s1 == s2
127 | 
128 | 
129 | -- | We can totally order the finite and cofinite subsets of a finite type.
130 | instance forall a.(Bounded a, Enum a, Ord a) => Ord (FiniteSet a) where
131 |     -- | Order by size first; then use lexicographical order on the elements.
132 |     compare p1 p2 =
133 |         compare (size p1) (size p2) <> lex p1 p2
134 |         where
135 |             lex (ComplementOf s1) (ComplementOf s2) =
136 |                 -- This is a lot more efficient than turning complemented
137 |                 -- sets into lists. Note that the order of arguments to the
138 |                 -- comparison is reversed.
139 |                 (compare `on` Data.Set.toAscList) s2 s1
140 |             lex _ _ =
141 |                 (compare `on` toList) p1 p2
142 | 
143 | 
144 | -- | Nicer interface for inputting finite sets over 'Char'.
145 | instance IsString (FiniteSet Char) where
146 |     fromString =
147 |         These . Data.Set.fromList
148 | 
149 | 
150 | -- | Allows us to write finite sets as lists.
151 | instance (Bounded a, Enum a, Ord a) => IsList (FiniteSet a) where
152 |     type (Item (FiniteSet a)) = a
153 | 
154 |     fromList =
155 |         These . Data.Set.fromList
156 | 
157 |     -- | We can list all elements in a finite or cofinite subset of
158 |     -- a finite type. For infinite types, size of cofinite subsets
159 |     -- is infinite, so this is not possible.
160 |     toList (These s) =
161 |         Data.Set.toAscList s
162 |     toList (ComplementOf s) =
163 |         [a | a <- [minBound..maxBound], Data.Set.notMember a s]
164 | 
165 | 
166 | instance Show a => Show (FiniteSet a) where
167 |     show (These s) =
168 |         "{" ++ intercalate "," (map show $ Data.Set.toList s) ++ "}"
169 |     show (ComplementOf s) | Data.Set.null s =
170 |         "."
171 |     show (ComplementOf s) =
172 |         "~{" ++ intercalate "," (map show $ Data.Set.toList s) ++ "}"
173 | 
174 | 
175 | instance (Read a, Ord a) => Read (FiniteSet a) where
176 |     readsPrec _ =
177 |         Parser.readP_to_S parser
178 | 
179 |         where
180 |             parser :: Parser.ReadP (FiniteSet a)
181 |             parser = do
182 |                 Parser.skipSpaces
183 |                 Parser.choice
184 |                     [ do {Parser.char '.'; return one}
185 |                     , do {Parser.char '~'; fmap ComplementOf elements}
186 |                     , do {fmap These elements}
187 |                     ]
188 | 
189 |             elements :: Parser.ReadP (Data.Set.Set a)
190 |             elements =
191 |                 Parser.between openBrace closeBrace $ do
192 |                     elements <- Parser.sepBy (Parser.readS_to_P reads) comma
193 |                     return $ Data.Set.fromList elements
194 | 
195 |             openBrace = do
196 |                 Parser.char '{'
197 |                 Parser.skipSpaces
198 | 
199 |             closeBrace = do
200 |                 Parser.skipSpaces
201 |                 Parser.char '}'
202 | 
203 |             comma = do
204 |                 Parser.char ','
205 |                 Parser.skipSpaces
206 | 
207 | 
208 | -- | Set containing no elements.
209 | empty :: FiniteSet a
210 | empty =
211 |     These Data.Set.empty
212 | 
213 | 
214 | -- | Set containing all elements.
215 | full :: FiniteSet a
216 | full =
217 |     ComplementOf Data.Set.empty
218 | 
219 | 
220 | -- | Complement of a set.
221 | setComplement :: FiniteSet a -> FiniteSet a
222 | setComplement (These s) =
223 |     ComplementOf s
224 | setComplement (ComplementOf s) =
225 |     These s
226 | 
227 | 
228 | -- | Intersection of two sets.
229 | intersection :: Ord a => FiniteSet a -> FiniteSet a -> FiniteSet a
230 | intersection (These s1) (These s2) =
231 |     These (Data.Set.intersection s1 s2)
232 | intersection (These s1) (ComplementOf s2) =
233 |     These (Data.Set.difference s1 s2)
234 | intersection p1@(ComplementOf _) p2@(These _) =
235 |     intersection p2 p1
236 | intersection(ComplementOf s1) (ComplementOf s2) =
237 |     ComplementOf (Data.Set.union s1 s2)
238 | 
239 | 
240 | -- | Intersection of two sets.
241 | union :: Ord a => FiniteSet a -> FiniteSet a -> FiniteSet a
242 | union p1 p2 =
243 |     complement $ (complement p1) `intersection` (complement p2)
244 | 
245 | 
246 | 
247 | -- * Operations on finite types
248 | 
249 | -- | Number of elements in a finite type.
250 | sizeOfType :: forall a. (Bounded a, Enum a) => a -> Int
251 | sizeOfType _ =
252 |     1 + fromEnum (maxBound :: a) - fromEnum (minBound :: a)
253 | 
254 | 
255 | -- | We can compute the size of finite or cofinite subsets of
256 | -- a finite type. For infinite types, size of cofinite subsets
257 | -- is infinite.
258 | size :: forall a. (Bounded a, Enum a) => FiniteSet a -> Int
259 | size (These s) =
260 |     Data.Set.size s
261 | size (ComplementOf s) =
262 |     sizeOfType (undefined :: a) - Data.Set.size s
263 | 
264 | 
265 | 
266 | -- * Testing
267 | 
268 | instance (Arbitrary a, Ord a) => Arbitrary (FiniteSet a) where
269 |     arbitrary = do
270 |         complemented <- arbitrary
271 |         set <- arbitrary
272 |         case complemented of
273 |             False ->
274 |                 return (These set)
275 | 
276 |             True ->
277 |                 return (ComplementOf set)
278 | 
279 |     shrink (These s) =
280 |         fmap These (shrink s)
281 |     shrink (ComplementOf s) =
282 |         fmap These (shrink s) ++ fmap ComplementOf (shrink s)
283 | 
284 | 
285 | -- | We use a newtype to define a 'Serial' instance for 'Data.Set.Set'
286 | -- so we don't pollute the global class space.
287 | newtype Set' a =
288 |     Set' {unSet' :: Data.Set.Set a}
289 | 
290 | instance (Serial m a, Enum a, Bounded a, Ord a) => Serial m (Set' a) where
291 |     series = SmallCheck.generate $ \depth ->
292 |         Data.Set.toList allSubsets
293 |             |> filter ((<= depth) . Data.Set.size)
294 |             |> fmap Set'
295 |         where
296 |             allSubsets =
297 |                 ([minBound .. maxBound] :: [a])
298 |                     |> Data.Set.fromList
299 |                     |> Data.Set.powerSet
300 | 
301 | instance (Serial m a, Enum a, Bounded a, Ord a) => Serial m (FiniteSet a) where
302 |     series =
303 |         cons1 (These . unSet') \/ cons1 (ComplementOf . unSet')
304 | 


--------------------------------------------------------------------------------
/src/RegExp/Internal/DFA.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GADTs #-}
  2 | {-# LANGUAGE Rank2Types #-}
  3 | {-# LANGUAGE DataKinds #-}
  4 | {-# LANGUAGE KindSignatures #-}
  5 | {-# LANGUAGE TypeOperators #-}
  6 | 
  7 | {-# LANGUAGE FlexibleContexts #-}
  8 | {-# LANGUAGE UndecidableInstances #-}
  9 | 
 10 | -- | Finite state automaton represented as matrices.
 11 | module RegExp.Internal.DFA
 12 |     ( Dfa
 13 | 
 14 |     -- * Combining DFAs
 15 |     , product
 16 | 
 17 |     -- * Convert from/to regular expressions
 18 |     , regexp
 19 |     , fromRegExp
 20 |     ) where
 21 | 
 22 | import Prelude hiding (product)
 23 | import Flow
 24 | 
 25 | import Control.Exception.Base(assert)
 26 | 
 27 | import Data.Finite
 28 | import Data.Singletons
 29 | import Data.Singletons.Prelude
 30 | import Data.Singletons.TypeLits
 31 | 
 32 | import Data.List (intercalate)
 33 | import qualified Data.Set
 34 | 
 35 | import Data.BooleanAlgebra
 36 | import Data.Semiring(Semiring(..))
 37 | import Data.KleeneAlgebra
 38 | import Data.GSet
 39 | 
 40 | import RegExp.RegExp
 41 | import RegExp.Derivative
 42 | import RegExp.Language (Language)
 43 | import qualified RegExp.Language as Language
 44 | 
 45 | 
 46 | import SparseVector (SparseVector)
 47 | import qualified SparseVector as Vector
 48 | 
 49 | import SparseMatrix (SparseMatrix)
 50 | import qualified SparseMatrix as Matrix
 51 | 
 52 | 
 53 | -- | Deterministic finite state automata that accept words over alphabet @c@.
 54 | data Dfa c where
 55 |     Dfa :: KnownNat n => DfaSize n c -> Dfa c
 56 | 
 57 | 
 58 | -- | Deterministic finite state automata with @n@ states that accept words
 59 | -- over alphabet @c@.
 60 | data DfaSize (n :: Nat) c =
 61 |     DfaSize {
 62 |         -- | The start state.
 63 |         start :: Finite n,
 64 | 
 65 |         -- | The transition matrix. In order to represent a deterministic
 66 |         -- machine, the following must hold:
 67 |         -- * Each row covers the entire alphabet. That is, the union of all
 68 |         --   entries on a given row must be the entire set of characters.
 69 |         -- * All entries in a row are pairwise disjoint.
 70 |         --
 71 |         -- The first requirement says that there is at least one transition
 72 |         -- from every state given a character. This requirement is easy to
 73 |         -- satisfy by adding an explicit "error" state.
 74 |         --
 75 |         -- The second requirement states that there is at most one transition
 76 |         -- given a state and a character.
 77 |         transition :: SparseMatrix n n (CharacterClass c),
 78 | 
 79 |         -- | Accepting states.
 80 |         accept :: SparseVector n Bool
 81 |     }
 82 | 
 83 | 
 84 | -- | Verify that the given DFA satisfies the conditions outlined in 'DfaSize',
 85 | -- and return the DFA unchanged if so. Raises an exception otherwise.
 86 | assertValid :: forall c. GSet c => Dfa c -> Dfa c
 87 | assertValid r@(Dfa (d :: DfaSize n c)) =
 88 |     assert transitionValid $
 89 |     r
 90 |     where
 91 |         transitionValid =
 92 |             all
 93 |                 (== one)
 94 |                 [Vector.sum $ Matrix.nthRow r (transition d) | r <- finites]
 95 | 
 96 | 
 97 | -- | Generic product construction over two DFAs. Intersection
 98 | -- and union of DFAs can be recovered as special cases by passing
 99 | -- in @('<.>')@ and @('<+>')@, respectively.
100 | product :: forall c. GSet c
101 |         => (forall a. BooleanAlgebra a => a -> a -> a)
102 |         -> Dfa c
103 |         -> Dfa c
104 |         -> Dfa c
105 | product f (Dfa (d1 :: DfaSize n c)) (Dfa (d2 :: DfaSize m c)) =
106 |     withKnownNat ((sing :: SNat n) %* (sing :: SNat m)) $
107 |     assertValid $
108 |     Dfa $ DfaSize {
109 |         start =
110 |             state (start d1) (start d2),
111 | 
112 |         transition =
113 |             Matrix.matrix
114 |                 [ ((state n m, state n' m'), f s1 s2)
115 |                 | n <- finites
116 |                 , n' <- finites
117 |                 , m <- finites
118 |                 , m' <- finites
119 |                 , let s1 = transition d1 Matrix.! (n, n')
120 |                 , let s2 = transition d2 Matrix.! (m, m')
121 |                 ],
122 | 
123 |         accept =
124 |             Vector.vector
125 |                 [ (state n m, f a1 a2)
126 |                 | n <- finites
127 |                 , m <- finites
128 |                 , let a1 = accept d1 Vector.! n
129 |                 , let a2 = accept d2 Vector.! m
130 |                 ]
131 |         }
132 |     where
133 |         -- | State in the product automata that corresponds to the given
134 |         -- pair of states.
135 |         state :: Finite n -> Finite m -> Finite (n * m)
136 |         state i j =
137 |            combineProduct (i, j)
138 | 
139 | 
140 | 
141 | -- | DFA that accepts words accepted by both input DFAs.
142 | intersection :: forall c. GSet c => Dfa c -> Dfa c -> Dfa c
143 | intersection (Dfa (d1 :: DfaSize n c)) (Dfa (d2 :: DfaSize m c)) =
144 |     withKnownNat ((sing :: SNat n) %* (sing :: SNat m)) $
145 |     assertValid $
146 |     Dfa $ DfaSize {
147 |         start =
148 |             state (start d1) (start d2),
149 | 
150 |         transition =
151 |             Matrix.matrix
152 |                 [ ((state n m, state n' m'), s1 <.> s2)
153 |                 | ((n, n'), s1) <- Matrix.nonZero (transition d1)
154 |                 , ((m, m'), s2) <- Matrix.nonZero (transition d2)
155 |                 ],
156 | 
157 |         accept =
158 |             Vector.vector
159 |                 [ (state n m, a1 <.> a2)
160 |                 | (n, a1) <- Vector.nonZero (accept d1)
161 |                 , (m, a2) <- Vector.nonZero (accept d2)
162 |                 ]
163 |         }
164 |     where
165 |         -- | State in the product automata that corresponds to the given
166 |         -- pair of states.
167 |         state :: Finite n -> Finite m -> Finite (n * m)
168 |         state i j =
169 |            combineProduct (i, j)
170 | 
171 | 
172 | -- | We can form a semiring over DFAs by interpreting them as sets
173 | -- of words.
174 | instance GSet c => Semiring (Dfa c) where
175 |     -- | DFA that accepts no words.
176 |     zero =
177 |         assertValid $
178 |         Dfa $ DfaSize {
179 |             start =
180 |                 finite 0 :: Finite 1,
181 | 
182 |             transition =
183 |                 Matrix.matrix [((0, 0), one)],
184 | 
185 |             accept =
186 |                 Vector.vector [(0, False)]
187 |         }
188 | 
189 |     -- | DFA that accepts all words.
190 |     one =
191 |         assertValid $
192 |         Dfa $ DfaSize {
193 |             start =
194 |                 finite 0 :: Finite 1,
195 | 
196 |             transition =
197 |                 Matrix.matrix [((0, 0), one)],
198 | 
199 |             accept =
200 |                 Vector.vector [(0, True)]
201 |         }
202 | 
203 |     -- | DFA that accepts words accepted by either DFA.
204 |     (<+>) =
205 |         product (<+>)
206 | 
207 |     -- | DFA that accepts words accepted by both DFAs.
208 |     (<.>) =
209 |         intersection
210 | 
211 | 
212 | -- | We can form a boolean algebra over DFAs by interpreting them as
213 | -- sets of words.
214 | instance GSet c => BooleanAlgebra (Dfa c) where
215 |     -- | @complement d@ accepts precisely the words that @d@ doesn't.
216 |     complement (Dfa d) =
217 |         assertValid $
218 |         Dfa $
219 |             d {
220 |                 accept =
221 |                     accept d
222 |                         |> Vector.toList
223 |                         |> fmap not
224 |                         |> zip finites
225 |                         |> Vector.vector
226 |             }
227 | 
228 | 
229 | 
230 | -- * Converting to and from regular expressions
231 | 
232 | -- | Convert a DFA to a regular expression.
233 | regexp :: forall c. GSet c => Dfa c -> RegExp c
234 | regexp (Dfa (d :: DfaSize n c)) =
235 |     Language.regexp $
236 |         (s `Matrix.times` star m `Matrix.times` t) Matrix.! (0, 0)
237 |     where
238 |         s :: SparseMatrix 1 n (Language c)
239 |         s =
240 |             Matrix.fromRows [(0, Vector.vector [(start d, injectBool True)])]
241 | 
242 |         m :: SparseMatrix n n (Language c)
243 |         m =
244 |             Matrix.map (Language.language . rLiteral) (transition d)
245 | 
246 |         t :: SparseMatrix n 1 (Language c)
247 |         t =
248 |             Matrix.transpose $
249 |                 Matrix.fromRows [(0, Vector.map injectBool (accept d))]
250 | 
251 |         injectBool :: Semiring a => Bool -> a
252 |         injectBool True =
253 |             one
254 |         injectBool False =
255 |             zero
256 | 
257 | 
258 | -- | Convert a regular expression to a DFA.
259 | fromRegExp :: forall c. GSet c => RegExp c -> Dfa c
260 | fromRegExp r =
261 |     case toSing (fromIntegral $ Data.Set.size derivatives) of
262 |         SomeSing (s :: SNat n) ->
263 |             withKnownNat s $
264 |             let
265 |                 -- | States of the constructed DFA will be the derivatives of
266 |                 -- the input regular expression. We assign each an index.
267 |                 state :: RegExp c -> Finite n
268 |                 state r =
269 |                     finite $ fromIntegral (Data.Set.findIndex r derivatives)
270 | 
271 | 
272 |                 -- | Transitions /from/ the given state.
273 |                 row :: RegExp c -> SparseVector n (CharacterClass c)
274 |                 row r =
275 |                     Vector.vector $
276 |                         (state rZero, complement $ ors $ next r) :
277 |                         [ (state (derivative c r), p)
278 |                         | p <- Data.Set.toList (next r)
279 |                         , Just c <- [choose p]
280 |                         ]
281 |             in
282 |                 assertValid $
283 |                 Dfa $ DfaSize {
284 |                     start =
285 |                         state r,
286 | 
287 |                     transition =
288 |                         Matrix.fromRows
289 |                             [(state d, row d) | d <- Data.Set.toList derivatives],
290 | 
291 |                     accept =
292 |                         Vector.vector [(state d, True) | d <- Data.Set.toList derivatives, nullable d]
293 |                 }
294 |     where
295 |         derivatives =
296 |             allDerivatives r
297 | 
298 | 
299 | 
300 | -- * Showing DFAs
301 | 
302 | instance (GSet c, Show (Set c)) => Show (Dfa c) where
303 |     show (Dfa d) =
304 |         show d
305 | 
306 | 
307 | instance (GSet c, KnownNat n, Show (Set c)) => Show (DfaSize n c) where
308 |     show d =
309 |         intercalate "\n"
310 |             [ "Start state:\n    " ++ show (start d)
311 |             ,   "Transition matrix:\n" ++ show (transition d)
312 |             ,   "Accepting states:\n    " ++ show (accept d)
313 |             ]


--------------------------------------------------------------------------------
/src/SparseMatrix.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GADTs #-}
  2 | {-# LANGUAGE DataKinds #-}
  3 | {-# LANGUAGE KindSignatures #-}
  4 | {-# LANGUAGE TypeApplications #-}
  5 | {-# LANGUAGE TypeOperators #-}
  6 | 
  7 | {-# LANGUAGE FlexibleContexts #-}
  8 | {-# LANGUAGE FlexibleInstances #-}
  9 | 
 10 | -- TODO: remove
 11 | {-# LANGUAGE AllowAmbiguousTypes #-}
 12 | 
 13 | -- | A very cruddy implementation of sparse matrices. I couldn't
 14 | -- find an existing implementation that had all that I needed, so
 15 | -- I cooked this up.
 16 | --
 17 | -- TODO: find a package or make nicer
 18 | module SparseMatrix
 19 |     ( SparseMatrix
 20 |     , matrix
 21 |     , fromRows
 22 |     , (!)
 23 |     , nthRow
 24 | 
 25 |     , plus
 26 |     , times
 27 |     , transpose
 28 | 
 29 |     , map
 30 |     , nonZero
 31 |     , nonZeroRows
 32 |     , toList
 33 |     ) where
 34 | 
 35 | import Flow
 36 | import Prelude hiding (map)
 37 | 
 38 | import Data.Function (on)
 39 | import Data.List (sortBy, groupBy, intercalate)
 40 | import Data.Ord (comparing)
 41 | 
 42 | import Data.Finite
 43 | import Data.Singletons
 44 | import Data.Singletons.Decide
 45 | import Data.Singletons.Prelude
 46 | import Data.Singletons.TypeLits
 47 | 
 48 | import Data.Semiring (Semiring(..), DetectableZero(..))
 49 | import Data.KleeneAlgebra
 50 | 
 51 | import SparseVector (SparseVector)
 52 | import qualified SparseVector as Vector
 53 | 
 54 | 
 55 | 
 56 | -- | A sparse matrix with @r@ rows and @c@ columns over
 57 | -- elements of type @a@.
 58 | newtype SparseMatrix (r :: Nat) (c :: Nat) a =
 59 |     UnsafeMakeSparseMatrix {
 60 |         rows :: SparseVector r (SparseVector c a)
 61 |     }
 62 | 
 63 | 
 64 | -- | Value at the given row and column.
 65 | (!) :: (DetectableZero a, KnownNat r, KnownNat c)
 66 |     => SparseMatrix r c a
 67 |     -> (Finite r, Finite c)
 68 |     -> a
 69 | m ! (r, c) =
 70 |     (rows m Vector.! r) Vector.! c
 71 | 
 72 | 
 73 | -- | Row with the given index.
 74 | nthRow :: (DetectableZero a, KnownNat r, KnownNat c)
 75 |     => Finite r
 76 |     -> SparseMatrix r c a
 77 |     -> SparseVector c a
 78 | nthRow r m =
 79 |     rows m Vector.! r
 80 | 
 81 | 
 82 | -- | Construct a sparse matrix from a list of indexed elements. Indices
 83 | -- that don't appear in the list are all set to zero. Duplicate indexes
 84 | -- are combined with '(<+>)'.
 85 | --
 86 | -- We need detectable zeros so we can filter them out.
 87 | matrix :: (DetectableZero a, KnownNat r, KnownNat c)
 88 |        => [((Finite r, Finite c), a)]
 89 |        -> SparseMatrix r c a
 90 | matrix l =
 91 |     UnsafeMakeSparseMatrix {
 92 |         rows =
 93 |             sortBy (comparing (fst . fst)) l
 94 |                 |> groupBy ((==) `on` (fst . fst))
 95 |                 |> fmap (\l -> (fst $ fst $ head l, fmap dropRow l))
 96 |                 |> fmap (\(r, elements) -> (r, Vector.vector elements))
 97 |                 |> Vector.vector
 98 |     }
 99 |     where
100 |         dropRow :: ((r, c), a) -> (c, a)
101 |         dropRow ((r, c), x)=
102 |             (c, x)
103 | 
104 | 
105 | -- | Construct a sparse matrix from a list of indexed vectors corresponding
106 | -- to the rows of the matrix.
107 | fromRows :: (DetectableZero a, KnownNat r, KnownNat c)
108 |          => [(Finite r, SparseVector c a)]
109 |          -> SparseMatrix r c a
110 | fromRows rows =
111 |     UnsafeMakeSparseMatrix {
112 |         rows =
113 |             Vector.vector rows
114 |     }
115 | 
116 | 
117 | -- | Matrix addition.
118 | plus :: (DetectableZero a, KnownNat r, KnownNat c)
119 |      => SparseMatrix r c a
120 |      -> SparseMatrix r c a
121 |      -> SparseMatrix r c a
122 | plus m1 m2 =
123 |     UnsafeMakeSparseMatrix {
124 |         rows =
125 |             rows m1 <+> rows m2
126 |     }
127 | 
128 | 
129 | -- | Matrix multiplication.
130 | times :: (DetectableZero a, KnownNat r, KnownNat m, KnownNat c)
131 |      => SparseMatrix r m a
132 |      -> SparseMatrix m c a
133 |      -> SparseMatrix r c a
134 | times m1 m2 =
135 |     UnsafeMakeSparseMatrix {
136 |         rows =
137 |             Vector.map (\r -> Vector.map (\c -> r `cross` c) (rows m2Tr)) (rows m1)
138 |     }
139 |     where
140 |         m2Tr =
141 |             transpose m2
142 | 
143 |         cross v1 v2 =
144 |             Vector.sum (v1 <.> v2)
145 | 
146 | 
147 | -- | Swap the rows of a matrix with its columns.
148 | transpose :: (DetectableZero a, KnownNat r, KnownNat c)
149 |           => SparseMatrix r c a
150 |           -> SparseMatrix c r a
151 | transpose m =
152 |     UnsafeMakeSparseMatrix {
153 |         rows =
154 |             Vector.vector [(i, Vector.map (Vector.! i) (rows m)) | i <- finites]
155 |     }
156 | 
157 | 
158 | -- | Split a square matrix into four quadrants.
159 | split :: forall s t a. (DetectableZero a, KnownNat s, KnownNat t)
160 |       => SparseMatrix (s + t) (s + t) a
161 |       -> ( SparseMatrix s s a
162 |          , SparseMatrix s t a
163 |          , SparseMatrix t s a
164 |          , SparseMatrix t t a
165 |          )
166 | split m =
167 |     withKnownNat ((sing :: SNat s) %+ (sing :: SNat t)) $
168 |         let
169 |             (top, bottom) =
170 |                 Vector.split (rows m)
171 | 
172 |             topSplit =
173 |                 Vector.map Vector.split top
174 | 
175 |             bottomSplit =
176 |                 Vector.map Vector.split bottom
177 | 
178 |             a = UnsafeMakeSparseMatrix { rows = Vector.map fst topSplit }
179 |             b = UnsafeMakeSparseMatrix { rows = Vector.map snd topSplit }
180 |             c = UnsafeMakeSparseMatrix { rows = Vector.map fst bottomSplit }
181 |             d = UnsafeMakeSparseMatrix { rows = Vector.map snd bottomSplit }
182 |         in
183 |             (a, b, c, d)
184 | 
185 | 
186 | -- | Combine four quadrants into a single square matrix.
187 | combine :: forall s t a. (DetectableZero a, KnownNat s, KnownNat t)
188 |         => ( SparseMatrix s s a
189 |            , SparseMatrix s t a
190 |            , SparseMatrix t s a
191 |            , SparseMatrix t t a
192 |            )
193 |         -> SparseMatrix (s + t) (s + t) a
194 | combine (a, b, c, d) =
195 |     withKnownNat ((sing :: SNat s) %+ (sing :: SNat t)) $
196 |         let
197 |             top =
198 |                 Vector.zipWith (Vector.++) (rows a) (rows b)
199 | 
200 |             bottom =
201 |                 Vector.zipWith (Vector.++) (rows c) (rows d)
202 |         in
203 |             UnsafeMakeSparseMatrix {
204 |                 rows =
205 |                     top Vector.++ bottom
206 |             }
207 | 
208 | 
209 | -- | We can map from matrices with one type for elements to another given
210 | -- a semiring homomorphism. Note that this does not work for arbitrary
211 | -- functions. Specifically, this function must map zeros to zeros.
212 | map :: (DetectableZero a, DetectableZero b, KnownNat r, KnownNat c)
213 |     => (a -> b)
214 |     -> SparseMatrix r c a
215 |     -> SparseMatrix r c b
216 | map f m =
217 |     UnsafeMakeSparseMatrix {
218 |         rows =
219 |             Vector.map (Vector.map f) (rows m)
220 |     }
221 | 
222 | 
223 | -- | Iterate over non-zero elements in a matrix.
224 | nonZero :: (KnownNat r, KnownNat c)
225 |         => SparseMatrix r c a
226 |         -> [((Finite r, Finite c), a)]
227 | nonZero m =
228 |     concatMap
229 |       (\(r, row) -> [((r, c), a) | (c, a) <- Vector.nonZero row])
230 |       (Vector.nonZero $ rows m)
231 | 
232 | 
233 | -- | Iterate over non-zero elements in a matrix grouped by rows.
234 | nonZeroRows :: (KnownNat r, KnownNat c)
235 |         => SparseMatrix r c a
236 |         -> [(Finite r, [(Finite c, a)])]
237 | nonZeroRows m =
238 |     fmap
239 |         (\(r, row) -> (r, Vector.nonZero row))
240 |         (Vector.nonZero $ rows m)
241 | 
242 | 
243 | -- | Convert a vector to a list.
244 | toList :: (DetectableZero a, KnownNat r, KnownNat c) => SparseMatrix r c a -> [[a]]
245 | toList m =
246 |     fmap Vector.toList (Vector.toList $ rows m)
247 | 
248 | 
249 | 
250 | -- | Square matrices form a semiring.
251 | instance (DetectableZero a, KnownNat n) => Semiring (SparseMatrix n n a) where
252 |     -- | Matrix where all entries are zero.
253 |     zero =
254 |         matrix []
255 | 
256 |     -- | Matrix where the diagonal is one.
257 |     one =
258 |         matrix [((i, i), one) | i <- [0..]]
259 | 
260 |     -- | Matrix addition.
261 |     (<+>) =
262 |         plus
263 | 
264 |     -- | Matrix multiplication.
265 |     (<.>) =
266 |         times
267 | 
268 | 
269 | -- | We can recognize zero matrices.
270 | instance (DetectableZero a, KnownNat n) => DetectableZero (SparseMatrix n n a) where
271 |     isZero m =
272 |         isZero (rows m)
273 | 
274 | 
275 | -- | Square matrices over Kleene algebra form a Kleene algebra.
276 | instance (DetectableZero a, KleeneAlgebra a, KnownNat n) => KleeneAlgebra (SparseMatrix n n a) where
277 |     star m | Proved Refl <- (sing :: SNat n) %~ (sing :: SNat 0) =
278 |         m
279 |     star m | Proved Refl <- (sing :: SNat n) %~ (sing :: SNat 1) =
280 |         matrix [((0,0), star (m ! (0, 0)))]
281 |     star m =
282 |         -- TODO: get rid of 'unsafeCoerce' or limit it to proving @n = small + large@.
283 |         withKnownNat ((sing :: SNat n) `sDiv` (sing :: SNat 2)) $
284 |         withKnownNat (((sing :: SNat n) %+ (sing :: SNat 1)) `sDiv` (sing :: SNat 2)) $
285 |         withKnownNat (((sing :: SNat n) `sDiv` (sing :: SNat 2))
286 |                      %+
287 |                      (((sing :: SNat n) %+ (sing :: SNat 1)) `sDiv` (sing :: SNat 2))
288 |                      ) $
289 |         case (sing :: SNat n) %~ (sing :: SNat ((n `Div` 2) + ((n + 1) `Div` 2))) of
290 |             Proved Refl ->
291 |                 combine (a', b', c', d')
292 |                 where
293 |                     a :: SparseMatrix (n `Div` 2) (n `Div` 2) a
294 |                     (a, b, c, d) =
295 |                         split m
296 | 
297 |                     -- a' :: SparseMatrix small small a
298 |                     a' = star (a `plus` (b `times` star d `times` c))
299 | 
300 |                     -- b' :: SparseMatrix small large a
301 |                     b' = star (a `plus` (b `times` star d `times` c)) `times` b `times` star d
302 | 
303 |                     -- c' :: SparseMatrix large small a
304 |                     c' = star (d `plus` (c `times` star a `times` b)) `times` c `times` star a
305 | 
306 |                     -- d' :: SparseMatrix large large a
307 |                     d' = star (d `plus` (c `times` star a `times` b))
308 | 
309 |             Disproved _->
310 |                 error "impossible"
311 | 
312 | 
313 | 
314 | -- | Equality of matrices is decidable.
315 | deriving instance Eq a => Eq (SparseMatrix r c a)
316 | 
317 | 
318 | -- | We can totally order matrices.
319 | deriving instance Ord a => Ord (SparseMatrix r c a)
320 | 
321 | 
322 | instance (DetectableZero a, Show a, KnownNat r, KnownNat c) => Show (SparseMatrix r c a) where
323 |     show m =
324 |         intercalate "\n"
325 |             [ intercalate " " (fmap (padded widest) row) | row <- grid ]
326 | 
327 |         where
328 |             -- | Matrix as a list of lists.
329 |             grid :: [[a]]
330 |             grid =
331 |                 fmap Vector.toList (Vector.toList $ rows m)
332 | 
333 |             -- | Show an element of the matrix.
334 |             show :: a -> String
335 |             show a =
336 |                 showsPrec 11 a ""
337 | 
338 |             -- | Width of the widest entry in the matrix.
339 |             widest :: Int
340 |             widest =
341 |                 foldr max 0 [ length (show a) | a <- concat grid ]
342 | 
343 |             -- | Show with a constant width.
344 |             padded :: Int -> a -> String
345 |             padded width a =
346 |                 let
347 |                     s = show a
348 |                 in
349 |                     s ++ take (width - length s) (repeat ' ')


--------------------------------------------------------------------------------
/paper/report.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt]{article}
  2 | 
  3 | \usepackage{fullpage}  % More space
  4 | \usepackage{tgpagella} % Better font
  5 | \usepackage{microtype} % Slightly improved typography
  6 | 
  7 | \usepackage{hyperref}
  8 | \usepackage{cleveref}
  9 | \usepackage{todonotes}
 10 | 
 11 | \usepackage{amsmath, amsfonts, amsthm, amssymb, mathtools} % math
 12 | \usepackage{mathpartir}
 13 | 
 14 | \usepackage{syntax}
 15 | \setlength{\grammarindent}{4em} % increase separation between LHS/RHS
 16 | 
 17 | \usepackage{macro/generic}
 18 | \usepackage{macro/code}
 19 | 
 20 | 
 21 | %=====================================================================
 22 | % Author
 23 | %=====================================================================
 24 | 
 25 | \title{A Regular Expression Library for Haskell}
 26 | \author{Josh Acay \\ \href{mailto:ca483@cornell.edu}{ca483@cornell.edu}}
 27 | \date{May 22, 2018}
 28 | 
 29 | 
 30 | %=====================================================================
 31 | % Macros
 32 | %=====================================================================
 33 | 
 34 | \newtheorem{theorem}{Theorem}
 35 | \newtheorem{example}{Example}
 36 | 
 37 | \DeclareMathOperator{\lang}{\mathcal{L}}
 38 | \DeclareMathOperator{\derivative}{D}
 39 | \DeclareMathOperator{\nullable}{nullable}
 40 | 
 41 | \newcommand{\transpose}{^\top}
 42 | 
 43 | \newcommand{\by}[1]{\parens{\text{#1}}}
 44 | \newcommand{\since}[1]{\parens{\text{since #1}}}
 45 | \newcommand{\eqBy}[1]{\braces{\text{#1}}}
 46 | 
 47 | \newcommand{\haskell}{\lstinline}
 48 | 
 49 | 
 50 | 
 51 | \begin{document}
 52 | \maketitle
 53 | 
 54 | \begin{abstract}
 55 |   I detail the implementation of a regular expression library for Haskell.\footnote{%
 56 |     Available online at \url{https://github.com/cacay/regexp}.}
 57 |   Unlike similar libraries in the wild, this one supports more than just matching strings: it can compute intersections and complements, take derivatives \'a la Brzozowski \cite{Brzozowski64}, check for equivalence, and solve systems of linear equations with regular expression coefficients. In addition, the library is designed to be generic over the alphabet (even allowing infinite ones) so it is not tied to Haskell's builtin \haskell{Char} and \haskell{String} types.
 58 | \end{abstract}
 59 | 
 60 | 
 61 | \section{Introduction}
 62 |   Regular expressions provide a simple yet powerful language for string searching and matching. They are expressive enough to describe many common patterns that arise in practice (e.g.\ alternatives and repetition) but restrictive enough that many desirable properties are effectively decidable (e.g.\ equivalence and containment checking). Additionally, regular languages are closed under intersection, union, and complement making it possible to expose a natural interface that uses familiar connectives like ``and'', ``or'', and ``not''.
 63 | 
 64 |   Unfortunately, practical implementations of regular expressions focus too much on string matching and forego all the benefits of having a restricted language. For instance, I was
 65 |   unable to find a single popular regular expression library (for any programming language) that supports equivalence checking.
 66 |   % Very few support complement and/or intersection, but do so by extending the language
 67 |   % rather than computing the complement or intersection in terms of the basic operators.
 68 |   In fact, many modern implementations add features such as capture groups and backreferences which break most closure properties and make equivalence checking undecidable \cite{CampeanuSY03}.
 69 | 
 70 |   Although string matching is sufficient for many application, there are cases where the additional power is useful (e.g.\ during design and development) or necessary.\footnote{%
 71 |     I sketch the application this library was built for in \cref{application}.}
 72 |   Here, I detail the design and development of a Haskell library that exposes this additional power to the user.
 73 | 
 74 | 
 75 | \section{Specification}
 76 | 
 77 | Given an alphabet $\Sigma$, regular expressions have the following syntax:
 78 | \begin{grammar}
 79 |   <exp> ::= 1
 80 |   \alt <exp> + <exp>
 81 |   \alt <exp> $\cdot$ <exp>
 82 |   \alt <exp>$^*$
 83 |   \alt $l \subseteq \Sigma$
 84 | \end{grammar}
 85 | 
 86 | The interpretation of these expressions is standard:
 87 | $1$ matches the empty word,
 88 | $e_1 + e_2$ matches either $e_1$ or $e_2$,
 89 | $e_1 \cdot e_2$ matches $e_1$ followed by $e_2$,
 90 | $e^*$ matches zero or more copies of $e$,
 91 | and the literal $l$ matches single character words $a$ where $a \in l$.
 92 | This is a generalization of the standard syntax which only allows single characters as literals rather than sets of characters or \emph{character classes} and encodes the set $\set{a_1, a_2, \ldots, a_n}$ as $a_1 + a_2 + \cdots + a_n$. I use character classes directly since they generalize to infinite alphabets (which I will discuss in \cref{effective-boolean-algebra}) and are more efficient to implement. The expression 0---which matches no strings---is represented with the empty character class $\emptyset$.
 93 | 
 94 | It is easy enough to define a parametrized algebraic data type \haskell{RegExp $\Sigma$} that mirrors this syntax. I implement the following operations on regular expressions:
 95 | \begin{itemize}
 96 |   \item \haskell{matches :: RegExp $\Sigma$ -> [$\Sigma$] -> Bool}\\
 97 |     \haskell{matches $e$ $w$} returns \haskell{True} whenever the expression $e$ matches the word $w$.
 98 | 
 99 |   \item \haskell{complement :: RegExp $\Sigma$ -> RegExp $\Sigma$}\\
100 |     \haskell{complement $e$} returns a regular expression that matches precisely the words $e$ does not match.
101 | 
102 |   \item \haskell{intersection :: RegExp $\Sigma$ -> RegExp $\Sigma$ -> RegExp $\Sigma$}\\
103 |     \haskell{intersection $e_1$ $e_2$} returns a regular expression that matches words both $e_1$ and $e_2$ match.
104 | 
105 |   \item \haskell{equivalent :: RegExp $\Sigma$ -> RegExp $\Sigma$ -> Either [$\Sigma$] ()}\\
106 |     \haskell{equivalent $e_1$ $e_2$} returns \haskell{Right ()} (i.e.\ ``true'') if $e_1$ and $e_2$ are equivalent, and \haskell{Left $w$} otherwise. Here, the word $w$ is a counterexample that is matched by one expression but not the other.
107 | 
108 |   \item \haskell{solve :: LinearSystem $\Sigma$ -> RegExp $\Sigma$}: given a system of linear equations of the form:
109 |     \begin{gather*}
110 |       X_1 = e_1^0 + e_1^1 X_1 + e_1^2 X_2 + \cdots e_1^n X_n\\
111 |       X_2 = e_2^0 + e_2^1 X_1 + e_2^2 X_2 + \cdots e_2^n X_n\\
112 |           \vdots\\
113 |       X_m = e_m^0 + e_m^1 X_1 + e_m^2 X_2 + \cdots e_m^n X_n
114 |     \end{gather*}
115 |     where each $e_i^j$ is non-nullable (i.e.\ doesn't match the empty string), solve for $X_1$.\footnote{%
116 |     Actual type of \haskell{solve} is slightly different, but amounts to the same thing.} Note that all coefficients need to be on the same of the variables to ensure there is a solution. I arbitrarily pick left.
117 | 
118 |     \haskell{solve} is a versatile and powerful function. It can derive \haskell{intersection}, \haskell{complement}, and many other operators. However, I observed that going through deterministic finite automata gives more succinct expressions.
119 | \end{itemize}
120 | 
121 | 
122 | \section{Implementation}
123 | 
124 | \subsection{Derivatives of Regular Expressions}
125 | 
126 | The entirety of this library reduces to Brzozowski derivatives, deterministic finite state automata (DFAs), and the correspondence between them. The derivative of a regular expression $e$ with respect to a character $a \in \Sigma$ is a regular expression $e'$ such that $e'$ matches a word $w$ if and only if $e$ matches $a w$. Brzozowski showed that derivatives always exist and that they could be computed syntactically \cite{Brzozowski64}. This has an obvious extension to arbitrary words $w$, which I denote by $\derivative_w{(e)}$. This gives a trivial way to implement matching:
127 | \begin{equation*}
128 |   \haskell{matches $e$ $w$ = nullable ($\derivative_w{(e)}$)}
129 | \end{equation*}
130 | where \haskell{nullable $e$} if and only if $e$ can match the empty word. There is no need to explicitly convert to a DFA, although one might still want to for efficiency's sake.
131 | 
132 | \subsection{Computing Intersection and Complement}
133 | 
134 | To compute intersection and complement, I go through the standard motions of converting regular expressions to DFAs, performing the relevant operation on the DFA representations (product construction for intersection and flipping accepting/non-accepting states for complement), and converting back. Although the concept of going through DFAs is not new, the methods I use are hard to come by in real-world code (even though they are not novel theoretically).
135 | 
136 | In the forward direction, I construct a DFA directly from a regular expression using Brzozowski derivatives instead of determinizing a nondeterministic finite state automaton, which is what most implementations do. The idea is simple: derivatives of a regular expression are the states of the deterministic automaton and there is an $a$ transition from $e$ to $e'$ if $\derivative_a{(e)} = e'$. Considering the derivative with respect to all words gives the full automaton. Brzozowski showed in \cite{Brzozowski64} that this process generates only finitely many states as long as regular expressions are compared modulo associativity, commutativity, and idempotence of $+$ (ACI). I achieve this by keeping the type \haskell{RegExp} abstract and only exposing ``smart'' constructors that normalize regular expressions with respect to the following equalities:
137 | \begin{mathpar}
138 | (r + s) + t = r + (s + t)
139 | \and r + s = s + t
140 | \and r + r = r
141 | %
142 | \\ 0 + r = r
143 | %
144 | \\ (r \cdot s) \cdot t = r \cdot (s \cdot t)
145 | \and 1 \cdot r = r = r \cdot 1
146 | \and 0 \cdot r = 0 = r \cdot 0
147 | %
148 | \\ (r^*)^* = r^*
149 | \and 0^* = 1
150 | \and 1^* = 1
151 | \end{mathpar}
152 | The first three equalities are required for termination as discussed; others reduce the number of states generated which speeds up execution and results in smaller more readable expressions. To get even smaller expressions, I keep regular expressions in strong star normal form, which was introduced as a linear time simplification method in \cite{GruberG10}. Very basically, the strong star normal form limits applications of $^*$ to non-nullable expressions, so $(1 + a)^*$ would get converted to the equivalent $a^*$.
153 | 
154 | 
155 | The other direction, converting a DFA into a regular expression, is essentially unimplemented: I was unable to find a single library, or even a single piece of working code that does it\footnote{%
156 |   I did find a Coq library described in \cite{DoczkalKS13} that implements DFA to regular expression conversion and does so in a mechanically verified way, but being a constructive proof, the implementation was so inefficient it was unable to convert the expression $0$ to a DFA and back.}.
157 |   Worse yet, a web search yields no results that are easy to turn into an algorithm. Most sources talk about solving a system of linear equations but offer no insight into how. Such methods might work if one is doing this by hand, but of course a computer requires a more formal specification.
158 | 
159 |   The method I settled on is due to Kozen \cite{Kozen94}. The idea is simple and elegant: represent the DFA in matrix form as a triple $\angled{u, M, v}$ where $u$ and $v$ are $\set{0, 1}$ vectors representing starting and accepting states, respectively, and $M$ is the transition matrix. Then, $u\transpose M^* v$ gives a regular expression that matches the set of words accepted by this DFA\@. I use a home-baked implementation of sparse vectors and sparse matrices to represent DFAs since I was unable to find an existing Haskell library that was up to the task. The most interesting part of this is the implementation of Kleene star for matrices, which uses the divide-and-conquer method of \cite{Kozen94}. Further details on the technique can be found in \cite{Kozen94}.
160 | 
161 | 
162 | \subsection{Equality Checking}
163 | 
164 | I use Hopcroft and Karp's bisimulation algorithm for checking DFA equivalence \cite{HopcroftK71} modified to use a union-find structure as described in \cite{BonchiP11}. However, I generate the DFAs on the fly using Brzozowski derivatives instead of computing them upfront. This gives the algorithm a chance to fail early and avoid the exponential time process of converting to a DFA\@. The implementation is able to generate a counterexample if the expressions are not equivalent.
165 | 
166 | 
167 | \subsection{Solving Systems of Linear Equations}
168 | 
169 | Given a system of linear equations of the form
170 | \begin{gather*}
171 |   X_1 = e_1^0 + e_1^1 X_1 + e_1^2 X_2 + \cdots e_1^n X_n\\
172 |   X_2 = e_2^0 + e_2^1 X_1 + e_2^2 X_2 + \cdots e_2^n X_n\\
173 |       \vdots\\
174 |   X_m = e_m^0 + e_m^1 X_1 + e_m^2 X_2 + \cdots e_m^n X_n
175 | \end{gather*}
176 | where $e_i^j$ are non-nullable, it is possible to solve for $X_1$ using Arden's lemma and Gaussian elimination. Arden's lemma states that the unique solution to the equation $X = A \cdot X + B$ is $A^* B$ as long as $A$ is non-nullable. Using the lemma, it is possible to eliminate variables one at a time by walking down the list\footnote{%
177 | Apply the lemma to the first equation to get an equation for $X_1$ that doesn't refer to itself, then substitute it in for $X_1$ in all following equations. Rinse and repeat until you reach the bottom.} until $X_m$ is reached. At this point, the equation for $X_m$ only refers to itself, so apply Arden's lemma to get a closed expression for $X_m$. This can be propagated back using substitution by walking up the list. At each step, the equation for a variable will only contain that variable so Arden's lemma gives a closed form solution.
178 | 
179 | Linear equation solving is useful for many applications, one of which is computing intersection and complement. It is easy to see that
180 | \[ \haskell{intersection $e_1$ $e_2$}
181 |   = (\haskell{nullable $e_1$} = \haskell{nullable $e_2$}) + \sum_{a \in \Sigma}{a  \cdot \haskell{intersection (derivative $a$ $e_1$) (derivative a $e_2$)}}
182 | \]
183 | and
184 | \[ \haskell{complement $e$}
185 |   = \neg(\haskell{nullable $e$}) + \sum_{a \in \Sigma}{a \cdot \haskell{complement (derivative $a$ $e$)}}
186 | \]
187 | (interpreting booleans as regular expressions in the expected way). Unfolding these definitions indefinitely and treating every application of the operator to unique input(s) as a fresh variable results in a finite system of linear equations since every regular expression has a finite set of derivatives modulo ACI as discussed.
188 | 
189 | 
190 | \subsection{Generalizing Literals}\label{effective-boolean-algebra}
191 | 
192 | Rather than fixing literals to be sets of characters, I define them to be elements of an effective boolean algebra $\angled{\Sigma, U, \bracks{\cdot}, \sqcup, \sqcap, \bot, \top}$.
193 | The implementation closely follows the presentation in \cite{KeilT14} so I will elide the details. The important thing to note is that effective boolean algebras have many useful instantiations including finite subsets of a finite alphabet, finite and cofinite (a set whose complement is finite) subsets of an infinite alphabet, a logic of decidable propositions over an alphabet and so on.
194 | 
195 | My implementation is generic against this abstract interface so all these options can be made to work easily, but I only provide an implementation of finite and cofinite subsets of a \emph{finite} alphabet. One might wonder the point of supporting cofinite subsets of a finite alphabet since they are finite sets after all. The reason is that supporting cofinite subsets gives major speedups when working with very large alphabets such as the set of Unicode characters. For example, the representation of \haskell{complement $\set{a}$} is simply the complemented set $\neg\set{a}$ which needs to list a single element. Representing this set directly would require listing all characters that are not ``a'' of which there are millions.
196 | 
197 | 
198 | \section{Testing and Verification}
199 | 
200 | I employ a mix of static and dynamic techniques to ensure correctness. Static guarantees are a result of Haskell's incredibly powerful type system. For example, I use algebraic data types to ensure all regular expressions conform to the syntax described in this paper. With normalized regular expressions, I had to go a step further and use generalized algebraic data types (a.k.a.\ indexed algebraic data types) \cite{CheneyH03} to check invariants such as ``Kleene star is only applied to non-nullable regular expressions''. This is achieved by indexing the data type of regular expression by a boolean \haskell{isNullable} with the obvious meaning. The implementation of sparse vectors and sparse matrices goes yet another step further straight into the dependently typed territory. These types are indexed by their size to enforce invariants such as ``only vectors of equal length are added together''. I use the singletons library \cite{Singletons} for this purpose, which gives Haskell limited dependent programming capabilities. I insert dynamic assertions wherever the type system falls short. These dynamic checks are useful for two reasons: (1) they increase the likelihood of catching bugs, and (2) they make tracking down the root cause of said bugs much easier.\footnote{%
201 |   I discovered one bug due to an assertion failure. A different unrelated bug in the regular expression to DFA conversion code was discovered during testing by manual inspection of output, but I tracked it down by inserting dynamic checks (which, in hindsight, should have been there to begin with).
202 | }
203 | 
204 | In addition to static checks, I use unit tests and property testing to further increase the confidence in the library. Unit tests are assertions on ground terms such as
205 |   \[ \haskell{intersection $(a + b)^*$ $a^*$} = a^* \]
206 |   where $a$, $b$ are concrete elements of a concrete type $\Sigma$, say, \haskell{Char}.  Haskell's hspec library\footnote{%
207 |   \url{https://hackage.haskell.org/package/hspec}
208 | } provides a good framework for writing such tests. I use SmallCheck \cite{SmallCheck} and QuickCheck \cite{QuickCheck} for property testing. These libraries allow one to write logical assertions such as
209 | \[ \forall e_1, e_2. \haskell{equivalent $e_1$ $e_2$} = \haskell{Right ()}
210 |    \implies \forall w. \haskell{matches $e_1$ $w$} = \haskell{matches $e_2$ $w$}
211 |  \]
212 | and verifies these assertions by instantiating them with concrete terms. QuickCheck randomly generates a fixed number of examples whereas SmallCheck systematically generates \emph{all} inputs. I use QuickCheck to test against a large alphabet (Unicode characters), and SmallCheck to tests against a very small alphabet with three characters. SmallCheck was very helpful for catching bugs at corner cases and generating simple counterexamples (since the alphabet is very small).
213 | 
214 | During random testing, I frequently ran into exponential behavior with functions \haskell{equivalent}, \haskell{intersection}, and \haskell{complement}. These operations have proven lower bounds \cite{Kozen77,GeladeN12} so occasional bad behavior is unavoidable, but the usual claim is that they fare much better on real-world inputs (see \cite{FosterKM0T15} for instance). It is interesting to note that this claim apparently does not extend to \emph{random} inputs. As a solution, I limit the size of randomly generated regular expressions to 4 or 5 operators which is sufficient to get high code coverage.
215 | 
216 | This was the first time I took testing seriously and it certainly paid off. I had been of the opinion that having the backing of a powerful type system like Haskell's and writing well-structured code would make it nigh impossible to introduce bugs. However, despite being extra careful and utilizing Haskell's type system to its fullest, I discovered half a dozen bugs during testing. On the plus side, I was able to focus on writing ``interesting'' tests thanks to Haskell's type system (users of untyped languages have to write hundreds of tests essentially asserting their program is well-typed). This goes on to show that type systems and testing go hand in hand.
217 | 
218 | \section{An Application: Unicode Grapheme Cluster Breaks}\label{application}
219 | 
220 | In this section, I briefly talk about the reason this library exists in the first place.
221 | 
222 | Unicode has become the de facto standard for international text. However, it is such a complicated beast that most people have misconceptions about how Unicode works. One of these misconceptions is the idea that each character you see on the screen corresponds to a single Unicode code point. This is a reasonable expectation since ``a'', ``!'', and ``$\Omega$'' are all Unicode code points. However, there are ``characters'' that require more than one Unicode code point to represent. For example, G with acute accent \'G requires two. Such a group of code points that represent a single user-perceived character is called a grapheme cluster.
223 | 
224 | Unicode strings are transmitted as flat sequences of code points, so breaking them up into their grapheme clusters might (and should) be a first step in any code that is to handle Unicode correctly. The Unicode standard gives a declarative specification of how this should be done\footnote{\url{http://unicode.org/reports/tr29}} using an ordered list of rules with the following two forms: $e_1 \times e_2$ meaning break at positions where the left-hand side matches $e_1$ and the right-hand side matches $e_2$, and $e_1 \div e_2$ meaning do not break at matching positions. When rules overlap, the first one in the list applies. The Unicode technical report claims without proof that these rules can be converted into a regular expression that can be used to extract grapheme clusters (by repeating a longest match for example).
225 | 
226 | I thought this was an interesting way to specify text segmentation rules, so I wanted to come up with an algorithm to do this translation for the general case. I had a translation in mind that used intersection and complement extensively, so I implemented this library to see the algorithm in action. Unfortunately, that algorithm was not correct, and fixing it is left for future work.
227 | 
228 | \section{Conclusion and Future Work}
229 |   I presented a regular expressions library for Haskell that implements very well-known theoretical results that somehow never made their appearance in popular tools. As always, some work remains to be done. First, there is significant room for optimization.  My implementation of sparse vectors and sparse matrices could use a lot of work, and suppressing Haskell's default lazy semantics in computation heavy regions might lead to significant speedups. Second, I would like to implement DFA minimization and use it to simplify regular expressions generated by \haskell{intersection} and \haskell{complement} operations. Normalizing regular expressions already simplifies them quite a bit, and going through DFAs using Brzozowski derivatives generally produces readable (and perhaps even minimal) expressions, but there are cases where the output is quite hairy. I'm hoping minimizing DFAs could remedy this situation. Finally, the interface could use some work. The code is modularized in a way that makes sense from the implementer's perspective, but it is not the interface that should be exposed to the user.
230 | 
231 | 
232 | 
233 | %=====================================================================
234 | % Bibliography
235 | %=====================================================================
236 | 
237 | \bibliographystyle{acm}
238 | \bibliography{bibliography,bibliography-extra}
239 | 
240 | \end{document}
241 | 


--------------------------------------------------------------------------------
/src/RegExp/RegExp.hs:
--------------------------------------------------------------------------------
   1 | {-# LANGUAGE GADTs #-}
   2 | {-# LANGUAGE DataKinds #-}
   3 | {-# LANGUAGE KindSignatures #-}
   4 | {-# LANGUAGE TypeOperators #-}
   5 | 
   6 | {-# LANGUAGE MultiParamTypeClasses #-}
   7 | {-# LANGUAGE FlexibleContexts #-}
   8 | {-# LANGUAGE FlexibleInstances #-}
   9 | {-# LANGUAGE UndecidableInstances #-}
  10 | 
  11 | 
  12 | -- | Definition of regular expressions over /character classes/
  13 | -- (i.e. sets of characters from an arbitrary type). The definition
  14 | -- is left abstract, which frees us to simplify regular expressions
  15 | -- using (sound) rewriting rules. We normalize regular expressions
  16 | -- enough to guarantee that the set of Brzozowski derivatives are
  17 | -- finite.
  18 | module RegExp.RegExp
  19 |     ( CharacterClass
  20 |     , RegExp
  21 | 
  22 |     -- * Deconstructing regular expressions
  23 |     , RegExpView(..)
  24 |     , view
  25 |     , hide
  26 | 
  27 |     -- * Properties
  28 |     , nullable
  29 |     , empty
  30 | 
  31 |     -- * #combining# Combining regular expressions
  32 |     , rZero
  33 |     , rOne
  34 |     , rPlus
  35 |     , rTimes
  36 |     , rStar
  37 |     , rLiteral
  38 |     ) where
  39 | 
  40 | import Control.Exception.Base(assert)
  41 | import Unsafe.Coerce(unsafeCoerce)
  42 | 
  43 | import Data.Singletons
  44 | import Data.Singletons.Decide
  45 | import Data.Singletons.Prelude
  46 | 
  47 | import Data.Set(Set)
  48 | import qualified Data.Set as Set
  49 | 
  50 | import Data.Either (isRight)
  51 | import qualified Data.List as List
  52 | import Data.String (IsString(..))
  53 | import qualified Text.ParserCombinators.ReadP as Parser
  54 | 
  55 | import Data.GSet (GSet)
  56 | import qualified Data.GSet as GSet
  57 | import Data.Semiring (Semiring(..))
  58 | 
  59 | import Test.QuickCheck as QuickCheck
  60 | import Test.SmallCheck.Series as SmallCheck
  61 | 
  62 | 
  63 | -- | Sets of characters from an alphabet 'c'.
  64 | type CharacterClass c =
  65 |     GSet.Set c
  66 | 
  67 | 
  68 | -- | Regular expressions that support character classes over an alphabet
  69 | -- @c@ (so we don't have to encode them using choice). The type
  70 | -- is left abstract so we can apply rewriting rules to simplify and
  71 | -- normalize expressions. Refer to 'view' and 'RegExpView' for
  72 | -- inspecting 'RegExp', and to 'hide' and [the relevant section](#combining)
  73 | -- for constructing them.
  74 | --
  75 | -- Normalizing expressions not only makes them smaller and more
  76 | -- readable, but also ensures termination for some algorithms, so
  77 | -- it is a good idea overall. We normalize with respect to the
  78 | -- following equations:
  79 | --
  80 | -- == Associativity, Commutativity, and Idempotence of @+@ (ACI)
  81 | --
  82 | -- prop> (r + s) + t = r + (s + t)
  83 | -- prop> r + s = s + t
  84 | -- prop> r + r = r
  85 | --
  86 | -- == Identity for @+@
  87 | --
  88 | -- prop> 0 + r = r
  89 | --
  90 | -- == Associativity of @.@
  91 | --
  92 | -- prop> (r . s) . t = r . (s . t)
  93 | --
  94 | -- == Identity and Annihilator for @.@
  95 | --
  96 | -- prop> 1 . r = r = r . 1
  97 | -- prop> 0 . r = 0 = r . 0
  98 | --
  99 | -- == Star
 100 | --
 101 | -- prop> (r*)* = r*
 102 | -- prop> 0* = 1
 103 | -- prop> 1* = 1
 104 | --
 105 | -- In addition, we keep regular expressions in strong star normal form
 106 | -- which is described in
 107 | -- [Simplifying Regular Expressions A Quantitative Perspective](https://pdfs.semanticscholar.org/1b6b/5843442a64523ccb7afd21eabec7881b4219.pdf).
 108 | -- In practice, this means @*@ is only applied to expression that
 109 | -- cannot match the empty word.
 110 | --
 111 | -- Brzozowski proved that normalizing with respect to ACI ensures
 112 | -- there are only finitely many derivatives of a regular expression.
 113 | -- So ACI is necessary for any algorithm that relies on taking repeated
 114 | -- derivatives of regular expressions.
 115 | data RegExp c where
 116 |     RZero :: RegExp c
 117 |     ROne  :: RegExp c
 118 |     RNormalized :: (NormalizedRegExp c isUnion isSeq isNullable) -> RegExp c
 119 | 
 120 | -- | Syntactic equality.
 121 | instance GSet c => Eq (RegExp c) where
 122 |     RZero == RZero =
 123 |         True
 124 |     ROne == ROne =
 125 |         True
 126 |     RNormalized r1 == RNormalized r2 =
 127 |         r1 `hEq` r2
 128 |     _ == _ =
 129 |         False
 130 | 
 131 | -- | An arbitrary syntactic ordering. Useful for defining sets and
 132 | -- maps over regular expressions.
 133 | instance (GSet c, Ord (CharacterClass c)) => Ord (RegExp c) where
 134 |     compare RZero RZero =
 135 |         EQ
 136 |     compare RZero _ =
 137 |         LT
 138 |     compare _ RZero =
 139 |         GT
 140 | 
 141 |     compare ROne ROne =
 142 |         EQ
 143 |     compare ROne _ =
 144 |         LT
 145 |     compare _ ROne =
 146 |         GT
 147 | 
 148 |     compare (RNormalized r1) (RNormalized r2) =
 149 |         r1 `hCompare` r2
 150 | 
 151 | -- | Nicer interface for inputting regular expression over 'Char'.
 152 | -- For example,
 153 | --
 154 | -- > "abc" :: 'RegExp' 'Char'
 155 | --
 156 | -- is the regular expression that matches single character strings
 157 | -- @"a"@, @"b"@, and @"c"@ (it doesn't match the string @"abc"@).
 158 | instance IsString (RegExp Char) where
 159 |     fromString =
 160 |         rLiteral . fromString
 161 | 
 162 | 
 163 | 
 164 | -- * Deconstructing regular expressions
 165 | 
 166 | -- | Standard syntax for regular expressions. We omit @Zero@ since
 167 | -- it can be encoded as @'Literal' 'zero'@.
 168 | data RegExpView c r where
 169 |     -- | Match the empty string and nothing else.
 170 |     One :: RegExpView c r
 171 | 
 172 |     -- | Match the left or the right expression.
 173 |     Plus :: r -> r -> RegExpView c r
 174 | 
 175 |     -- | Match the left then the right expression.
 176 |     Times :: r -> r -> RegExpView c r
 177 | 
 178 |     -- | Match zero or more copies of the given expression.
 179 |     Star :: r -> RegExpView c r
 180 | 
 181 |     -- | Match any character in the character class. The character
 182 |     -- class might be empty, in which case this matches no strings.
 183 |     Literal :: CharacterClass c -> RegExpView c r
 184 | 
 185 | deriving instance Functor (RegExpView c)
 186 | 
 187 | 
 188 | 
 189 | -- | Expose the abstract type 'RegExp' as a 'RegExpView'.
 190 | view :: forall c. GSet c => RegExp c -> RegExpView c (RegExp c)
 191 | view RZero =
 192 |     Literal zero
 193 | view ROne =
 194 |     One
 195 | view (RNormalized r) =
 196 |     view' r
 197 |     where
 198 |         view' :: NormalizedRegExp c isUnion isSeq isNullable
 199 |               -> RegExpView c (RegExp c)
 200 |         view' (NUnion p n s) | p /= zero, Set.null n, Set.null s =
 201 |             Literal p
 202 |         view' (NUnion p n s) | p /= zero =
 203 |             nUnion p Set.empty Set.empty `Plus` nUnion zero n s
 204 |         view' (NUnion _ n s) | otherwise =
 205 |             case (Set.minView n, Set.minView s) of
 206 |                 (Nothing, Nothing) ->
 207 |                     error "impossible"
 208 | 
 209 |                 (Just (SubUnion r, n), _) ->
 210 |                     RNormalized r `Plus` nUnion zero n s
 211 | 
 212 |                 (Nothing, Just (SubUnion r, s)) ->
 213 |                     RNormalized r `Plus` nUnion zero Set.empty s
 214 | 
 215 |         view' (NUnionWithOne p s) =
 216 |             ROne `Plus` nUnion p Set.empty s
 217 | 
 218 |         view' (NSeq l) | Some1 (SubSeq r1) ::: (Some1 l')  <- nSeqView l
 219 |                        , Some1 (SubSeq r2) ::: (Some1 l'') <- nSeqView l'
 220 |                        , Nil <- nSeqView l'' =
 221 |             RNormalized r1 `Times` RNormalized r2
 222 |         view' (NSeq l) | Some1 (SubSeq r1) ::: Some1 l' <- nSeqView l =
 223 |             RNormalized r1 `Times` RNormalized (NSeq l')
 224 |         view' (NSeq l) | otherwise =
 225 |             error "impossible"
 226 | 
 227 |         view' (NStar r) =
 228 |             Star (RNormalized r)
 229 | 
 230 | 
 231 | -- | Pack the public view 'RegExpView' back into the abstract view 'RegExp'.
 232 | hide :: GSet c => RegExpView c (RegExp c) -> RegExp c
 233 | hide One =
 234 |     rOne
 235 | hide (Plus r1 r2) =
 236 |     rPlus r1 r2
 237 | hide (Times r1 r2) =
 238 |     rTimes r1 r2
 239 | hide (Star r) =
 240 |     rStar r
 241 | hide (Literal p) =
 242 |     rLiteral p
 243 | 
 244 | 
 245 | 
 246 | -- * Properties
 247 | 
 248 | -- | 'True' if and only if the regular expression can match the
 249 | -- empty word.
 250 | nullable :: GSet c => RegExp c -> Bool
 251 | nullable RZero =
 252 |     False
 253 | nullable ROne =
 254 |     True
 255 | nullable (RNormalized r) =
 256 |     isRight (nullableNormalized r)
 257 | 
 258 | 
 259 | -- | 'True' if and only if the regular expression matches no words.
 260 | empty :: RegExp c -> Bool
 261 | empty RZero =
 262 |     True
 263 | empty _ =
 264 |     False
 265 | 
 266 | 
 267 | 
 268 | -- * Normalized regular expressions
 269 | 
 270 | -- | We define a type of /normalized regular expressions/ to help us
 271 | -- statically ensure the properties described in 'RexExp'. Terms of this
 272 | -- type denote expressions that
 273 | -- 1. are fully normalized with respect to the rewriting rules in 'RegExp',
 274 | -- 2. are in strong star normal form,
 275 | -- 3. know whether they are nullable (i.e. we can determine in constant time
 276 | --    whether they can match the empty word or not).
 277 | --
 278 | -- (1) and (2) are direct requirements. (3) helps us specify when a regular
 279 | -- expression is in star normal form. Additionally, being able to compute
 280 | -- nullability in constant time speeds up derivative based algorithms.
 281 | --
 282 | -- To enforce these requirements, we use data structures that intrinsically
 283 | -- capture the required properties. For example, instead of using binary
 284 | -- unions, we define unions over /sets/ of subexpressions since sets capture
 285 | -- associativity, commutativity, and idempotence. Similarly, we use lists
 286 | -- for sequencing so we get associativity automatically.
 287 | --
 288 | -- Normalized expressions are indexed based on whether the root constructor
 289 | -- is a union or a sequence so we can statically disallow nesting unions
 290 | -- under unions (since they can be combined into one) and similarly for
 291 | -- sequencing. There is an additional index to track whether the expression
 292 | -- is nullable.
 293 | --
 294 | -- Normalized expressions cannot encode the regular expressions that matches
 295 | -- no words (i.e. @Zero@) or the regular expression that only matches the
 296 | -- empty words (i.e. @One@). These are added by 'RegExp'. This is in line
 297 | -- with the development in
 298 | -- /Simplifying Regular Expressions A Quantitative Perspective/.
 299 | data NormalizedRegExp c (isUnion :: Bool) (isSeq :: Bool) (isNullable :: Bool) where
 300 |     -- | Union of a literal (a character set) and a set of subexpressions.
 301 |     -- Note that we disallow standalone literal nodes under unions and
 302 |     -- instead bundle them with the union nodes themselves to ensure we
 303 |     -- combine literals using set union instead of syntactic regular expression
 304 |     -- union. This gives an additional level of normalization.
 305 |     --
 306 |     -- We do not have a separate case for literals since they can be
 307 |     -- represented as a union node with an empty set of subexpressions.
 308 |     --
 309 |     -- We keep nullable and strict (non-nullable) subexpressions separate to
 310 |     -- satisfy requirement (3): a union node is nullable if the set of
 311 |     -- nullable subexpressions is not empty. This is checked dynamically.
 312 |     --
 313 |     -- To satisfy (1) and (2), we require that for every @NUnion p n s@, the
 314 |     -- following should hold:
 315 | 
 316 |     -- @p /= zero || 'length' n + 'length' s >= 2@.
 317 |     --
 318 |     -- Additionally, 'NUnion' and 'NUnionWithOne' should not occur as
 319 |     -- subexpressions since these can be hoisted up.
 320 |     NUnion :: CharacterClass c
 321 |            -> Set (SubUnion c True)
 322 |            -> Set (SubUnion c False)
 323 |            -> NormalizedRegExp c True False isNullable
 324 | 
 325 |     -- | The empty word or a union. This corresponds to the @?@ constructor
 326 |     -- from the paper, but generalized over a union of subexpressions
 327 |     -- (as opposed to a single expression) so that expressions like @a? + b?@
 328 |     -- get simplified to @(a + b)?@.
 329 |     --
 330 |     -- For every, @NUnionWithOne p s@ we require that
 331 |     -- @p /= zero || 'length' s >= 1@ and @s@ does not contain 'NUnion' or
 332 |     -- 'NUnionWithOne'.
 333 |     NUnionWithOne :: CharacterClass c
 334 |                   -> Set (SubUnion c False)
 335 |                   -> NormalizedRegExp c True False True
 336 | 
 337 | 
 338 |     -- | Sequential composition of a list of subexpressions. Each cons node
 339 |     -- in the list needs to store whether the sequential composition of that
 340 |     -- and the following nodes are nullable or not in order to satisfy (3).
 341 |     --
 342 |     -- The list must contain at least two elements and 'NSeq' nodes cannot
 343 |     -- appear as subexpression.
 344 |     NSeq :: NSeq c isNullable
 345 |          -> NormalizedRegExp c False True isNullable
 346 | 
 347 |     -- | Iteration. The iterated expression cannot be nullable, but the
 348 |     -- overall expression always is.
 349 |     NStar :: NormalizedRegExp c isUnion isSeq False
 350 |           -> NormalizedRegExp c False False True
 351 | 
 352 | 
 353 | -- | Normalized regular expressions that can appear under a union node.
 354 | data SubUnion c (isNullable :: Bool) where
 355 |     SubUnion :: !(NormalizedRegExp c False isSeq isNullable)
 356 |              -> SubUnion c isNullable
 357 | 
 358 | 
 359 | -- | Normalized regular expressions that can appear under a seq node.
 360 | data SubSeq c (isNullable :: Bool) where
 361 |     SubSeq :: !(NormalizedRegExp c isUnion False isNullable)
 362 |            -> SubSeq c isNullable
 363 | 
 364 | 
 365 | -- | Sequential composition of a list of subexpressions. Each node
 366 | -- in the sequence keeps track of whether the subsequence starting
 367 | -- with that element is nullable.
 368 | data NSeq c (isNullable :: Bool) where
 369 |     -- | Empty list. Nullable since it corresponds to @One@.
 370 |     NSeqNil :: NSeq c True
 371 | 
 372 |     -- | Tack on a nullable subexpression. The result is nullable if
 373 |     -- and only if the rest of the list is.
 374 |     NSeqConsNullable :: Sing (isNullable :: Bool)
 375 |                      -> SubSeq c True
 376 |                      -> NSeq c isNullable
 377 |                      -> NSeq c isNullable
 378 | 
 379 |     -- | Tack on a strict (non-nullable) subexpression. The result
 380 |     -- is always strict.
 381 |     NSeqConsStrict :: SubSeq c False
 382 |                    -> NSeq c isNullable
 383 |                    -> NSeq c False
 384 | 
 385 | 
 386 | 
 387 | -- * Smart constructors to check invariants we cannot encode statically.
 388 | 
 389 | 
 390 | -- | Compute the union of the given sets of subexpressions.
 391 | -- This constructor is always safe: it will always construct
 392 | -- a valid expression, and it will always "do the right thing".
 393 | nUnion :: GSet c
 394 |        => CharacterClass c
 395 |        -> Set (SubUnion c True)
 396 |        -> Set (SubUnion c False)
 397 |        -> RegExp c
 398 | nUnion p n s | p /= zero || Set.size n + Set.size s >= 2 =
 399 |     if Set.null n then
 400 |         RNormalized (nUnionStrict p s)
 401 |     else
 402 |         RNormalized (nUnionNullable p n s)
 403 | nUnion _ n s | otherwise =
 404 |     case (Set.lookupMin n, Set.lookupMin s) of
 405 |         (Nothing, Nothing) ->
 406 |             RZero
 407 | 
 408 |         (Just (SubUnion r), Nothing) ->
 409 |             RNormalized r
 410 | 
 411 |         (Nothing, Just (SubUnion r)) ->
 412 |             RNormalized r
 413 | 
 414 |         (Just _, Just _) ->
 415 |             error "impossible"
 416 | 
 417 | 
 418 | -- | Safe and smart constructor for 'NUnion' that always returns a
 419 | -- nullable expression.
 420 | nUnionNullable :: GSet c
 421 |                => CharacterClass c
 422 |                -> Set (SubUnion c True)
 423 |                -> Set (SubUnion c False)
 424 |                -> NormalizedRegExp c True False True
 425 | nUnionNullable p n s =
 426 |     assert (not $ Set.null n) $
 427 |     assert (p /= zero || Set.size n + Set.size s >= 2) $
 428 |     NUnion p n s
 429 | 
 430 | 
 431 | -- | Safe and smart constructor for 'NUnion' that always returns a
 432 | -- non-nullable expression.
 433 | nUnionStrict :: GSet c
 434 |              => CharacterClass c
 435 |              -> Set (SubUnion c False)
 436 |              -> NormalizedRegExp c True False False
 437 | nUnionStrict p s =
 438 |     assert (p /= zero || Set.size s >= 2) $
 439 |     NUnion p Set.empty s
 440 | 
 441 | 
 442 | -- | Safe and smart constructor for 'NUnionWithOne'.
 443 | nUnionWithOne :: GSet c
 444 |               => CharacterClass c
 445 |               -> Set (SubUnion c False)
 446 |               -> NormalizedRegExp c True False True
 447 | nUnionWithOne p s =
 448 |     assert (p /= zero || Set.size s >= 1) $
 449 |     NUnionWithOne p s
 450 | 
 451 | 
 452 | -- | Safe and smart constructor for 'NSeq'.
 453 | nSeq :: GSet c
 454 |      => NSeq c isNullable
 455 |      -> NormalizedRegExp c False True isNullable
 456 | nSeq l =
 457 |     assert (isValid l) $
 458 |     NSeq l
 459 |     where
 460 |         -- | Check that the list has at least two subexpressions.
 461 |         isValid NSeqNil =
 462 |             False
 463 |         isValid (NSeqConsNullable _ _ NSeqNil) =
 464 |             False
 465 |         isValid (NSeqConsStrict _ NSeqNil) =
 466 |             False
 467 |         isValid _ =
 468 |             True
 469 | 
 470 | 
 471 | -- | Alias for 'NStar' for uniformity.
 472 | nStar :: GSet c
 473 |       => NormalizedRegExp c isUnion isSeq False
 474 |       -> NormalizedRegExp c False False True
 475 | nStar =
 476 |     NStar
 477 | 
 478 | 
 479 | 
 480 | -- * Working with normalized expressions
 481 | 
 482 | nullableNormalized :: GSet c
 483 |                    => NormalizedRegExp c isUnion isSeq isNullable
 484 |                    -> Either (isNullable :~: False) (isNullable :~: True)
 485 | nullableNormalized (NUnion _ n _)=
 486 |     if Set.null n then
 487 |         Left (unsafeCoerce Refl)
 488 |     else
 489 |         Right (unsafeCoerce Refl)
 490 | nullableNormalized (NUnionWithOne _ _) =
 491 |     Right Refl
 492 | nullableNormalized (NSeq l) =
 493 |     nullableNSeq l
 494 | nullableNormalized (NStar _) =
 495 |     Right Refl
 496 | 
 497 | 
 498 | nullableNSeq :: NSeq c isNullable
 499 |              -> Either (isNullable :~: False) (isNullable :~: True)
 500 | nullableNSeq NSeqNil =
 501 |     Right Refl
 502 | nullableNSeq (NSeqConsNullable SFalse _ _) =
 503 |     Left Refl
 504 | nullableNSeq (NSeqConsNullable STrue _ _) =
 505 |     Right Refl
 506 | nullableNSeq (NSeqConsStrict _ _) =
 507 |     Left Refl
 508 | 
 509 | 
 510 | 
 511 | -- | View 'NSeq' as a list.
 512 | nSeqView :: NSeq c isNullable
 513 |          -> ListView (Some1 (NSeq c)) (Some1 (SubSeq c))
 514 | nSeqView NSeqNil =
 515 |     Nil
 516 | nSeqView (NSeqConsNullable _ h t) =
 517 |     Some1 h ::: (Some1 t)
 518 | nSeqView (NSeqConsStrict h t) =
 519 |     Some1 h ::: (Some1 t)
 520 | 
 521 | 
 522 | -- | Construct an 'NSeq' that contains a single subexpression.
 523 | nSeqSingleton :: GSet c => SubSeq c isNullable -> Some1 (NSeq c)
 524 | nSeqSingleton r@(SubSeq n) =
 525 |     case nullableNormalized n of
 526 |         Left Refl  ->
 527 |             Some1 $ NSeqConsStrict r NSeqNil
 528 | 
 529 |         Right Refl ->
 530 |             Some1 $ NSeqConsNullable STrue r NSeqNil
 531 | 
 532 | 
 533 | -- | Append two 'NSeq's.
 534 | nSeqAppend :: NSeq c isNullable1
 535 |            -> NSeq c isNullable2
 536 |            -> NSeq c (isNullable1 && isNullable2)
 537 | nSeqAppend NSeqNil l2 =
 538 |     l2
 539 | nSeqAppend (NSeqConsNullable SFalse h1 t1) l2 =
 540 |     NSeqConsNullable SFalse h1 (nSeqAppend t1 l2)
 541 | nSeqAppend (NSeqConsNullable STrue h1 t1) l2 =
 542 |     case nullableNSeq l2 of
 543 |         Left Refl ->
 544 |             NSeqConsNullable SFalse h1 (nSeqAppend t1 l2)
 545 |         Right Refl ->
 546 |             NSeqConsNullable STrue h1 (nSeqAppend t1 l2)
 547 | nSeqAppend (NSeqConsStrict h1 t1) l2 =
 548 |     NSeqConsStrict h1 (nSeqAppend t1 l2)
 549 | 
 550 | 
 551 | 
 552 | -- * Comparing normalized expressions
 553 | 
 554 | -- GHC cannot derive 'Eq' and 'Ord' instances for normalized regular
 555 | -- expressions because of all the existential quantification going
 556 | -- on. Since we only care about ordering so we can put expression
 557 | -- in sets, we define ordering on the underlying untyped terms. We
 558 | -- do this efficiently by defining heterogeneous equality and comparison.
 559 | 
 560 | -- | Heterogeneous equality.
 561 | class HEq a b where
 562 |     hEq :: a -> b -> Bool
 563 | 
 564 | 
 565 | -- | Heterogeneous ordering.
 566 | class HOrd a b where
 567 |     hCompare :: a -> b -> Ordering
 568 | 
 569 | 
 570 | instance GSet c => HEq (NormalizedRegExp c u1 s1 n1) (NormalizedRegExp c u2 s2 n2) where
 571 |     hEq (NUnion p1 n1 s1) (NUnion p2 n2 s2) =
 572 |         p1 == p2 && n1 == n2 && s1 == s2
 573 |     hEq (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) =
 574 |         p1 == p2 && s1 == s2
 575 |     hEq (NSeq l1) (NSeq l2) =
 576 |         l1 `hEq` l2
 577 |     hEq (NStar r1) (NStar r2) =
 578 |         r1 `hEq` r2
 579 |     hEq _ _ =
 580 |         False
 581 | 
 582 | instance GSet c => HEq (SubUnion c n1) (SubUnion c n2) where
 583 |     hEq (SubUnion r1) (SubUnion r2) =
 584 |         r1 `hEq` r2
 585 | 
 586 | instance GSet c => HEq (SubSeq c n1) (SubSeq c n2) where
 587 |     hEq (SubSeq r1) (SubSeq r2) =
 588 |         r1 `hEq` r2
 589 | 
 590 | instance GSet c => HEq (NSeq c n1) (NSeq c n2) where
 591 |     hEq NSeqNil NSeqNil =
 592 |         True
 593 |     hEq (NSeqConsNullable n1 h1 t1) (NSeqConsNullable n2 h2 t2) =
 594 |         fromSing n1 == fromSing n2 && h1 `hEq` h2 && t1 `hEq` t2
 595 |     hEq (NSeqConsStrict h1 t1) (NSeqConsStrict h2 t2) =
 596 |         h1 `hEq` h2 && t1 `hEq` t2
 597 |     hEq _ _ =
 598 |         False
 599 | 
 600 | 
 601 | instance GSet c => Eq (NormalizedRegExp c isUnion isSeq isNullable) where
 602 |     (==) = hEq
 603 | 
 604 | instance GSet c => Eq (SubUnion c isNullable) where
 605 |     (==) = hEq
 606 | 
 607 | instance GSet c => Eq (SubSeq c isNullable) where
 608 |     (==) = hEq
 609 | 
 610 | instance GSet c => Eq (NSeq c isNullable) where
 611 |     (==) = hEq
 612 | 
 613 | 
 614 | 
 615 | instance (GSet c, Ord (CharacterClass c)) => HOrd (NormalizedRegExp c u1 s1 n1) (NormalizedRegExp c u2 s2 n2) where
 616 |     hCompare (NUnion p1 n1 s1) (NUnion p2 n2 s2) =
 617 |         p1 `compare` p2 <> n1 `compare` n2 <> s1 `compare` s2
 618 |     hCompare (NUnion _ _ _) _ =
 619 |         LT
 620 |     hCompare _ (NUnion _ _ _) =
 621 |         GT
 622 | 
 623 |     hCompare (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) =
 624 |         p1 `compare` p2 <> s1 `compare` s2
 625 |     hCompare (NUnionWithOne _ _) _ =
 626 |         LT
 627 |     hCompare _ (NUnionWithOne _ _) =
 628 |         GT
 629 | 
 630 |     hCompare (NSeq l1) (NSeq l2) =
 631 |         l1 `hCompare` l2
 632 |     hCompare (NSeq _) _ =
 633 |         LT
 634 |     hCompare _ (NSeq _) =
 635 |         GT
 636 | 
 637 |     hCompare (NStar r1) (NStar r2) =
 638 |         r1 `hCompare` r2
 639 | 
 640 | instance (GSet c, Ord (CharacterClass c)) => HOrd (SubUnion c n1) (SubUnion c n2) where
 641 |     hCompare (SubUnion r1) (SubUnion r2) =
 642 |         r1 `hCompare` r2
 643 | 
 644 | instance (GSet c, Ord (CharacterClass c)) => HOrd (SubSeq c n1) (SubSeq c n2) where
 645 |     hCompare (SubSeq r1) (SubSeq r2) =
 646 |         r1 `hCompare` r2
 647 | 
 648 | instance (GSet c, Ord (CharacterClass c)) => HOrd (NSeq c n1) (NSeq c n2) where
 649 |     hCompare NSeqNil NSeqNil =
 650 |         EQ
 651 |     hCompare NSeqNil _ =
 652 |         LT
 653 |     hCompare _ NSeqNil =
 654 |         GT
 655 | 
 656 |     hCompare (NSeqConsNullable n1 h1 t1) (NSeqConsNullable n2 h2 t2) =
 657 |         fromSing n1 `compare` fromSing n2 <> h1 `hCompare` h2 <> t1 `hCompare` t2
 658 |     hCompare (NSeqConsNullable _ _ _) _ =
 659 |         LT
 660 |     hCompare _ (NSeqConsNullable _ _ _) =
 661 |         GT
 662 | 
 663 |     hCompare (NSeqConsStrict h1 t1) (NSeqConsStrict h2 t2) =
 664 |         h1 `hCompare` h2 <> t1 `hCompare` t2
 665 | 
 666 | 
 667 | instance (GSet c, Ord (CharacterClass c)) => Ord (NormalizedRegExp c isUnion isSeq isNullable) where
 668 |     compare = hCompare
 669 | 
 670 | instance (GSet c, Ord (CharacterClass c)) => Ord (SubUnion c isNullable) where
 671 |     compare = hCompare
 672 | 
 673 | instance (GSet c, Ord (CharacterClass c)) => Ord (SubSeq c isNullable) where
 674 |     compare = hCompare
 675 | 
 676 | instance (GSet c, Ord (CharacterClass c)) => Ord (NSeq c isNullable) where
 677 |     compare = hCompare
 678 | 
 679 | 
 680 | 
 681 | -- * Constructing and combining regular expressions
 682 | 
 683 | -- | Regular expression that matches no strings.
 684 | rZero :: RegExp c
 685 | rZero =
 686 |     RZero
 687 | 
 688 | 
 689 | -- | Regular expression that matches the empty string and nothing else.
 690 | rOne :: RegExp c
 691 | rOne =
 692 |     ROne
 693 | 
 694 | 
 695 | -- | Regular expression that matches strings that either regular expression
 696 | -- matches.
 697 | rPlus :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c
 698 | rPlus RZero r2 =
 699 |     r2
 700 | rPlus r1 RZero =
 701 |     r1
 702 | 
 703 | rPlus ROne ROne =
 704 |     ROne
 705 | rPlus ROne result@(RNormalized r2) =
 706 |     case nullableNormalized r2 of
 707 |         Left Refl ->
 708 |             case r2 of
 709 |                 NUnion p2 n2 s2 ->
 710 |                     assert (Set.null n2) $
 711 |                     RNormalized $ nUnionWithOne p2 s2
 712 | 
 713 |                 NSeq _ ->
 714 |                     RNormalized $ nUnionWithOne zero (Set.singleton $ SubUnion r2)
 715 | 
 716 |         Right Refl ->
 717 |             result
 718 | rPlus r1 ROne =
 719 |     rPlus ROne r1
 720 | 
 721 | rPlus (RNormalized r1) (RNormalized r2) =
 722 |     rPlus' r1 r2
 723 |     where
 724 |         rPlus' :: NormalizedRegExp c isUnion1 isSeq1 isNullable1
 725 |                -> NormalizedRegExp c isUnion2 isSeq2 isNullable2
 726 |                -> RegExp c
 727 |         rPlus' (NUnion p1 n1 s1) (NUnion p2 n2 s2) =
 728 |             nUnion (p1 <+> p2) (Set.union n1 n2) (Set.union s1 s2)
 729 |         rPlus' (NUnion p1 n1 s1) (NUnionWithOne p2 s2) | Set.null n1 =
 730 |             RNormalized $
 731 |                 nUnionWithOne (p1 <+> p2) (Set.union s1 s2)
 732 |         rPlus' (NUnion p1 n1 s1) (NUnionWithOne p2 s2) | otherwise =
 733 |             RNormalized $
 734 |                 nUnionNullable (p1 <+> p2) n1 (Set.union s1 s2)
 735 |         rPlus' (NUnion p1 n1 s1) r2@(NSeq _) =
 736 |             case nullableNormalized r2 of
 737 |                 Left Refl ->
 738 |                     nUnion p1 n1 (Set.insert (SubUnion r2) s1)
 739 | 
 740 |                 Right Refl ->
 741 |                     nUnion p1 (Set.insert (SubUnion r2) n1) s1
 742 |         rPlus' (NUnion p1 n1 s1) r2@(NStar _) =
 743 |             nUnion p1 (Set.insert (SubUnion r2) n1) s1
 744 |         rPlus' r1 r2@(NUnion _ _ _) =
 745 |             rPlus' r2 r1
 746 | 
 747 |         rPlus' (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) =
 748 |             RNormalized $
 749 |                 nUnionWithOne (p1 <+> p2) (Set.union s1 s2)
 750 |         rPlus' (NUnionWithOne p1 s1) r2@(NSeq _) =
 751 |             case nullableNormalized r2 of
 752 |                 Left Refl ->
 753 |                     RNormalized $
 754 |                         nUnionWithOne p1 (Set.insert (SubUnion r2) s1)
 755 | 
 756 |                 Right Refl ->
 757 |                     nUnion p1 (Set.singleton $ SubUnion r2) s1
 758 |         rPlus' (NUnionWithOne p1 s1) r2@(NStar _) =
 759 |             nUnion p1 (Set.singleton $ SubUnion r2) s1
 760 |         rPlus' r1 r2@(NUnionWithOne _ _) =
 761 |             rPlus' r2 r1
 762 | 
 763 |         rPlus' r1@(NSeq _) r2@(NSeq _) =
 764 |             rPlusNotUnion r1 r2
 765 |         rPlus' r1@(NSeq _) r2@(NStar _) =
 766 |             rPlusNotUnion r1 r2
 767 |         rPlus' r1@(NStar _) r2@(NSeq _) =
 768 |             rPlusNotUnion r1 r2
 769 |         rPlus' r1@(NStar _) r2@(NStar _) =
 770 |             rPlusNotUnion r1 r2
 771 | 
 772 | 
 773 |         rPlusNotUnion :: NormalizedRegExp c False isSeq1 isNullable1
 774 |                       -> NormalizedRegExp c False isSeq2 isNullable2
 775 |                       -> RegExp c
 776 |         rPlusNotUnion r1 r2 | r1 `hEq` r2 =
 777 |             RNormalized r1
 778 |         rPlusNotUnion r1 r2 | otherwise  =
 779 |             case (nullableNormalized r1, nullableNormalized r2) of
 780 |                 (Right Refl, Right Refl) ->
 781 |                     RNormalized $
 782 |                         nUnionNullable zero (Set.fromList [SubUnion r1, SubUnion r2]) Set.empty
 783 | 
 784 |                 (Right Refl, Left Refl) ->
 785 |                     RNormalized $
 786 |                         nUnionNullable zero (Set.singleton $ SubUnion r1) (Set.singleton $ SubUnion r2)
 787 | 
 788 |                 (Left Refl, Right Refl) ->
 789 |                     RNormalized $
 790 |                         nUnionNullable zero (Set.singleton $ SubUnion r2) (Set.singleton $ SubUnion r1)
 791 | 
 792 |                 (Left Refl, Left Refl) ->
 793 |                     RNormalized $
 794 |                         nUnionStrict zero (Set.fromList [SubUnion r1, SubUnion r2])
 795 | 
 796 | 
 797 | -- | Regular expression that matches the first expression followed
 798 | -- by the second.
 799 | rTimes :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c
 800 | rTimes RZero _ =
 801 |     RZero
 802 | rTimes _ RZero =
 803 |     RZero
 804 | 
 805 | rTimes ROne r2 =
 806 |     r2
 807 | rTimes r1 ROne =
 808 |     r1
 809 | 
 810 | rTimes (RNormalized r1) (RNormalized r2) =
 811 |     case (isSeq r1, isSeq r2) of
 812 |         (Left Refl, Left Refl) ->
 813 |             case (nSeqSingleton $ SubSeq r1, nSeqSingleton $ SubSeq r2) of
 814 |                 (Some1 l1, Some1 l2) ->
 815 |                     RNormalized $ NSeq (l1 `nSeqAppend` l2)
 816 | 
 817 |         (Right Refl, Left Refl) | NSeq l1 <- r1 ->
 818 |             case nSeqSingleton (SubSeq r2) of
 819 |                 Some1 l2 ->
 820 |                     RNormalized $ NSeq (l1 `nSeqAppend` l2)
 821 | 
 822 |         (Left Refl, Right Refl) | NSeq l2 <- r2 ->
 823 |             case nSeqSingleton (SubSeq r1) of
 824 |                 Some1 l1 ->
 825 |                     RNormalized $ NSeq (l1 `nSeqAppend` l2)
 826 | 
 827 |         (Right Refl, Right Refl) | NSeq l1 <- r1
 828 |                                  , NSeq l2 <- r2 ->
 829 |             RNormalized $ NSeq (l1 `nSeqAppend` l2)
 830 |     where
 831 |         isSeq :: NormalizedRegExp c isUnion isSeq isNullable
 832 |                  -> Either (isSeq :~: False) (isSeq :~: True)
 833 |         isSeq (NUnion _ _ _) =
 834 |             Left Refl
 835 |         isSeq (NUnionWithOne _ _) =
 836 |             Left Refl
 837 |         isSeq (NSeq _) =
 838 |             Right Refl
 839 |         isSeq (NStar _) =
 840 |             Left Refl
 841 | 
 842 | 
 843 | -- | Regular expression that matches zero or more copies of the given
 844 | -- expression.
 845 | rStar :: forall c. GSet c => RegExp c -> RegExp c
 846 | rStar RZero =
 847 |     ROne
 848 | rStar ROne =
 849 |     ROne
 850 | rStar r@(RNormalized (NStar _)) = -- Optimize a common case
 851 |     r
 852 | rStar (RNormalized r) =
 853 |     case nullableNormalized r of
 854 |         Left Refl ->
 855 |             RNormalized $ nStar r
 856 | 
 857 |         Right Refl | (p, s) <- removeOne r ->
 858 |             if p /= zero || Set.size s >= 2 then
 859 |                 RNormalized $ nStar (nUnionStrict p s)
 860 |             else
 861 |                 assert (p == zero && Set.size s == 1) $
 862 |                 case Set.findMin s of
 863 |                     SubUnion r ->
 864 |                         RNormalized $ nStar r
 865 |     where
 866 |         -- | If @removeOne r = r'@, then @r* = (uncurry nUnionStrict r')*@.
 867 |         removeOne :: NormalizedRegExp c isUnion isSeq True
 868 |                   -> (CharacterClass c, Set (SubUnion c False))
 869 |         removeOne (NUnion p n s) =
 870 |             List.foldl' merge (p, s) n'
 871 |             where
 872 |                 n' = fmap (\(SubUnion r) -> removeOne r) (Set.toList n)
 873 |         removeOne (NUnionWithOne p s) =
 874 |             (p, s)
 875 |         removeOne (NSeq NSeqNil) =
 876 |             (zero, Set.empty)
 877 |         removeOne (NSeq (NSeqConsNullable STrue (SubSeq r) t)) =
 878 |             removeOne r `merge` removeOne (NSeq t)
 879 |         removeOne (NStar (NUnion p n s)) =
 880 |             assert (Set.null n) $
 881 |             (p, s)
 882 |         removeOne (NStar r@(NSeq l)) =
 883 |             (zero, Set.singleton (SubUnion  r))
 884 | 
 885 |         -- | Combine two strict union components.
 886 |         merge :: (CharacterClass c, Set (SubUnion c False))
 887 |               -> (CharacterClass c, Set (SubUnion c False))
 888 |               -> (CharacterClass c, Set (SubUnion c False))
 889 |         merge (p1, s1) (p2, s2) =
 890 |             (p1 <+> p2, s1 `Set.union` s2)
 891 | 
 892 | 
 893 | -- | Regular expression that matches single character strings picked
 894 | -- from the given character class.
 895 | rLiteral :: forall c. GSet c => CharacterClass c -> RegExp c
 896 | rLiteral p | p == zero =
 897 |     RZero
 898 | rLiteral p | otherwise =
 899 |     nUnion p Set.empty Set.empty
 900 | 
 901 | 
 902 | -- * Printing
 903 | 
 904 | instance (GSet c, Show (CharacterClass c)) => Show (RegExp c) where
 905 |     showsPrec d RZero =
 906 |         showString "{}"
 907 | 
 908 |     showsPrec d ROne =
 909 |         showString "<>"
 910 | 
 911 |     showsPrec d (RNormalized r) =
 912 |         showsPrec d r
 913 | 
 914 | 
 915 | instance (GSet c, Show (CharacterClass c), Show r) => Show (RegExpView c r) where
 916 |     showsPrec _ One =
 917 |         showString "<>"
 918 |     showsPrec d (Plus r1 r2) =
 919 |         showParen (d > plusPrec) $
 920 |         showsPrec plusPrec r1 . showString " ++ " . showsPrec plusPrec r2
 921 |         where
 922 |             plusPrec = 9
 923 |     showsPrec d (Times r1 r2) =
 924 |         showParen (d > timesPrec) $
 925 |         showsPrec timesPrec r1 . showString "##" . showsPrec timesPrec r2
 926 |         where
 927 |             timesPrec = 10
 928 |     showsPrec d (Star r) =
 929 |         showParen (d > starPrec) $
 930 |         showsPrec starPrec r . showString "**"
 931 |         where
 932 |             starPrec = 11
 933 |     showsPrec d (Literal p) =
 934 |         showsPrec d p
 935 | 
 936 | 
 937 | instance (GSet c, Show (CharacterClass c)) => Show (NormalizedRegExp c isUnion isSeq isNullable) where
 938 |     showsPrec d (NUnion p n s) =
 939 |         showUnion d p (n' ++ s')
 940 |         where
 941 |             n' = fmap Some1 (Set.toList n)
 942 |             s' = fmap Some1 (Set.toList s)
 943 | 
 944 |     showsPrec d (NUnionWithOne p s) =
 945 |         showParen (d > unionWithOnePrec) $
 946 |             showUnion unionWithOnePrec p s' . showString "?"
 947 |         where
 948 |             unionWithOnePrec = 8
 949 |             s' = fmap Some1 (Set.toList s)
 950 | 
 951 |     showsPrec d (NSeq l) =
 952 |         showParen (d > seqPrec) $
 953 |             intercalate (showString "#") (toList l)
 954 |         where
 955 |             seqPrec = 7
 956 | 
 957 |             toList :: NSeq c n -> [ShowS]
 958 |             toList NSeqNil =
 959 |                 []
 960 |             toList (NSeqConsNullable _ h t) =
 961 |                 showsPrec seqPrec h : toList t
 962 |             toList (NSeqConsStrict h t) =
 963 |                 showsPrec seqPrec h : toList t
 964 | 
 965 |     showsPrec d (NStar r) =
 966 |         showParen (d > starPrec) $
 967 |             showsPrec starPrec r . showString "*"
 968 |         where
 969 |             starPrec = 8
 970 | 
 971 | 
 972 | instance (GSet c, Show (CharacterClass c)) => Show (SubUnion c isNullable) where
 973 |     showsPrec d (SubUnion r) =
 974 |         showsPrec d r
 975 | 
 976 | 
 977 | instance (GSet c, Show (CharacterClass c)) => Show (SubSeq c isNullable) where
 978 |     showsPrec d (SubSeq r) =
 979 |         showsPrec d r
 980 | 
 981 | 
 982 | -- | Behavior common to showing 'NUnion' and 'NUnionWithOne'.
 983 | showUnion :: (GSet c, Show (CharacterClass c))
 984 |           => Int
 985 |           -> CharacterClass c
 986 |           -> [Some1 (SubUnion c)]
 987 |           -> ShowS
 988 | showUnion d p l =
 989 |     showParen (d > unionPrec && numElements > 1) $
 990 |         intercalate (showString " + ") (literal ++ map showSub l)
 991 |     where
 992 |         unionPrec = 6
 993 | 
 994 |         prec =
 995 |             if numElements > 1 then unionPrec else d
 996 | 
 997 |         showSub (Some1 r) =
 998 |             showsPrec prec r
 999 | 
1000 |         literal =
1001 |             if p == zero then
1002 |                 []
1003 |             else
1004 |                 [showsPrec prec p]
1005 | 
1006 |         numElements = length literal + length l
1007 | 
1008 | 
1009 | 
1010 | -- | Combine a list of string builders using another as a separator.
1011 | intercalate :: ShowS -> [ShowS] -> ShowS
1012 | intercalate sep l =
1013 |     foldr (.) id (List.intersperse sep l)
1014 | 
1015 | 
1016 | 
1017 | -- * Parsing
1018 | 
1019 | instance (GSet c, Read (CharacterClass c)) => Read (RegExp c) where
1020 |     readsPrec _ =
1021 |         Parser.readP_to_S parser
1022 | 
1023 |         where
1024 |             parser :: Parser.ReadP (RegExp c)
1025 |             parser = do
1026 |                 Parser.skipSpaces
1027 |                 pPlus
1028 | 
1029 |             pPlus :: Parser.ReadP (RegExp c)
1030 |             pPlus =
1031 |                 Parser.choice
1032 |                     [ do
1033 |                         left <- pTimes
1034 |                         Parser.char '+'
1035 |                         Parser.skipSpaces
1036 |                         right <- pPlus
1037 |                         return $ left `rPlus` right
1038 | 
1039 |                     , pTimes
1040 |                     ]
1041 | 
1042 |             pTimes :: Parser.ReadP (RegExp c)
1043 |             pTimes =
1044 |                 Parser.choice
1045 |                     [ do
1046 |                         left <- pStar
1047 |                         Parser.char '#'
1048 |                         Parser.skipSpaces
1049 |                         right <- pTimes
1050 |                         return $ left `rTimes` right
1051 | 
1052 |                     , pStar
1053 |                     ]
1054 | 
1055 |             pStar :: Parser.ReadP (RegExp c)
1056 |             pStar = do
1057 |                 atom <- pAtom
1058 |                 ops <- Parser.many postfix
1059 |                 return $ foldr ($) atom ops
1060 | 
1061 |             pAtom :: Parser.ReadP (RegExp c)
1062 |             pAtom =
1063 |                 Parser.choice
1064 |                     [ do
1065 |                         p <- Parser.readS_to_P reads
1066 |                         Parser.skipSpaces
1067 |                         return $ rLiteral p
1068 | 
1069 |                     , do
1070 |                         Parser.string "<>"
1071 |                         Parser.skipSpaces
1072 |                         return rOne
1073 | 
1074 |                     , do
1075 |                         Parser.char '('
1076 |                         Parser.skipSpaces
1077 |                         r <- pPlus
1078 |                         Parser.char ')'
1079 |                         Parser.skipSpaces
1080 |                         return r
1081 |                     ]
1082 | 
1083 |             postfix :: Parser.ReadP (RegExp c -> RegExp c)
1084 |             postfix =
1085 |                 Parser.choice
1086 |                     [ do
1087 |                         Parser.char '?'
1088 |                         Parser.skipSpaces
1089 |                         return $ \r -> rOne `rPlus` r
1090 | 
1091 |                     , do
1092 |                         Parser.char '*'
1093 |                         Parser.skipSpaces
1094 |                         return rStar
1095 |                     ]
1096 | 
1097 | 
1098 | 
1099 | -- * Helpers
1100 | 
1101 | -- | Existentially quantify a single boolean argument.
1102 | data Some1 (f :: Bool -> *) where
1103 |     Some1 :: !(f b) -> Some1 f
1104 | 
1105 | 
1106 | -- | Useful for defining views that look like lists.
1107 | data ListView c e
1108 |     = Nil
1109 |     | e ::: c
1110 | 
1111 | 
1112 | 
1113 | -- * Testing
1114 | 
1115 | -- | For testing with QuickCheck.
1116 | instance (GSet c, Arbitrary (CharacterClass c)) => Arbitrary (RegExp c) where
1117 |     arbitrary = do
1118 |         size <- getSize
1119 |         if size <= 1 then
1120 |             oneof [zero, one, literal]
1121 |         else
1122 |             oneof [plus, times, star]
1123 | 
1124 |         where
1125 |             zero :: Gen (RegExp c)
1126 |             zero =
1127 |                 return rZero
1128 | 
1129 |             one :: Gen (RegExp c)
1130 |             one =
1131 |                 return rOne
1132 | 
1133 |             plus :: Gen (RegExp c)
1134 |             plus = do
1135 |                 l <- subtree
1136 |                 r <- subtree
1137 |                 return (l `rPlus` r)
1138 | 
1139 |             times :: Gen (RegExp c)
1140 |             times = do
1141 |                 l <- subtree
1142 |                 r <- subtree
1143 |                 return (l `rTimes` r)
1144 | 
1145 |             star :: Gen (RegExp c)
1146 |             star = do
1147 |                 r <- subtree
1148 |                 return (rStar r)
1149 | 
1150 |             literal :: Gen (RegExp c)
1151 |             literal = do
1152 |                 p <- resize 5 arbitrary
1153 |                 return (rLiteral p)
1154 | 
1155 |             -- | Decrement the size parameter before generating
1156 |             -- a regular expression
1157 |             subtree :: Gen (RegExp c)
1158 |             subtree = do
1159 |                 size <- getSize
1160 |                 newSize <- choose (0, size - 1)
1161 |                 resize newSize arbitrary
1162 | 
1163 |     shrink r =
1164 |         case view r of
1165 |             One ->
1166 |                 []
1167 |             Plus r1 r2 ->
1168 |                 concat [
1169 |                     [r1, r2],
1170 |                     [s1 `rPlus` r2 | s1 <- shrink r1],
1171 |                     [r1 `rPlus` s2 | s2 <- shrink r2]
1172 |                 ]
1173 |             Times r1 r2 ->
1174 |                 concat [
1175 |                     [r1, r2],
1176 |                     [s1 `rTimes` r2 | s1 <- shrink r1],
1177 |                     [r1 `rTimes` s2 | s2 <- shrink r2]
1178 |                 ]
1179 |             Star r ->
1180 |                 r : [rStar s | s <- shrink r]
1181 |             Literal p ->
1182 |                 [rLiteral s | s <- shrink p]
1183 | 
1184 | 
1185 | -- | For testing with SmallCheck.
1186 | instance (GSet c, Monad m, Serial m (CharacterClass c)) => Serial m (RegExp c) where
1187 |     series =
1188 |         cons0 rZero \/
1189 |         cons0 rOne \/
1190 |         cons2 rPlus \/
1191 |         cons2 rTimes \/
1192 |         cons1 rStar \/
1193 |         cons1 rLiteral


--------------------------------------------------------------------------------