├── test ├── Spec.hs ├── RegExp │ ├── Internal │ │ └── DFASpec.hs │ ├── OperationsSpec.hs │ └── RegExpSpec.hs ├── Helpers.hs └── Data │ └── GSetSpec.hs ├── Setup.hs ├── paper ├── mathpartir.sty ├── Makefile ├── bibliography-extra.bib ├── bibliography.bibdb ├── macro │ ├── generic.sty │ └── code.sty ├── .gitignore └── report.tex ├── .gitignore ├── stack.yaml ├── submission.sh ├── .travis.yml ├── src ├── Data │ ├── KleeneAlgebra.hs │ ├── BooleanAlgebra.hs │ ├── Semiring.hs │ └── GSet.hs ├── RegExp │ ├── Language.hs │ ├── Operations.hs │ ├── Equation.hs │ ├── Derivative.hs │ ├── Internal │ │ └── DFA.hs │ └── RegExp.hs ├── SparseVector.hs └── SparseMatrix.hs ├── LICENSE ├── regexp.cabal └── README.md /test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /paper/mathpartir.sty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cacay/regexp/HEAD/paper/mathpartir.sty -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Haskell 2 | cabal.config 3 | .stack-work 4 | 5 | # JetBrains 6 | .idea 7 | 8 | # Distribution 9 | *.tar 10 | *.tar.gz 11 | *.pdf 12 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | flags: {} 2 | 3 | packages: 4 | - '.' 5 | 6 | resolver: lts-12.7 7 | 8 | extra-deps: 9 | - semiring-num-1.6.0.1 10 | 11 | nix: 12 | enable: false 13 | packages: 14 | -------------------------------------------------------------------------------- /paper/Makefile: -------------------------------------------------------------------------------- 1 | LATEXMK = latexmk -pdf 2 | 3 | all: report.pdf 4 | 5 | report.pdf: report.tex bibliography.bib bibliography-extra.bib *.sty macro/*.sty 6 | $(LATEXMK) report 7 | 8 | bibliography.bib: bibliography.bibdb 9 | bibdb bibliography.bibdb 10 | 11 | clean: 12 | $(LATEXMK) -C report 13 | rm bibliography.bib 14 | 15 | .PHONY: all clean 16 | -------------------------------------------------------------------------------- /paper/bibliography-extra.bib: -------------------------------------------------------------------------------- 1 | % Entries that [bibdb](https://github.com/cacay/bibdb) cannot track. 2 | @techreport{CheneyH03, 3 | title={First-class phantom types}, 4 | author={Cheney, James and Hinze, Ralf}, 5 | year={2003}, 6 | institution={Cornell University} 7 | } 8 | 9 | @techreport{HopcroftK71, 10 | title={A linear algorithm for testing equivalence of finite automata}, 11 | author={Hopcroft, John E and Karp, Richard M}, 12 | year={1971}, 13 | institution={Cornell University} 14 | } 15 | -------------------------------------------------------------------------------- /submission.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Create a new temporary branch 6 | git checkout -b dist 7 | 8 | # Generate report.pdf 9 | (cd paper && make && rm -f ../report.pdf && cp report.pdf ../) 10 | 11 | # Commit report.pdf 12 | git add -f report.pdf 13 | git commit -m "Add paper" 14 | 15 | # Create an archive containing the source code and the report 16 | git archive --prefix=regexp/ -o dist.tar.gz HEAD 17 | 18 | # Delete the temporary branch 19 | git checkout master && git branch -D dist 20 | -------------------------------------------------------------------------------- /paper/bibliography.bibdb: -------------------------------------------------------------------------------- 1 | -- Regular Expressions 2 | DBLP:journals/jacm/Brzozowski64 as Brzozowski64 3 | DBLP:journals/ijfcs/CampeanuSY03 as CampeanuSY03 4 | DBLP:conf/lata/GruberG10 as GruberG10 5 | DBLP:conf/cpp/DoczkalKS13 as DoczkalKS13 6 | DBLP:journals/iandc/Kozen94 as Kozen94 7 | DBLP:conf/fsttcs/KeilT14 as KeilT14 8 | HAL:hal-00639716v2 as BonchiP11 9 | 10 | -- Haskell Libraries 11 | DBLP:conf/haskell/EisenbergW12 as Singletons 12 | DBLP:conf/haskell/RuncimanNL08 as SmallCheck 13 | DBLP:conf/icfp/ClaessenH00 as QuickCheck 14 | 15 | -- Complexity 16 | DBLP:conf/focs/Kozen77 as Kozen77 17 | DBLP:journals/tocl/GeladeN12 as GeladeN12 18 | DBLP:conf/popl/FosterKM0T15 as FosterKM0T15 19 | -------------------------------------------------------------------------------- /test/RegExp/Internal/DFASpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE OverloadedLists #-} 3 | 4 | module RegExp.Internal.DFASpec where 5 | 6 | import Test.Hspec 7 | import Test.QuickCheck 8 | import Helpers 9 | 10 | import Data.Either(isRight) 11 | 12 | import RegExp.RegExp 13 | import RegExp.Internal.DFA 14 | import RegExp.Derivative(equivalent) 15 | 16 | import Data.GSet() 17 | 18 | 19 | spec :: Spec 20 | spec = do 21 | describe "regexp" $ do 22 | it "is the inverse of fromRegExp" $ do 23 | mapSize (`div` 4) $ 24 | property $ \(r :: RegExp Helpers.Small) -> 25 | regexp (fromRegExp r) `shouldSatisfy` (isRight . equivalent r) 26 | -------------------------------------------------------------------------------- /test/Helpers.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE MultiParamTypeClasses #-} 3 | 4 | {-# LANGUAGE FlexibleContexts #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE UndecidableInstances #-} 7 | 8 | 9 | -- | Helper functions for testing. 10 | module Helpers where 11 | 12 | import GHC.Generics 13 | 14 | import Test.QuickCheck.Arbitrary 15 | import Test.QuickCheck.Gen 16 | import Test.SmallCheck.Series 17 | 18 | 19 | -- | A finite data type with a few constructors. Useful with SmallCheck. 20 | data Small 21 | = A 22 | | B 23 | | C 24 | deriving (Eq, Ord, Bounded, Enum, Show, Read, Generic) 25 | 26 | 27 | instance Monad m => Serial m Small 28 | 29 | instance Arbitrary Small where 30 | arbitrary = 31 | elements [minBound.. ] 32 | 33 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use new container infrastructure to enable caching 2 | sudo: false 3 | 4 | # Do not choose a language; we provide our own build tools. 5 | language: generic 6 | 7 | # Caching so the next build will be fast too. 8 | cache: 9 | directories: 10 | - $HOME/.stack 11 | 12 | # Ensure necessary system libraries are present 13 | addons: 14 | apt: 15 | packages: 16 | - libgmp-dev 17 | 18 | before_install: 19 | # Download and unpack the stack executable 20 | - mkdir -p ~/.local/bin 21 | - export PATH=$HOME/.local/bin:$PATH 22 | - travis_retry curl -L https://get.haskellstack.org/stable/linux-x86_64.tar.gz | tar xz --wildcards --strip-components=1 -C ~/.local/bin '*/stack' 23 | 24 | install: 25 | # Build dependencies 26 | - stack --no-terminal --install-ghc test --only-dependencies 27 | 28 | script: 29 | # Build the package, its tests, and its docs and run the tests 30 | - stack --no-terminal test --haddock --no-haddock-deps 31 | -------------------------------------------------------------------------------- /src/Data/KleeneAlgebra.hs: -------------------------------------------------------------------------------- 1 | -- | Definition of Kleene algebras. 2 | module Data.KleeneAlgebra 3 | ( KleeneAlgebra(..) 4 | ) where 5 | 6 | import Data.Semiring (Semiring (..)) 7 | 8 | 9 | -- | A Kleene algebra is an /idempotent/ semiring with an additional operation 10 | -- called the Kleene star. In addition to the semiring axioms, a Kleene algebra 11 | -- needs to satisfy the following properties: 12 | -- 13 | -- == Idempotence of '<+>' 14 | -- @a '<+>' a = a@ 15 | -- 16 | -- == Properties of 'star' 17 | -- @'one' '<+>' (a '<.>' 'star' a) <= 'star' a@ 18 | -- 19 | -- @'one' '<+>' ('star' a '<.>' a) <= 'star' a@ 20 | -- 21 | -- @b '<+>' (a '<.>' x) <= x ==> ('star' a) '<.>' b <= x@ 22 | -- 23 | -- @b '<+>' (x '<.>' a) <= x ==> b '<.>' ('star' a) <= x@ 24 | -- 25 | -- Here, @a <= b@ is defined as @a '<+>' b = b@. 26 | class Semiring a => KleeneAlgebra a where 27 | -- | Kleene star. Captures the notion of /iteration/. 28 | star :: a -> a 29 | 30 | 31 | -- | Booleans form a (trivial) Kleene algebra. 32 | instance KleeneAlgebra Bool where 33 | star _ = 34 | True 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Cosku Acay 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /test/RegExp/OperationsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | 3 | module RegExp.OperationsSpec where 4 | 5 | import Test.Hspec 6 | import Test.QuickCheck 7 | import Helpers 8 | 9 | import RegExp.RegExp 10 | import RegExp.Operations 11 | import qualified RegExp.Derivative as RegExp 12 | 13 | import Data.GSet(GSet) 14 | 15 | 16 | spec :: Spec 17 | spec = do 18 | describe "intersection with equation solving" $ do 19 | it "is equivalent to intersection with DFAs" $ do 20 | mapSize (`div` 4) $ 21 | property $ \(r1 :: RegExp Helpers.Small) r2 -> 22 | (r1 `intersectionEquation` r2) `shouldMatch` (r1 `intersection` r2) 23 | 24 | describe "complement with equation solving" $ do 25 | it "is equivalent to complement with DFAs" $ do 26 | mapSize (`div` 4) $ 27 | property $ \(r :: RegExp Helpers.Small) -> 28 | complement r `shouldMatch` complementEquation r 29 | 30 | 31 | -- | Set the expectation that the given regular expression 32 | -- should be equivalent. 33 | shouldMatch :: (GSet c, Show c, Eq c) => RegExp c -> RegExp c -> Expectation 34 | shouldMatch r1 r2 = 35 | RegExp.equivalent r1 r2 `shouldBe` Right () 36 | -------------------------------------------------------------------------------- /test/Data/GSetSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | 3 | module Data.GSetSpec where 4 | 5 | import Test.Hspec 6 | import Test.Hspec.SmallCheck 7 | import Test.SmallCheck 8 | import Helpers 9 | 10 | import GHC.Exts (IsList(..)) 11 | 12 | import Data.GSet 13 | import Data.BooleanAlgebra 14 | 15 | 16 | spec :: Spec 17 | spec = do 18 | describe "Eq.(==)" $ do 19 | it "is reflexive" $ do 20 | property $ \(s :: Set Small) -> s `shouldBe` s 21 | 22 | it "agrees with Ord.compare" $ do 23 | property $ \(s :: Set Small) t -> compare s t == EQ ==> s `shouldBe` t 24 | 25 | it "handles compelements" $ do 26 | property $ \(s :: Set Small) -> 27 | s `shouldBe` complement (fromList [a | a <- [A .. ], not (a `member` s)]) 28 | 29 | 30 | 31 | describe "Ord.compare" $ do 32 | it "is reflexive" $ do 33 | property $ \(s :: Set Small) -> compare s s `shouldBe` EQ 34 | 35 | it "agrees with Eq.(==)" $ do 36 | property $ \(s :: Set Small) t -> s == t ==> compare s t `shouldBe` EQ 37 | 38 | it "is flippable" $ do 39 | property $ \(s :: Set Small) t -> compare s t == LT ==> compare t s`shouldBe` GT 40 | it "is flippable" $ do 41 | property $ \(s :: Set Small) t -> compare s t == GT ==> compare t s`shouldBe` LT 42 | 43 | it "is transitive" $ do 44 | property $ \(s :: Set Small) t u -> 45 | s <= t ==> t <= u ==> s <= u `shouldBe` True 46 | 47 | 48 | -------------------------------------------------------------------------------- /paper/macro/generic.sty: -------------------------------------------------------------------------------- 1 | \NeedsTeXFormat{LaTeX2e} 2 | \ProvidesPackage{macro/generic} 3 | [2016/01/29 version 1.0.0 Generic Macros] 4 | 5 | \RequirePackage{mathtools} 6 | 7 | %%% Paired delimiters 8 | \DeclarePairedDelimiter\parens{(}{)} % parenthesis 9 | \DeclarePairedDelimiter\bracks{\lbrack}{\rbrack} % brackets 10 | \DeclarePairedDelimiter\braces{\lbrace}{\rbrace} % braces 11 | \DeclarePairedDelimiter\abs{\lvert}{\rvert} % absolute value 12 | \DeclarePairedDelimiter\norm{\lVert}{\rVert} % double verts 13 | \DeclarePairedDelimiter\angled{\langle}{\rangle} % angle brackets 14 | \DeclarePairedDelimiter\set{\lbrace}{\rbrace} % braces 15 | \DeclarePairedDelimiter\denot{\llbracket}{\rrbracket} 16 | \DeclarePairedDelimiterX\setdef[2]{\{}{\}} 17 | {#1 \mathrel{}\mathclose{}\delimsize|\mathopen{}\mathrel{} #2} 18 | 19 | 20 | %%% Quotation marks for code 21 | \newcommand{\sq}{\text{\ttfamily{\char'15}}} % Single quote 22 | \newcommand{\qq}{\text{\ttfamily"}} % Double quote 23 | \newcommand{\qquote}[1]{\qq #1\qq{}} % Strings 24 | 25 | 26 | %%% Logic 27 | \newcommand{\DD}{\mathcal{D}} 28 | \newcommand{\EE}{\mathcal{E}} 29 | \newcommand{\FF}{\mathcal{F}} 30 | 31 | \newcommand\defined{\mathrel{\triangleq}} 32 | 33 | 34 | %%% Math 35 | \DeclareMathOperator{\dom}{dom} 36 | \DeclareMathOperator{\cod}{cod} 37 | \DeclareMathOperator{\im}{im} 38 | 39 | 40 | %%% Induction 41 | \newcommand\centerLine[1]{ 42 | \begingroup\setbox0=\hbox{#1}% 43 | \parbox{\wd0}{\box0}\endgroup} 44 | \newcommand\pred[1]{\mathcal{P}\parens*{#1}} 45 | \newcommand\predC[1]{\pred{\centerLine{#1}}} 46 | \newcommand\predQ[1]{\mathcal{Q}\parens*{#1}} 47 | \newcommand\predQC[1]{\predQ{\centerLine{#1}}} 48 | 49 | -------------------------------------------------------------------------------- /src/Data/BooleanAlgebra.hs: -------------------------------------------------------------------------------- 1 | -- | Definition of Boolean algebras. We base the definition on semirings 2 | -- so it works nicely with "Data.KleeneAlgebra". 3 | module Data.BooleanAlgebra 4 | ( BooleanAlgebra(..) 5 | -- * Operations 6 | , implies 7 | , butNot 8 | , ands 9 | , ors 10 | ) where 11 | 12 | import Data.Semiring (Semiring (..)) 13 | 14 | 15 | -- | A Boolean algebra is a distributive commutative idempotent semiring with 16 | -- complement satisfying some extra equations. More concretely, on top of the 17 | -- semiring axioms, the following axioms need to hold: 18 | -- 19 | -- == Distributivity of '<+>' over '<.>' 20 | -- @a '<+>' (b '<.>' c) = (a '<+>' b) '<.>' (a '<+>' c)@ 21 | -- 22 | -- == Commutativity of '<.>' 23 | -- @a '<.>' b = b '<.>' a@ 24 | -- 25 | -- == Idempotence of '<+>' and '<.>' 26 | -- @a '<+>' a = a@ 27 | -- 28 | -- @a '<.>' a = a@ 29 | -- 30 | -- == Upper bounded by 'one' 31 | -- @a '<+>' 'one' = 'one'@ 32 | -- 33 | -- == Properties of 'complement' 34 | -- @a '<+>' 'complement' a = 'one'@ 35 | -- 36 | -- @a '<.>' 'complement' a = 'zero'@ 37 | class Semiring a => BooleanAlgebra a where 38 | -- | Complement or negation. 39 | complement :: a -> a 40 | 41 | 42 | -- | Booleans form a boolean algebra. 43 | instance BooleanAlgebra Bool where 44 | complement = not 45 | 46 | 47 | -- * Functions over Boolean algebras 48 | 49 | -- | @p `implies` q@ holds if @p@ holds implies @q@ holds. 50 | implies :: BooleanAlgebra a => a -> a -> a 51 | implies p q = 52 | complement p <+> q 53 | 54 | 55 | -- | @p `butNot` q@ holds when @p@ holds but @q@ doesn't. 56 | butNot :: BooleanAlgebra a => a -> a -> a 57 | butNot p q = 58 | p <.> complement q 59 | 60 | 61 | -- | Logical conjunction of all elements in a container. 62 | ands :: (BooleanAlgebra a, Foldable t) => t a -> a 63 | ands t = 64 | foldr (<.>) one t 65 | 66 | 67 | -- | Logical disjunction of all elements in a container. 68 | ors :: (BooleanAlgebra a, Foldable t) => t a -> a 69 | ors = 70 | foldr (<+>) zero 71 | -------------------------------------------------------------------------------- /src/RegExp/Language.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MonoLocalBinds #-} 2 | 3 | -- | The language of a regular expression is the set of all words 4 | -- matched by that expression. Here, we show that languages of regular 5 | -- expressions (also called regular languages) form a Kleene algebra. 6 | module RegExp.Language 7 | ( Language 8 | , language 9 | , regexp 10 | ) where 11 | 12 | import Data.Either (isRight) 13 | 14 | import RegExp.RegExp 15 | import RegExp.Derivative 16 | 17 | import Data.Semiring (Semiring(..), DetectableZero(..)) 18 | import Data.KleeneAlgebra (KleeneAlgebra(..)) 19 | import Data.GSet (GSet(..)) 20 | 21 | 22 | -- | Regular languages over the alphabet @c@, i.e. set of strings that 23 | -- are matched by some regular expression. 24 | newtype Language c = 25 | Language (RegExp c) 26 | 27 | 28 | -- | Compute the set of all strings given regular expression matches. 29 | language :: RegExp c -> Language c 30 | language = 31 | Language 32 | 33 | 34 | -- | Given a regular language, construct a regular expression that 35 | -- matches precisely that language. 36 | regexp :: Language c -> RegExp c 37 | regexp (Language r) = 38 | r 39 | 40 | 41 | -- | Equivalence of regular languages is decidable. 42 | instance GSet c => Eq (Language c) where 43 | l1 == l2 = 44 | isRight $ equivalent (regexp l1) (regexp l2) 45 | 46 | 47 | -- | Regular languages form a semiring. 48 | instance GSet c => Semiring (Language c) where 49 | zero = 50 | Language rZero 51 | 52 | one = 53 | Language rOne 54 | 55 | l1 <+> l2 = 56 | Language (regexp l1 `rPlus` regexp l2) 57 | 58 | l1 <.> l2 = 59 | Language (regexp l1 `rTimes` regexp l2) 60 | 61 | 62 | -- | We can tell when a regular language is empty. 63 | instance GSet c => DetectableZero (Language c) where 64 | -- | TODO: we can do this a lot more efficiently. 65 | isZero l = 66 | empty (regexp l) 67 | 68 | 69 | -- | Regular languages form a Kleene algebra. 70 | instance GSet c => KleeneAlgebra (Language c) where 71 | star l = 72 | Language (rStar $ regexp l) 73 | 74 | 75 | -- TODO: regular languages form a 'GSet', but concatenation is not intersection. 76 | -- How do we reconcile that? -------------------------------------------------------------------------------- /paper/.gitignore: -------------------------------------------------------------------------------- 1 | bibliography.bib 2 | 3 | ## Core latex/pdflatex auxiliary files: 4 | *.aux 5 | *.lof 6 | *.log 7 | *.lot 8 | *.fls 9 | *.out 10 | *.toc 11 | *.vtc 12 | 13 | ## Intermediate documents: 14 | *.dvi 15 | *-converted-to.* 16 | # these rules might exclude image files for figures etc. 17 | *.ps 18 | # *.eps 19 | *.pdf 20 | 21 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 22 | *.bbl 23 | *.bcf 24 | *.blg 25 | *-blx.aux 26 | *-blx.bib 27 | *.brf 28 | *.run.xml 29 | 30 | ## Build tool auxiliary files: 31 | *.fdb_latexmk 32 | *.synctex 33 | *.synctex.gz 34 | *.synctex.gz(busy) 35 | *.pdfsync 36 | 37 | ## Auxiliary and intermediate files from other packages: 38 | 39 | 40 | # algorithms 41 | *.alg 42 | *.loa 43 | 44 | # achemso 45 | acs-*.bib 46 | 47 | # amsthm 48 | *.thm 49 | 50 | # beamer 51 | *.nav 52 | *.snm 53 | *.vrb 54 | 55 | #(e)ledmac/(e)ledpar 56 | *.end 57 | *.[1-9] 58 | *.[1-9][0-9] 59 | *.[1-9][0-9][0-9] 60 | *.[1-9]R 61 | *.[1-9][0-9]R 62 | *.[1-9][0-9][0-9]R 63 | *.eledsec[1-9] 64 | *.eledsec[1-9]R 65 | *.eledsec[1-9][0-9] 66 | *.eledsec[1-9][0-9]R 67 | *.eledsec[1-9][0-9][0-9] 68 | *.eledsec[1-9][0-9][0-9]R 69 | 70 | # glossaries 71 | *.acn 72 | *.acr 73 | *.glg 74 | *.glo 75 | *.gls 76 | 77 | # gnuplottex 78 | *-gnuplottex-* 79 | 80 | # hyperref 81 | *.brf 82 | 83 | # knitr 84 | *-concordance.tex 85 | *.tikz 86 | *-tikzDictionary 87 | 88 | # listings 89 | *.lol 90 | 91 | # makeidx 92 | *.idx 93 | *.ilg 94 | *.ind 95 | *.ist 96 | 97 | # minitoc 98 | *.maf 99 | *.mtc 100 | *.mtc[0-9] 101 | *.mtc[1-9][0-9] 102 | 103 | # minted 104 | _minted* 105 | *.pyg 106 | 107 | # morewrites 108 | *.mw 109 | 110 | # mylatexformat 111 | *.fmt 112 | 113 | # nomencl 114 | *.nlo 115 | 116 | # sagetex 117 | *.sagetex.sage 118 | *.sagetex.py 119 | *.sagetex.scmd 120 | 121 | # sympy 122 | *.sout 123 | *.sympy 124 | sympy-plots-for-*.tex/ 125 | 126 | #pythontex 127 | *.pytxcode 128 | pythontex-files-*/ 129 | 130 | # Texpad 131 | .texpadtmp 132 | 133 | # TikZ & PGF 134 | *.dpth 135 | *.md5 136 | *.auxlock 137 | 138 | # todonotes 139 | *.tdo 140 | 141 | # xindy 142 | *.xdy 143 | 144 | # WinEdt 145 | *.bak 146 | *.sav 147 | 148 | # endfloat 149 | *.ttt 150 | *.fff 151 | 152 | # Packages 153 | *.zip 154 | *.tar 155 | -------------------------------------------------------------------------------- /paper/macro/code.sty: -------------------------------------------------------------------------------- 1 | \NeedsTeXFormat{LaTeX2e} 2 | \ProvidesPackage{macro/code} 3 | [2016/01/29 version 1.0.0 Code] 4 | 5 | \RequirePackage{listings} 6 | \RequirePackage{color} 7 | 8 | % Types 9 | \newcommand\nat{\text{\lstinline{Nat}}} 10 | \newcommand\pos{\text{\lstinline{Pos}}} 11 | \newcommand\even{\text{\lstinline{Even}}} 12 | \newcommand\odd{\text{\lstinline{Odd}}} 13 | 14 | % Listings settings 15 | \lstdefinestyle{custom}{% 16 | commentstyle=\it, 17 | keywordstyle=\bf, 18 | % 19 | breakatwhitespace=false, % sets if automatic breaks should only happen at whitespace 20 | breaklines=true, % sets automatic line breaking 21 | captionpos=b, % sets the caption-position to bottom 22 | frame=none, % adds a frame around the code 23 | keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible) 24 | numbers=none, % where to put the line-numbers; possible values are (none, left, right) 25 | numbersep=5pt, % how far the line-numbers are from the code 26 | numberstyle=\tiny\color{gray}, % the style that is used for the line-numbers 27 | stepnumber=2, % the step between two line-numbers. If it's 1, each line will be numbered 28 | rulecolor=\color{black}, % if not set, the frame-color may be changed on line-breaks within not-black text (e.g. comments (green here)) 29 | showspaces=false, % show spaces everywhere adding particular underscores; it overrides 'showstringspaces' 30 | showstringspaces=false, % underline spaces within strings only 31 | showtabs=false, % show tabs within strings adding particular underscores 32 | tabsize=2, % sets default tabsize to 2 spaces 33 | } 34 | 35 | \lstdefinelanguage{krill}{% 36 | % Language 37 | morekeywords={type, data}, 38 | morekeywords={close, wait, send, recv, case, of, 1}, 39 | sensitive=true, 40 | morecomment=[l]{--}, 41 | morecomment=[n]{\{-}{-\}}, 42 | % Comments 43 | mathescape=true, % Latex math mode using $...$ 44 | escapechar=@, % Latex inside code with @...@ 45 | escapeinside={\{*}{*\}}, % Latex comment with {* ... *} 46 | } 47 | 48 | \lstset{language=haskell, style=custom} 49 | \lstset{mathescape, keepspaces} 50 | 51 | -------------------------------------------------------------------------------- /regexp.cabal: -------------------------------------------------------------------------------- 1 | name: regexp 2 | version: 0.1.0.0 3 | category: Text 4 | synopsis: Derivatives, intersections, complement, and equivalence of regular expressions. 5 | homepage: https://github.com/cacay/regexp 6 | 7 | license: MIT 8 | license-file: LICENSE 9 | 10 | author: Cosku Acay 11 | maintainer: coskuacay@gmail.com 12 | 13 | build-type: Simple 14 | cabal-version: >=1.10 15 | 16 | extra-source-files: README.md 17 | 18 | 19 | library 20 | hs-source-dirs: src 21 | 22 | default-language: Haskell2010 23 | 24 | exposed-modules: RegExp.RegExp 25 | RegExp.Language 26 | RegExp.Derivative 27 | RegExp.Equation 28 | RegExp.Operations 29 | Data.BooleanAlgebra 30 | Data.KleeneAlgebra 31 | Data.GSet 32 | Data.Semiring 33 | 34 | RegExp.Internal.DFA 35 | 36 | other-modules: SparseVector 37 | SparseMatrix 38 | 39 | build-depends: base >= 4.11, 40 | flow, 41 | mtl, 42 | containers, 43 | equivalence, 44 | -- semiring-num, 45 | singletons, 46 | finite-typelits, 47 | QuickCheck, 48 | smallcheck 49 | 50 | default-extensions: ScopedTypeVariables 51 | OverloadedStrings 52 | StandaloneDeriving 53 | DeriveFunctor 54 | 55 | other-extensions: GADTs 56 | DataKinds 57 | KindSignatures 58 | TypeFamilies 59 | FlexibleInstances 60 | FlexibleContexts 61 | UndecidableInstances 62 | OverloadedLists 63 | 64 | ghc-options: -fwarn-incomplete-patterns 65 | -fwarn-unused-imports 66 | 67 | 68 | test-suite spec 69 | type: exitcode-stdio-1.0 70 | 71 | hs-source-dirs: test 72 | 73 | default-language: Haskell2010 74 | 75 | main-is: Spec.hs 76 | 77 | other-modules: Data.GSetSpec 78 | RegExp.RegExpSpec 79 | RegExp.OperationsSpec 80 | RegExp.Internal.DFASpec 81 | Helpers 82 | 83 | build-depends: base >= 4.11, 84 | regexp, 85 | hspec, 86 | hspec-smallcheck, 87 | QuickCheck, 88 | smallcheck 89 | 90 | default-extensions: ScopedTypeVariables 91 | OverloadedStrings 92 | StandaloneDeriving 93 | DeriveFunctor 94 | 95 | ghc-options: -Wall -------------------------------------------------------------------------------- /src/RegExp/Operations.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | 3 | -- | Operations over regular expressions. 4 | module RegExp.Operations 5 | ( intersection 6 | , complement 7 | , difference 8 | , intersectionEquation 9 | , complementEquation 10 | ) where 11 | 12 | 13 | import qualified Data.Map as Map 14 | import qualified Data.Set as Set 15 | 16 | import qualified RegExp.Internal.DFA as DFA 17 | import RegExp.RegExp 18 | import RegExp.Derivative 19 | import RegExp.Equation 20 | 21 | import qualified Data.BooleanAlgebra as BooleanAlgebra 22 | import Data.Semiring(Semiring(..)) 23 | import Data.GSet hiding (Set) 24 | 25 | 26 | -- | Regular expression that accepts words both expressions accept. 27 | intersection :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c 28 | intersection r1 r2 = 29 | DFA.regexp $ 30 | DFA.fromRegExp r1 <.> DFA.fromRegExp r2 31 | 32 | 33 | -- | Regular expression that accepts words given expression doesn't. 34 | complement :: GSet c => RegExp c -> RegExp c 35 | complement r = 36 | DFA.regexp $ 37 | BooleanAlgebra.complement $ DFA.fromRegExp r 38 | 39 | 40 | -- | Regular expression that accepts words the first expression does but 41 | -- the second doesn't. 42 | difference :: GSet c => RegExp c -> RegExp c -> RegExp c 43 | difference r1 r2 = 44 | DFA.regexp $ 45 | DFA.fromRegExp r1 <.> BooleanAlgebra.complement (DFA.fromRegExp r2) 46 | 47 | 48 | 49 | -- | Intersection of two regular expressions computed directly by solving 50 | -- linear equations instead of going through DFAs. 51 | intersectionEquation :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c 52 | intersectionEquation r1 r2 = 53 | solve step (r1, r2) 54 | where 55 | step :: (RegExp c, RegExp c) -> RightHand c (RegExp c, RegExp c) 56 | step (r1, r2) = 57 | (hasEmpty, Map.fromListWith rPlus subTerms) 58 | where 59 | hasEmpty = 60 | if nullable r1 && nullable r2 then 61 | rOne 62 | else 63 | rZero 64 | 65 | subTerms = 66 | [ ((derivative c r1, derivative c r2), rLiteral p) 67 | | p <- Set.toList $ join (next r1) (next r2) 68 | , Just c <- [choose p] 69 | ] 70 | 71 | 72 | -- | Complement of a regular expression computed directly by solving 73 | -- linear equations instead of going through DFAs. 74 | complementEquation :: forall c. GSet c => RegExp c -> RegExp c 75 | complementEquation r = 76 | solve step r 77 | where 78 | step :: RegExp c -> RightHand c (RegExp c) 79 | step r = 80 | (hasEmpty `rPlus` (notNext `rTimes` any), Map.fromListWith rPlus subTerms) 81 | where 82 | hasEmpty = 83 | if nullable r then rZero else rOne 84 | 85 | notNext = 86 | rLiteral $ BooleanAlgebra.complement $ BooleanAlgebra.ors $ next r 87 | 88 | any :: RegExp c 89 | any = 90 | rStar $ rLiteral $ one 91 | 92 | subTerms = 93 | [ (derivative c r, rLiteral p) 94 | | p <- Set.toList $ next r 95 | , Just c <- [choose p] 96 | ] 97 | -------------------------------------------------------------------------------- /src/Data/Semiring.hs: -------------------------------------------------------------------------------- 1 | -- | Copied from @semiring-num-1.6.0.1@ because that doesn't work 2 | -- with Stack nightly build. TODO: delete when that works. 3 | module Data.Semiring 4 | ( Semiring(..) 5 | , DetectableZero(..) 6 | ) where 7 | 8 | -- $setup 9 | -- >>> import Data.Function 10 | 11 | -- | A is like the 12 | -- the combination of two 'Data.Monoid.Monoid's. The first 13 | -- is called '<+>'; it has the identity element 'zero', and it is 14 | -- commutative. The second is called '<.>'; it has identity element 'one', 15 | -- and it must distribute over '<+>'. 16 | -- 17 | -- = Laws 18 | -- == Normal 'Monoid' laws 19 | -- 20 | -- @(a '<+>' b) '<+>' c = a '<+>' (b '<+>' c) 21 | --'zero' '<+>' a = a '<+>' 'zero' = a 22 | --(a '<.>' b) '<.>' c = a '<.>' (b '<.>' c) 23 | --'one' '<.>' a = a '<.>' 'one' = a@ 24 | -- 25 | -- == Commutativity of '<+>' 26 | -- @a '<+>' b = b '<+>' a@ 27 | -- 28 | -- == Distribution of '<.>' over '<+>' 29 | -- @a '<.>' (b '<+>' c) = (a '<.>' b) '<+>' (a '<.>' c) 30 | --(a '<+>' b) '<.>' c = (a '<.>' c) '<+>' (b '<.>' c)@ 31 | -- 32 | -- == Annihilation 33 | -- @'zero' '<.>' a = a '<.>' 'zero' = 'zero'@ 34 | -- 35 | -- An ordered semiring follows the laws: 36 | -- 37 | -- @x '<=' y => x '<+>' z '<=' y '<+>' z 38 | --x '<=' y => x '<+>' z '<=' y '<+>' z 39 | --'zero' '<=' z '&&' x '<=' y => x '<.>' z '<=' y '<.>' z '&&' z '<.>' x '<=' z '<.>' y@ 40 | class Semiring a where 41 | {-# MINIMAL zero , one , (<.>) , (<+>) #-} 42 | -- | The identity of '<+>'. 43 | zero 44 | :: a 45 | -- | The identity of '<.>'. 46 | one 47 | :: a 48 | -- | An associative binary operation, which distributes over '<+>'. 49 | infixl 7 <.> 50 | (<.>) :: a -> a -> a 51 | -- | An associative, commutative binary operation. 52 | infixl 6 <+> 53 | (<+>) :: a -> a -> a 54 | 55 | 56 | -- | Useful for operations where zeroes may need to be discarded: for instance 57 | -- in sparse matrix calculations. 58 | class Semiring a => 59 | DetectableZero a where 60 | -- | 'True' if x is 'zero'. 61 | isZero 62 | :: a -> Bool 63 | 64 | 65 | instance Semiring Bool where 66 | one = True 67 | zero = False 68 | (<+>) = (||) 69 | (<.>) = (&&) 70 | {-# INLINE zero #-} 71 | {-# INLINE one #-} 72 | {-# INLINE (<+>) #-} 73 | {-# INLINE (<.>) #-} 74 | 75 | 76 | instance DetectableZero Bool where 77 | isZero = not 78 | {-# INLINE isZero #-} 79 | 80 | 81 | instance Semiring () where 82 | one = () 83 | zero = () 84 | _ <+> _ = () 85 | _ <.> _ = () 86 | {-# INLINE zero #-} 87 | {-# INLINE one #-} 88 | {-# INLINE (<+>) #-} 89 | {-# INLINE (<.>) #-} 90 | 91 | instance DetectableZero () where 92 | isZero _ = True 93 | {-# INLINE isZero #-} 94 | 95 | 96 | instance (Semiring a, Semiring b) => Semiring (a, b) where 97 | one = 98 | (one, one) 99 | 100 | zero = 101 | (zero, zero) 102 | 103 | (a1, b1) <+> (a2, b2) = 104 | (a1 <+> a2, b1 <+> b2) 105 | 106 | (a1, b1) <.> (a2, b2) = 107 | (a1 <.> a2, b1 <.> b2) 108 | 109 | instance (DetectableZero a, DetectableZero b) => DetectableZero (a, b) where 110 | isZero (a, b) = 111 | isZero a && isZero b -------------------------------------------------------------------------------- /test/RegExp/RegExpSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE UndecidableInstances #-} 6 | 7 | module RegExp.RegExpSpec where 8 | 9 | import Test.Hspec 10 | import Test.QuickCheck 11 | 12 | import RegExp.RegExp 13 | import RegExp.Derivative (matches) 14 | 15 | import Data.GSet 16 | 17 | 18 | spec :: Spec 19 | spec = do 20 | describe "zero" $ do 21 | it "does not match the empty string" $ do 22 | (rZero `matches` "") `shouldBe` False 23 | it "does not match non-empty strings" $ do 24 | (rZero `matches` "a") `shouldBe` False 25 | (rZero `matches` "abc") `shouldBe` False 26 | it "does not match any string" $ do 27 | property $ \(w :: String) -> (rZero `matches` w) `shouldBe` False 28 | 29 | describe "one" $ do 30 | it "matches the empty string" $ do 31 | (rOne `matches` "") `shouldBe` True 32 | it "does not match non-empty strings" $ do 33 | property $ \c (w :: String) -> (rZero `matches` (c : w)) `shouldBe` False 34 | 35 | describe "plus" $ do 36 | it "matches when either subexpression matches" $ do 37 | let r1 = "abc" 38 | let r2 = "def" 39 | (rPlus r1 r2 `matches` "a") `shouldBe` True 40 | 41 | describe "times" $ do 42 | it "matches when both subexpressions match in order" $ do 43 | let r1 = "abc" 44 | let r2 = "def" 45 | (rTimes r1 r2 `matches` "af") `shouldBe` True 46 | 47 | describe "star" $ do 48 | it "matches the empty string" $ do 49 | (rStar "abc" `matches` "") `shouldBe` True 50 | it "matches once" $ do 51 | (rStar "abc" `matches` "c") `shouldBe` True 52 | it "matches multiple times" $ do 53 | (rStar "abc" `matches` "abcbabc") `shouldBe` True 54 | 55 | describe "literal" $ do 56 | it "matches one of the options" $ do 57 | (rLiteral "ab" `matches` "a") `shouldBe` True 58 | (rLiteral "ab" `matches` "b") `shouldBe` True 59 | it "does not match characters not in the class" $ do 60 | (rLiteral "ab" `matches` "c") `shouldBe` False 61 | 62 | 63 | describe "hide" $ do 64 | it "is the inverse of view" $ do 65 | property $ \(r :: RegExp Char) -> hide (view r) `shouldBe` r 66 | 67 | it "is the deep inverse of view" $ do 68 | property $ \(r :: RegExp Char) -> hideAll (viewAll r) `shouldBe` r 69 | 70 | 71 | describe "read" $ do 72 | it "is the inverse of show" $ do 73 | withMaxSuccess 5 $ -- Reading and/or printing is super slow! 74 | property $ \(r :: RegExp Char) -> read (show r) `shouldBe` r 75 | 76 | 77 | 78 | -- | Fixed point of a functor. 79 | data Fix f = Fix {unFix :: f (Fix f)} 80 | 81 | deriving instance Eq (f (Fix f)) => Eq (Fix f) 82 | deriving instance Ord (f (Fix f)) => Ord (Fix f) 83 | deriving instance Show (f (Fix f)) => Show (Fix f) 84 | 85 | 86 | -- | Like 'view', but fully unfolds all subtrees. 87 | viewAll :: GSet c => RegExp c -> Fix (RegExpView c) 88 | viewAll r = 89 | Fix $ fmap viewAll $ view r 90 | 91 | 92 | -- | Like 'hide', but works on a fully viewed regular expression. 93 | hideAll :: GSet c => Fix (RegExpView c) -> RegExp c 94 | hideAll r = 95 | hide (fmap hideAll $ unFix r) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regexp 2 | 3 | [![Build Status](https://travis-ci.com/cacay/regexp.svg?branch=master)](https://travis-ci.com/cacay/regexp) 4 | 5 | This is a regular expression library for Haskell that focuses on higher level 6 | operations like computing the intersection of regular expressions or deciding 7 | whether two regular expressions match the same set of strings. This is in stark 8 | contrast to pretty much every single regular expression library out there (including 9 | ones for other languages), which are only concerned with matching strings. Unfortunately, 10 | deprioritizing string matching means it isn't very efficient, so if that's all you need, 11 | you should use a different library. 12 | 13 | Here is a summary of supported features: 14 | * Intersection and complement 15 | * Derivatives à la [Brzozowski](https://en.wikipedia.org/wiki/Brzozowski_derivative) 16 | * Equivalence checking 17 | * Solving systems of linear equations with regular expression coefficients 18 | (which can be used to implement intersection, complement, and more) 19 | * Arbitrary alphabets, even infinite ones! 20 | 21 | 22 | ## Usage and Development 23 | 24 | We use [Stack](https://docs.haskellstack.org) so it's pretty much 25 | trivial to get started. If you don't have Stack already, 26 | [install](https://docs.haskellstack.org/en/stable/README/#how-to-install) 27 | it and set it up by running 28 | ```shell 29 | stack setup 30 | ``` 31 | in your shell. You only need to do this once. Then, you can run 32 | ```shell 33 | stack repl 34 | ``` 35 | to be dropped in GHCi where you can play around with the library. This will 36 | install all dependencies, build the library, and do whatever is necessary so 37 | everything "Just Works™". 38 | ```shell 39 | stack haddock --open regexp 40 | ``` 41 | will open the documentation in your browser and 42 | ```shell 43 | stack test 44 | ``` 45 | will run the test suite. 46 | 47 | Stack is all about reproducible builds, so you should not run into any issues. 48 | 49 | 50 | ## Examples 51 | 52 | Load up the library in GHCi: 53 | ```shell 54 | stack repl 55 | ``` 56 | 57 | ### Creating Regular Expressions 58 | 59 | The simplest regular expressions are `rZero`, which matches no strings, and 60 | `rOne`, which matches only the empty string. You can combine regular expressions 61 | using `rPlus` and `rTimes` (choice and sequencing). Kleene star is implemented 62 | by `rStar`. For example: 63 | ```haskell 64 | rOne 65 | rStar rZero 66 | rOne `rTimes` (rZero `rPlus` rOne) 67 | ``` 68 | are all valid expressions, though they are boring since they are all equivalent to 69 | `rOne`. More interesting expressions can be constructed using _character 70 | classes_. Standard Haskell string notation is interpreted as a character class 71 | containing all characters in the string. For example, 72 | ```haskell 73 | "abc" :: RegExp Char 74 | ``` 75 | is the (regular expression formed by) the character class containing the characters 76 | `a`, `b`, and `c`. This expression will match single character strings `"a"`, `"b"`, 77 | and `"c"` and nothing else. Note that the type annotation is required for Haskell 78 | to interpret the string as a regular expression. 79 | 80 | 81 | ### String Matching 82 | 83 | We can check that this is indeed how character classes behave by trying to match them 84 | against strings: 85 | ```haskell 86 | matches ("abc" :: RegExp Char) "a" 87 | ==> True 88 | 89 | matches ("abc" :: RegExp Char) "ab" 90 | ==> False 91 | 92 | matches (rStar "abc" :: RegExp Char) "ab" 93 | ==> True 94 | ``` 95 | 96 | 97 | ### Checking Equivalence 98 | 99 | We can check if two regular expressions are equivalent and get a counter example 100 | in the case they are not: 101 | ```haskell 102 | equivalent ("abc" :: RegExp Char) ("abc" `rPlus` rZero) 103 | ==> Right () 104 | 105 | equivalent ("abc" :: RegExp Char) ("ab") 106 | ==> Left "c" 107 | 108 | equivalent (rStar "abc" :: RegExp Char) ("abc" `rTimes` rStar "abc") 109 | ==> Left "" 110 | ``` 111 | 112 | 113 | ### Intersection and Complement 114 | 115 | We can compute intersections and complements: 116 | ```haskell 117 | intersection (rStar "ab" :: RegExp Char) (rStar "a") 118 | ==> rStar "a" 119 | 120 | intersection (rStar "a" :: RegExp Char) (RegExp.Operations.complement $ rStar "a") 121 | ==> rZero 122 | ``` 123 | -------------------------------------------------------------------------------- /src/RegExp/Equation.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | 5 | -- | Solving systems of linear equations with regular expression coefficients. 6 | module RegExp.Equation 7 | ( RightHand 8 | , solve 9 | , scale 10 | , combine 11 | ) where 12 | 13 | import Control.Exception.Base(assert) 14 | import Control.Monad.State 15 | 16 | import Data.Map.Strict(Map) 17 | import qualified Data.Map.Strict as Map 18 | 19 | import RegExp.RegExp 20 | import Data.GSet(GSet) 21 | 22 | 23 | -- | Right-hand side of an equation. A sum of terms where a term is 24 | -- either a constant, or a constant times a variable. These look 25 | -- like @r0 + r1 X1 + r2 X2 ...@ where @ri@ are regular expressions 26 | -- and @Xi@ are variables. 27 | type RightHand c v = 28 | (RegExp c, Map v (RegExp c)) 29 | 30 | 31 | -- | Solve a system of linear equations with regular expression coefficients. 32 | -- Equations are generated on demand using the given function. 33 | -- Coefficients in front of variables must be non-nullable to ensure the 34 | -- system has a unique solution. 35 | solve :: forall v c. (GSet c, Ord v) => (v -> RightHand c v) -> v -> RegExp c 36 | solve f v = 37 | evalState (go v) (Context Map.empty Map.empty) 38 | where 39 | go :: (MonadState (Context c v) m) => v -> m (RegExp c) 40 | go v = do 41 | context <- get 42 | case Map.lookup v (solved context) of 43 | Just r -> 44 | -- We are done if the result was computed beforehand. 45 | return r 46 | 47 | Nothing -> 48 | assert (not $ Map.member v (partial context)) $ do 49 | resolved@(c, l) <- resolve (f v) 50 | 51 | -- Eliminate @v@ in @resolved@. 52 | let resolved'@(c', l') = 53 | case Map.lookup v l of 54 | -- Nothing to do if @v@ doesn't occur. 55 | Nothing -> 56 | resolved 57 | 58 | -- Otherwise, use Arden's lemma to to rewrite @resolved@. 59 | Just cv -> 60 | assert (not $ nullable cv) 61 | scale (rStar cv) (c, Map.delete v l) 62 | 63 | -- Add a partial solution for @v@ in the context. This is 64 | -- essentially substituting the solution for @v@ in all 65 | -- "following" equations. 66 | put (context {partial = Map.insert v resolved' (partial context)}) 67 | 68 | -- Recursively solve all variables appearing in the 69 | -- equation for @v@. 70 | terms <- 71 | mapM 72 | (\(v, c) -> do {r <- go v; return (c `rTimes` r)}) 73 | (Map.toList l') 74 | 75 | let result = foldr rPlus c' terms 76 | 77 | put (context {solved = Map.insert v result (solved context)}) 78 | return result 79 | 80 | 81 | -- | Resolve variables in a 'RightHand' using the context. 82 | -- Variables not in the context are kept as is. 83 | resolve :: (MonadState (Context c v) m) 84 | => RightHand c v 85 | -> m (RightHand c v) 86 | resolve (c, l) = do 87 | resolved <- mapM resolveTerm (Map.toList l) 88 | return $ foldr combine (c, Map.empty) resolved 89 | 90 | 91 | -- | Resolve the variable in a single term using the context. 92 | -- If the variable is not in the context, it is returned as is. 93 | resolveTerm :: (MonadState (Context c v) m) 94 | => (v, RegExp c) 95 | -> m (RightHand c v) 96 | resolveTerm (v, r) = do 97 | context <- get 98 | case Map.lookup v (partial context) of 99 | Nothing -> 100 | return (rZero, Map.singleton v r) 101 | 102 | Just right -> do 103 | resolved <- resolve right 104 | return (scale r resolved) 105 | 106 | 107 | -- | Multiply a 'RightHand' on the left with the given coefficient. 108 | scale :: (GSet c, Ord v) => RegExp c -> RightHand c v -> RightHand c v 109 | scale r (c, l) = 110 | (r `rTimes` c, Map.map (r `rTimes`) l) 111 | 112 | 113 | -- | Merge two 'RightHand's using `rPlus`. 114 | combine :: (GSet c, Ord v) => RightHand c v -> RightHand c v -> RightHand c v 115 | combine (c1, l1) (c2, l2) = 116 | (c1 `rPlus` c2, Map.unionWith rPlus l1 l2) 117 | 118 | 119 | -- * Internals 120 | 121 | -- | Context used in the implementation of 'solve'. 122 | data Context c v = 123 | Context { 124 | -- | Variables that have a partial solution in terms of 125 | -- other variables. 126 | partial :: Map v (RightHand c v), 127 | 128 | -- | Variables that are fully solved. 129 | solved :: Map v (RegExp c) 130 | } 131 | 132 | 133 | -------------------------------------------------------------------------------- /src/SparseVector.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE KindSignatures #-} 3 | {-# LANGUAGE TypeApplications #-} 4 | {-# LANGUAGE TypeOperators #-} 5 | 6 | -- | A very cruddy implementation of sparse vectors. We define vectors 7 | -- with a known length @n@ so we can define a 'Semiring' instance. 8 | -- 9 | -- TODO: find a package or make nicer 10 | module SparseVector 11 | ( SparseVector 12 | , vector 13 | , (!) 14 | , length 15 | 16 | , (++) 17 | , split 18 | 19 | , map 20 | , zipWith 21 | 22 | , sum 23 | , nonZero 24 | , toList 25 | ) where 26 | 27 | import Prelude hiding (length, map, sum, zipWith, (++)) 28 | import Control.Exception (assert) 29 | 30 | import Data.Finite 31 | import Data.Singletons 32 | import Data.Singletons.Prelude 33 | import Data.Singletons.TypeLits 34 | 35 | import qualified Data.IntMap.Strict as IntMap 36 | import Data.Semiring (Semiring(..), DetectableZero(..)) 37 | 38 | 39 | -- | Sparse vectors of length @n@ over elements of type @a@. 40 | newtype SparseVector (n :: Nat) a = 41 | UnsafeMakeSparseVector { 42 | elements :: IntMap.IntMap a 43 | } 44 | 45 | 46 | -- | Construct a sparse vector from a list of indexed elements. Indexes 47 | -- not in the list are all set to zero. Duplicate indexes are combined 48 | -- with '(<+>)'. We need to be able to tell when elements are zero so we 49 | -- can filter them out. 50 | vector :: (DetectableZero a, KnownNat n) => [(Finite n, a)] -> SparseVector n a 51 | vector l = 52 | UnsafeMakeSparseVector { 53 | elements = 54 | removeZeros $ 55 | IntMap.fromListWith (<+>) [(fromIntegral i, a) | (i, a) <- l] 56 | } 57 | 58 | 59 | -- | The value at a given index. 60 | (!) :: (Semiring a, KnownNat n) => SparseVector n a -> Finite n -> a 61 | v ! i = 62 | IntMap.findWithDefault zero (fromIntegral i) (elements v) 63 | 64 | 65 | -- | Length of a vector. 66 | length :: forall n a r. (KnownNat n, Integral r) => SparseVector n a -> r 67 | length _ = 68 | fromIntegral $ fromSing (sing :: SNat n) 69 | 70 | 71 | -- | Sum of all elements in a vector. 72 | sum :: Semiring a => SparseVector n a -> a 73 | sum v = 74 | IntMap.foldr (<+>) zero (elements v) 75 | 76 | 77 | -- | Concatenate two vectors. 78 | (++) :: KnownNat n 79 | => SparseVector n a 80 | -> SparseVector m a 81 | -> SparseVector (n + m) a 82 | v1 ++ v2 = 83 | UnsafeMakeSparseVector { 84 | elements = 85 | IntMap.union 86 | (elements v1) 87 | (IntMap.mapKeysMonotonic (length v1 +) (elements v2)) 88 | } 89 | 90 | 91 | -- | We can map from vectors with one type for elements to another given 92 | -- a semiring homomorphism. Note that this does not work for arbitrary 93 | -- functions. Specifically, this function must map zeros to zeros. 94 | map :: (DetectableZero a, DetectableZero b) 95 | => (a -> b) 96 | -> SparseVector n a 97 | -> SparseVector n b 98 | map f v = 99 | assert (isZero $ f zero) $ 100 | UnsafeMakeSparseVector { 101 | elements = 102 | removeZeros $ 103 | IntMap.map f (elements v) 104 | } 105 | 106 | 107 | -- | Combine two vectors of equal length with the given function. 108 | -- The function should return zero when /both/ its arguments are 109 | -- zero. 110 | zipWith :: (DetectableZero a, DetectableZero b, DetectableZero c) 111 | => (a -> b -> c) 112 | -> SparseVector n a 113 | -> SparseVector n b 114 | -> SparseVector n c 115 | zipWith f v1 v2 = 116 | assert (isZero $ f zero zero) $ 117 | UnsafeMakeSparseVector { 118 | elements = 119 | removeZeros $ 120 | IntMap.mergeWithKey 121 | (\_ a b -> Just (f a b)) 122 | (IntMap.map (`f` zero)) 123 | (IntMap.map (zero `f`)) 124 | (elements v1) 125 | (elements v2) 126 | } 127 | 128 | 129 | 130 | -- | Split a vector into two vectors. 131 | split :: forall n m a. (KnownNat n, KnownNat m) 132 | => SparseVector (n + m) a 133 | -> (SparseVector n a, SparseVector m a) 134 | split v = 135 | ( UnsafeMakeSparseVector { elements = v1 } 136 | , UnsafeMakeSparseVector { elements = v2 } 137 | ) 138 | where 139 | (v1, v2') = 140 | IntMap.partitionWithKey (\k _ -> k < n) (elements v) 141 | 142 | v2 = 143 | IntMap.mapKeysMonotonic (subtract n) v2' 144 | 145 | n = 146 | fromIntegral $ fromSing (sing :: SNat n) 147 | 148 | 149 | 150 | -- | Iterate over non-zero elements in a vector. 151 | nonZero :: KnownNat n => SparseVector n a -> [(Finite n, a)] 152 | nonZero v = 153 | fmap (\(i, x) -> (finite $ fromIntegral i, x)) (IntMap.toList $ elements v) 154 | 155 | 156 | -- | Convert a vector to a list. 157 | toList :: (Semiring a, KnownNat n) => SparseVector n a -> [a] 158 | toList v = 159 | [v ! i | i <- finites] 160 | 161 | 162 | -- | Vectors of length @n@ over elements drawn from a semiring also 163 | -- form a semiring. 164 | instance (DetectableZero a, KnownNat n) => Semiring (SparseVector n a) where 165 | 166 | -- | Vector where all entries are zero. 167 | zero = 168 | UnsafeMakeSparseVector { 169 | elements = 170 | IntMap.empty 171 | } 172 | 173 | 174 | -- | Vector where all entries are one. 175 | one = 176 | -- We need to treat the trivial semiring as a special case 177 | if isZero (one :: a) then 178 | zero 179 | else 180 | UnsafeMakeSparseVector { 181 | elements = 182 | IntMap.fromList [(i, one) | i <- [0 .. length - 1]] 183 | } 184 | where 185 | length = 186 | fromIntegral $ fromSing (sing :: SNat n) 187 | 188 | 189 | -- | Vector addition. 190 | v1 <+> v2 = 191 | UnsafeMakeSparseVector { 192 | elements = 193 | removeZeros $ 194 | IntMap.unionWith (<+>) (elements v1) (elements v2) 195 | } 196 | 197 | 198 | -- | Vector dot product. 199 | v1 <.> v2 = 200 | UnsafeMakeSparseVector { 201 | elements = 202 | removeZeros $ 203 | IntMap.intersectionWith (<.>) (elements v1) (elements v2) 204 | } 205 | 206 | 207 | -- | We can recognize the zero vector. 208 | instance (DetectableZero a, KnownNat n) => DetectableZero (SparseVector n a) where 209 | isZero v1 = 210 | IntMap.null (elements v1) 211 | 212 | 213 | -- | Equality of vectors is decidable. 214 | deriving instance Eq a => Eq (SparseVector n a) 215 | 216 | 217 | -- | We can totally order vectors. 218 | deriving instance Ord a => Ord (SparseVector n a) 219 | 220 | 221 | instance (Semiring a, KnownNat n, Show a) => Show (SparseVector n a) where 222 | show = 223 | show . toList 224 | 225 | 226 | 227 | -- * Helper functions 228 | 229 | -- | Remove zero elements. 230 | removeZeros :: DetectableZero a => IntMap.IntMap a -> IntMap.IntMap a 231 | removeZeros = 232 | IntMap.filter (not . isZero) 233 | 234 | -------------------------------------------------------------------------------- /src/RegExp/Derivative.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE MultiParamTypeClasses #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | 5 | -- | Derivatives of regular expressions that support character classes. 6 | -- The development follows 7 | -- [Symbolic Solving of Extended Regular Expression Inequalities](https://arxiv.org/abs/1410.3227). 8 | module RegExp.Derivative 9 | ( Word 10 | 11 | -- * Derivatives 12 | , derivative 13 | , partialDerivative 14 | 15 | -- * Application of derivatives 16 | , matches 17 | , equivalent 18 | 19 | -- * Automata construction 20 | , allDerivatives 21 | , next 22 | , join 23 | ) where 24 | 25 | import Prelude hiding (Word) 26 | 27 | import Control.Exception.Base(assert) 28 | import Control.Monad(unless, when) 29 | import Control.Monad.Except(MonadError(..), runExceptT) 30 | import qualified Data.Equivalence.Monad as Equiv 31 | import qualified Data.Equivalence.STT as EquivSTT 32 | 33 | import Data.Set(Set) 34 | import qualified Data.Set as Set 35 | 36 | import RegExp.RegExp 37 | 38 | import Data.BooleanAlgebra 39 | import Data.Semiring (Semiring(..)) 40 | import Data.GSet hiding (Set) 41 | 42 | 43 | 44 | -- | String of characters from an alphabet @c@. 45 | type Word c = 46 | [c] 47 | 48 | 49 | 50 | -- * Derivatives 51 | 52 | -- | Brzozowski derivative of a regular expression with respect to a character. 53 | -- @derivative c r@ matches a word @w@ if and only if @r@ matches @cw@. 54 | derivative :: GSet c => c -> RegExp c -> RegExp c 55 | derivative c r = 56 | case view r of 57 | One -> 58 | rZero 59 | 60 | Plus r1 r2 -> 61 | rPlus (derivative c r1) (derivative c r2) 62 | 63 | Times r1 r2 | nullable r1 -> 64 | rPlus (derivative c r1 `rTimes` r2) (derivative c r2) 65 | 66 | Times r1 r2 | otherwise -> 67 | derivative c r1 `rTimes` r2 68 | 69 | Star r' -> 70 | derivative c r' `rTimes` r 71 | 72 | Literal p -> 73 | if c `member` p then rOne else rZero 74 | 75 | 76 | -- | Antimirov derivative of a regular expression with respect to a character. 77 | -- This is similar to 'derivative', but returns a set of regular expressions 78 | -- whose union is equivalent to the Brzozowski derivative. 79 | partialDerivative :: forall c. GSet c 80 | => c 81 | -> RegExp c 82 | -> Set (RegExp c) 83 | partialDerivative c r = 84 | case view r of 85 | One -> 86 | Set.empty 87 | 88 | Plus r1 r2 -> 89 | partialDerivative c r1 `Set.union` partialDerivative c r2 90 | 91 | Times r1 r2 | nullable r1 -> 92 | Set.union 93 | (partialDerivative c r1 `setTimes` r2) 94 | (partialDerivative c r2) 95 | 96 | Times r1 r2 | otherwise -> 97 | partialDerivative c r1 `setTimes` r2 98 | 99 | Star r' -> 100 | partialDerivative c r' `setTimes` r 101 | 102 | Literal p -> 103 | if c `member` p then Set.singleton rOne else Set.empty 104 | 105 | where 106 | setTimes :: Set (RegExp c) -> RegExp c -> Set (RegExp c) 107 | setTimes s r = 108 | Set.map (`rTimes` r) s 109 | 110 | 111 | 112 | -- * Applications 113 | 114 | -- | @r `matches` w@ if the regular expression @r@ accepts word @w@. 115 | matches :: GSet c => RegExp c -> Word c -> Bool 116 | matches r [] = 117 | nullable r 118 | matches r (c : w) = 119 | matches (derivative c r) w 120 | 121 | 122 | -- | Two regular expressions are equivalent if and only if they match 123 | -- the same set of strings. This function will check for equivalence, 124 | -- and return a witness in the case the expressions are different. 125 | -- One of the expressions will match the witness and the other won't. 126 | equivalent :: forall c. GSet c => RegExp c -> RegExp c -> Either (Word c) () 127 | equivalent r1 r2 = 128 | case Equiv.runEquivM' (runExceptT (check r1 r2)) of 129 | Left w -> 130 | assert (r1 `matches` w /= r2 `matches` w) $ 131 | Left w 132 | 133 | Right () -> 134 | Right () 135 | where 136 | -- | Hopcroft and Karp's bisimulation algorithm. 137 | check :: (MonadError (Word c) m, Equiv.MonadEquiv (EquivSTT.Class s () (RegExp c)) (RegExp c) () m) 138 | => RegExp c 139 | -> RegExp c 140 | -> m () 141 | check r1 r2 = do 142 | weAlreadyChecked <- Equiv.equivalent r1 r2 143 | unless weAlreadyChecked $ do 144 | when (nullable r1 /= nullable r2) $ 145 | -- These expressions differ since one can match the empty 146 | -- word and the other cannot. The empty word is our witness. 147 | throwError [] 148 | 149 | -- Assume these "states" are equivalent, check following states. 150 | Equiv.equate r1 r2 151 | 152 | let derivatives = 153 | [ (c, derivative c r1, derivative c r2) 154 | | p <- Set.toList (next r1 `join` next r2) 155 | , Just c <- [choose p] 156 | ] 157 | 158 | mapM_ checkNext derivatives 159 | 160 | 161 | -- | Check states reached by a character. 162 | checkNext :: (MonadError (Word c) m, Equiv.MonadEquiv (EquivSTT.Class s () (RegExp c)) (RegExp c) () m) 163 | => (c, RegExp c, RegExp c) 164 | -> m () 165 | checkNext (c, r1, r2) = 166 | check r1 r2 `catchError` \w -> 167 | throwError (c : w) 168 | 169 | 170 | 171 | -- * Automata construction 172 | 173 | -- | Set of derivatives of a regular expression under all words. 174 | allDerivatives :: forall c. GSet c => RegExp c -> Set (RegExp c) 175 | allDerivatives r = 176 | Set.insert rZero (helper Set.empty [r]) 177 | where 178 | helper :: Set (RegExp c) -> [RegExp c] -> Set (RegExp c) 179 | helper context [] = 180 | context 181 | 182 | helper context (r : rest) | r `Set.member` context = 183 | helper context rest 184 | 185 | helper context (r : rest) = 186 | let 187 | derivatives = 188 | [ derivative c r | p <- Set.toList (next r) 189 | , Just c <- [choose p]] 190 | in 191 | helper (Set.insert r context) (derivatives ++ rest) 192 | 193 | 194 | 195 | -- * Helpers 196 | 197 | -- | Given a regular expression @r@, compute equivalence classes of 198 | -- character classes such that: 199 | -- 200 | -- * @p `member` next r@ and @c1 `member` p && c2 `member` p@ implies 201 | -- @derivative c1 r = derivative c2 r@, 202 | -- * @not $ c `member` ors (next r)@ implies @derivative c r ~ rZero@. 203 | next :: GSet c => RegExp c -> Set (CharacterClass c) 204 | next r = 205 | case view r of 206 | One -> 207 | Set.singleton zero 208 | 209 | Plus r1 r2 -> 210 | join (next r1) (next r2) 211 | 212 | Times r1 r2 | nullable r1 -> 213 | join (next r1) (next r2) 214 | 215 | Times r1 _ | otherwise -> 216 | next r1 217 | 218 | Star r -> 219 | next r 220 | 221 | Literal p -> 222 | Set.singleton p 223 | 224 | 225 | -- | Given two sets of mutually disjoint character classes, compute 226 | -- a set of mutually disjoint character classes that cover both input 227 | -- sets. More concretely, given @s1@ and @s2@ such that 228 | -- 229 | -- @'disjoint' s1 && 'disjoint' s2@ 230 | -- 231 | -- we have: 232 | -- 233 | -- * @'ors' ('join' s1 s2) = 'ors' s1 <+> 'ors' s2@ 234 | -- * @'disjoint' ('join' s1 s2)@ 235 | -- * @'all' (\p -> 'all' (\p1 -> p '<.>' p1 == 'zero' || p `subset` p1) s1) ('join' s1 s2)@ 236 | -- * @'all' (\p -> 'all' (\p2 -> p '<.>' p2 == 'zero' || p `subset` p2) s2) ('join' s1 s2)@ 237 | join :: GSet c 238 | => Set (CharacterClass c) 239 | -> Set (CharacterClass c) 240 | -> Set (CharacterClass c) 241 | join s1 s2 = Set.fromList $ concat $ do 242 | p1 <- Set.toList s1 243 | p2 <- Set.toList s2 244 | return 245 | [ p1 <.> p2 246 | , p1 `butNot` ors s2 247 | , p2 `butNot` ors s1 248 | ] 249 | 250 | 251 | -- | Test if a set of character classes are pairwise disjoint. 252 | disjoint :: GSet c => Set (CharacterClass c) -> Bool 253 | disjoint s = 254 | let s' = Set.toList s in 255 | and [ p1 <.> p2 == zero | p1 <- s', p2 <- s', p1 /= p2 ] -------------------------------------------------------------------------------- /src/Data/GSet.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeFamilies #-} 2 | 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE FlexibleContexts #-} 5 | {-# LANGUAGE FlexibleInstances #-} 6 | {-# LANGUAGE UndecidableInstances #-} 7 | 8 | 9 | -- | A generic interface for sets and an instance 10 | -- supporting finite and cofinite subsets of a type. 11 | module Data.GSet 12 | ( GSet (..) 13 | ) where 14 | 15 | import Prelude hiding (and, or) 16 | import Flow 17 | 18 | import Data.String (IsString(..)) 19 | import GHC.Exts (IsList(..)) 20 | import qualified Text.ParserCombinators.ReadP as Parser 21 | 22 | import Data.Function (on) 23 | import Data.List (intercalate) 24 | import qualified Data.Set 25 | 26 | import Data.BooleanAlgebra (BooleanAlgebra(..)) 27 | import Data.Semiring (Semiring(..), DetectableZero(..)) 28 | 29 | import Test.QuickCheck as QuickCheck 30 | import Test.SmallCheck.Series as SmallCheck 31 | 32 | 33 | -- | A generic interface for sets over a type @a@. 34 | class (BooleanAlgebra (Set a), DetectableZero (Set a), Ord (Set a)) => GSet a where 35 | -- | Sets of elements of @a@. 36 | type Set a :: * 37 | 38 | -- | The set containing a single element. 39 | singleton :: a -> Set a 40 | 41 | -- | Determine if an element is a member of a set. 42 | member :: a -> Set a -> Bool 43 | member a s = 44 | singleton a <.> s == singleton a 45 | 46 | -- | Return an arbitrary element from a non-empty set; and 47 | -- 'Nothing' if the set is empty. That is, the following properties 48 | -- should hold: 49 | -- 50 | -- @'choose' 'zero' = 'Nothing'@ 51 | -- 52 | -- @s /= 'zero' ==> exists a. 'choose' s = 'Just' a && 'member' a s@ 53 | choose :: Set a -> Maybe a 54 | 55 | 56 | 57 | -- | Finite and cofinite subsets of a type form a 'Semiring'. 58 | instance Ord a => Semiring (FiniteSet a) where 59 | zero = 60 | empty 61 | 62 | one = 63 | full 64 | 65 | (<+>) = 66 | union 67 | 68 | (<.>) = 69 | intersection 70 | 71 | 72 | -- | We know when a finite or cofinite subset of a finite type is empty. 73 | instance (Bounded a, Enum a, Ord a) => DetectableZero (FiniteSet a) where 74 | isZero p = 75 | size p == 0 76 | 77 | 78 | -- | Finite and cofinite subsets of a type form a 'BooleanAlgebra'. 79 | instance Ord a => BooleanAlgebra (FiniteSet a) where 80 | complement = 81 | setComplement 82 | 83 | 84 | -- | Finite and cofinite sets over the elements of a finite type. 85 | instance (Bounded a, Enum a, Ord a) => GSet a where 86 | type Set a = FiniteSet a 87 | 88 | singleton = 89 | These . Data.Set.singleton 90 | 91 | member a (These s) = 92 | Data.Set.member a s 93 | member a (ComplementOf s) = 94 | Data.Set.notMember a s 95 | 96 | choose (These s) = 97 | Data.Set.lookupMin s 98 | choose p@(ComplementOf _) = 99 | if size p == 0 then 100 | Nothing 101 | else 102 | Just $ head [a | a <- [minBound..maxBound], member a p] 103 | 104 | 105 | 106 | -- * Implementation of sets with a more efficient complement operation. 107 | 108 | -- | Finite and cofinite sets over the elements of a type. 109 | data FiniteSet a 110 | -- | Set containing the given elements. 111 | = These (Data.Set.Set a) 112 | 113 | -- | Set containing the complement of the given elements. 114 | | ComplementOf (Data.Set.Set a) 115 | 116 | 117 | -- | Equality of finite and cofinite subsets of a finite type is decidable. 118 | instance (Bounded a, Enum a, Ord a) => Eq (FiniteSet a) where 119 | These s1 == These s2 = 120 | s1 == s2 121 | p1@(These s1) == p2@(ComplementOf s2) = 122 | size p1 == size p2 && Data.Set.null (s1 `Data.Set.intersection` s2) 123 | p1@(ComplementOf _) == p2@(These _) = 124 | p2 == p1 125 | ComplementOf s1 == ComplementOf s2 = 126 | s1 == s2 127 | 128 | 129 | -- | We can totally order the finite and cofinite subsets of a finite type. 130 | instance forall a.(Bounded a, Enum a, Ord a) => Ord (FiniteSet a) where 131 | -- | Order by size first; then use lexicographical order on the elements. 132 | compare p1 p2 = 133 | compare (size p1) (size p2) <> lex p1 p2 134 | where 135 | lex (ComplementOf s1) (ComplementOf s2) = 136 | -- This is a lot more efficient than turning complemented 137 | -- sets into lists. Note that the order of arguments to the 138 | -- comparison is reversed. 139 | (compare `on` Data.Set.toAscList) s2 s1 140 | lex _ _ = 141 | (compare `on` toList) p1 p2 142 | 143 | 144 | -- | Nicer interface for inputting finite sets over 'Char'. 145 | instance IsString (FiniteSet Char) where 146 | fromString = 147 | These . Data.Set.fromList 148 | 149 | 150 | -- | Allows us to write finite sets as lists. 151 | instance (Bounded a, Enum a, Ord a) => IsList (FiniteSet a) where 152 | type (Item (FiniteSet a)) = a 153 | 154 | fromList = 155 | These . Data.Set.fromList 156 | 157 | -- | We can list all elements in a finite or cofinite subset of 158 | -- a finite type. For infinite types, size of cofinite subsets 159 | -- is infinite, so this is not possible. 160 | toList (These s) = 161 | Data.Set.toAscList s 162 | toList (ComplementOf s) = 163 | [a | a <- [minBound..maxBound], Data.Set.notMember a s] 164 | 165 | 166 | instance Show a => Show (FiniteSet a) where 167 | show (These s) = 168 | "{" ++ intercalate "," (map show $ Data.Set.toList s) ++ "}" 169 | show (ComplementOf s) | Data.Set.null s = 170 | "." 171 | show (ComplementOf s) = 172 | "~{" ++ intercalate "," (map show $ Data.Set.toList s) ++ "}" 173 | 174 | 175 | instance (Read a, Ord a) => Read (FiniteSet a) where 176 | readsPrec _ = 177 | Parser.readP_to_S parser 178 | 179 | where 180 | parser :: Parser.ReadP (FiniteSet a) 181 | parser = do 182 | Parser.skipSpaces 183 | Parser.choice 184 | [ do {Parser.char '.'; return one} 185 | , do {Parser.char '~'; fmap ComplementOf elements} 186 | , do {fmap These elements} 187 | ] 188 | 189 | elements :: Parser.ReadP (Data.Set.Set a) 190 | elements = 191 | Parser.between openBrace closeBrace $ do 192 | elements <- Parser.sepBy (Parser.readS_to_P reads) comma 193 | return $ Data.Set.fromList elements 194 | 195 | openBrace = do 196 | Parser.char '{' 197 | Parser.skipSpaces 198 | 199 | closeBrace = do 200 | Parser.skipSpaces 201 | Parser.char '}' 202 | 203 | comma = do 204 | Parser.char ',' 205 | Parser.skipSpaces 206 | 207 | 208 | -- | Set containing no elements. 209 | empty :: FiniteSet a 210 | empty = 211 | These Data.Set.empty 212 | 213 | 214 | -- | Set containing all elements. 215 | full :: FiniteSet a 216 | full = 217 | ComplementOf Data.Set.empty 218 | 219 | 220 | -- | Complement of a set. 221 | setComplement :: FiniteSet a -> FiniteSet a 222 | setComplement (These s) = 223 | ComplementOf s 224 | setComplement (ComplementOf s) = 225 | These s 226 | 227 | 228 | -- | Intersection of two sets. 229 | intersection :: Ord a => FiniteSet a -> FiniteSet a -> FiniteSet a 230 | intersection (These s1) (These s2) = 231 | These (Data.Set.intersection s1 s2) 232 | intersection (These s1) (ComplementOf s2) = 233 | These (Data.Set.difference s1 s2) 234 | intersection p1@(ComplementOf _) p2@(These _) = 235 | intersection p2 p1 236 | intersection(ComplementOf s1) (ComplementOf s2) = 237 | ComplementOf (Data.Set.union s1 s2) 238 | 239 | 240 | -- | Intersection of two sets. 241 | union :: Ord a => FiniteSet a -> FiniteSet a -> FiniteSet a 242 | union p1 p2 = 243 | complement $ (complement p1) `intersection` (complement p2) 244 | 245 | 246 | 247 | -- * Operations on finite types 248 | 249 | -- | Number of elements in a finite type. 250 | sizeOfType :: forall a. (Bounded a, Enum a) => a -> Int 251 | sizeOfType _ = 252 | 1 + fromEnum (maxBound :: a) - fromEnum (minBound :: a) 253 | 254 | 255 | -- | We can compute the size of finite or cofinite subsets of 256 | -- a finite type. For infinite types, size of cofinite subsets 257 | -- is infinite. 258 | size :: forall a. (Bounded a, Enum a) => FiniteSet a -> Int 259 | size (These s) = 260 | Data.Set.size s 261 | size (ComplementOf s) = 262 | sizeOfType (undefined :: a) - Data.Set.size s 263 | 264 | 265 | 266 | -- * Testing 267 | 268 | instance (Arbitrary a, Ord a) => Arbitrary (FiniteSet a) where 269 | arbitrary = do 270 | complemented <- arbitrary 271 | set <- arbitrary 272 | case complemented of 273 | False -> 274 | return (These set) 275 | 276 | True -> 277 | return (ComplementOf set) 278 | 279 | shrink (These s) = 280 | fmap These (shrink s) 281 | shrink (ComplementOf s) = 282 | fmap These (shrink s) ++ fmap ComplementOf (shrink s) 283 | 284 | 285 | -- | We use a newtype to define a 'Serial' instance for 'Data.Set.Set' 286 | -- so we don't pollute the global class space. 287 | newtype Set' a = 288 | Set' {unSet' :: Data.Set.Set a} 289 | 290 | instance (Serial m a, Enum a, Bounded a, Ord a) => Serial m (Set' a) where 291 | series = SmallCheck.generate $ \depth -> 292 | Data.Set.toList allSubsets 293 | |> filter ((<= depth) . Data.Set.size) 294 | |> fmap Set' 295 | where 296 | allSubsets = 297 | ([minBound .. maxBound] :: [a]) 298 | |> Data.Set.fromList 299 | |> Data.Set.powerSet 300 | 301 | instance (Serial m a, Enum a, Bounded a, Ord a) => Serial m (FiniteSet a) where 302 | series = 303 | cons1 (These . unSet') \/ cons1 (ComplementOf . unSet') 304 | -------------------------------------------------------------------------------- /src/RegExp/Internal/DFA.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE Rank2Types #-} 3 | {-# LANGUAGE DataKinds #-} 4 | {-# LANGUAGE KindSignatures #-} 5 | {-# LANGUAGE TypeOperators #-} 6 | 7 | {-# LANGUAGE FlexibleContexts #-} 8 | {-# LANGUAGE UndecidableInstances #-} 9 | 10 | -- | Finite state automaton represented as matrices. 11 | module RegExp.Internal.DFA 12 | ( Dfa 13 | 14 | -- * Combining DFAs 15 | , product 16 | 17 | -- * Convert from/to regular expressions 18 | , regexp 19 | , fromRegExp 20 | ) where 21 | 22 | import Prelude hiding (product) 23 | import Flow 24 | 25 | import Control.Exception.Base(assert) 26 | 27 | import Data.Finite 28 | import Data.Singletons 29 | import Data.Singletons.Prelude 30 | import Data.Singletons.TypeLits 31 | 32 | import Data.List (intercalate) 33 | import qualified Data.Set 34 | 35 | import Data.BooleanAlgebra 36 | import Data.Semiring(Semiring(..)) 37 | import Data.KleeneAlgebra 38 | import Data.GSet 39 | 40 | import RegExp.RegExp 41 | import RegExp.Derivative 42 | import RegExp.Language (Language) 43 | import qualified RegExp.Language as Language 44 | 45 | 46 | import SparseVector (SparseVector) 47 | import qualified SparseVector as Vector 48 | 49 | import SparseMatrix (SparseMatrix) 50 | import qualified SparseMatrix as Matrix 51 | 52 | 53 | -- | Deterministic finite state automata that accept words over alphabet @c@. 54 | data Dfa c where 55 | Dfa :: KnownNat n => DfaSize n c -> Dfa c 56 | 57 | 58 | -- | Deterministic finite state automata with @n@ states that accept words 59 | -- over alphabet @c@. 60 | data DfaSize (n :: Nat) c = 61 | DfaSize { 62 | -- | The start state. 63 | start :: Finite n, 64 | 65 | -- | The transition matrix. In order to represent a deterministic 66 | -- machine, the following must hold: 67 | -- * Each row covers the entire alphabet. That is, the union of all 68 | -- entries on a given row must be the entire set of characters. 69 | -- * All entries in a row are pairwise disjoint. 70 | -- 71 | -- The first requirement says that there is at least one transition 72 | -- from every state given a character. This requirement is easy to 73 | -- satisfy by adding an explicit "error" state. 74 | -- 75 | -- The second requirement states that there is at most one transition 76 | -- given a state and a character. 77 | transition :: SparseMatrix n n (CharacterClass c), 78 | 79 | -- | Accepting states. 80 | accept :: SparseVector n Bool 81 | } 82 | 83 | 84 | -- | Verify that the given DFA satisfies the conditions outlined in 'DfaSize', 85 | -- and return the DFA unchanged if so. Raises an exception otherwise. 86 | assertValid :: forall c. GSet c => Dfa c -> Dfa c 87 | assertValid r@(Dfa (d :: DfaSize n c)) = 88 | assert transitionValid $ 89 | r 90 | where 91 | transitionValid = 92 | all 93 | (== one) 94 | [Vector.sum $ Matrix.nthRow r (transition d) | r <- finites] 95 | 96 | 97 | -- | Generic product construction over two DFAs. Intersection 98 | -- and union of DFAs can be recovered as special cases by passing 99 | -- in @('<.>')@ and @('<+>')@, respectively. 100 | product :: forall c. GSet c 101 | => (forall a. BooleanAlgebra a => a -> a -> a) 102 | -> Dfa c 103 | -> Dfa c 104 | -> Dfa c 105 | product f (Dfa (d1 :: DfaSize n c)) (Dfa (d2 :: DfaSize m c)) = 106 | withKnownNat ((sing :: SNat n) %* (sing :: SNat m)) $ 107 | assertValid $ 108 | Dfa $ DfaSize { 109 | start = 110 | state (start d1) (start d2), 111 | 112 | transition = 113 | Matrix.matrix 114 | [ ((state n m, state n' m'), f s1 s2) 115 | | n <- finites 116 | , n' <- finites 117 | , m <- finites 118 | , m' <- finites 119 | , let s1 = transition d1 Matrix.! (n, n') 120 | , let s2 = transition d2 Matrix.! (m, m') 121 | ], 122 | 123 | accept = 124 | Vector.vector 125 | [ (state n m, f a1 a2) 126 | | n <- finites 127 | , m <- finites 128 | , let a1 = accept d1 Vector.! n 129 | , let a2 = accept d2 Vector.! m 130 | ] 131 | } 132 | where 133 | -- | State in the product automata that corresponds to the given 134 | -- pair of states. 135 | state :: Finite n -> Finite m -> Finite (n * m) 136 | state i j = 137 | combineProduct (i, j) 138 | 139 | 140 | 141 | -- | DFA that accepts words accepted by both input DFAs. 142 | intersection :: forall c. GSet c => Dfa c -> Dfa c -> Dfa c 143 | intersection (Dfa (d1 :: DfaSize n c)) (Dfa (d2 :: DfaSize m c)) = 144 | withKnownNat ((sing :: SNat n) %* (sing :: SNat m)) $ 145 | assertValid $ 146 | Dfa $ DfaSize { 147 | start = 148 | state (start d1) (start d2), 149 | 150 | transition = 151 | Matrix.matrix 152 | [ ((state n m, state n' m'), s1 <.> s2) 153 | | ((n, n'), s1) <- Matrix.nonZero (transition d1) 154 | , ((m, m'), s2) <- Matrix.nonZero (transition d2) 155 | ], 156 | 157 | accept = 158 | Vector.vector 159 | [ (state n m, a1 <.> a2) 160 | | (n, a1) <- Vector.nonZero (accept d1) 161 | , (m, a2) <- Vector.nonZero (accept d2) 162 | ] 163 | } 164 | where 165 | -- | State in the product automata that corresponds to the given 166 | -- pair of states. 167 | state :: Finite n -> Finite m -> Finite (n * m) 168 | state i j = 169 | combineProduct (i, j) 170 | 171 | 172 | -- | We can form a semiring over DFAs by interpreting them as sets 173 | -- of words. 174 | instance GSet c => Semiring (Dfa c) where 175 | -- | DFA that accepts no words. 176 | zero = 177 | assertValid $ 178 | Dfa $ DfaSize { 179 | start = 180 | finite 0 :: Finite 1, 181 | 182 | transition = 183 | Matrix.matrix [((0, 0), one)], 184 | 185 | accept = 186 | Vector.vector [(0, False)] 187 | } 188 | 189 | -- | DFA that accepts all words. 190 | one = 191 | assertValid $ 192 | Dfa $ DfaSize { 193 | start = 194 | finite 0 :: Finite 1, 195 | 196 | transition = 197 | Matrix.matrix [((0, 0), one)], 198 | 199 | accept = 200 | Vector.vector [(0, True)] 201 | } 202 | 203 | -- | DFA that accepts words accepted by either DFA. 204 | (<+>) = 205 | product (<+>) 206 | 207 | -- | DFA that accepts words accepted by both DFAs. 208 | (<.>) = 209 | intersection 210 | 211 | 212 | -- | We can form a boolean algebra over DFAs by interpreting them as 213 | -- sets of words. 214 | instance GSet c => BooleanAlgebra (Dfa c) where 215 | -- | @complement d@ accepts precisely the words that @d@ doesn't. 216 | complement (Dfa d) = 217 | assertValid $ 218 | Dfa $ 219 | d { 220 | accept = 221 | accept d 222 | |> Vector.toList 223 | |> fmap not 224 | |> zip finites 225 | |> Vector.vector 226 | } 227 | 228 | 229 | 230 | -- * Converting to and from regular expressions 231 | 232 | -- | Convert a DFA to a regular expression. 233 | regexp :: forall c. GSet c => Dfa c -> RegExp c 234 | regexp (Dfa (d :: DfaSize n c)) = 235 | Language.regexp $ 236 | (s `Matrix.times` star m `Matrix.times` t) Matrix.! (0, 0) 237 | where 238 | s :: SparseMatrix 1 n (Language c) 239 | s = 240 | Matrix.fromRows [(0, Vector.vector [(start d, injectBool True)])] 241 | 242 | m :: SparseMatrix n n (Language c) 243 | m = 244 | Matrix.map (Language.language . rLiteral) (transition d) 245 | 246 | t :: SparseMatrix n 1 (Language c) 247 | t = 248 | Matrix.transpose $ 249 | Matrix.fromRows [(0, Vector.map injectBool (accept d))] 250 | 251 | injectBool :: Semiring a => Bool -> a 252 | injectBool True = 253 | one 254 | injectBool False = 255 | zero 256 | 257 | 258 | -- | Convert a regular expression to a DFA. 259 | fromRegExp :: forall c. GSet c => RegExp c -> Dfa c 260 | fromRegExp r = 261 | case toSing (fromIntegral $ Data.Set.size derivatives) of 262 | SomeSing (s :: SNat n) -> 263 | withKnownNat s $ 264 | let 265 | -- | States of the constructed DFA will be the derivatives of 266 | -- the input regular expression. We assign each an index. 267 | state :: RegExp c -> Finite n 268 | state r = 269 | finite $ fromIntegral (Data.Set.findIndex r derivatives) 270 | 271 | 272 | -- | Transitions /from/ the given state. 273 | row :: RegExp c -> SparseVector n (CharacterClass c) 274 | row r = 275 | Vector.vector $ 276 | (state rZero, complement $ ors $ next r) : 277 | [ (state (derivative c r), p) 278 | | p <- Data.Set.toList (next r) 279 | , Just c <- [choose p] 280 | ] 281 | in 282 | assertValid $ 283 | Dfa $ DfaSize { 284 | start = 285 | state r, 286 | 287 | transition = 288 | Matrix.fromRows 289 | [(state d, row d) | d <- Data.Set.toList derivatives], 290 | 291 | accept = 292 | Vector.vector [(state d, True) | d <- Data.Set.toList derivatives, nullable d] 293 | } 294 | where 295 | derivatives = 296 | allDerivatives r 297 | 298 | 299 | 300 | -- * Showing DFAs 301 | 302 | instance (GSet c, Show (Set c)) => Show (Dfa c) where 303 | show (Dfa d) = 304 | show d 305 | 306 | 307 | instance (GSet c, KnownNat n, Show (Set c)) => Show (DfaSize n c) where 308 | show d = 309 | intercalate "\n" 310 | [ "Start state:\n " ++ show (start d) 311 | , "Transition matrix:\n" ++ show (transition d) 312 | , "Accepting states:\n " ++ show (accept d) 313 | ] -------------------------------------------------------------------------------- /src/SparseMatrix.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE KindSignatures #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | {-# LANGUAGE TypeOperators #-} 6 | 7 | {-# LANGUAGE FlexibleContexts #-} 8 | {-# LANGUAGE FlexibleInstances #-} 9 | 10 | -- TODO: remove 11 | {-# LANGUAGE AllowAmbiguousTypes #-} 12 | 13 | -- | A very cruddy implementation of sparse matrices. I couldn't 14 | -- find an existing implementation that had all that I needed, so 15 | -- I cooked this up. 16 | -- 17 | -- TODO: find a package or make nicer 18 | module SparseMatrix 19 | ( SparseMatrix 20 | , matrix 21 | , fromRows 22 | , (!) 23 | , nthRow 24 | 25 | , plus 26 | , times 27 | , transpose 28 | 29 | , map 30 | , nonZero 31 | , nonZeroRows 32 | , toList 33 | ) where 34 | 35 | import Flow 36 | import Prelude hiding (map) 37 | 38 | import Data.Function (on) 39 | import Data.List (sortBy, groupBy, intercalate) 40 | import Data.Ord (comparing) 41 | 42 | import Data.Finite 43 | import Data.Singletons 44 | import Data.Singletons.Decide 45 | import Data.Singletons.Prelude 46 | import Data.Singletons.TypeLits 47 | 48 | import Data.Semiring (Semiring(..), DetectableZero(..)) 49 | import Data.KleeneAlgebra 50 | 51 | import SparseVector (SparseVector) 52 | import qualified SparseVector as Vector 53 | 54 | 55 | 56 | -- | A sparse matrix with @r@ rows and @c@ columns over 57 | -- elements of type @a@. 58 | newtype SparseMatrix (r :: Nat) (c :: Nat) a = 59 | UnsafeMakeSparseMatrix { 60 | rows :: SparseVector r (SparseVector c a) 61 | } 62 | 63 | 64 | -- | Value at the given row and column. 65 | (!) :: (DetectableZero a, KnownNat r, KnownNat c) 66 | => SparseMatrix r c a 67 | -> (Finite r, Finite c) 68 | -> a 69 | m ! (r, c) = 70 | (rows m Vector.! r) Vector.! c 71 | 72 | 73 | -- | Row with the given index. 74 | nthRow :: (DetectableZero a, KnownNat r, KnownNat c) 75 | => Finite r 76 | -> SparseMatrix r c a 77 | -> SparseVector c a 78 | nthRow r m = 79 | rows m Vector.! r 80 | 81 | 82 | -- | Construct a sparse matrix from a list of indexed elements. Indices 83 | -- that don't appear in the list are all set to zero. Duplicate indexes 84 | -- are combined with '(<+>)'. 85 | -- 86 | -- We need detectable zeros so we can filter them out. 87 | matrix :: (DetectableZero a, KnownNat r, KnownNat c) 88 | => [((Finite r, Finite c), a)] 89 | -> SparseMatrix r c a 90 | matrix l = 91 | UnsafeMakeSparseMatrix { 92 | rows = 93 | sortBy (comparing (fst . fst)) l 94 | |> groupBy ((==) `on` (fst . fst)) 95 | |> fmap (\l -> (fst $ fst $ head l, fmap dropRow l)) 96 | |> fmap (\(r, elements) -> (r, Vector.vector elements)) 97 | |> Vector.vector 98 | } 99 | where 100 | dropRow :: ((r, c), a) -> (c, a) 101 | dropRow ((r, c), x)= 102 | (c, x) 103 | 104 | 105 | -- | Construct a sparse matrix from a list of indexed vectors corresponding 106 | -- to the rows of the matrix. 107 | fromRows :: (DetectableZero a, KnownNat r, KnownNat c) 108 | => [(Finite r, SparseVector c a)] 109 | -> SparseMatrix r c a 110 | fromRows rows = 111 | UnsafeMakeSparseMatrix { 112 | rows = 113 | Vector.vector rows 114 | } 115 | 116 | 117 | -- | Matrix addition. 118 | plus :: (DetectableZero a, KnownNat r, KnownNat c) 119 | => SparseMatrix r c a 120 | -> SparseMatrix r c a 121 | -> SparseMatrix r c a 122 | plus m1 m2 = 123 | UnsafeMakeSparseMatrix { 124 | rows = 125 | rows m1 <+> rows m2 126 | } 127 | 128 | 129 | -- | Matrix multiplication. 130 | times :: (DetectableZero a, KnownNat r, KnownNat m, KnownNat c) 131 | => SparseMatrix r m a 132 | -> SparseMatrix m c a 133 | -> SparseMatrix r c a 134 | times m1 m2 = 135 | UnsafeMakeSparseMatrix { 136 | rows = 137 | Vector.map (\r -> Vector.map (\c -> r `cross` c) (rows m2Tr)) (rows m1) 138 | } 139 | where 140 | m2Tr = 141 | transpose m2 142 | 143 | cross v1 v2 = 144 | Vector.sum (v1 <.> v2) 145 | 146 | 147 | -- | Swap the rows of a matrix with its columns. 148 | transpose :: (DetectableZero a, KnownNat r, KnownNat c) 149 | => SparseMatrix r c a 150 | -> SparseMatrix c r a 151 | transpose m = 152 | UnsafeMakeSparseMatrix { 153 | rows = 154 | Vector.vector [(i, Vector.map (Vector.! i) (rows m)) | i <- finites] 155 | } 156 | 157 | 158 | -- | Split a square matrix into four quadrants. 159 | split :: forall s t a. (DetectableZero a, KnownNat s, KnownNat t) 160 | => SparseMatrix (s + t) (s + t) a 161 | -> ( SparseMatrix s s a 162 | , SparseMatrix s t a 163 | , SparseMatrix t s a 164 | , SparseMatrix t t a 165 | ) 166 | split m = 167 | withKnownNat ((sing :: SNat s) %+ (sing :: SNat t)) $ 168 | let 169 | (top, bottom) = 170 | Vector.split (rows m) 171 | 172 | topSplit = 173 | Vector.map Vector.split top 174 | 175 | bottomSplit = 176 | Vector.map Vector.split bottom 177 | 178 | a = UnsafeMakeSparseMatrix { rows = Vector.map fst topSplit } 179 | b = UnsafeMakeSparseMatrix { rows = Vector.map snd topSplit } 180 | c = UnsafeMakeSparseMatrix { rows = Vector.map fst bottomSplit } 181 | d = UnsafeMakeSparseMatrix { rows = Vector.map snd bottomSplit } 182 | in 183 | (a, b, c, d) 184 | 185 | 186 | -- | Combine four quadrants into a single square matrix. 187 | combine :: forall s t a. (DetectableZero a, KnownNat s, KnownNat t) 188 | => ( SparseMatrix s s a 189 | , SparseMatrix s t a 190 | , SparseMatrix t s a 191 | , SparseMatrix t t a 192 | ) 193 | -> SparseMatrix (s + t) (s + t) a 194 | combine (a, b, c, d) = 195 | withKnownNat ((sing :: SNat s) %+ (sing :: SNat t)) $ 196 | let 197 | top = 198 | Vector.zipWith (Vector.++) (rows a) (rows b) 199 | 200 | bottom = 201 | Vector.zipWith (Vector.++) (rows c) (rows d) 202 | in 203 | UnsafeMakeSparseMatrix { 204 | rows = 205 | top Vector.++ bottom 206 | } 207 | 208 | 209 | -- | We can map from matrices with one type for elements to another given 210 | -- a semiring homomorphism. Note that this does not work for arbitrary 211 | -- functions. Specifically, this function must map zeros to zeros. 212 | map :: (DetectableZero a, DetectableZero b, KnownNat r, KnownNat c) 213 | => (a -> b) 214 | -> SparseMatrix r c a 215 | -> SparseMatrix r c b 216 | map f m = 217 | UnsafeMakeSparseMatrix { 218 | rows = 219 | Vector.map (Vector.map f) (rows m) 220 | } 221 | 222 | 223 | -- | Iterate over non-zero elements in a matrix. 224 | nonZero :: (KnownNat r, KnownNat c) 225 | => SparseMatrix r c a 226 | -> [((Finite r, Finite c), a)] 227 | nonZero m = 228 | concatMap 229 | (\(r, row) -> [((r, c), a) | (c, a) <- Vector.nonZero row]) 230 | (Vector.nonZero $ rows m) 231 | 232 | 233 | -- | Iterate over non-zero elements in a matrix grouped by rows. 234 | nonZeroRows :: (KnownNat r, KnownNat c) 235 | => SparseMatrix r c a 236 | -> [(Finite r, [(Finite c, a)])] 237 | nonZeroRows m = 238 | fmap 239 | (\(r, row) -> (r, Vector.nonZero row)) 240 | (Vector.nonZero $ rows m) 241 | 242 | 243 | -- | Convert a vector to a list. 244 | toList :: (DetectableZero a, KnownNat r, KnownNat c) => SparseMatrix r c a -> [[a]] 245 | toList m = 246 | fmap Vector.toList (Vector.toList $ rows m) 247 | 248 | 249 | 250 | -- | Square matrices form a semiring. 251 | instance (DetectableZero a, KnownNat n) => Semiring (SparseMatrix n n a) where 252 | -- | Matrix where all entries are zero. 253 | zero = 254 | matrix [] 255 | 256 | -- | Matrix where the diagonal is one. 257 | one = 258 | matrix [((i, i), one) | i <- [0..]] 259 | 260 | -- | Matrix addition. 261 | (<+>) = 262 | plus 263 | 264 | -- | Matrix multiplication. 265 | (<.>) = 266 | times 267 | 268 | 269 | -- | We can recognize zero matrices. 270 | instance (DetectableZero a, KnownNat n) => DetectableZero (SparseMatrix n n a) where 271 | isZero m = 272 | isZero (rows m) 273 | 274 | 275 | -- | Square matrices over Kleene algebra form a Kleene algebra. 276 | instance (DetectableZero a, KleeneAlgebra a, KnownNat n) => KleeneAlgebra (SparseMatrix n n a) where 277 | star m | Proved Refl <- (sing :: SNat n) %~ (sing :: SNat 0) = 278 | m 279 | star m | Proved Refl <- (sing :: SNat n) %~ (sing :: SNat 1) = 280 | matrix [((0,0), star (m ! (0, 0)))] 281 | star m = 282 | -- TODO: get rid of 'unsafeCoerce' or limit it to proving @n = small + large@. 283 | withKnownNat ((sing :: SNat n) `sDiv` (sing :: SNat 2)) $ 284 | withKnownNat (((sing :: SNat n) %+ (sing :: SNat 1)) `sDiv` (sing :: SNat 2)) $ 285 | withKnownNat (((sing :: SNat n) `sDiv` (sing :: SNat 2)) 286 | %+ 287 | (((sing :: SNat n) %+ (sing :: SNat 1)) `sDiv` (sing :: SNat 2)) 288 | ) $ 289 | case (sing :: SNat n) %~ (sing :: SNat ((n `Div` 2) + ((n + 1) `Div` 2))) of 290 | Proved Refl -> 291 | combine (a', b', c', d') 292 | where 293 | a :: SparseMatrix (n `Div` 2) (n `Div` 2) a 294 | (a, b, c, d) = 295 | split m 296 | 297 | -- a' :: SparseMatrix small small a 298 | a' = star (a `plus` (b `times` star d `times` c)) 299 | 300 | -- b' :: SparseMatrix small large a 301 | b' = star (a `plus` (b `times` star d `times` c)) `times` b `times` star d 302 | 303 | -- c' :: SparseMatrix large small a 304 | c' = star (d `plus` (c `times` star a `times` b)) `times` c `times` star a 305 | 306 | -- d' :: SparseMatrix large large a 307 | d' = star (d `plus` (c `times` star a `times` b)) 308 | 309 | Disproved _-> 310 | error "impossible" 311 | 312 | 313 | 314 | -- | Equality of matrices is decidable. 315 | deriving instance Eq a => Eq (SparseMatrix r c a) 316 | 317 | 318 | -- | We can totally order matrices. 319 | deriving instance Ord a => Ord (SparseMatrix r c a) 320 | 321 | 322 | instance (DetectableZero a, Show a, KnownNat r, KnownNat c) => Show (SparseMatrix r c a) where 323 | show m = 324 | intercalate "\n" 325 | [ intercalate " " (fmap (padded widest) row) | row <- grid ] 326 | 327 | where 328 | -- | Matrix as a list of lists. 329 | grid :: [[a]] 330 | grid = 331 | fmap Vector.toList (Vector.toList $ rows m) 332 | 333 | -- | Show an element of the matrix. 334 | show :: a -> String 335 | show a = 336 | showsPrec 11 a "" 337 | 338 | -- | Width of the widest entry in the matrix. 339 | widest :: Int 340 | widest = 341 | foldr max 0 [ length (show a) | a <- concat grid ] 342 | 343 | -- | Show with a constant width. 344 | padded :: Int -> a -> String 345 | padded width a = 346 | let 347 | s = show a 348 | in 349 | s ++ take (width - length s) (repeat ' ') -------------------------------------------------------------------------------- /paper/report.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt]{article} 2 | 3 | \usepackage{fullpage} % More space 4 | \usepackage{tgpagella} % Better font 5 | \usepackage{microtype} % Slightly improved typography 6 | 7 | \usepackage{hyperref} 8 | \usepackage{cleveref} 9 | \usepackage{todonotes} 10 | 11 | \usepackage{amsmath, amsfonts, amsthm, amssymb, mathtools} % math 12 | \usepackage{mathpartir} 13 | 14 | \usepackage{syntax} 15 | \setlength{\grammarindent}{4em} % increase separation between LHS/RHS 16 | 17 | \usepackage{macro/generic} 18 | \usepackage{macro/code} 19 | 20 | 21 | %===================================================================== 22 | % Author 23 | %===================================================================== 24 | 25 | \title{A Regular Expression Library for Haskell} 26 | \author{Josh Acay \\ \href{mailto:ca483@cornell.edu}{ca483@cornell.edu}} 27 | \date{May 22, 2018} 28 | 29 | 30 | %===================================================================== 31 | % Macros 32 | %===================================================================== 33 | 34 | \newtheorem{theorem}{Theorem} 35 | \newtheorem{example}{Example} 36 | 37 | \DeclareMathOperator{\lang}{\mathcal{L}} 38 | \DeclareMathOperator{\derivative}{D} 39 | \DeclareMathOperator{\nullable}{nullable} 40 | 41 | \newcommand{\transpose}{^\top} 42 | 43 | \newcommand{\by}[1]{\parens{\text{#1}}} 44 | \newcommand{\since}[1]{\parens{\text{since #1}}} 45 | \newcommand{\eqBy}[1]{\braces{\text{#1}}} 46 | 47 | \newcommand{\haskell}{\lstinline} 48 | 49 | 50 | 51 | \begin{document} 52 | \maketitle 53 | 54 | \begin{abstract} 55 | I detail the implementation of a regular expression library for Haskell.\footnote{% 56 | Available online at \url{https://github.com/cacay/regexp}.} 57 | Unlike similar libraries in the wild, this one supports more than just matching strings: it can compute intersections and complements, take derivatives \'a la Brzozowski \cite{Brzozowski64}, check for equivalence, and solve systems of linear equations with regular expression coefficients. In addition, the library is designed to be generic over the alphabet (even allowing infinite ones) so it is not tied to Haskell's builtin \haskell{Char} and \haskell{String} types. 58 | \end{abstract} 59 | 60 | 61 | \section{Introduction} 62 | Regular expressions provide a simple yet powerful language for string searching and matching. They are expressive enough to describe many common patterns that arise in practice (e.g.\ alternatives and repetition) but restrictive enough that many desirable properties are effectively decidable (e.g.\ equivalence and containment checking). Additionally, regular languages are closed under intersection, union, and complement making it possible to expose a natural interface that uses familiar connectives like ``and'', ``or'', and ``not''. 63 | 64 | Unfortunately, practical implementations of regular expressions focus too much on string matching and forego all the benefits of having a restricted language. For instance, I was 65 | unable to find a single popular regular expression library (for any programming language) that supports equivalence checking. 66 | % Very few support complement and/or intersection, but do so by extending the language 67 | % rather than computing the complement or intersection in terms of the basic operators. 68 | In fact, many modern implementations add features such as capture groups and backreferences which break most closure properties and make equivalence checking undecidable \cite{CampeanuSY03}. 69 | 70 | Although string matching is sufficient for many application, there are cases where the additional power is useful (e.g.\ during design and development) or necessary.\footnote{% 71 | I sketch the application this library was built for in \cref{application}.} 72 | Here, I detail the design and development of a Haskell library that exposes this additional power to the user. 73 | 74 | 75 | \section{Specification} 76 | 77 | Given an alphabet $\Sigma$, regular expressions have the following syntax: 78 | \begin{grammar} 79 | ::= 1 80 | \alt + 81 | \alt $\cdot$ 82 | \alt $^*$ 83 | \alt $l \subseteq \Sigma$ 84 | \end{grammar} 85 | 86 | The interpretation of these expressions is standard: 87 | $1$ matches the empty word, 88 | $e_1 + e_2$ matches either $e_1$ or $e_2$, 89 | $e_1 \cdot e_2$ matches $e_1$ followed by $e_2$, 90 | $e^*$ matches zero or more copies of $e$, 91 | and the literal $l$ matches single character words $a$ where $a \in l$. 92 | This is a generalization of the standard syntax which only allows single characters as literals rather than sets of characters or \emph{character classes} and encodes the set $\set{a_1, a_2, \ldots, a_n}$ as $a_1 + a_2 + \cdots + a_n$. I use character classes directly since they generalize to infinite alphabets (which I will discuss in \cref{effective-boolean-algebra}) and are more efficient to implement. The expression 0---which matches no strings---is represented with the empty character class $\emptyset$. 93 | 94 | It is easy enough to define a parametrized algebraic data type \haskell{RegExp $\Sigma$} that mirrors this syntax. I implement the following operations on regular expressions: 95 | \begin{itemize} 96 | \item \haskell{matches :: RegExp $\Sigma$ -> [$\Sigma$] -> Bool}\\ 97 | \haskell{matches $e$ $w$} returns \haskell{True} whenever the expression $e$ matches the word $w$. 98 | 99 | \item \haskell{complement :: RegExp $\Sigma$ -> RegExp $\Sigma$}\\ 100 | \haskell{complement $e$} returns a regular expression that matches precisely the words $e$ does not match. 101 | 102 | \item \haskell{intersection :: RegExp $\Sigma$ -> RegExp $\Sigma$ -> RegExp $\Sigma$}\\ 103 | \haskell{intersection $e_1$ $e_2$} returns a regular expression that matches words both $e_1$ and $e_2$ match. 104 | 105 | \item \haskell{equivalent :: RegExp $\Sigma$ -> RegExp $\Sigma$ -> Either [$\Sigma$] ()}\\ 106 | \haskell{equivalent $e_1$ $e_2$} returns \haskell{Right ()} (i.e.\ ``true'') if $e_1$ and $e_2$ are equivalent, and \haskell{Left $w$} otherwise. Here, the word $w$ is a counterexample that is matched by one expression but not the other. 107 | 108 | \item \haskell{solve :: LinearSystem $\Sigma$ -> RegExp $\Sigma$}: given a system of linear equations of the form: 109 | \begin{gather*} 110 | X_1 = e_1^0 + e_1^1 X_1 + e_1^2 X_2 + \cdots e_1^n X_n\\ 111 | X_2 = e_2^0 + e_2^1 X_1 + e_2^2 X_2 + \cdots e_2^n X_n\\ 112 | \vdots\\ 113 | X_m = e_m^0 + e_m^1 X_1 + e_m^2 X_2 + \cdots e_m^n X_n 114 | \end{gather*} 115 | where each $e_i^j$ is non-nullable (i.e.\ doesn't match the empty string), solve for $X_1$.\footnote{% 116 | Actual type of \haskell{solve} is slightly different, but amounts to the same thing.} Note that all coefficients need to be on the same of the variables to ensure there is a solution. I arbitrarily pick left. 117 | 118 | \haskell{solve} is a versatile and powerful function. It can derive \haskell{intersection}, \haskell{complement}, and many other operators. However, I observed that going through deterministic finite automata gives more succinct expressions. 119 | \end{itemize} 120 | 121 | 122 | \section{Implementation} 123 | 124 | \subsection{Derivatives of Regular Expressions} 125 | 126 | The entirety of this library reduces to Brzozowski derivatives, deterministic finite state automata (DFAs), and the correspondence between them. The derivative of a regular expression $e$ with respect to a character $a \in \Sigma$ is a regular expression $e'$ such that $e'$ matches a word $w$ if and only if $e$ matches $a w$. Brzozowski showed that derivatives always exist and that they could be computed syntactically \cite{Brzozowski64}. This has an obvious extension to arbitrary words $w$, which I denote by $\derivative_w{(e)}$. This gives a trivial way to implement matching: 127 | \begin{equation*} 128 | \haskell{matches $e$ $w$ = nullable ($\derivative_w{(e)}$)} 129 | \end{equation*} 130 | where \haskell{nullable $e$} if and only if $e$ can match the empty word. There is no need to explicitly convert to a DFA, although one might still want to for efficiency's sake. 131 | 132 | \subsection{Computing Intersection and Complement} 133 | 134 | To compute intersection and complement, I go through the standard motions of converting regular expressions to DFAs, performing the relevant operation on the DFA representations (product construction for intersection and flipping accepting/non-accepting states for complement), and converting back. Although the concept of going through DFAs is not new, the methods I use are hard to come by in real-world code (even though they are not novel theoretically). 135 | 136 | In the forward direction, I construct a DFA directly from a regular expression using Brzozowski derivatives instead of determinizing a nondeterministic finite state automaton, which is what most implementations do. The idea is simple: derivatives of a regular expression are the states of the deterministic automaton and there is an $a$ transition from $e$ to $e'$ if $\derivative_a{(e)} = e'$. Considering the derivative with respect to all words gives the full automaton. Brzozowski showed in \cite{Brzozowski64} that this process generates only finitely many states as long as regular expressions are compared modulo associativity, commutativity, and idempotence of $+$ (ACI). I achieve this by keeping the type \haskell{RegExp} abstract and only exposing ``smart'' constructors that normalize regular expressions with respect to the following equalities: 137 | \begin{mathpar} 138 | (r + s) + t = r + (s + t) 139 | \and r + s = s + t 140 | \and r + r = r 141 | % 142 | \\ 0 + r = r 143 | % 144 | \\ (r \cdot s) \cdot t = r \cdot (s \cdot t) 145 | \and 1 \cdot r = r = r \cdot 1 146 | \and 0 \cdot r = 0 = r \cdot 0 147 | % 148 | \\ (r^*)^* = r^* 149 | \and 0^* = 1 150 | \and 1^* = 1 151 | \end{mathpar} 152 | The first three equalities are required for termination as discussed; others reduce the number of states generated which speeds up execution and results in smaller more readable expressions. To get even smaller expressions, I keep regular expressions in strong star normal form, which was introduced as a linear time simplification method in \cite{GruberG10}. Very basically, the strong star normal form limits applications of $^*$ to non-nullable expressions, so $(1 + a)^*$ would get converted to the equivalent $a^*$. 153 | 154 | 155 | The other direction, converting a DFA into a regular expression, is essentially unimplemented: I was unable to find a single library, or even a single piece of working code that does it\footnote{% 156 | I did find a Coq library described in \cite{DoczkalKS13} that implements DFA to regular expression conversion and does so in a mechanically verified way, but being a constructive proof, the implementation was so inefficient it was unable to convert the expression $0$ to a DFA and back.}. 157 | Worse yet, a web search yields no results that are easy to turn into an algorithm. Most sources talk about solving a system of linear equations but offer no insight into how. Such methods might work if one is doing this by hand, but of course a computer requires a more formal specification. 158 | 159 | The method I settled on is due to Kozen \cite{Kozen94}. The idea is simple and elegant: represent the DFA in matrix form as a triple $\angled{u, M, v}$ where $u$ and $v$ are $\set{0, 1}$ vectors representing starting and accepting states, respectively, and $M$ is the transition matrix. Then, $u\transpose M^* v$ gives a regular expression that matches the set of words accepted by this DFA\@. I use a home-baked implementation of sparse vectors and sparse matrices to represent DFAs since I was unable to find an existing Haskell library that was up to the task. The most interesting part of this is the implementation of Kleene star for matrices, which uses the divide-and-conquer method of \cite{Kozen94}. Further details on the technique can be found in \cite{Kozen94}. 160 | 161 | 162 | \subsection{Equality Checking} 163 | 164 | I use Hopcroft and Karp's bisimulation algorithm for checking DFA equivalence \cite{HopcroftK71} modified to use a union-find structure as described in \cite{BonchiP11}. However, I generate the DFAs on the fly using Brzozowski derivatives instead of computing them upfront. This gives the algorithm a chance to fail early and avoid the exponential time process of converting to a DFA\@. The implementation is able to generate a counterexample if the expressions are not equivalent. 165 | 166 | 167 | \subsection{Solving Systems of Linear Equations} 168 | 169 | Given a system of linear equations of the form 170 | \begin{gather*} 171 | X_1 = e_1^0 + e_1^1 X_1 + e_1^2 X_2 + \cdots e_1^n X_n\\ 172 | X_2 = e_2^0 + e_2^1 X_1 + e_2^2 X_2 + \cdots e_2^n X_n\\ 173 | \vdots\\ 174 | X_m = e_m^0 + e_m^1 X_1 + e_m^2 X_2 + \cdots e_m^n X_n 175 | \end{gather*} 176 | where $e_i^j$ are non-nullable, it is possible to solve for $X_1$ using Arden's lemma and Gaussian elimination. Arden's lemma states that the unique solution to the equation $X = A \cdot X + B$ is $A^* B$ as long as $A$ is non-nullable. Using the lemma, it is possible to eliminate variables one at a time by walking down the list\footnote{% 177 | Apply the lemma to the first equation to get an equation for $X_1$ that doesn't refer to itself, then substitute it in for $X_1$ in all following equations. Rinse and repeat until you reach the bottom.} until $X_m$ is reached. At this point, the equation for $X_m$ only refers to itself, so apply Arden's lemma to get a closed expression for $X_m$. This can be propagated back using substitution by walking up the list. At each step, the equation for a variable will only contain that variable so Arden's lemma gives a closed form solution. 178 | 179 | Linear equation solving is useful for many applications, one of which is computing intersection and complement. It is easy to see that 180 | \[ \haskell{intersection $e_1$ $e_2$} 181 | = (\haskell{nullable $e_1$} = \haskell{nullable $e_2$}) + \sum_{a \in \Sigma}{a \cdot \haskell{intersection (derivative $a$ $e_1$) (derivative a $e_2$)}} 182 | \] 183 | and 184 | \[ \haskell{complement $e$} 185 | = \neg(\haskell{nullable $e$}) + \sum_{a \in \Sigma}{a \cdot \haskell{complement (derivative $a$ $e$)}} 186 | \] 187 | (interpreting booleans as regular expressions in the expected way). Unfolding these definitions indefinitely and treating every application of the operator to unique input(s) as a fresh variable results in a finite system of linear equations since every regular expression has a finite set of derivatives modulo ACI as discussed. 188 | 189 | 190 | \subsection{Generalizing Literals}\label{effective-boolean-algebra} 191 | 192 | Rather than fixing literals to be sets of characters, I define them to be elements of an effective boolean algebra $\angled{\Sigma, U, \bracks{\cdot}, \sqcup, \sqcap, \bot, \top}$. 193 | The implementation closely follows the presentation in \cite{KeilT14} so I will elide the details. The important thing to note is that effective boolean algebras have many useful instantiations including finite subsets of a finite alphabet, finite and cofinite (a set whose complement is finite) subsets of an infinite alphabet, a logic of decidable propositions over an alphabet and so on. 194 | 195 | My implementation is generic against this abstract interface so all these options can be made to work easily, but I only provide an implementation of finite and cofinite subsets of a \emph{finite} alphabet. One might wonder the point of supporting cofinite subsets of a finite alphabet since they are finite sets after all. The reason is that supporting cofinite subsets gives major speedups when working with very large alphabets such as the set of Unicode characters. For example, the representation of \haskell{complement $\set{a}$} is simply the complemented set $\neg\set{a}$ which needs to list a single element. Representing this set directly would require listing all characters that are not ``a'' of which there are millions. 196 | 197 | 198 | \section{Testing and Verification} 199 | 200 | I employ a mix of static and dynamic techniques to ensure correctness. Static guarantees are a result of Haskell's incredibly powerful type system. For example, I use algebraic data types to ensure all regular expressions conform to the syntax described in this paper. With normalized regular expressions, I had to go a step further and use generalized algebraic data types (a.k.a.\ indexed algebraic data types) \cite{CheneyH03} to check invariants such as ``Kleene star is only applied to non-nullable regular expressions''. This is achieved by indexing the data type of regular expression by a boolean \haskell{isNullable} with the obvious meaning. The implementation of sparse vectors and sparse matrices goes yet another step further straight into the dependently typed territory. These types are indexed by their size to enforce invariants such as ``only vectors of equal length are added together''. I use the singletons library \cite{Singletons} for this purpose, which gives Haskell limited dependent programming capabilities. I insert dynamic assertions wherever the type system falls short. These dynamic checks are useful for two reasons: (1) they increase the likelihood of catching bugs, and (2) they make tracking down the root cause of said bugs much easier.\footnote{% 201 | I discovered one bug due to an assertion failure. A different unrelated bug in the regular expression to DFA conversion code was discovered during testing by manual inspection of output, but I tracked it down by inserting dynamic checks (which, in hindsight, should have been there to begin with). 202 | } 203 | 204 | In addition to static checks, I use unit tests and property testing to further increase the confidence in the library. Unit tests are assertions on ground terms such as 205 | \[ \haskell{intersection $(a + b)^*$ $a^*$} = a^* \] 206 | where $a$, $b$ are concrete elements of a concrete type $\Sigma$, say, \haskell{Char}. Haskell's hspec library\footnote{% 207 | \url{https://hackage.haskell.org/package/hspec} 208 | } provides a good framework for writing such tests. I use SmallCheck \cite{SmallCheck} and QuickCheck \cite{QuickCheck} for property testing. These libraries allow one to write logical assertions such as 209 | \[ \forall e_1, e_2. \haskell{equivalent $e_1$ $e_2$} = \haskell{Right ()} 210 | \implies \forall w. \haskell{matches $e_1$ $w$} = \haskell{matches $e_2$ $w$} 211 | \] 212 | and verifies these assertions by instantiating them with concrete terms. QuickCheck randomly generates a fixed number of examples whereas SmallCheck systematically generates \emph{all} inputs. I use QuickCheck to test against a large alphabet (Unicode characters), and SmallCheck to tests against a very small alphabet with three characters. SmallCheck was very helpful for catching bugs at corner cases and generating simple counterexamples (since the alphabet is very small). 213 | 214 | During random testing, I frequently ran into exponential behavior with functions \haskell{equivalent}, \haskell{intersection}, and \haskell{complement}. These operations have proven lower bounds \cite{Kozen77,GeladeN12} so occasional bad behavior is unavoidable, but the usual claim is that they fare much better on real-world inputs (see \cite{FosterKM0T15} for instance). It is interesting to note that this claim apparently does not extend to \emph{random} inputs. As a solution, I limit the size of randomly generated regular expressions to 4 or 5 operators which is sufficient to get high code coverage. 215 | 216 | This was the first time I took testing seriously and it certainly paid off. I had been of the opinion that having the backing of a powerful type system like Haskell's and writing well-structured code would make it nigh impossible to introduce bugs. However, despite being extra careful and utilizing Haskell's type system to its fullest, I discovered half a dozen bugs during testing. On the plus side, I was able to focus on writing ``interesting'' tests thanks to Haskell's type system (users of untyped languages have to write hundreds of tests essentially asserting their program is well-typed). This goes on to show that type systems and testing go hand in hand. 217 | 218 | \section{An Application: Unicode Grapheme Cluster Breaks}\label{application} 219 | 220 | In this section, I briefly talk about the reason this library exists in the first place. 221 | 222 | Unicode has become the de facto standard for international text. However, it is such a complicated beast that most people have misconceptions about how Unicode works. One of these misconceptions is the idea that each character you see on the screen corresponds to a single Unicode code point. This is a reasonable expectation since ``a'', ``!'', and ``$\Omega$'' are all Unicode code points. However, there are ``characters'' that require more than one Unicode code point to represent. For example, G with acute accent \'G requires two. Such a group of code points that represent a single user-perceived character is called a grapheme cluster. 223 | 224 | Unicode strings are transmitted as flat sequences of code points, so breaking them up into their grapheme clusters might (and should) be a first step in any code that is to handle Unicode correctly. The Unicode standard gives a declarative specification of how this should be done\footnote{\url{http://unicode.org/reports/tr29}} using an ordered list of rules with the following two forms: $e_1 \times e_2$ meaning break at positions where the left-hand side matches $e_1$ and the right-hand side matches $e_2$, and $e_1 \div e_2$ meaning do not break at matching positions. When rules overlap, the first one in the list applies. The Unicode technical report claims without proof that these rules can be converted into a regular expression that can be used to extract grapheme clusters (by repeating a longest match for example). 225 | 226 | I thought this was an interesting way to specify text segmentation rules, so I wanted to come up with an algorithm to do this translation for the general case. I had a translation in mind that used intersection and complement extensively, so I implemented this library to see the algorithm in action. Unfortunately, that algorithm was not correct, and fixing it is left for future work. 227 | 228 | \section{Conclusion and Future Work} 229 | I presented a regular expressions library for Haskell that implements very well-known theoretical results that somehow never made their appearance in popular tools. As always, some work remains to be done. First, there is significant room for optimization. My implementation of sparse vectors and sparse matrices could use a lot of work, and suppressing Haskell's default lazy semantics in computation heavy regions might lead to significant speedups. Second, I would like to implement DFA minimization and use it to simplify regular expressions generated by \haskell{intersection} and \haskell{complement} operations. Normalizing regular expressions already simplifies them quite a bit, and going through DFAs using Brzozowski derivatives generally produces readable (and perhaps even minimal) expressions, but there are cases where the output is quite hairy. I'm hoping minimizing DFAs could remedy this situation. Finally, the interface could use some work. The code is modularized in a way that makes sense from the implementer's perspective, but it is not the interface that should be exposed to the user. 230 | 231 | 232 | 233 | %===================================================================== 234 | % Bibliography 235 | %===================================================================== 236 | 237 | \bibliographystyle{acm} 238 | \bibliography{bibliography,bibliography-extra} 239 | 240 | \end{document} 241 | -------------------------------------------------------------------------------- /src/RegExp/RegExp.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE KindSignatures #-} 4 | {-# LANGUAGE TypeOperators #-} 5 | 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | {-# LANGUAGE FlexibleContexts #-} 8 | {-# LANGUAGE FlexibleInstances #-} 9 | {-# LANGUAGE UndecidableInstances #-} 10 | 11 | 12 | -- | Definition of regular expressions over /character classes/ 13 | -- (i.e. sets of characters from an arbitrary type). The definition 14 | -- is left abstract, which frees us to simplify regular expressions 15 | -- using (sound) rewriting rules. We normalize regular expressions 16 | -- enough to guarantee that the set of Brzozowski derivatives are 17 | -- finite. 18 | module RegExp.RegExp 19 | ( CharacterClass 20 | , RegExp 21 | 22 | -- * Deconstructing regular expressions 23 | , RegExpView(..) 24 | , view 25 | , hide 26 | 27 | -- * Properties 28 | , nullable 29 | , empty 30 | 31 | -- * #combining# Combining regular expressions 32 | , rZero 33 | , rOne 34 | , rPlus 35 | , rTimes 36 | , rStar 37 | , rLiteral 38 | ) where 39 | 40 | import Control.Exception.Base(assert) 41 | import Unsafe.Coerce(unsafeCoerce) 42 | 43 | import Data.Singletons 44 | import Data.Singletons.Decide 45 | import Data.Singletons.Prelude 46 | 47 | import Data.Set(Set) 48 | import qualified Data.Set as Set 49 | 50 | import Data.Either (isRight) 51 | import qualified Data.List as List 52 | import Data.String (IsString(..)) 53 | import qualified Text.ParserCombinators.ReadP as Parser 54 | 55 | import Data.GSet (GSet) 56 | import qualified Data.GSet as GSet 57 | import Data.Semiring (Semiring(..)) 58 | 59 | import Test.QuickCheck as QuickCheck 60 | import Test.SmallCheck.Series as SmallCheck 61 | 62 | 63 | -- | Sets of characters from an alphabet 'c'. 64 | type CharacterClass c = 65 | GSet.Set c 66 | 67 | 68 | -- | Regular expressions that support character classes over an alphabet 69 | -- @c@ (so we don't have to encode them using choice). The type 70 | -- is left abstract so we can apply rewriting rules to simplify and 71 | -- normalize expressions. Refer to 'view' and 'RegExpView' for 72 | -- inspecting 'RegExp', and to 'hide' and [the relevant section](#combining) 73 | -- for constructing them. 74 | -- 75 | -- Normalizing expressions not only makes them smaller and more 76 | -- readable, but also ensures termination for some algorithms, so 77 | -- it is a good idea overall. We normalize with respect to the 78 | -- following equations: 79 | -- 80 | -- == Associativity, Commutativity, and Idempotence of @+@ (ACI) 81 | -- 82 | -- prop> (r + s) + t = r + (s + t) 83 | -- prop> r + s = s + t 84 | -- prop> r + r = r 85 | -- 86 | -- == Identity for @+@ 87 | -- 88 | -- prop> 0 + r = r 89 | -- 90 | -- == Associativity of @.@ 91 | -- 92 | -- prop> (r . s) . t = r . (s . t) 93 | -- 94 | -- == Identity and Annihilator for @.@ 95 | -- 96 | -- prop> 1 . r = r = r . 1 97 | -- prop> 0 . r = 0 = r . 0 98 | -- 99 | -- == Star 100 | -- 101 | -- prop> (r*)* = r* 102 | -- prop> 0* = 1 103 | -- prop> 1* = 1 104 | -- 105 | -- In addition, we keep regular expressions in strong star normal form 106 | -- which is described in 107 | -- [Simplifying Regular Expressions A Quantitative Perspective](https://pdfs.semanticscholar.org/1b6b/5843442a64523ccb7afd21eabec7881b4219.pdf). 108 | -- In practice, this means @*@ is only applied to expression that 109 | -- cannot match the empty word. 110 | -- 111 | -- Brzozowski proved that normalizing with respect to ACI ensures 112 | -- there are only finitely many derivatives of a regular expression. 113 | -- So ACI is necessary for any algorithm that relies on taking repeated 114 | -- derivatives of regular expressions. 115 | data RegExp c where 116 | RZero :: RegExp c 117 | ROne :: RegExp c 118 | RNormalized :: (NormalizedRegExp c isUnion isSeq isNullable) -> RegExp c 119 | 120 | -- | Syntactic equality. 121 | instance GSet c => Eq (RegExp c) where 122 | RZero == RZero = 123 | True 124 | ROne == ROne = 125 | True 126 | RNormalized r1 == RNormalized r2 = 127 | r1 `hEq` r2 128 | _ == _ = 129 | False 130 | 131 | -- | An arbitrary syntactic ordering. Useful for defining sets and 132 | -- maps over regular expressions. 133 | instance (GSet c, Ord (CharacterClass c)) => Ord (RegExp c) where 134 | compare RZero RZero = 135 | EQ 136 | compare RZero _ = 137 | LT 138 | compare _ RZero = 139 | GT 140 | 141 | compare ROne ROne = 142 | EQ 143 | compare ROne _ = 144 | LT 145 | compare _ ROne = 146 | GT 147 | 148 | compare (RNormalized r1) (RNormalized r2) = 149 | r1 `hCompare` r2 150 | 151 | -- | Nicer interface for inputting regular expression over 'Char'. 152 | -- For example, 153 | -- 154 | -- > "abc" :: 'RegExp' 'Char' 155 | -- 156 | -- is the regular expression that matches single character strings 157 | -- @"a"@, @"b"@, and @"c"@ (it doesn't match the string @"abc"@). 158 | instance IsString (RegExp Char) where 159 | fromString = 160 | rLiteral . fromString 161 | 162 | 163 | 164 | -- * Deconstructing regular expressions 165 | 166 | -- | Standard syntax for regular expressions. We omit @Zero@ since 167 | -- it can be encoded as @'Literal' 'zero'@. 168 | data RegExpView c r where 169 | -- | Match the empty string and nothing else. 170 | One :: RegExpView c r 171 | 172 | -- | Match the left or the right expression. 173 | Plus :: r -> r -> RegExpView c r 174 | 175 | -- | Match the left then the right expression. 176 | Times :: r -> r -> RegExpView c r 177 | 178 | -- | Match zero or more copies of the given expression. 179 | Star :: r -> RegExpView c r 180 | 181 | -- | Match any character in the character class. The character 182 | -- class might be empty, in which case this matches no strings. 183 | Literal :: CharacterClass c -> RegExpView c r 184 | 185 | deriving instance Functor (RegExpView c) 186 | 187 | 188 | 189 | -- | Expose the abstract type 'RegExp' as a 'RegExpView'. 190 | view :: forall c. GSet c => RegExp c -> RegExpView c (RegExp c) 191 | view RZero = 192 | Literal zero 193 | view ROne = 194 | One 195 | view (RNormalized r) = 196 | view' r 197 | where 198 | view' :: NormalizedRegExp c isUnion isSeq isNullable 199 | -> RegExpView c (RegExp c) 200 | view' (NUnion p n s) | p /= zero, Set.null n, Set.null s = 201 | Literal p 202 | view' (NUnion p n s) | p /= zero = 203 | nUnion p Set.empty Set.empty `Plus` nUnion zero n s 204 | view' (NUnion _ n s) | otherwise = 205 | case (Set.minView n, Set.minView s) of 206 | (Nothing, Nothing) -> 207 | error "impossible" 208 | 209 | (Just (SubUnion r, n), _) -> 210 | RNormalized r `Plus` nUnion zero n s 211 | 212 | (Nothing, Just (SubUnion r, s)) -> 213 | RNormalized r `Plus` nUnion zero Set.empty s 214 | 215 | view' (NUnionWithOne p s) = 216 | ROne `Plus` nUnion p Set.empty s 217 | 218 | view' (NSeq l) | Some1 (SubSeq r1) ::: (Some1 l') <- nSeqView l 219 | , Some1 (SubSeq r2) ::: (Some1 l'') <- nSeqView l' 220 | , Nil <- nSeqView l'' = 221 | RNormalized r1 `Times` RNormalized r2 222 | view' (NSeq l) | Some1 (SubSeq r1) ::: Some1 l' <- nSeqView l = 223 | RNormalized r1 `Times` RNormalized (NSeq l') 224 | view' (NSeq l) | otherwise = 225 | error "impossible" 226 | 227 | view' (NStar r) = 228 | Star (RNormalized r) 229 | 230 | 231 | -- | Pack the public view 'RegExpView' back into the abstract view 'RegExp'. 232 | hide :: GSet c => RegExpView c (RegExp c) -> RegExp c 233 | hide One = 234 | rOne 235 | hide (Plus r1 r2) = 236 | rPlus r1 r2 237 | hide (Times r1 r2) = 238 | rTimes r1 r2 239 | hide (Star r) = 240 | rStar r 241 | hide (Literal p) = 242 | rLiteral p 243 | 244 | 245 | 246 | -- * Properties 247 | 248 | -- | 'True' if and only if the regular expression can match the 249 | -- empty word. 250 | nullable :: GSet c => RegExp c -> Bool 251 | nullable RZero = 252 | False 253 | nullable ROne = 254 | True 255 | nullable (RNormalized r) = 256 | isRight (nullableNormalized r) 257 | 258 | 259 | -- | 'True' if and only if the regular expression matches no words. 260 | empty :: RegExp c -> Bool 261 | empty RZero = 262 | True 263 | empty _ = 264 | False 265 | 266 | 267 | 268 | -- * Normalized regular expressions 269 | 270 | -- | We define a type of /normalized regular expressions/ to help us 271 | -- statically ensure the properties described in 'RexExp'. Terms of this 272 | -- type denote expressions that 273 | -- 1. are fully normalized with respect to the rewriting rules in 'RegExp', 274 | -- 2. are in strong star normal form, 275 | -- 3. know whether they are nullable (i.e. we can determine in constant time 276 | -- whether they can match the empty word or not). 277 | -- 278 | -- (1) and (2) are direct requirements. (3) helps us specify when a regular 279 | -- expression is in star normal form. Additionally, being able to compute 280 | -- nullability in constant time speeds up derivative based algorithms. 281 | -- 282 | -- To enforce these requirements, we use data structures that intrinsically 283 | -- capture the required properties. For example, instead of using binary 284 | -- unions, we define unions over /sets/ of subexpressions since sets capture 285 | -- associativity, commutativity, and idempotence. Similarly, we use lists 286 | -- for sequencing so we get associativity automatically. 287 | -- 288 | -- Normalized expressions are indexed based on whether the root constructor 289 | -- is a union or a sequence so we can statically disallow nesting unions 290 | -- under unions (since they can be combined into one) and similarly for 291 | -- sequencing. There is an additional index to track whether the expression 292 | -- is nullable. 293 | -- 294 | -- Normalized expressions cannot encode the regular expressions that matches 295 | -- no words (i.e. @Zero@) or the regular expression that only matches the 296 | -- empty words (i.e. @One@). These are added by 'RegExp'. This is in line 297 | -- with the development in 298 | -- /Simplifying Regular Expressions A Quantitative Perspective/. 299 | data NormalizedRegExp c (isUnion :: Bool) (isSeq :: Bool) (isNullable :: Bool) where 300 | -- | Union of a literal (a character set) and a set of subexpressions. 301 | -- Note that we disallow standalone literal nodes under unions and 302 | -- instead bundle them with the union nodes themselves to ensure we 303 | -- combine literals using set union instead of syntactic regular expression 304 | -- union. This gives an additional level of normalization. 305 | -- 306 | -- We do not have a separate case for literals since they can be 307 | -- represented as a union node with an empty set of subexpressions. 308 | -- 309 | -- We keep nullable and strict (non-nullable) subexpressions separate to 310 | -- satisfy requirement (3): a union node is nullable if the set of 311 | -- nullable subexpressions is not empty. This is checked dynamically. 312 | -- 313 | -- To satisfy (1) and (2), we require that for every @NUnion p n s@, the 314 | -- following should hold: 315 | 316 | -- @p /= zero || 'length' n + 'length' s >= 2@. 317 | -- 318 | -- Additionally, 'NUnion' and 'NUnionWithOne' should not occur as 319 | -- subexpressions since these can be hoisted up. 320 | NUnion :: CharacterClass c 321 | -> Set (SubUnion c True) 322 | -> Set (SubUnion c False) 323 | -> NormalizedRegExp c True False isNullable 324 | 325 | -- | The empty word or a union. This corresponds to the @?@ constructor 326 | -- from the paper, but generalized over a union of subexpressions 327 | -- (as opposed to a single expression) so that expressions like @a? + b?@ 328 | -- get simplified to @(a + b)?@. 329 | -- 330 | -- For every, @NUnionWithOne p s@ we require that 331 | -- @p /= zero || 'length' s >= 1@ and @s@ does not contain 'NUnion' or 332 | -- 'NUnionWithOne'. 333 | NUnionWithOne :: CharacterClass c 334 | -> Set (SubUnion c False) 335 | -> NormalizedRegExp c True False True 336 | 337 | 338 | -- | Sequential composition of a list of subexpressions. Each cons node 339 | -- in the list needs to store whether the sequential composition of that 340 | -- and the following nodes are nullable or not in order to satisfy (3). 341 | -- 342 | -- The list must contain at least two elements and 'NSeq' nodes cannot 343 | -- appear as subexpression. 344 | NSeq :: NSeq c isNullable 345 | -> NormalizedRegExp c False True isNullable 346 | 347 | -- | Iteration. The iterated expression cannot be nullable, but the 348 | -- overall expression always is. 349 | NStar :: NormalizedRegExp c isUnion isSeq False 350 | -> NormalizedRegExp c False False True 351 | 352 | 353 | -- | Normalized regular expressions that can appear under a union node. 354 | data SubUnion c (isNullable :: Bool) where 355 | SubUnion :: !(NormalizedRegExp c False isSeq isNullable) 356 | -> SubUnion c isNullable 357 | 358 | 359 | -- | Normalized regular expressions that can appear under a seq node. 360 | data SubSeq c (isNullable :: Bool) where 361 | SubSeq :: !(NormalizedRegExp c isUnion False isNullable) 362 | -> SubSeq c isNullable 363 | 364 | 365 | -- | Sequential composition of a list of subexpressions. Each node 366 | -- in the sequence keeps track of whether the subsequence starting 367 | -- with that element is nullable. 368 | data NSeq c (isNullable :: Bool) where 369 | -- | Empty list. Nullable since it corresponds to @One@. 370 | NSeqNil :: NSeq c True 371 | 372 | -- | Tack on a nullable subexpression. The result is nullable if 373 | -- and only if the rest of the list is. 374 | NSeqConsNullable :: Sing (isNullable :: Bool) 375 | -> SubSeq c True 376 | -> NSeq c isNullable 377 | -> NSeq c isNullable 378 | 379 | -- | Tack on a strict (non-nullable) subexpression. The result 380 | -- is always strict. 381 | NSeqConsStrict :: SubSeq c False 382 | -> NSeq c isNullable 383 | -> NSeq c False 384 | 385 | 386 | 387 | -- * Smart constructors to check invariants we cannot encode statically. 388 | 389 | 390 | -- | Compute the union of the given sets of subexpressions. 391 | -- This constructor is always safe: it will always construct 392 | -- a valid expression, and it will always "do the right thing". 393 | nUnion :: GSet c 394 | => CharacterClass c 395 | -> Set (SubUnion c True) 396 | -> Set (SubUnion c False) 397 | -> RegExp c 398 | nUnion p n s | p /= zero || Set.size n + Set.size s >= 2 = 399 | if Set.null n then 400 | RNormalized (nUnionStrict p s) 401 | else 402 | RNormalized (nUnionNullable p n s) 403 | nUnion _ n s | otherwise = 404 | case (Set.lookupMin n, Set.lookupMin s) of 405 | (Nothing, Nothing) -> 406 | RZero 407 | 408 | (Just (SubUnion r), Nothing) -> 409 | RNormalized r 410 | 411 | (Nothing, Just (SubUnion r)) -> 412 | RNormalized r 413 | 414 | (Just _, Just _) -> 415 | error "impossible" 416 | 417 | 418 | -- | Safe and smart constructor for 'NUnion' that always returns a 419 | -- nullable expression. 420 | nUnionNullable :: GSet c 421 | => CharacterClass c 422 | -> Set (SubUnion c True) 423 | -> Set (SubUnion c False) 424 | -> NormalizedRegExp c True False True 425 | nUnionNullable p n s = 426 | assert (not $ Set.null n) $ 427 | assert (p /= zero || Set.size n + Set.size s >= 2) $ 428 | NUnion p n s 429 | 430 | 431 | -- | Safe and smart constructor for 'NUnion' that always returns a 432 | -- non-nullable expression. 433 | nUnionStrict :: GSet c 434 | => CharacterClass c 435 | -> Set (SubUnion c False) 436 | -> NormalizedRegExp c True False False 437 | nUnionStrict p s = 438 | assert (p /= zero || Set.size s >= 2) $ 439 | NUnion p Set.empty s 440 | 441 | 442 | -- | Safe and smart constructor for 'NUnionWithOne'. 443 | nUnionWithOne :: GSet c 444 | => CharacterClass c 445 | -> Set (SubUnion c False) 446 | -> NormalizedRegExp c True False True 447 | nUnionWithOne p s = 448 | assert (p /= zero || Set.size s >= 1) $ 449 | NUnionWithOne p s 450 | 451 | 452 | -- | Safe and smart constructor for 'NSeq'. 453 | nSeq :: GSet c 454 | => NSeq c isNullable 455 | -> NormalizedRegExp c False True isNullable 456 | nSeq l = 457 | assert (isValid l) $ 458 | NSeq l 459 | where 460 | -- | Check that the list has at least two subexpressions. 461 | isValid NSeqNil = 462 | False 463 | isValid (NSeqConsNullable _ _ NSeqNil) = 464 | False 465 | isValid (NSeqConsStrict _ NSeqNil) = 466 | False 467 | isValid _ = 468 | True 469 | 470 | 471 | -- | Alias for 'NStar' for uniformity. 472 | nStar :: GSet c 473 | => NormalizedRegExp c isUnion isSeq False 474 | -> NormalizedRegExp c False False True 475 | nStar = 476 | NStar 477 | 478 | 479 | 480 | -- * Working with normalized expressions 481 | 482 | nullableNormalized :: GSet c 483 | => NormalizedRegExp c isUnion isSeq isNullable 484 | -> Either (isNullable :~: False) (isNullable :~: True) 485 | nullableNormalized (NUnion _ n _)= 486 | if Set.null n then 487 | Left (unsafeCoerce Refl) 488 | else 489 | Right (unsafeCoerce Refl) 490 | nullableNormalized (NUnionWithOne _ _) = 491 | Right Refl 492 | nullableNormalized (NSeq l) = 493 | nullableNSeq l 494 | nullableNormalized (NStar _) = 495 | Right Refl 496 | 497 | 498 | nullableNSeq :: NSeq c isNullable 499 | -> Either (isNullable :~: False) (isNullable :~: True) 500 | nullableNSeq NSeqNil = 501 | Right Refl 502 | nullableNSeq (NSeqConsNullable SFalse _ _) = 503 | Left Refl 504 | nullableNSeq (NSeqConsNullable STrue _ _) = 505 | Right Refl 506 | nullableNSeq (NSeqConsStrict _ _) = 507 | Left Refl 508 | 509 | 510 | 511 | -- | View 'NSeq' as a list. 512 | nSeqView :: NSeq c isNullable 513 | -> ListView (Some1 (NSeq c)) (Some1 (SubSeq c)) 514 | nSeqView NSeqNil = 515 | Nil 516 | nSeqView (NSeqConsNullable _ h t) = 517 | Some1 h ::: (Some1 t) 518 | nSeqView (NSeqConsStrict h t) = 519 | Some1 h ::: (Some1 t) 520 | 521 | 522 | -- | Construct an 'NSeq' that contains a single subexpression. 523 | nSeqSingleton :: GSet c => SubSeq c isNullable -> Some1 (NSeq c) 524 | nSeqSingleton r@(SubSeq n) = 525 | case nullableNormalized n of 526 | Left Refl -> 527 | Some1 $ NSeqConsStrict r NSeqNil 528 | 529 | Right Refl -> 530 | Some1 $ NSeqConsNullable STrue r NSeqNil 531 | 532 | 533 | -- | Append two 'NSeq's. 534 | nSeqAppend :: NSeq c isNullable1 535 | -> NSeq c isNullable2 536 | -> NSeq c (isNullable1 && isNullable2) 537 | nSeqAppend NSeqNil l2 = 538 | l2 539 | nSeqAppend (NSeqConsNullable SFalse h1 t1) l2 = 540 | NSeqConsNullable SFalse h1 (nSeqAppend t1 l2) 541 | nSeqAppend (NSeqConsNullable STrue h1 t1) l2 = 542 | case nullableNSeq l2 of 543 | Left Refl -> 544 | NSeqConsNullable SFalse h1 (nSeqAppend t1 l2) 545 | Right Refl -> 546 | NSeqConsNullable STrue h1 (nSeqAppend t1 l2) 547 | nSeqAppend (NSeqConsStrict h1 t1) l2 = 548 | NSeqConsStrict h1 (nSeqAppend t1 l2) 549 | 550 | 551 | 552 | -- * Comparing normalized expressions 553 | 554 | -- GHC cannot derive 'Eq' and 'Ord' instances for normalized regular 555 | -- expressions because of all the existential quantification going 556 | -- on. Since we only care about ordering so we can put expression 557 | -- in sets, we define ordering on the underlying untyped terms. We 558 | -- do this efficiently by defining heterogeneous equality and comparison. 559 | 560 | -- | Heterogeneous equality. 561 | class HEq a b where 562 | hEq :: a -> b -> Bool 563 | 564 | 565 | -- | Heterogeneous ordering. 566 | class HOrd a b where 567 | hCompare :: a -> b -> Ordering 568 | 569 | 570 | instance GSet c => HEq (NormalizedRegExp c u1 s1 n1) (NormalizedRegExp c u2 s2 n2) where 571 | hEq (NUnion p1 n1 s1) (NUnion p2 n2 s2) = 572 | p1 == p2 && n1 == n2 && s1 == s2 573 | hEq (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) = 574 | p1 == p2 && s1 == s2 575 | hEq (NSeq l1) (NSeq l2) = 576 | l1 `hEq` l2 577 | hEq (NStar r1) (NStar r2) = 578 | r1 `hEq` r2 579 | hEq _ _ = 580 | False 581 | 582 | instance GSet c => HEq (SubUnion c n1) (SubUnion c n2) where 583 | hEq (SubUnion r1) (SubUnion r2) = 584 | r1 `hEq` r2 585 | 586 | instance GSet c => HEq (SubSeq c n1) (SubSeq c n2) where 587 | hEq (SubSeq r1) (SubSeq r2) = 588 | r1 `hEq` r2 589 | 590 | instance GSet c => HEq (NSeq c n1) (NSeq c n2) where 591 | hEq NSeqNil NSeqNil = 592 | True 593 | hEq (NSeqConsNullable n1 h1 t1) (NSeqConsNullable n2 h2 t2) = 594 | fromSing n1 == fromSing n2 && h1 `hEq` h2 && t1 `hEq` t2 595 | hEq (NSeqConsStrict h1 t1) (NSeqConsStrict h2 t2) = 596 | h1 `hEq` h2 && t1 `hEq` t2 597 | hEq _ _ = 598 | False 599 | 600 | 601 | instance GSet c => Eq (NormalizedRegExp c isUnion isSeq isNullable) where 602 | (==) = hEq 603 | 604 | instance GSet c => Eq (SubUnion c isNullable) where 605 | (==) = hEq 606 | 607 | instance GSet c => Eq (SubSeq c isNullable) where 608 | (==) = hEq 609 | 610 | instance GSet c => Eq (NSeq c isNullable) where 611 | (==) = hEq 612 | 613 | 614 | 615 | instance (GSet c, Ord (CharacterClass c)) => HOrd (NormalizedRegExp c u1 s1 n1) (NormalizedRegExp c u2 s2 n2) where 616 | hCompare (NUnion p1 n1 s1) (NUnion p2 n2 s2) = 617 | p1 `compare` p2 <> n1 `compare` n2 <> s1 `compare` s2 618 | hCompare (NUnion _ _ _) _ = 619 | LT 620 | hCompare _ (NUnion _ _ _) = 621 | GT 622 | 623 | hCompare (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) = 624 | p1 `compare` p2 <> s1 `compare` s2 625 | hCompare (NUnionWithOne _ _) _ = 626 | LT 627 | hCompare _ (NUnionWithOne _ _) = 628 | GT 629 | 630 | hCompare (NSeq l1) (NSeq l2) = 631 | l1 `hCompare` l2 632 | hCompare (NSeq _) _ = 633 | LT 634 | hCompare _ (NSeq _) = 635 | GT 636 | 637 | hCompare (NStar r1) (NStar r2) = 638 | r1 `hCompare` r2 639 | 640 | instance (GSet c, Ord (CharacterClass c)) => HOrd (SubUnion c n1) (SubUnion c n2) where 641 | hCompare (SubUnion r1) (SubUnion r2) = 642 | r1 `hCompare` r2 643 | 644 | instance (GSet c, Ord (CharacterClass c)) => HOrd (SubSeq c n1) (SubSeq c n2) where 645 | hCompare (SubSeq r1) (SubSeq r2) = 646 | r1 `hCompare` r2 647 | 648 | instance (GSet c, Ord (CharacterClass c)) => HOrd (NSeq c n1) (NSeq c n2) where 649 | hCompare NSeqNil NSeqNil = 650 | EQ 651 | hCompare NSeqNil _ = 652 | LT 653 | hCompare _ NSeqNil = 654 | GT 655 | 656 | hCompare (NSeqConsNullable n1 h1 t1) (NSeqConsNullable n2 h2 t2) = 657 | fromSing n1 `compare` fromSing n2 <> h1 `hCompare` h2 <> t1 `hCompare` t2 658 | hCompare (NSeqConsNullable _ _ _) _ = 659 | LT 660 | hCompare _ (NSeqConsNullable _ _ _) = 661 | GT 662 | 663 | hCompare (NSeqConsStrict h1 t1) (NSeqConsStrict h2 t2) = 664 | h1 `hCompare` h2 <> t1 `hCompare` t2 665 | 666 | 667 | instance (GSet c, Ord (CharacterClass c)) => Ord (NormalizedRegExp c isUnion isSeq isNullable) where 668 | compare = hCompare 669 | 670 | instance (GSet c, Ord (CharacterClass c)) => Ord (SubUnion c isNullable) where 671 | compare = hCompare 672 | 673 | instance (GSet c, Ord (CharacterClass c)) => Ord (SubSeq c isNullable) where 674 | compare = hCompare 675 | 676 | instance (GSet c, Ord (CharacterClass c)) => Ord (NSeq c isNullable) where 677 | compare = hCompare 678 | 679 | 680 | 681 | -- * Constructing and combining regular expressions 682 | 683 | -- | Regular expression that matches no strings. 684 | rZero :: RegExp c 685 | rZero = 686 | RZero 687 | 688 | 689 | -- | Regular expression that matches the empty string and nothing else. 690 | rOne :: RegExp c 691 | rOne = 692 | ROne 693 | 694 | 695 | -- | Regular expression that matches strings that either regular expression 696 | -- matches. 697 | rPlus :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c 698 | rPlus RZero r2 = 699 | r2 700 | rPlus r1 RZero = 701 | r1 702 | 703 | rPlus ROne ROne = 704 | ROne 705 | rPlus ROne result@(RNormalized r2) = 706 | case nullableNormalized r2 of 707 | Left Refl -> 708 | case r2 of 709 | NUnion p2 n2 s2 -> 710 | assert (Set.null n2) $ 711 | RNormalized $ nUnionWithOne p2 s2 712 | 713 | NSeq _ -> 714 | RNormalized $ nUnionWithOne zero (Set.singleton $ SubUnion r2) 715 | 716 | Right Refl -> 717 | result 718 | rPlus r1 ROne = 719 | rPlus ROne r1 720 | 721 | rPlus (RNormalized r1) (RNormalized r2) = 722 | rPlus' r1 r2 723 | where 724 | rPlus' :: NormalizedRegExp c isUnion1 isSeq1 isNullable1 725 | -> NormalizedRegExp c isUnion2 isSeq2 isNullable2 726 | -> RegExp c 727 | rPlus' (NUnion p1 n1 s1) (NUnion p2 n2 s2) = 728 | nUnion (p1 <+> p2) (Set.union n1 n2) (Set.union s1 s2) 729 | rPlus' (NUnion p1 n1 s1) (NUnionWithOne p2 s2) | Set.null n1 = 730 | RNormalized $ 731 | nUnionWithOne (p1 <+> p2) (Set.union s1 s2) 732 | rPlus' (NUnion p1 n1 s1) (NUnionWithOne p2 s2) | otherwise = 733 | RNormalized $ 734 | nUnionNullable (p1 <+> p2) n1 (Set.union s1 s2) 735 | rPlus' (NUnion p1 n1 s1) r2@(NSeq _) = 736 | case nullableNormalized r2 of 737 | Left Refl -> 738 | nUnion p1 n1 (Set.insert (SubUnion r2) s1) 739 | 740 | Right Refl -> 741 | nUnion p1 (Set.insert (SubUnion r2) n1) s1 742 | rPlus' (NUnion p1 n1 s1) r2@(NStar _) = 743 | nUnion p1 (Set.insert (SubUnion r2) n1) s1 744 | rPlus' r1 r2@(NUnion _ _ _) = 745 | rPlus' r2 r1 746 | 747 | rPlus' (NUnionWithOne p1 s1) (NUnionWithOne p2 s2) = 748 | RNormalized $ 749 | nUnionWithOne (p1 <+> p2) (Set.union s1 s2) 750 | rPlus' (NUnionWithOne p1 s1) r2@(NSeq _) = 751 | case nullableNormalized r2 of 752 | Left Refl -> 753 | RNormalized $ 754 | nUnionWithOne p1 (Set.insert (SubUnion r2) s1) 755 | 756 | Right Refl -> 757 | nUnion p1 (Set.singleton $ SubUnion r2) s1 758 | rPlus' (NUnionWithOne p1 s1) r2@(NStar _) = 759 | nUnion p1 (Set.singleton $ SubUnion r2) s1 760 | rPlus' r1 r2@(NUnionWithOne _ _) = 761 | rPlus' r2 r1 762 | 763 | rPlus' r1@(NSeq _) r2@(NSeq _) = 764 | rPlusNotUnion r1 r2 765 | rPlus' r1@(NSeq _) r2@(NStar _) = 766 | rPlusNotUnion r1 r2 767 | rPlus' r1@(NStar _) r2@(NSeq _) = 768 | rPlusNotUnion r1 r2 769 | rPlus' r1@(NStar _) r2@(NStar _) = 770 | rPlusNotUnion r1 r2 771 | 772 | 773 | rPlusNotUnion :: NormalizedRegExp c False isSeq1 isNullable1 774 | -> NormalizedRegExp c False isSeq2 isNullable2 775 | -> RegExp c 776 | rPlusNotUnion r1 r2 | r1 `hEq` r2 = 777 | RNormalized r1 778 | rPlusNotUnion r1 r2 | otherwise = 779 | case (nullableNormalized r1, nullableNormalized r2) of 780 | (Right Refl, Right Refl) -> 781 | RNormalized $ 782 | nUnionNullable zero (Set.fromList [SubUnion r1, SubUnion r2]) Set.empty 783 | 784 | (Right Refl, Left Refl) -> 785 | RNormalized $ 786 | nUnionNullable zero (Set.singleton $ SubUnion r1) (Set.singleton $ SubUnion r2) 787 | 788 | (Left Refl, Right Refl) -> 789 | RNormalized $ 790 | nUnionNullable zero (Set.singleton $ SubUnion r2) (Set.singleton $ SubUnion r1) 791 | 792 | (Left Refl, Left Refl) -> 793 | RNormalized $ 794 | nUnionStrict zero (Set.fromList [SubUnion r1, SubUnion r2]) 795 | 796 | 797 | -- | Regular expression that matches the first expression followed 798 | -- by the second. 799 | rTimes :: forall c. GSet c => RegExp c -> RegExp c -> RegExp c 800 | rTimes RZero _ = 801 | RZero 802 | rTimes _ RZero = 803 | RZero 804 | 805 | rTimes ROne r2 = 806 | r2 807 | rTimes r1 ROne = 808 | r1 809 | 810 | rTimes (RNormalized r1) (RNormalized r2) = 811 | case (isSeq r1, isSeq r2) of 812 | (Left Refl, Left Refl) -> 813 | case (nSeqSingleton $ SubSeq r1, nSeqSingleton $ SubSeq r2) of 814 | (Some1 l1, Some1 l2) -> 815 | RNormalized $ NSeq (l1 `nSeqAppend` l2) 816 | 817 | (Right Refl, Left Refl) | NSeq l1 <- r1 -> 818 | case nSeqSingleton (SubSeq r2) of 819 | Some1 l2 -> 820 | RNormalized $ NSeq (l1 `nSeqAppend` l2) 821 | 822 | (Left Refl, Right Refl) | NSeq l2 <- r2 -> 823 | case nSeqSingleton (SubSeq r1) of 824 | Some1 l1 -> 825 | RNormalized $ NSeq (l1 `nSeqAppend` l2) 826 | 827 | (Right Refl, Right Refl) | NSeq l1 <- r1 828 | , NSeq l2 <- r2 -> 829 | RNormalized $ NSeq (l1 `nSeqAppend` l2) 830 | where 831 | isSeq :: NormalizedRegExp c isUnion isSeq isNullable 832 | -> Either (isSeq :~: False) (isSeq :~: True) 833 | isSeq (NUnion _ _ _) = 834 | Left Refl 835 | isSeq (NUnionWithOne _ _) = 836 | Left Refl 837 | isSeq (NSeq _) = 838 | Right Refl 839 | isSeq (NStar _) = 840 | Left Refl 841 | 842 | 843 | -- | Regular expression that matches zero or more copies of the given 844 | -- expression. 845 | rStar :: forall c. GSet c => RegExp c -> RegExp c 846 | rStar RZero = 847 | ROne 848 | rStar ROne = 849 | ROne 850 | rStar r@(RNormalized (NStar _)) = -- Optimize a common case 851 | r 852 | rStar (RNormalized r) = 853 | case nullableNormalized r of 854 | Left Refl -> 855 | RNormalized $ nStar r 856 | 857 | Right Refl | (p, s) <- removeOne r -> 858 | if p /= zero || Set.size s >= 2 then 859 | RNormalized $ nStar (nUnionStrict p s) 860 | else 861 | assert (p == zero && Set.size s == 1) $ 862 | case Set.findMin s of 863 | SubUnion r -> 864 | RNormalized $ nStar r 865 | where 866 | -- | If @removeOne r = r'@, then @r* = (uncurry nUnionStrict r')*@. 867 | removeOne :: NormalizedRegExp c isUnion isSeq True 868 | -> (CharacterClass c, Set (SubUnion c False)) 869 | removeOne (NUnion p n s) = 870 | List.foldl' merge (p, s) n' 871 | where 872 | n' = fmap (\(SubUnion r) -> removeOne r) (Set.toList n) 873 | removeOne (NUnionWithOne p s) = 874 | (p, s) 875 | removeOne (NSeq NSeqNil) = 876 | (zero, Set.empty) 877 | removeOne (NSeq (NSeqConsNullable STrue (SubSeq r) t)) = 878 | removeOne r `merge` removeOne (NSeq t) 879 | removeOne (NStar (NUnion p n s)) = 880 | assert (Set.null n) $ 881 | (p, s) 882 | removeOne (NStar r@(NSeq l)) = 883 | (zero, Set.singleton (SubUnion r)) 884 | 885 | -- | Combine two strict union components. 886 | merge :: (CharacterClass c, Set (SubUnion c False)) 887 | -> (CharacterClass c, Set (SubUnion c False)) 888 | -> (CharacterClass c, Set (SubUnion c False)) 889 | merge (p1, s1) (p2, s2) = 890 | (p1 <+> p2, s1 `Set.union` s2) 891 | 892 | 893 | -- | Regular expression that matches single character strings picked 894 | -- from the given character class. 895 | rLiteral :: forall c. GSet c => CharacterClass c -> RegExp c 896 | rLiteral p | p == zero = 897 | RZero 898 | rLiteral p | otherwise = 899 | nUnion p Set.empty Set.empty 900 | 901 | 902 | -- * Printing 903 | 904 | instance (GSet c, Show (CharacterClass c)) => Show (RegExp c) where 905 | showsPrec d RZero = 906 | showString "{}" 907 | 908 | showsPrec d ROne = 909 | showString "<>" 910 | 911 | showsPrec d (RNormalized r) = 912 | showsPrec d r 913 | 914 | 915 | instance (GSet c, Show (CharacterClass c), Show r) => Show (RegExpView c r) where 916 | showsPrec _ One = 917 | showString "<>" 918 | showsPrec d (Plus r1 r2) = 919 | showParen (d > plusPrec) $ 920 | showsPrec plusPrec r1 . showString " ++ " . showsPrec plusPrec r2 921 | where 922 | plusPrec = 9 923 | showsPrec d (Times r1 r2) = 924 | showParen (d > timesPrec) $ 925 | showsPrec timesPrec r1 . showString "##" . showsPrec timesPrec r2 926 | where 927 | timesPrec = 10 928 | showsPrec d (Star r) = 929 | showParen (d > starPrec) $ 930 | showsPrec starPrec r . showString "**" 931 | where 932 | starPrec = 11 933 | showsPrec d (Literal p) = 934 | showsPrec d p 935 | 936 | 937 | instance (GSet c, Show (CharacterClass c)) => Show (NormalizedRegExp c isUnion isSeq isNullable) where 938 | showsPrec d (NUnion p n s) = 939 | showUnion d p (n' ++ s') 940 | where 941 | n' = fmap Some1 (Set.toList n) 942 | s' = fmap Some1 (Set.toList s) 943 | 944 | showsPrec d (NUnionWithOne p s) = 945 | showParen (d > unionWithOnePrec) $ 946 | showUnion unionWithOnePrec p s' . showString "?" 947 | where 948 | unionWithOnePrec = 8 949 | s' = fmap Some1 (Set.toList s) 950 | 951 | showsPrec d (NSeq l) = 952 | showParen (d > seqPrec) $ 953 | intercalate (showString "#") (toList l) 954 | where 955 | seqPrec = 7 956 | 957 | toList :: NSeq c n -> [ShowS] 958 | toList NSeqNil = 959 | [] 960 | toList (NSeqConsNullable _ h t) = 961 | showsPrec seqPrec h : toList t 962 | toList (NSeqConsStrict h t) = 963 | showsPrec seqPrec h : toList t 964 | 965 | showsPrec d (NStar r) = 966 | showParen (d > starPrec) $ 967 | showsPrec starPrec r . showString "*" 968 | where 969 | starPrec = 8 970 | 971 | 972 | instance (GSet c, Show (CharacterClass c)) => Show (SubUnion c isNullable) where 973 | showsPrec d (SubUnion r) = 974 | showsPrec d r 975 | 976 | 977 | instance (GSet c, Show (CharacterClass c)) => Show (SubSeq c isNullable) where 978 | showsPrec d (SubSeq r) = 979 | showsPrec d r 980 | 981 | 982 | -- | Behavior common to showing 'NUnion' and 'NUnionWithOne'. 983 | showUnion :: (GSet c, Show (CharacterClass c)) 984 | => Int 985 | -> CharacterClass c 986 | -> [Some1 (SubUnion c)] 987 | -> ShowS 988 | showUnion d p l = 989 | showParen (d > unionPrec && numElements > 1) $ 990 | intercalate (showString " + ") (literal ++ map showSub l) 991 | where 992 | unionPrec = 6 993 | 994 | prec = 995 | if numElements > 1 then unionPrec else d 996 | 997 | showSub (Some1 r) = 998 | showsPrec prec r 999 | 1000 | literal = 1001 | if p == zero then 1002 | [] 1003 | else 1004 | [showsPrec prec p] 1005 | 1006 | numElements = length literal + length l 1007 | 1008 | 1009 | 1010 | -- | Combine a list of string builders using another as a separator. 1011 | intercalate :: ShowS -> [ShowS] -> ShowS 1012 | intercalate sep l = 1013 | foldr (.) id (List.intersperse sep l) 1014 | 1015 | 1016 | 1017 | -- * Parsing 1018 | 1019 | instance (GSet c, Read (CharacterClass c)) => Read (RegExp c) where 1020 | readsPrec _ = 1021 | Parser.readP_to_S parser 1022 | 1023 | where 1024 | parser :: Parser.ReadP (RegExp c) 1025 | parser = do 1026 | Parser.skipSpaces 1027 | pPlus 1028 | 1029 | pPlus :: Parser.ReadP (RegExp c) 1030 | pPlus = 1031 | Parser.choice 1032 | [ do 1033 | left <- pTimes 1034 | Parser.char '+' 1035 | Parser.skipSpaces 1036 | right <- pPlus 1037 | return $ left `rPlus` right 1038 | 1039 | , pTimes 1040 | ] 1041 | 1042 | pTimes :: Parser.ReadP (RegExp c) 1043 | pTimes = 1044 | Parser.choice 1045 | [ do 1046 | left <- pStar 1047 | Parser.char '#' 1048 | Parser.skipSpaces 1049 | right <- pTimes 1050 | return $ left `rTimes` right 1051 | 1052 | , pStar 1053 | ] 1054 | 1055 | pStar :: Parser.ReadP (RegExp c) 1056 | pStar = do 1057 | atom <- pAtom 1058 | ops <- Parser.many postfix 1059 | return $ foldr ($) atom ops 1060 | 1061 | pAtom :: Parser.ReadP (RegExp c) 1062 | pAtom = 1063 | Parser.choice 1064 | [ do 1065 | p <- Parser.readS_to_P reads 1066 | Parser.skipSpaces 1067 | return $ rLiteral p 1068 | 1069 | , do 1070 | Parser.string "<>" 1071 | Parser.skipSpaces 1072 | return rOne 1073 | 1074 | , do 1075 | Parser.char '(' 1076 | Parser.skipSpaces 1077 | r <- pPlus 1078 | Parser.char ')' 1079 | Parser.skipSpaces 1080 | return r 1081 | ] 1082 | 1083 | postfix :: Parser.ReadP (RegExp c -> RegExp c) 1084 | postfix = 1085 | Parser.choice 1086 | [ do 1087 | Parser.char '?' 1088 | Parser.skipSpaces 1089 | return $ \r -> rOne `rPlus` r 1090 | 1091 | , do 1092 | Parser.char '*' 1093 | Parser.skipSpaces 1094 | return rStar 1095 | ] 1096 | 1097 | 1098 | 1099 | -- * Helpers 1100 | 1101 | -- | Existentially quantify a single boolean argument. 1102 | data Some1 (f :: Bool -> *) where 1103 | Some1 :: !(f b) -> Some1 f 1104 | 1105 | 1106 | -- | Useful for defining views that look like lists. 1107 | data ListView c e 1108 | = Nil 1109 | | e ::: c 1110 | 1111 | 1112 | 1113 | -- * Testing 1114 | 1115 | -- | For testing with QuickCheck. 1116 | instance (GSet c, Arbitrary (CharacterClass c)) => Arbitrary (RegExp c) where 1117 | arbitrary = do 1118 | size <- getSize 1119 | if size <= 1 then 1120 | oneof [zero, one, literal] 1121 | else 1122 | oneof [plus, times, star] 1123 | 1124 | where 1125 | zero :: Gen (RegExp c) 1126 | zero = 1127 | return rZero 1128 | 1129 | one :: Gen (RegExp c) 1130 | one = 1131 | return rOne 1132 | 1133 | plus :: Gen (RegExp c) 1134 | plus = do 1135 | l <- subtree 1136 | r <- subtree 1137 | return (l `rPlus` r) 1138 | 1139 | times :: Gen (RegExp c) 1140 | times = do 1141 | l <- subtree 1142 | r <- subtree 1143 | return (l `rTimes` r) 1144 | 1145 | star :: Gen (RegExp c) 1146 | star = do 1147 | r <- subtree 1148 | return (rStar r) 1149 | 1150 | literal :: Gen (RegExp c) 1151 | literal = do 1152 | p <- resize 5 arbitrary 1153 | return (rLiteral p) 1154 | 1155 | -- | Decrement the size parameter before generating 1156 | -- a regular expression 1157 | subtree :: Gen (RegExp c) 1158 | subtree = do 1159 | size <- getSize 1160 | newSize <- choose (0, size - 1) 1161 | resize newSize arbitrary 1162 | 1163 | shrink r = 1164 | case view r of 1165 | One -> 1166 | [] 1167 | Plus r1 r2 -> 1168 | concat [ 1169 | [r1, r2], 1170 | [s1 `rPlus` r2 | s1 <- shrink r1], 1171 | [r1 `rPlus` s2 | s2 <- shrink r2] 1172 | ] 1173 | Times r1 r2 -> 1174 | concat [ 1175 | [r1, r2], 1176 | [s1 `rTimes` r2 | s1 <- shrink r1], 1177 | [r1 `rTimes` s2 | s2 <- shrink r2] 1178 | ] 1179 | Star r -> 1180 | r : [rStar s | s <- shrink r] 1181 | Literal p -> 1182 | [rLiteral s | s <- shrink p] 1183 | 1184 | 1185 | -- | For testing with SmallCheck. 1186 | instance (GSet c, Monad m, Serial m (CharacterClass c)) => Serial m (RegExp c) where 1187 | series = 1188 | cons0 rZero \/ 1189 | cons0 rOne \/ 1190 | cons2 rPlus \/ 1191 | cons2 rTimes \/ 1192 | cons1 rStar \/ 1193 | cons1 rLiteral --------------------------------------------------------------------------------