├── .github └── workflows │ └── haskell.yml ├── .gitignore ├── .travis.yml ├── ChangeLog.md ├── LICENSE ├── README.md ├── Setup.hs ├── bench ├── Bench.hs └── data │ ├── bible.txt │ ├── big.txt │ └── small-bible.txt ├── examples └── LogParser.hs ├── lens-regex-pcre.cabal ├── mac-build.sh ├── package.yaml ├── src └── Control │ └── Lens │ └── Regex │ ├── ByteString.hs │ └── Text.hs ├── stack.yaml ├── stack.yaml.lock └── test ├── ByteString.hs ├── Spec.hs └── Text.hs /.github/workflows/haskell.yml: -------------------------------------------------------------------------------- 1 | name: Haskell CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - uses: actions/setup-haskell@v1 13 | with: 14 | ghc-version: '8.6.5' 15 | cabal-version: '3.0' 16 | - name: Install dependencies 17 | run: | 18 | cabal update 19 | cabal new-build --only-dependencies 20 | - name: Build 21 | run: | 22 | cabal new-configure --enable-tests 23 | cabal new-build 24 | - name: Run tests 25 | run: cabal new-test 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .stack-work/ 2 | *~ 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: haskell 2 | dist: trusty 3 | 4 | cache: 5 | directories: 6 | - $HOME/.cabal/store 7 | 8 | cabal: "2.4" 9 | 10 | matrix: 11 | include: 12 | - ghc: "8.6.5" 13 | - ghc: "8.4.4" 14 | 15 | install: 16 | - cabal --version 17 | - ghc --version 18 | 19 | script: 20 | - cabal v2-update 21 | - cabal v2-build 22 | - cabal v2-test --enable-test 23 | - cabal new-haddock 24 | - cabal check 25 | - cabal sdist # tests that a source-distribution can be generated 26 | 27 | # Check that the resulting source distribution can be built & installed. 28 | # If there are no other `.tar.gz` files in `dist`, this can be even simpler: 29 | # `cabal install --force-reinstalls dist/*-*.tar.gz` 30 | # - SRC_TGZ=$(cabal info . | awk '{print $2;exit}').tar.gz && 31 | # (cd dist && cabal install --force-reinstalls "$SRC_TGZ") 32 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # Changelog for lens-regex-pcre 2 | 3 | # 1.0.1.0 4 | 5 | ### *BREAKING CHANGES* 6 | This release fixes a pretty major bugs surrounding the behaviour of optional groups. It's **unlikely** but still possible that the change to grouping behaviour has changed the behaviour of your application, but most likely it just fixed some bugs you didn't know you had yet... 7 | 8 | - Handle optional or alternated groups like `pcre-heavy`. This may change group behaviour on regular expressions which had groups with optional groups. E.g.: 9 | - `A(x)?(B)` 10 | - `(A)|(B)|(C)` 11 | - Switch `groups` from `IndexedTraversal'` to `IndexedLens'`. Since all lenses are valid traversals this shouldn't cause any breakages. 12 | - Add `namedGroups` and `namedGroup` 13 | 14 | # 1.0.0.0 15 | - Add `regexing` and `makeRegexTraversalQQ` 16 | - Replace `regex` traversal maker with `regex` QuasiQuoter 17 | - Split Control.Lens.Regex into Control.Lens.Regex.Text and Control.Lens.Regex.ByteString 18 | - Move regexBS to `Control.Lens.Regex.ByteString.regex` 19 | - Change whole implementation to use ByteString Builders for a massive speedup 20 | - Monomorphise `Match text` -> `Match` 21 | - Add groups to index of `match` and match to index of `groups` & `group` 22 | - Add `group = groups . ix n` for accessing a single group. 23 | 24 | # 0.3.1.0 25 | - Match -> Match text 26 | - Added regexBS to run regex on ByteStrings directly 27 | 28 | # 0.3.0.0 29 | - Unify `iregex` into `regex` as a single indexed traversal 30 | 31 | # 0.2.0.0 32 | - Unify `grouped`, `groups`, and `igroups` into just `groups` with optional traversal 33 | 34 | # 0.1.1.0 35 | - Adds `grouped` and `matchAndGroups` 36 | 37 | # 0.1.0.1 38 | - Doc fixes 39 | 40 | # 0.1.0.0 41 | - Initial Release 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Author name here (c) 2019 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Author name here nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lens-regex-pcre 2 | 3 | [Hackage and Docs](http://hackage.haskell.org/package/lens-regex-pcre) 4 | 5 | Based on `pcre-heavy`; so it should support any regexes or options which it supports. 6 | 7 | Performance is [equal, sometimes **better**](https://github.com/ChrisPenner/lens-regex-pcre#performance) than that of `pcre-heavy` alone. 8 | 9 | Which module should you use? 10 | 11 | If you need unicode support, use `Control.Lens.Regex.Text`, if not then `Control.Lens.Regex.ByteString` is faster. 12 | 13 | Working with Regexes in Haskell kinda sucks; it's tough to figure out which libs 14 | to use, and even after you pick one it's tough to figure out how to use it; `lens-regex-pcre` hopes to replace most other solutions by being fast, easy to set up, more adaptable with a more consistent interface. 15 | 16 | It helps that there are already HUNDREDS of combinators which interop with lenses :smile:. 17 | 18 | As it turns out; regexes are a very lens-like tool; Traversals allow you to select 19 | and alter zero or more matches; traversals can even carry indexes so you know which match or group you're working 20 | on. 21 | 22 | # Examples 23 | 24 | ```haskell 25 | import Control.Lens.Regex.Text 26 | 27 | txt :: Text 28 | txt = "raindrops on roses and whiskers on kittens" 29 | 30 | -- Search 31 | >>> has [regex|whisk|] txt 32 | True 33 | 34 | -- Get matches 35 | >>> txt ^.. [regex|\br\w+|] . match 36 | ["raindrops","roses"] 37 | 38 | -- Edit matches 39 | >>> txt & [regex|\br\w+|] . match %~ T.intersperse '-' . T.toUpper 40 | "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" 41 | 42 | -- Get Groups 43 | >>> txt ^.. [regex|(\w+) on (\w+)|] . groups 44 | [["raindrops","roses"],["whiskers","kittens"]] 45 | 46 | -- Edit Groups 47 | >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse 48 | "roses on raindrops and kittens on whiskers" 49 | 50 | -- Get the third match 51 | >>> txt ^? [regex|\w+|] . index 2 . match 52 | Just "roses" 53 | 54 | -- Match integers, 'Read' them into ints, then sort them in-place 55 | -- dumping them back into the source text afterwards. 56 | >>> "Monday: 29, Tuesday: 99, Wednesday: 3" 57 | & partsOf ([regex|\d+|] . match . unpacked . _Show @Int) %~ sort 58 | "Monday: 3, Tuesday: 29, Wednesday: 99" 59 | 60 | ``` 61 | 62 | Basically anything you want to do is possible somehow. 63 | 64 | # Performance 65 | 66 | See the [benchmarks](https://github.com/ChrisPenner/lens-regex-pcre/blob/master/bench/Bench.hs). 67 | 68 | ## Summary 69 | 70 | Caveat: I'm by no means a benchmarking expert; if you have tips on how to do this better I'm all ears! 71 | 72 | * **Search** `lens-regex-pcre` is *marginally* slower than `pcre-heavy`, but well within acceptable margins (within 0.6%) 73 | * **Replace** `lens-regex-pcre` beats `pcre-heavy` by ~10% 74 | * **Modify** `pcre-heavy` doesn't support this operation at all, so I guess `lens-regex-pcre` wins here :) 75 | 76 | How can it possibly be **faster** if it's based on `pcre-heavy`? `lens-regex-pcre` only uses `pcre-heavy` for **finding** the matches, not substitution/replacement. After that it splits the text into chunks and traverses over them with whichever operation you've chosen. The nature of this implementation makes it a lot easier to understand than imperative implementations of the same thing. This means it's pretty easy to make edits, and is also the reason we can support arbitrary traversals/actions. It was easy enough, so I went ahead and made the whole thing use ByteString Builders, which sped it up a lot. I suspect that `pcre-heavy` can benefit from the same optimization if anyone feels like back-porting it; it could be (almost) as nicely using simple `traverse` without any lenses. The whole thing is only about 25 LOC. 77 | 78 | I'm neither a benchmarks nor stats person, so please open an issue if anything here seems fishy. 79 | 80 | Without `pcre-light` and `pcre-heavy` this library wouldn't be possible, so huge thanks to all contributors! 81 | 82 | Here are the benchmarks on my 2013 Macbook (2.6 Ghz i5) 83 | 84 | ```haskell 85 | benchmarking static pattern search/pcre-heavy ... took 20.78 s, total 56 iterations 86 | benchmarked static pattern search/pcre-heavy 87 | time 375.3 ms (372.0 ms .. 378.5 ms) 88 | 1.000 R² (0.999 R² .. 1.000 R²) 89 | mean 378.1 ms (376.4 ms .. 380.8 ms) 90 | std dev 3.747 ms (922.3 μs .. 5.609 ms) 91 | 92 | benchmarking static pattern search/lens-regex-pcre ... took 20.79 s, total 56 iterations 93 | benchmarked static pattern search/lens-regex-pcre 94 | time 379.5 ms (376.2 ms .. 382.4 ms) 95 | 1.000 R² (1.000 R² .. 1.000 R²) 96 | mean 377.3 ms (376.5 ms .. 378.4 ms) 97 | std dev 1.667 ms (1.075 ms .. 2.461 ms) 98 | 99 | benchmarking complex pattern search/pcre-heavy ... took 95.95 s, total 56 iterations 100 | benchmarked complex pattern search/pcre-heavy 101 | time 1.741 s (1.737 s .. 1.746 s) 102 | 1.000 R² (1.000 R² .. 1.000 R²) 103 | mean 1.746 s (1.744 s .. 1.749 s) 104 | std dev 4.499 ms (3.186 ms .. 6.080 ms) 105 | 106 | benchmarking complex pattern search/lens-regex-pcre ... took 97.26 s, total 56 iterations 107 | benchmarked complex pattern search/lens-regex-pcre 108 | time 1.809 s (1.736 s .. 1.908 s) 109 | 0.996 R² (0.991 R² .. 1.000 R²) 110 | mean 1.757 s (1.742 s .. 1.810 s) 111 | std dev 42.83 ms (11.51 ms .. 70.69 ms) 112 | 113 | benchmarking simple replacement/pcre-heavy ... took 23.32 s, total 56 iterations 114 | benchmarked simple replacement/pcre-heavy 115 | time 423.8 ms (422.4 ms .. 425.3 ms) 116 | 1.000 R² (1.000 R² .. 1.000 R²) 117 | mean 424.0 ms (422.9 ms .. 426.2 ms) 118 | std dev 2.684 ms (1.239 ms .. 4.270 ms) 119 | 120 | benchmarking simple replacement/lens-regex-pcre ... took 20.84 s, total 56 iterations 121 | benchmarked simple replacement/lens-regex-pcre 122 | time 382.8 ms (374.3 ms .. 391.5 ms) 123 | 0.999 R² (0.999 R² .. 1.000 R²) 124 | mean 378.2 ms (376.3 ms .. 381.0 ms) 125 | std dev 3.794 ms (2.577 ms .. 5.418 ms) 126 | 127 | benchmarking complex replacement/pcre-heavy ... took 24.77 s, total 56 iterations 128 | benchmarked complex replacement/pcre-heavy 129 | time 448.1 ms (444.7 ms .. 450.0 ms) 130 | 1.000 R² (1.000 R² .. 1.000 R²) 131 | mean 450.8 ms (449.5 ms .. 453.9 ms) 132 | std dev 3.129 ms (947.0 μs .. 4.841 ms) 133 | 134 | benchmarking complex replacement/lens-regex-pcre ... took 21.99 s, total 56 iterations 135 | benchmarked complex replacement/lens-regex-pcre 136 | time 399.9 ms (398.4 ms .. 402.2 ms) 137 | 1.000 R² (1.000 R² .. 1.000 R²) 138 | mean 399.6 ms (399.0 ms .. 400.4 ms) 139 | std dev 1.135 ms (826.2 μs .. 1.604 ms) 140 | 141 | Benchmark lens-regex-pcre-bench: FINISH 142 | ``` 143 | 144 | # Behaviour 145 | 146 | Precise Expected behaviour (and examples) can be found in the test suites: 147 | 148 | * [ByteString tests](https://github.com/ChrisPenner/lens-regex-pcre/blob/master/test/ByteString.hs) 149 | * [Text tests](https://github.com/ChrisPenner/lens-regex-pcre/blob/master/test/Text.hs) 150 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /bench/Bench.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE QuasiQuotes #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | import Gauge.Benchmark 5 | import Gauge.Main 6 | import Data.ByteString as BS 7 | import qualified Text.Regex.PCRE.Heavy as PCRE 8 | import Control.Lens 9 | import Control.Lens.Regex.ByteString 10 | 11 | main :: IO () 12 | main = do 13 | srcFile <- BS.readFile "./bench/data/small-bible.txt" 14 | defaultMain 15 | [ bgroup "static pattern search" 16 | [ bench "pcre-heavy" $ nf (heavySearch [PCRE.re|Moses|]) srcFile 17 | , bench "lens-regex-pcre" $ nf (lensSearch [PCRE.re|Moses|]) srcFile 18 | ] 19 | , bgroup "complex pattern search" 20 | [ bench "pcre-heavy" $ nf (heavySearch [PCRE.re|l\w+e|]) srcFile 21 | , bench "lens-regex-pcre" $ nf (lensSearch [PCRE.re|l\w+e|]) srcFile 22 | ] 23 | , bgroup "simple replacement" 24 | [ bench "pcre-heavy" $ nf (heavyReplace [PCRE.re|Moses|] "Jarvis") srcFile 25 | , bench "lens-regex-pcre" $ nf (lensReplace [PCRE.re|Moses|] "Jarvis") srcFile 26 | ] 27 | , bgroup "complex replacement" 28 | [ bench "pcre-heavy" $ nf (heavyReplace [PCRE.re|M\w*s\w*s|] "Jarvis") srcFile 29 | , bench "lens-regex-pcre" $ nf (lensReplace [PCRE.re|M\w*s\w*s|] "Jarvis") srcFile 30 | ] 31 | ] 32 | 33 | heavySearch :: Regex -> BS.ByteString -> [BS.ByteString] 34 | heavySearch pat src = fst <$> PCRE.scan pat src 35 | 36 | lensSearch :: Regex -> BS.ByteString -> [BS.ByteString] 37 | lensSearch pat src = src ^.. regexing pat . match 38 | 39 | heavyReplace :: Regex -> BS.ByteString -> BS.ByteString -> BS.ByteString 40 | heavyReplace pat replacement src = PCRE.gsub pat replacement src 41 | 42 | lensReplace :: Regex -> BS.ByteString -> BS.ByteString -> BS.ByteString 43 | lensReplace pat replacement src = src & regexing pat . match .~ replacement 44 | -------------------------------------------------------------------------------- /examples/LogParser.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE QuasiQuotes #-} 3 | module LogParser where 4 | 5 | import Control.Lens 6 | import Control.Lens.Regex 7 | import Text.RawString.QQ 8 | import Data.Text as T 9 | import Text.Read 10 | 11 | logs :: Text 12 | logs =[r| Config-file not found 13 | No network access 14 | Waldo logged in 15 | 90% of disk space used 16 | Carmen logged out 17 | Something really bad happened|] 18 | 19 | -- define types to represent our log messages 20 | data LogLevel = Info | Warn | Error deriving (Show, Read, Eq) 21 | data LogService = Kafka | App deriving (Show, Read, Eq) 22 | data Log = 23 | Log 24 | { logLevel :: LogLevel 25 | , logService :: LogService 26 | , logText :: Text 27 | } deriving Show 28 | 29 | -- Our parser is just a prism from a list of groups to our type! 30 | _Log :: Prism' [Text] Log 31 | _Log = prism' toGroups toLog 32 | where 33 | toGroups (Log level service msgText) = [pack (show level), pack (show service), msgText] 34 | toLog [levelText, serviceText, msgText] = do 35 | Log <$> readMaybe (unpack levelText) 36 | <*> readMaybe (unpack serviceText) 37 | <*> pure msgText 38 | toLog _ = Nothing 39 | 40 | logPattern :: Regex 41 | logPattern = [rx|<([\w]+)> <(\w+)> (.*)|] 42 | 43 | -- Now you're ready to hack'n'slash! 44 | allInfos :: [Log] 45 | allInfos = logs ^.. regex logPattern . groups . _Log . filtered ((== Info) . logLevel) 46 | -- [ Log {logLevel = Info, logService = App, logText = "Waldo logged in"} 47 | -- , Log {logLevel = Info, logService = App, logText = "Carmen logged out"}] 48 | 49 | allKafkaErrorMessages :: [Text] 50 | allKafkaErrorMessages = 51 | logs 52 | ^.. regex logPattern 53 | . groups 54 | . _Log 55 | . filtered ((== Error) . logLevel) 56 | . filtered ((== Kafka) . logService) 57 | . to logText 58 | -- ["No network access","Something really bad happened"] 59 | 60 | downGradeErrors :: Text 61 | downGradeErrors = 62 | logs & regex logPattern . groups . _Log . filtered ((== Error) . logLevel) %~ toWarning 63 | where 64 | toWarning (Log _level service txt) = Log Warn service (T.toUpper txt) 65 | -- CONFIG-FILE NOT FOUND 66 | -- NO NETWORK ACCESS 67 | -- Waldo logged in 68 | -- 90% of disk space used 69 | -- Carmen logged out 70 | -- SOMETHING REALLY BAD HAPPENED 71 | -------------------------------------------------------------------------------- /lens-regex-pcre.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 1.12 2 | 3 | -- This file has been generated from package.yaml by hpack version 0.37.0. 4 | -- 5 | -- see: https://github.com/sol/hpack 6 | -- 7 | -- hash: 6ab2b296761109d7cc7b2a86dd4ca15b111214086ceff217b38b72b7de9e5629 8 | 9 | name: lens-regex-pcre 10 | version: 1.1.1.0 11 | synopsis: A lensy interface to regular expressions 12 | description: Please see the README on GitHub at 13 | category: Regex 14 | homepage: https://github.com/ChrisPenner/lens-regex-pcre#readme 15 | bug-reports: https://github.com/ChrisPenner/lens-regex-pcre/issues 16 | author: Chris Penner 17 | maintainer: christopher.penner@gmail.com 18 | copyright: 2019 Chris Penner 19 | license: BSD3 20 | license-file: LICENSE 21 | build-type: Simple 22 | extra-source-files: 23 | README.md 24 | ChangeLog.md 25 | 26 | source-repository head 27 | type: git 28 | location: https://github.com/ChrisPenner/lens-regex-pcre 29 | 30 | library 31 | exposed-modules: 32 | Control.Lens.Regex.ByteString 33 | Control.Lens.Regex.Text 34 | other-modules: 35 | Paths_lens_regex_pcre 36 | hs-source-dirs: 37 | src 38 | ghc-options: -Wall 39 | build-depends: 40 | base >=4.7 && <5 41 | , bytestring 42 | , containers 43 | , lens 44 | , pcre-heavy 45 | , pcre-light >=0.4.1.0 46 | , template-haskell 47 | , text 48 | default-language: Haskell2010 49 | 50 | test-suite lens-regex-pcre-test 51 | type: exitcode-stdio-1.0 52 | main-is: Spec.hs 53 | other-modules: 54 | ByteString 55 | Text 56 | Paths_lens_regex_pcre 57 | hs-source-dirs: 58 | test 59 | ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N 60 | build-depends: 61 | base >=4.7 && <5 62 | , bytestring 63 | , containers 64 | , hspec 65 | , lens 66 | , lens-regex-pcre 67 | , pcre-heavy 68 | , pcre-light >=0.4.1.0 69 | , template-haskell 70 | , text 71 | default-language: Haskell2010 72 | -------------------------------------------------------------------------------- /mac-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Ensures we have all the correct C-libs in scope 4 | stack build --fast --extra-include-dirs "$(brew --prefix)/include" --extra-lib-dirs "$(brew --prefix)/lib" 5 | -------------------------------------------------------------------------------- /package.yaml: -------------------------------------------------------------------------------- 1 | name: lens-regex-pcre 2 | version: 1.1.2.0 3 | github: "ChrisPenner/lens-regex-pcre" 4 | license: BSD3 5 | author: "Chris Penner" 6 | maintainer: "christopher.penner@gmail.com" 7 | copyright: "2019 Chris Penner" 8 | 9 | extra-source-files: 10 | - README.md 11 | - ChangeLog.md 12 | 13 | # Metadata used when publishing your package 14 | synopsis: A lensy interface to regular expressions 15 | category: Regex 16 | 17 | # To avoid duplicated efforts in documentation and dealing with the 18 | # complications of embedding Haddock markup inside cabal files, it is 19 | # common to point users to the README.md file. 20 | description: Please see the README on GitHub at 21 | 22 | dependencies: 23 | - base >= 4.7 && < 5 24 | - pcre-heavy 25 | - pcre-light >= 0.4.1.0 26 | - text 27 | - bytestring 28 | - lens 29 | - template-haskell 30 | - containers 31 | 32 | ghc-options: 33 | - -Wall 34 | library: 35 | source-dirs: src 36 | 37 | tests: 38 | lens-regex-pcre-test: 39 | main: Spec.hs 40 | source-dirs: test 41 | ghc-options: 42 | - -threaded 43 | - -rtsopts 44 | - -with-rtsopts=-N 45 | dependencies: 46 | - lens-regex-pcre 47 | - hspec 48 | 49 | # benchmarks: 50 | # lens-regex-pcre-bench: 51 | # main: Bench.hs 52 | # source-dirs: bench 53 | # ghc-options: 54 | # - -threaded 55 | # - -rtsopts 56 | # - -with-rtsopts=-N 57 | # dependencies: 58 | # - lens-regex-pcre 59 | # - gauge 60 | -------------------------------------------------------------------------------- /src/Control/Lens/Regex/ByteString.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | Module : Control.Lens.Regex.ByteString 3 | Description : ByteString PCRE Regex library with a lensy interface. 4 | Copyright : (c) Chris Penner, 2019 5 | License : BSD3 6 | -} 7 | 8 | {-# LANGUAGE FlexibleContexts #-} 9 | {-# LANGUAGE OverloadedStrings #-} 10 | {-# LANGUAGE PartialTypeSignatures #-} 11 | {-# LANGUAGE QuasiQuotes #-} 12 | {-# LANGUAGE RankNTypes #-} 13 | {-# LANGUAGE TemplateHaskell #-} 14 | {-# LANGUAGE DataKinds #-} 15 | {-# LANGUAGE UndecidableInstances #-} 16 | {-# LANGUAGE TypeOperators #-} 17 | {-# LANGUAGE TupleSections #-} 18 | {-# LANGUAGE DerivingStrategies #-} 19 | 20 | module Control.Lens.Regex.ByteString 21 | ( 22 | -- * Basics 23 | regex 24 | , match 25 | , groups 26 | , group 27 | , namedGroups 28 | , namedGroup 29 | , matchAndGroups 30 | 31 | -- * Compiling regexes to Traversals 32 | , regexing 33 | , mkRegexTraversalQQ 34 | 35 | -- * Types 36 | , Match 37 | , PCRE.Regex 38 | ) where 39 | 40 | import qualified Data.ByteString as BS 41 | import qualified Data.ByteString.Lazy as BL 42 | import qualified Data.ByteString.Builder as BS 43 | import qualified Text.Regex.PCRE.Heavy as PCRE 44 | import qualified Text.Regex.PCRE.Light as PCRE 45 | import Control.Lens hiding (re) 46 | import Data.Bifunctor 47 | import qualified Language.Haskell.TH.Quote as TH 48 | import qualified Language.Haskell.TH.Syntax as TH 49 | import qualified Language.Haskell.TH as TH 50 | import GHC.TypeLits 51 | import qualified Data.Map as M 52 | import Data.Tuple (swap) 53 | 54 | -- $setup 55 | -- >>> :set -XQuasiQuotes 56 | -- >>> :set -XOverloadedStrings 57 | -- >>> :set -XTypeApplications 58 | -- >>> import qualified Data.ByteString.Char8 as Char8 59 | -- >>> import Data.Char 60 | -- >>> import Data.List hiding (group) 61 | -- >>> import Data.ByteString.Lens 62 | 63 | type MatchRange = (Int, Int) 64 | type GroupRanges = [(Int, Int)] 65 | 66 | unBuilder :: BS.Builder -> BS.ByteString 67 | unBuilder = BL.toStrict . BS.toLazyByteString 68 | 69 | building :: Iso' BS.Builder BS.ByteString 70 | building = iso unBuilder BS.byteString 71 | 72 | 73 | -- | Match represents an opaque regex match. 74 | -- You can drill into it using 'match', 'groups', 'group', 'namedGroup', 'namedGroups' or 'matchAndGroups' 75 | data Match = 76 | Match { _chunks :: [Either BS.Builder BS.Builder] 77 | , _matchRegex :: PCRE.Regex 78 | } 79 | 80 | instance Eq Match where 81 | a == b = (_matchRegex a == _matchRegex b) 82 | && (((bimap unBuilder unBuilder) <$> _chunks a) == ((bimap unBuilder unBuilder) <$> _chunks b)) 83 | 84 | instance Ord Match where 85 | compare a b = compare (_matchRegex a) (_matchRegex b) 86 | <> compare ((bimap unBuilder unBuilder) <$> _chunks a) ((bimap unBuilder unBuilder) <$> _chunks b) 87 | 88 | 89 | 90 | makeLensesFor [("_chunks", "chunks")] ''Match 91 | 92 | instance TypeError 93 | ('Text "You're trying to 'show' a raw 'Match' object." 94 | ':$$: 'Text "You likely missed adding a 'match' or 'groups' or 'group' call after your 'regex' call :)") 95 | => Show Match where 96 | show _ = "This is a raw Match object, did you miss a 'match' or 'groups' or 'group' call after your 'regex'?" 97 | 98 | -- | Access all groups of a match as a list. Stashes the full match text as the index in case 99 | -- you need it. 100 | -- 101 | -- Changing the length of the list has behaviour similar to 'partsOf'. 102 | -- 103 | -- Get all matched groups: 104 | -- 105 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups 106 | -- [["raindrops","roses"],["whiskers","kittens"]] 107 | -- 108 | -- You can access a specific group combining with 'ix', or just use 'group' instead 109 | -- 110 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . ix 1 111 | -- ["roses","kittens"] 112 | -- 113 | -- Editing groups: 114 | -- 115 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups . ix 1 %~ Char8.map toUpper 116 | -- "raindrops on ROSES and whiskers on KITTENS" 117 | -- 118 | -- Editing the list rearranges groups 119 | -- 120 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups %~ reverse 121 | -- "roses on raindrops and kittens on whiskers" 122 | -- 123 | -- You can traverse the list to flatten out all groups 124 | -- 125 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . traversed 126 | -- ["raindrops","roses","whiskers","kittens"] 127 | -- 128 | -- Use indexed helpers to access the full match when operating on a group. 129 | -- 130 | -- This replaces each group with the full match text wrapped in parens: 131 | -- 132 | -- >>> "one-two" & [regex|(\w+)-(\w+)|] . groups <. traversed %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" 133 | -- "one:(one-two)-two:(one-two)" 134 | groups :: IndexedLens' BS.ByteString Match [BS.ByteString] 135 | groups = conjoined groupsT (reindexed (view match) selfIndex <. groupsT) 136 | where 137 | groupsT :: Lens' Match [BS.ByteString] 138 | groupsT = chunks . partsOf (traversed . _Right . building) 139 | 140 | -- | Access a specific group of a match. Numbering starts at 0. 141 | -- 142 | -- Stashes the full match text as the index in case you need it. 143 | -- 144 | -- See 'groups' for more info on grouping 145 | -- 146 | -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 0 147 | -- ["key","a"] 148 | -- 149 | -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 1 150 | -- ["value","b"] 151 | -- 152 | -- >>> "key:value, a:b" & [regex|(\w+):(\w+)|] . group 1 %~ Char8.map toUpper 153 | -- "key:VALUE, a:B" 154 | -- 155 | -- Replace the first capture group with the full match: 156 | -- 157 | -- >>> "a, b" & [regex|(\w+), (\w+)|] . group 0 .@~ \i -> "(" <> i <> ")" 158 | -- "(a, b), b" 159 | group :: Int -> IndexedTraversal' BS.ByteString Match BS.ByteString 160 | group n = groups <. ix n 161 | 162 | -- | Access all the named groups of a match as a 'M.Map'. Stashes the full match text as the index in case 163 | -- you need it. 164 | -- 165 | -- Note that you can edit the groups through this lens, but the behaviour is undefined when editing inner elements of __nested__ groups. 166 | -- Behaviour is undefined if groups are removed from the map (so don't do that). 167 | -- 168 | -- NOTE: There's currently some strange behaviour in pcre-heavy where trailing unmatched optional groups are omitted, I'm looking into getting that patched, but for now, note the difference in behaviour: 169 | -- 170 | -- >>> "A" ^? [regex|(?A)|(?B)|] . namedGroups 171 | -- Just (fromList [("a","A")]) 172 | -- 173 | -- >>> "B" ^? [regex|(?A)|(?B)|] . namedGroups 174 | -- Just (fromList [("a",""),("b","B")]) 175 | -- 176 | -- Get all matched groups: 177 | -- 178 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups 179 | -- [fromList [("first","raindrops"),("second","roses")],fromList [("first","whiskers"),("second","kittens")]] 180 | -- 181 | -- You can access a specific group combining with 'ix', or just use 'namedGroup' instead 182 | -- 183 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups . ix "second" 184 | -- ["roses","kittens"] 185 | -- 186 | -- Editing groups: 187 | -- 188 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(?\w+) on (?\w+)|] . namedGroups . ix "second" %~ Char8.map toUpper 189 | -- "raindrops on ROSES and whiskers on KITTENS" 190 | -- 191 | -- Use indexed helpers to access the full match when operating on a group. 192 | -- 193 | -- This replaces the "first" group with the full match text wrapped in parens: 194 | -- 195 | -- >>> "one-two" & [regex|(?\w+)-(\w+)|] . namedGroups <. ix "first" %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" 196 | -- "one:(one-two)-two" 197 | namedGroups :: IndexedLens' BS.ByteString Match (M.Map BS.ByteString BS.ByteString) 198 | namedGroups = conjoined stepOne (reindexed (view match) selfIndex <. stepOne) 199 | where 200 | -- stepOne :: Traversal' Match (M.Map BS.ByteString BS.ByteString) 201 | stepOne :: Lens' Match (M.Map BS.ByteString BS.ByteString) 202 | stepOne f m = m & (groups . zipT . converterT (_matchRegex m) . partsOf (traversed . _Right) . mapL) %%~ f 203 | zipT :: Iso' [a] [(Int, a)] 204 | zipT = iso (zip [0..]) (fmap snd) 205 | converterT :: PCRE.Regex -> Lens' [(Int, BS.ByteString)] [Either (Int, BS.ByteString) (BS.ByteString, BS.ByteString)] 206 | converterT pattern f xs = 207 | f (converter pattern xs) <&> itraversed %@~ \i l -> either id ((i,) . snd) l 208 | converter :: PCRE.Regex -> [(Int, BS.ByteString)] -> [Either (Int, BS.ByteString) (BS.ByteString, BS.ByteString)] 209 | converter pattern = fmap $ \(i, s) -> 210 | case M.lookup i (names pattern) of 211 | Nothing -> Left (i, s) 212 | Just n -> Right (n, s) 213 | mapL :: Lens' [(BS.ByteString, BS.ByteString)] (M.Map BS.ByteString BS.ByteString) 214 | mapL = lens M.fromList setter 215 | where 216 | setter :: [(BS.ByteString, BS.ByteString)] -> M.Map BS.ByteString BS.ByteString -> [(BS.ByteString, BS.ByteString)] 217 | setter xs m = xs <&> \(k, _) -> (k, M.findWithDefault "" k m) 218 | names :: PCRE.Regex -> M.Map Int BS.ByteString 219 | names pattern = M.fromList . fmap swap $ PCRE.captureNames pattern 220 | 221 | -- | Access a specific named group of a match 222 | -- 223 | -- See 'namedGroups' for caveats and more info. 224 | -- 225 | -- Stashes the full match text as the index in case you need it. 226 | -- 227 | -- >>> "key:value, a:b" ^.. [regex|(?\w+):(?\w+)|] . namedGroup "first" 228 | -- ["key","a"] 229 | -- 230 | -- >>> "key:value, a:b" ^.. [regex|(?\w+):(?\w+)|] . namedGroup "second" 231 | -- ["value","b"] 232 | -- 233 | -- >>> "key:value, a:b" & [regex|(?\w+):(?\w+)|] . namedGroup "second" %~ Char8.map toUpper 234 | -- "key:VALUE, a:B" 235 | -- 236 | -- Replace the first capture group with the full match: 237 | -- 238 | -- >>> "a, b" & [regex|(?\w+), (?\w+)|] . namedGroup "first" .@~ \i -> "(" <> i <> ")" 239 | -- "(a, b), b" 240 | namedGroup :: BS.ByteString -> IndexedTraversal' BS.ByteString Match BS.ByteString 241 | namedGroup name = namedGroups <. ix name 242 | 243 | -- | Traverse each match 244 | -- 245 | -- Stashes any matched groups into the index in case you need them. 246 | -- 247 | -- Get a match if one exists: 248 | -- 249 | -- >>> "find a needle in a haystack" ^? [regex|n..dle|] . match 250 | -- Just "needle" 251 | -- 252 | -- Collect all matches 253 | -- 254 | -- >>> "one _two_ three _four_" ^.. [regex|_\w+_|] . match 255 | -- ["_two_","_four_"] 256 | -- 257 | -- You can edit the traversal to perform a regex replace/substitution 258 | -- 259 | -- >>> "one _two_ three _four_" & [regex|_\w+_|] . match %~ Char8.map toUpper 260 | -- "one _TWO_ three _FOUR_" 261 | -- 262 | -- Here we use the group matches stored in the index to form key-value pairs, replacing the entire match. 263 | -- 264 | -- >>> "abc-def, ghi-jkl" & [regex|(\w+)-(\w+)|] . match %@~ \[k, v] _ -> "{" <> k <> ":" <> v <> "}" 265 | -- "{abc:def}, {ghi:jkl}" 266 | match :: IndexedTraversal' [BS.ByteString] Match BS.ByteString 267 | match = conjoined matchBS (reindexed (view groups) selfIndex <. matchBS) 268 | where 269 | matchBS :: Traversal' Match BS.ByteString 270 | matchBS = chunks . matchT . building 271 | matchT :: Traversal' [Either BS.Builder BS.Builder] BS.Builder 272 | matchT f grps = 273 | (:[]) . Right <$> f (grps ^. folded . chosen) 274 | 275 | -- | Build a traversal from the provided 'PCRE.Regex', this is handy if you're QuasiQuoter 276 | -- averse, or if you already have a 'PCRE.Regex' object floating around. 277 | -- 278 | -- Also see 'mkRegexTraversalQQ' 279 | regexing :: PCRE.Regex -> IndexedTraversal' Int BS.ByteString Match 280 | regexing pattern = conjoined (regexT pattern) (indexing (regexT pattern)) . asMatch 281 | where 282 | -- Unlawful iso, but since the Regex field of Match isn't exported it's fine. 283 | asMatch :: Iso' [Either BS.Builder BS.Builder] Match 284 | asMatch = iso to' from' 285 | to' xs = Match xs pattern 286 | from' (Match xs _) = xs 287 | 288 | -- | Base regex traversal helper 289 | regexT :: PCRE.Regex -> Traversal' BS.ByteString [Either BS.Builder BS.Builder] 290 | regexT pattern f txt = unBuilder . collapseMatch <$> apply (splitAll txt matches) 291 | where 292 | matches :: [(MatchRange, GroupRanges)] 293 | matches = PCRE.scanRanges pattern txt 294 | collapseMatch :: [Either BS.Builder [Either BS.Builder BS.Builder]] -> BS.Builder 295 | collapseMatch xs = xs ^. folded . beside id (traversed . chosen) 296 | apply xs = xs & traversed . _Right %%~ f 297 | 298 | -- | Collect both the match text AND all the matching groups 299 | -- 300 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . matchAndGroups 301 | -- [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])] 302 | matchAndGroups :: Getter Match (BS.ByteString, [BS.ByteString]) 303 | matchAndGroups = to $ \m -> (m ^. match, m ^. groups) 304 | 305 | -- | Builds a traversal over text using a Regex pattern 306 | -- 307 | -- It's a 'TH.QuasiQuoter' which creates a Traversal out of the given regex string. 308 | -- It's equivalent to calling 'regexing' on a 'PCRE.Regex' created using the 309 | -- 'PCRE.re' QuasiQuoter. 310 | -- 311 | -- The "real" type is: 312 | -- 313 | -- > regex :: Regex -> IndexedTraversal' Int BS.ByteString Match 314 | -- 315 | -- It's a traversal which selects 'Match'es; compose it with 'match' or 'groups' 316 | -- to get the relevant parts of your match. 317 | -- 318 | -- >>> txt = "raindrops on roses and whiskers on kittens" 319 | -- 320 | -- Search 321 | -- 322 | -- >>> has ([regex|whisk|]) txt 323 | -- True 324 | -- 325 | -- Get matches 326 | -- 327 | -- >>> txt ^.. [regex|\br\w+|] . match 328 | -- ["raindrops","roses"] 329 | -- 330 | -- Edit matches 331 | -- 332 | -- >>> txt & [regex|\br\w+|] . match %~ Char8.intersperse '-' . Char8.map toUpper 333 | -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" 334 | -- 335 | -- Get Groups 336 | -- 337 | -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups 338 | -- [["raindrops","roses"],["whiskers","kittens"]] 339 | -- 340 | -- Edit Groups 341 | -- 342 | -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse 343 | -- "roses on raindrops and kittens on whiskers" 344 | -- 345 | -- Get the third match 346 | -- 347 | -- >>> txt ^? [regex|\w+|] . index 2 . match 348 | --Just "roses" 349 | -- 350 | -- Edit matches 351 | -- 352 | -- >>> txt & [regex|\br\w+|] . match %~ Char8.intersperse '-' . Char8.map toUpper 353 | -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" 354 | -- 355 | -- Get Groups 356 | -- 357 | -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups 358 | -- [["raindrops","roses"],["whiskers","kittens"]] 359 | -- 360 | -- Edit Groups 361 | -- 362 | -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse 363 | -- "roses on raindrops and kittens on whiskers" 364 | -- 365 | -- Get the third match 366 | -- 367 | -- >>> txt ^? [regex|\w+|] . index 2 . match 368 | -- Just "roses" 369 | -- 370 | -- Match integers, 'Read' them into ints, then sort them in-place 371 | -- dumping them back into the source text afterwards. 372 | -- 373 | -- >>> "Monday: 29, Tuesday: 99, Wednesday: 3" & partsOf ([regex|\d+|] . match . from packedChars . _Show @Int) %~ sort 374 | -- "Monday: 3, Tuesday: 29, Wednesday: 99" 375 | -- 376 | -- To alter behaviour of the regex you may wish to pass 'PCRE.PCREOption's when compiling it. 377 | -- The default behaviour may seem strange in certain cases; e.g. it operates in 'single-line' 378 | -- mode. You can 'PCRE.compile' the 'PCRE.Regex' separately and add any options you like, then pass the resulting 379 | -- 'PCRE.Regex' into 'regex'; 380 | -- Alternatively can make your own version of the QuasiQuoter with any options you want embedded 381 | -- by using 'PCRE.mkRegexQQ'. 382 | regex :: TH.QuasiQuoter 383 | regex = PCRE.re{TH.quoteExp=quoter} 384 | where 385 | quoter str = do 386 | rgx <- TH.quoteExp PCRE.re str 387 | regexExpr <- TH.varE 'regexing 388 | return $ TH.AppE regexExpr rgx 389 | 390 | -- | Build a QuasiQuoter just like 'regex' but with the provided 'PCRE.PCREOption' overrides. 391 | mkRegexTraversalQQ :: [PCRE.PCREOption] -> TH.QuasiQuoter 392 | mkRegexTraversalQQ opts = (PCRE.mkRegexQQ opts){TH.quoteExp=quoter} 393 | where 394 | quoter str = do 395 | rgx <- TH.quoteExp (PCRE.mkRegexQQ opts) str 396 | regexExpr <- TH.varE 'regexing 397 | return $ TH.AppE regexExpr rgx 398 | 399 | --------------------------------------------------------------------------------------------- 400 | 401 | splitAll :: BS.ByteString -> [(MatchRange, GroupRanges)] -> [Either BS.Builder [Either BS.Builder BS.Builder]] 402 | splitAll txt matches = fmap (second (\(txt', (start,_), grps) -> groupSplit txt' start grps)) splitUp 403 | where 404 | splitUp = splits txt 0 matches 405 | 406 | groupSplit :: BS.ByteString -> Int -> GroupRanges -> [Either BS.Builder BS.Builder] 407 | groupSplit txt _ [] = [Left $ BS.byteString txt] 408 | groupSplit txt offset ((-1, -1) : rest) = Right "" : groupSplit txt offset rest 409 | groupSplit txt offset ((grpStart, grpEnd) : rest) | offset == grpStart = 410 | let (prefix, suffix) = BS.splitAt (grpEnd - offset) txt 411 | in Right (BS.byteString prefix) : groupSplit suffix grpEnd rest 412 | groupSplit txt offset ((grpStart, grpEnd) : rest) = 413 | let (prefix, suffix) = BS.splitAt (grpStart - offset) txt 414 | in Left (BS.byteString prefix) : groupSplit suffix grpStart ((grpStart, grpEnd) : rest) 415 | 416 | splits :: BS.ByteString -> Int -> [(MatchRange, GroupRanges)] -> [Either BS.Builder (BS.ByteString, MatchRange, GroupRanges)] 417 | -- No more matches left 418 | splits txt _ [] = [Left $ BS.byteString txt] 419 | -- We're positioned at a match 420 | splits txt offset (((start, end), grps) : rest) | offset == start = 421 | let (prefix, suffix) = BS.splitAt (end - offset) txt 422 | in (Right (prefix, (start, end), grps)) : splits suffix end rest 423 | -- jump to the next match 424 | splits txt offset matches@(((start, _), _) : _) = 425 | let (prefix, suffix) = BS.splitAt (start - offset) txt 426 | in (Left $ BS.byteString prefix) : splits suffix start matches 427 | -------------------------------------------------------------------------------- /src/Control/Lens/Regex/Text.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | Module : Control.Lens.Regex.Text 3 | Description : Text PCRE Regex library with a lensy interface. 4 | Copyright : (c) Chris Penner, 2019 5 | License : BSD3 6 | -} 7 | 8 | {-# LANGUAGE FlexibleContexts #-} 9 | {-# LANGUAGE OverloadedStrings #-} 10 | {-# LANGUAGE PartialTypeSignatures #-} 11 | {-# LANGUAGE QuasiQuotes #-} 12 | {-# LANGUAGE RankNTypes #-} 13 | {-# LANGUAGE TemplateHaskell #-} 14 | 15 | module Control.Lens.Regex.Text 16 | ( 17 | -- * Basics 18 | regex 19 | , match 20 | , groups 21 | , group 22 | , namedGroups 23 | , namedGroup 24 | , matchAndGroups 25 | 26 | -- * Compiling regexes to Traversals 27 | , regexing 28 | , mkRegexTraversalQQ 29 | 30 | -- * Types 31 | , RBS.Match 32 | , PCRE.Regex 33 | ) where 34 | 35 | import qualified Data.Text as T 36 | import qualified Data.Text.Encoding as T 37 | import qualified Data.Text.Encoding.Error as T 38 | import qualified Data.ByteString as BS 39 | import qualified Text.Regex.PCRE.Heavy as PCRE 40 | import qualified Data.Map as M 41 | import Control.Lens hiding (re, matching) 42 | import qualified Language.Haskell.TH as TH 43 | import qualified Language.Haskell.TH.Quote as TH 44 | 45 | import qualified Control.Lens.Regex.ByteString as RBS 46 | 47 | -- $setup 48 | -- >>> :set -XQuasiQuotes 49 | -- >>> :set -XOverloadedStrings 50 | -- >>> :set -XTypeApplications 51 | -- >>> import Data.Text.Lens (unpacked) 52 | -- >>> import qualified Data.Text as T 53 | -- >>> import Data.List (sort) 54 | 55 | utf8 :: Iso' T.Text BS.ByteString 56 | utf8 = iso T.encodeUtf8 (T.decodeUtf8With T.lenientDecode) 57 | 58 | -- | Builds a traversal over text using a Regex pattern 59 | -- 60 | -- It's a 'TH.QuasiQuoter' which creates a Traversal out of the given regex string. 61 | -- It's equivalent to calling 'regexing' on a 'PCRE.Regex' created using the 62 | -- 'PCRE.re' QuasiQuoter. 63 | -- 64 | -- The "real" type is: 65 | -- 66 | -- > regex :: Regex -> IndexedTraversal' Int T.Text Match 67 | -- 68 | -- It's a traversal which selects 'RBS.Match'es; compose it with 'match' or 'groups' 69 | -- to get the relevant parts of your match. 70 | -- 71 | -- >>> txt = "raindrops on roses and whiskers on kittens" 72 | -- 73 | -- Search 74 | -- 75 | -- >>> has ([regex|whisk|]) txt 76 | -- True 77 | -- 78 | -- Get matches 79 | -- 80 | -- >>> txt ^.. [regex|\br\w+|] . match 81 | -- ["raindrops","roses"] 82 | -- 83 | -- Edit matches 84 | -- 85 | -- >>> txt & [regex|\br\w+|] . match %~ T.intersperse '-' . T.toUpper 86 | -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" 87 | -- 88 | -- Get Groups 89 | -- 90 | -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups 91 | -- [["raindrops","roses"],["whiskers","kittens"]] 92 | -- 93 | -- Edit Groups 94 | -- 95 | -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse 96 | -- "roses on raindrops and kittens on whiskers" 97 | -- 98 | -- Get the third match 99 | -- 100 | -- >>> txt ^? [regex|\w+|] . index 2 . match 101 | --Just "roses" 102 | -- 103 | -- Edit matches 104 | -- 105 | -- >>> txt & [regex|\br\w+|] . match %~ T.intersperse '-' . T.toUpper 106 | -- "R-A-I-N-D-R-O-P-S on R-O-S-E-S and whiskers on kittens" 107 | -- 108 | -- Get Groups 109 | -- 110 | -- >>> txt ^.. [regex|(\w+) on (\w+)|] . groups 111 | -- [["raindrops","roses"],["whiskers","kittens"]] 112 | -- 113 | -- Edit Groups 114 | -- 115 | -- >>> txt & [regex|(\w+) on (\w+)|] . groups %~ reverse 116 | -- "roses on raindrops and kittens on whiskers" 117 | -- 118 | -- Get the third match 119 | -- 120 | -- >>> txt ^? [regex|\w+|] . index 2 . match 121 | -- Just "roses" 122 | -- 123 | -- Match integers, 'Read' them into ints, then sort them in-place 124 | -- dumping them back into the source text afterwards. 125 | -- 126 | -- >>> "Monday: 29, Tuesday: 99, Wednesday: 3" & partsOf ([regex|\d+|] . match . unpacked . _Show @Int) %~ sort 127 | -- "Monday: 3, Tuesday: 29, Wednesday: 99" 128 | -- 129 | -- To alter behaviour of the regex you may wish to pass 'PCRE.PCREOption's when compiling it. 130 | -- The default behaviour may seem strange in certain cases; e.g. it operates in 'single-line' 131 | -- mode. You can 'PCRE.compile' the 'PCRE.Regex' separately and add any options you like, then pass the resulting 132 | -- 'PCRE.Regex' into 'regex'; 133 | -- Alternatively can make your own version of the QuasiQuoter with any options you want embedded 134 | -- by using 'PCRE.mkRegexQQ'. 135 | -- regex :: Regex -> IndexedTraversal' Int T.Text RBS.Match 136 | regex :: TH.QuasiQuoter 137 | regex = PCRE.re{TH.quoteExp=quoter} 138 | where 139 | quoter str = do 140 | rgx <- TH.quoteExp PCRE.re str 141 | regexExpr <- TH.varE 'regexing 142 | return $ TH.AppE regexExpr rgx 143 | 144 | -- | Build a QuasiQuoter just like 'regex' but with the provided 'PCRE.PCREOption' overrides. 145 | mkRegexTraversalQQ :: [PCRE.PCREOption] -> TH.QuasiQuoter 146 | mkRegexTraversalQQ opts = (PCRE.mkRegexQQ opts){TH.quoteExp=quoter} 147 | where 148 | quoter str = do 149 | rgx <- TH.quoteExp (PCRE.mkRegexQQ opts) str 150 | regexExpr <- TH.varE 'regexing 151 | return $ TH.AppE regexExpr rgx 152 | 153 | -- | Build a traversal from the provided 'PCRE.Regex', this is handy if you're QuasiQuoter 154 | -- averse, or if you already have a 'PCRE.Regex' object floating around. 155 | -- 156 | -- Also see 'mkRegexTraversalQQ' 157 | regexing :: PCRE.Regex -> IndexedTraversal' Int T.Text RBS.Match 158 | regexing pat = utf8 . RBS.regexing pat 159 | 160 | -- | Access all groups of a match as a list. Also keeps full match text as the index in case 161 | -- you need it. 162 | -- 163 | -- Note that you can edit the groups through this lens, 164 | -- Changing the length of the list has behaviour similar to 'partsOf'. 165 | -- 166 | -- Get all matched groups: 167 | -- 168 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups 169 | -- [["raindrops","roses"],["whiskers","kittens"]] 170 | -- 171 | -- You can access a specific group combining with 'ix', or just use 'group' instead 172 | -- 173 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . ix 1 174 | -- ["roses","kittens"] 175 | -- 176 | -- Editing groups: 177 | -- 178 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups . ix 1 %~ T.toUpper 179 | -- "raindrops on ROSES and whiskers on KITTENS" 180 | -- 181 | -- Editing the list rearranges groups 182 | -- 183 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups %~ Prelude.reverse 184 | -- "roses on raindrops and kittens on whiskers" 185 | -- 186 | -- You can traverse the list to flatten out all groups 187 | -- 188 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups . traversed 189 | -- ["raindrops","roses","whiskers","kittens"] 190 | -- 191 | -- This replaces each group with the full match text wrapped in parens: 192 | -- 193 | -- >>> "one-two" & [regex|(\w+)-(\w+)|] . groups <. traversed %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" 194 | -- "one:(one-two)-two:(one-two)" 195 | groups :: IndexedLens' T.Text RBS.Match [T.Text] 196 | groups = reindexed (view $ from utf8) RBS.groups <. mapping (from utf8) 197 | 198 | -- | Access a specific group of a match. Numbering starts at 0. 199 | -- 200 | -- Stashes the full match text as the index in case you need it. 201 | -- 202 | -- See 'groups' for more info on grouping 203 | -- 204 | -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 0 205 | -- ["key","a"] 206 | -- 207 | -- >>> "key:value, a:b" ^.. [regex|(\w+):(\w+)|] . group 1 208 | -- ["value","b"] 209 | -- 210 | -- >>> "key:value, a:b" & [regex|(\w+):(\w+)|] . group 1 %~ T.toUpper 211 | -- "key:VALUE, a:B" 212 | -- 213 | -- >>> "key:value, a:b" & [regex|(\w+):(\w+)|] . group 1 %~ T.toUpper 214 | -- "key:VALUE, a:B" 215 | -- 216 | -- Replace the first capture group with the full match: 217 | -- 218 | -- >>> "a, b" & [regex|(\w+), (\w+)|] . group 0 .@~ \i -> "(" <> i <> ")" 219 | -- "(a, b), b" 220 | group :: Int -> IndexedTraversal' T.Text RBS.Match T.Text 221 | group n = groups <. ix n 222 | 223 | -- | Access all the named groups of a match as a 'M.Map'. Stashes the full match text as the index in case 224 | -- you need it. 225 | -- 226 | -- Note that you can edit the groups through this lens, but the behaviour is undefined when editing inner elements of __nested__ groups. 227 | -- Behaviour is undefined if groups are removed from the map (so don't do that). 228 | -- 229 | -- NOTE: There's currently some strange behaviour in pcre-heavy where trailing unmatched optional groups are omitted, I'm looking into getting that patched, but for now, note the difference in behaviour: 230 | -- 231 | -- >>> "A" ^? [regex|(?A)|(?B)|] . namedGroups 232 | -- Just (fromList [("a","A")]) 233 | -- 234 | -- >>> "B" ^? [regex|(?A)|(?B)|] . namedGroups 235 | -- Just (fromList [("a",""),("b","B")]) 236 | -- 237 | -- Get all matched groups: 238 | -- 239 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups 240 | -- [fromList [("first","raindrops"),("second","roses")],fromList [("first","whiskers"),("second","kittens")]] 241 | -- 242 | -- You can access a specific group combining with 'ix', or just use 'namedGroup' instead 243 | -- 244 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups . ix "second" 245 | -- ["roses","kittens"] 246 | -- 247 | -- Editing groups: 248 | -- 249 | -- >>> "raindrops on roses and whiskers on kittens" & [regex|(?\w+) on (?\w+)|] . namedGroups . ix "second" %~ T.toUpper 250 | -- "raindrops on ROSES and whiskers on KITTENS" 251 | -- 252 | -- Use indexed helpers to access the full match when operating on a group. 253 | -- 254 | -- This replaces the "first" group with the full match text wrapped in parens: 255 | -- 256 | -- >>> "one-two" & [regex|(?\w+)-(\w+)|] . namedGroups <. ix "first" %@~ \mtch grp -> grp <> ":(" <> mtch <> ")" 257 | -- "one:(one-two)-two" 258 | namedGroups :: IndexedLens' T.Text RBS.Match (M.Map T.Text T.Text) 259 | namedGroups = reindexed (view $ from utf8) RBS.namedGroups <. mapAsTxt 260 | where 261 | mapAsTxt :: Iso' (M.Map BS.ByteString BS.ByteString) (M.Map T.Text T.Text) 262 | mapAsTxt = iso (M.mapKeys (review utf8)) (M.mapKeys (view utf8)) . mapping (from utf8) 263 | 264 | -- | Access a specific named group of a match 265 | -- 266 | -- See 'namedGroups' for caveats and more info. 267 | -- 268 | -- Stashes the full match text as the index in case you need it. 269 | -- 270 | -- >>> "key:value, a:b" ^.. [regex|(?\w+):(?\w+)|] . namedGroup "first" 271 | -- ["key","a"] 272 | -- 273 | -- >>> "key:value, a:b" ^.. [regex|(?\w+):(?\w+)|] . namedGroup "second" 274 | -- ["value","b"] 275 | -- 276 | -- >>> "key:value, a:b" & [regex|(?\w+):(?\w+)|] . namedGroup "second" %~ T.toUpper 277 | -- "key:VALUE, a:B" 278 | -- 279 | -- Replace the first capture group with the full match: 280 | -- 281 | -- >>> "a, b" & [regex|(?\w+), (?\w+)|] . namedGroup "first" .@~ \i -> "(" <> i <> ")" 282 | -- "(a, b), b" 283 | namedGroup :: T.Text -> IndexedTraversal' T.Text RBS.Match T.Text 284 | namedGroup name = namedGroups <. ix name 285 | 286 | -- | Traverse each match 287 | -- 288 | -- Stashes any matched groups into the index in case you need them. 289 | -- 290 | -- Get a match if one exists: 291 | -- 292 | -- >>> "find a needle in a haystack" ^? [regex|n..dle|] . match 293 | -- Just "needle" 294 | -- 295 | -- Collect all matches 296 | -- 297 | -- >>> "one _two_ three _four_" ^.. [regex|_\w+_|] . match 298 | -- ["_two_","_four_"] 299 | -- 300 | -- You can edit the traversal to perform a regex replace/substitution 301 | -- 302 | -- >>> "one _two_ three _four_" & [regex|_\w+_|] . match %~ T.toUpper 303 | -- "one _TWO_ three _FOUR_" 304 | -- 305 | -- Here we use the group matches stored in the index to form key-value pairs, replacing the entire match. 306 | -- 307 | -- >>> "abc-def, ghi-jkl" & [regex|(\w+)-(\w+)|] . match %@~ \[k, v] _ -> "{" <> k <> ":" <> v <> "}" 308 | -- "{abc:def}, {ghi:jkl}" 309 | match :: IndexedTraversal' [T.Text] RBS.Match T.Text 310 | match = reindexed (fmap (view $ from utf8)) RBS.match <. from utf8 311 | 312 | -- | Collect both the match text AND all the matching groups 313 | -- 314 | -- >>> "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . matchAndGroups 315 | -- [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])] 316 | matchAndGroups :: Getter RBS.Match (T.Text, [T.Text]) 317 | matchAndGroups = to $ \m -> (m ^. match, m ^. groups) 318 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-18.28 2 | 3 | packages: 4 | - . 5 | 6 | extra-deps: [] 7 | -------------------------------------------------------------------------------- /stack.yaml.lock: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by Stack. 2 | # You should not edit this file by hand. 3 | # For more information, please see the documentation at: 4 | # https://docs.haskellstack.org/en/stable/lock_files 5 | 6 | packages: [] 7 | snapshots: 8 | - completed: 9 | sha256: 428ec8d5ce932190d3cbe266b9eb3c175cd81e984babf876b64019e2cbe4ea68 10 | size: 590100 11 | url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/18/28.yaml 12 | original: lts-18.28 13 | -------------------------------------------------------------------------------- /test/ByteString.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE QuasiQuotes #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | module ByteString where 4 | 5 | import Control.Lens 6 | import Control.Lens.Regex.ByteString 7 | import qualified Data.ByteString.Char8 as C8 hiding (index) 8 | import Data.Char 9 | import qualified Data.Map as M 10 | import Test.Hspec 11 | 12 | spec :: Spec 13 | spec = do 14 | describe "regex" $ do 15 | xdescribe "pcre-heavy-compat" $ do 16 | it "should handle crazy nested groups" $ do 17 | "abcdefhijklm" ^? [regex|^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$|] . matchAndGroups 18 | `shouldBe` Just ("abcdefhijklm", ["bc", "c", "ef", "f", "ij", "j", "lm", "m"]) 19 | describe "match" $ do 20 | describe "getting" $ do 21 | it "should find one match" $ do 22 | "abc" ^.. [regex|b|] . match 23 | `shouldBe` ["b"] 24 | 25 | it "should find many matches" $ do 26 | "a b c" ^.. [regex|\w|] . match 27 | `shouldBe` ["a", "b", "c"] 28 | 29 | it "should fold" $ do 30 | "a b c" ^. [regex|\w|] . match 31 | `shouldBe` "abc" 32 | 33 | it "should match with a group" $ do 34 | "a b c" ^.. [regex|(\w)|] . match 35 | `shouldBe` ["a", "b", "c"] 36 | 37 | it "should match with many groups" $ do 38 | "a b c" ^.. [regex|(\w) (\w)|] . match 39 | `shouldBe` ["a b"] 40 | 41 | it "should be greedy when overlapping" $ do 42 | "abc" ^.. [regex|\w+|] . match 43 | `shouldBe`["abc"] 44 | 45 | it "should respect lazy modifiers" $ do 46 | "abc" ^.. [regex|\w+?|] . match 47 | `shouldBe`["a", "b", "c"] 48 | 49 | describe "setting" $ do 50 | it "should allow setting" $ do 51 | ("one two three" & [regex|two|] . match .~ "new") 52 | `shouldBe` "one new three" 53 | 54 | it "should allow setting many" $ do 55 | ("one three" & [regex|\w+|] . match .~ "new") 56 | `shouldBe` "new new" 57 | 58 | it "should allow mutating" $ do 59 | ("one two three" & [regex|two|] . match %~ (<> "!!"). C8.map toUpper) 60 | `shouldBe` "one TWO!! three" 61 | 62 | it "should allow mutating many" $ do 63 | ("one two three" & [regex|two|] . match %~ C8.map toUpper) 64 | `shouldBe` "one TWO three" 65 | 66 | describe "indexed" $ do 67 | it "should allow folding with index" $ do 68 | ("one two three" ^.. ([regex|\w+|] <. match) . withIndex) 69 | `shouldBe` [(0, "one"), (1, "two"), (2, "three")] 70 | 71 | it "should allow getting with index" $ do 72 | ("one two three" ^.. [regex|\w+|] . index 1 . match) 73 | `shouldBe` ["two"] 74 | 75 | it "should allow setting with index" $ do 76 | ("one two three" & [regex|\w+|] <. match .@~ C8.pack . show) 77 | `shouldBe` "0 1 2" 78 | 79 | it "should allow mutating with index" $ do 80 | ("one two three" & [regex|\w+|] <. match %@~ \i s -> (C8.pack $ show i) <> ": " <> s) 81 | `shouldBe` "0: one 1: two 2: three" 82 | 83 | describe "groups" $ do 84 | describe "getting" $ do 85 | it "should get groups" $ do 86 | "a b c" ^.. [regex|(\w)|] . groups 87 | `shouldBe` [["a"], ["b"], ["c"]] 88 | 89 | it "should get multiple groups" $ do 90 | "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups 91 | `shouldBe` [["raindrops","roses"],["whiskers","kittens"]] 92 | 93 | it "should allow getting a specific index" $ do 94 | ("one two three four" ^.. [regex|(\w+) (\w+)|] . groups . ix 1) 95 | `shouldBe` ["two", "four"] 96 | 97 | it "should handle weird group alternation" $ do 98 | ("AB" ^.. [regex|A(x)?(B)|] . groups `shouldBe` [["", "B"]]) 99 | ("B" ^.. [regex|(A)|(B)|] . groups `shouldBe` [["", "B"]]) 100 | -- This behaviour is consistent with pcre-heavy 101 | ("A" ^.. [regex|(A)|(B)|] . groups `shouldBe` [["A"]]) 102 | 103 | describe "setting" $ do 104 | it "should allow setting groups as a list" $ do 105 | ("one two three" & [regex|(\w+) (\w+)|] . groups .~ ["1", "2"]) 106 | `shouldBe` "1 2 three" 107 | 108 | it "should allow editing when result list is the same length" $ do 109 | ("raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups %~ reverse) 110 | `shouldBe` "roses on raindrops and kittens on whiskers" 111 | 112 | describe "group" $ do 113 | it "should get a single group" $ do 114 | "a:b c:d" ^.. [regex|(\w):(\w)|] . group 1 115 | `shouldBe` ["b", "d"] 116 | 117 | it "should set a single group" $ do 118 | "a:b c:d" & [regex|(\w):(\w)|] . group 1 %~ C8.map toUpper 119 | `shouldBe` "a:B c:D" 120 | 121 | describe "traversed" $ do 122 | it "should allow setting all group matches" $ do 123 | ("one two three" & [regex|(\w+) (\w+)|] . groups . traversed .~ "new") 124 | `shouldBe` "new new three" 125 | 126 | it "should allow mutating" $ do 127 | ("one two three four" & [regex|one (two) (three)|] . groups . traversed %~ (<> "!!") . C8.map toUpper) 128 | `shouldBe` "one TWO!! THREE!! four" 129 | 130 | it "should allow folding with index" $ do 131 | ("one two three four" ^.. [regex|(\w+) (\w+)|] . groups . traversed . withIndex) 132 | `shouldBe` [(0, "one"), (1, "two"), (0, "three"), (1, "four")] 133 | 134 | it "should allow setting with index" $ do 135 | ("one two three four" & [regex|(\w+) (\w+)|] . groups . traversed .@~ C8.pack . show) 136 | `shouldBe` "0 1 0 1" 137 | 138 | it "should allow mutating with index" $ do 139 | ("one two three four" & [regex|(\w+) (\w+)|] . groups . traversed %@~ \i s -> (C8.pack $ show i) <> ": " <> s) 140 | `shouldBe` "0: one 1: two 0: three 1: four" 141 | 142 | it "should compose indices with matches" $ do 143 | ("one two three four" ^.. ([regex|(\w+) (\w+)|] <.> groups . traversed) . withIndex) 144 | `shouldBe` [((0, 0), "one"), ((0, 1), "two"), ((1, 0), "three"), ((1, 1), "four")] 145 | 146 | describe "namedGroups" $ do 147 | describe "getting" $ do 148 | it "should get named groups" $ do 149 | "a b c" ^.. [regex|(?\w)|] . namedGroups 150 | `shouldBe` [M.fromList [("mygroup", "a")], M.fromList [("mygroup", "b")], M.fromList [("mygroup", "c")]] 151 | 152 | it "should get multiple named groups" $ do 153 | "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups 154 | `shouldBe` [M.fromList [("one", "raindrops"), ("two", "roses")], M.fromList [("one", "whiskers"), ("two", "kittens")]] 155 | 156 | it "should allow getting a specific named group" $ do 157 | ("raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups . ix "two") 158 | `shouldBe` ["roses", "kittens"] 159 | 160 | it "should handle weird group alternation" $ do 161 | ("AB" ^.. [regex|A(?x)?(?B)|] . namedGroups `shouldBe` [M.fromList [("opt", ""), ("always", "B")]]) 162 | ("B" ^.. [regex|(?A)|(?B)|] . namedGroups `shouldBe` [M.fromList [("a", ""), ("b", "B")]]) 163 | -- This is the behaviour of pcre-heavy, it's a bit unfortunate 164 | ("A" ^.. [regex|(?A)|(?B)|] . namedGroups `shouldBe` [M.fromList [("a", "A")]]) 165 | 166 | describe "setting" $ do 167 | it "should allow setting groups as a map" $ do 168 | ("one two three" & [regex|(?\w+) (?\w+)|] . namedGroups .~ M.fromList [("a", "1"), ("b", "2")]) 169 | `shouldBe` "1 2 three" 170 | 171 | describe "namedGroup" $ do 172 | it "should get a single named group" $ do 173 | "a:b c:d" ^.. [regex|(?\w):(?\w)|] . namedGroup "after" 174 | `shouldBe` ["b", "d"] 175 | 176 | it "should set a single group" $ do 177 | "a:b c:d" & [regex|(\w):(?\w)|] . namedGroup "after" %~ C8.map toUpper 178 | `shouldBe` "a:B c:D" 179 | 180 | describe "matchAndGroups" $ do 181 | it "should get match and groups" $ do 182 | "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . matchAndGroups 183 | `shouldBe` [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])] 184 | 185 | -------------------------------------------------------------------------------- /test/Spec.hs: -------------------------------------------------------------------------------- 1 | import Test.Hspec 2 | import Text 3 | import ByteString 4 | 5 | main :: IO () 6 | main = hspec $ do 7 | describe "text" Text.spec 8 | describe "bytestring" ByteString.spec 9 | -------------------------------------------------------------------------------- /test/Text.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE QuasiQuotes #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | module Text where 4 | 5 | import Control.Lens 6 | import Control.Lens.Regex.Text 7 | import qualified Data.Text as T 8 | import qualified Data.Map as M 9 | import Test.Hspec 10 | 11 | spec :: Spec 12 | spec = do 13 | describe "regex" $ do 14 | describe "match" $ do 15 | describe "getting" $ do 16 | it "should find one match" $ do 17 | "abc" ^.. [regex|b|] . match 18 | `shouldBe` ["b"] 19 | 20 | it "should find many matches" $ do 21 | "a b c" ^.. [regex|\w|] . match 22 | `shouldBe` ["a", "b", "c"] 23 | 24 | it "should fold" $ do 25 | "a b c" ^. [regex|\w|] . match 26 | `shouldBe` "abc" 27 | 28 | it "should match with a group" $ do 29 | "a b c" ^.. [regex|(\w)|] . match 30 | `shouldBe` ["a", "b", "c"] 31 | 32 | it "should match with many groups" $ do 33 | "a b c" ^.. [regex|(\w) (\w)|] . match 34 | `shouldBe` ["a b"] 35 | 36 | it "should be greedy when overlapping" $ do 37 | "abc" ^.. [regex|\w+|] . match 38 | `shouldBe`["abc"] 39 | 40 | it "should respect lazy modifiers" $ do 41 | "abc" ^.. [regex|\w+?|] . match 42 | `shouldBe`["a", "b", "c"] 43 | 44 | it "should handle unicode in source text properly" $ do 45 | "🍕 test 🍔" ^. [regex|test|] . match 46 | `shouldBe` "test" 47 | ("🍕 test 🍔" & [regex|🍔|] . match .~ "👻🙈") 48 | `shouldBe` "🍕 test 👻🙈" 49 | 50 | it "should handle unicode in patterns properly" $ do 51 | "*🍕 test 🍔*" ^. [regex|🍕 \w+ 🍔|] . match 52 | `shouldBe` "🍕 test 🍔" 53 | 54 | describe "setting" $ do 55 | it "should allow setting" $ do 56 | ("one two three" & [regex|two|] . match .~ "new") 57 | `shouldBe` "one new three" 58 | 59 | it "should allow setting many" $ do 60 | ("one three" & [regex|\w+|] . match .~ "new") 61 | `shouldBe` "new new" 62 | 63 | it "should allow mutating" $ do 64 | ("one two three" & [regex|two|] . match %~ (<> "!!"). T.toUpper) 65 | `shouldBe` "one TWO!! three" 66 | 67 | it "should allow mutating many" $ do 68 | ("one two three" & [regex|two|] . match %~ T.toUpper) 69 | `shouldBe` "one TWO three" 70 | 71 | describe "indexed" $ do 72 | it "should allow folding with index" $ do 73 | ("one two three" ^.. ([regex|\w+|] <. match) . withIndex) 74 | `shouldBe` [(0, "one"), (1, "two"), (2, "three")] 75 | 76 | it "should allow getting with index" $ do 77 | ("one two three" ^.. [regex|\w+|] . index 1 . match) 78 | `shouldBe` ["two"] 79 | 80 | it "should allow setting with index" $ do 81 | ("one two three" & [regex|\w+|] <. match .@~ T.pack . show) 82 | `shouldBe` "0 1 2" 83 | 84 | it "should allow mutating with index" $ do 85 | ("one two three" & [regex|\w+|] <. match %@~ \i s -> (T.pack $ show i) <> ": " <> s) 86 | `shouldBe` "0: one 1: two 2: three" 87 | 88 | describe "groups" $ do 89 | describe "getting" $ do 90 | it "should get groups" $ do 91 | "a b c" ^.. [regex|(\w)|] . groups 92 | `shouldBe` [["a"], ["b"], ["c"]] 93 | 94 | it "should get multiple groups" $ do 95 | "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . groups 96 | `shouldBe` [["raindrops","roses"],["whiskers","kittens"]] 97 | 98 | it "should allow getting a specific index" $ do 99 | ("one two three four" ^.. [regex|(\w+) (\w+)|] . groups . ix 1) 100 | `shouldBe` ["two", "four"] 101 | 102 | it "should handle weird group alternation" $ do 103 | ("AB" ^.. [regex|A(x)?(B)|] . groups `shouldBe` [["", "B"]]) 104 | ("B" ^.. [regex|(A)|(B)|] . groups `shouldBe` [["", "B"]]) 105 | -- This behaviour is consistent with pcre-heavy 106 | ("A" ^.. [regex|(A)|(B)|] . groups `shouldBe` [["A"]]) 107 | 108 | describe "setting" $ do 109 | it "should allow setting groups as a list" $ do 110 | ("one two three" & [regex|(\w+) (\w+)|] . groups .~ ["1", "2"]) 111 | `shouldBe` "1 2 three" 112 | 113 | it "should allow editing when result list is the same length" $ do 114 | ("raindrops on roses and whiskers on kittens" & [regex|(\w+) on (\w+)|] . groups %~ reverse) 115 | `shouldBe` "roses on raindrops and kittens on whiskers" 116 | 117 | describe "group" $ do 118 | it "should get a single group" $ do 119 | "a:b c:d" ^.. [regex|(\w):(\w)|] . group 1 120 | `shouldBe` ["b", "d"] 121 | 122 | it "should set a single group" $ do 123 | "a:b c:d" & [regex|(\w):(\w)|] . group 1 %~ T.toUpper 124 | `shouldBe` "a:B c:D" 125 | 126 | describe "traversed" $ do 127 | it "should allow setting all group matches" $ do 128 | ("one two three" & [regex|(\w+) (\w+)|] . groups . traversed .~ "new") 129 | `shouldBe` "new new three" 130 | 131 | it "should allow mutating" $ do 132 | ("one two three four" & [regex|one (two) (three)|] . groups . traversed %~ (<> "!!") . T.toUpper) 133 | `shouldBe` "one TWO!! THREE!! four" 134 | 135 | it "should allow folding with index" $ do 136 | ("one two three four" ^.. [regex|(\w+) (\w+)|] . groups . traversed . withIndex) 137 | `shouldBe` [(0, "one"), (1, "two"), (0, "three"), (1, "four")] 138 | 139 | it "should allow setting with index" $ do 140 | ("one two three four" & [regex|(\w+) (\w+)|] . groups . traversed .@~ T.pack . show) 141 | `shouldBe` "0 1 0 1" 142 | 143 | it "should allow mutating with index" $ do 144 | ("one two three four" & [regex|(\w+) (\w+)|] . groups . traversed %@~ \i s -> (T.pack $ show i) <> ": " <> s) 145 | `shouldBe` "0: one 1: two 0: three 1: four" 146 | 147 | it "should compose indices with matches" $ do 148 | ("one two three four" ^.. ([regex|(\w+) (\w+)|] <.> groups . traversed) . withIndex) 149 | `shouldBe` [((0, 0), "one"), ((0, 1), "two"), ((1, 0), "three"), ((1, 1), "four")] 150 | 151 | describe "namedGroups" $ do 152 | describe "getting" $ do 153 | it "should get named groups" $ do 154 | "a b c" ^.. [regex|(?\w)|] . namedGroups 155 | `shouldBe` [M.fromList [("mygroup", "a")], M.fromList [("mygroup", "b")], M.fromList [("mygroup", "c")]] 156 | 157 | it "should get multiple named groups" $ do 158 | "raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups 159 | `shouldBe` [M.fromList [("one", "raindrops"), ("two", "roses")], M.fromList [("one", "whiskers"), ("two", "kittens")]] 160 | 161 | it "should allow getting a specific named group" $ do 162 | ("raindrops on roses and whiskers on kittens" ^.. [regex|(?\w+) on (?\w+)|] . namedGroups . ix "two") 163 | `shouldBe` ["roses", "kittens"] 164 | 165 | it "should handle weird group alternation" $ do 166 | ("AB" ^.. [regex|A(?x)?(?B)|] . namedGroups `shouldBe` [M.fromList [("opt", ""), ("always", "B")]]) 167 | ("B" ^.. [regex|(?A)|(?B)|] . namedGroups `shouldBe` [M.fromList [("a", ""), ("b", "B")]]) 168 | -- This is the behaviour of pcre-heavy, it's a bit unfortunate 169 | ("A" ^.. [regex|(?A)|(?B)|] . namedGroups `shouldBe` [M.fromList [("a", "A")]]) 170 | 171 | describe "setting" $ do 172 | it "should allow setting groups as a map" $ do 173 | ("one two three" & [regex|(?\w+) (?\w+)|] . namedGroups .~ M.fromList [("a", "1"), ("b", "2")]) 174 | `shouldBe` "1 2 three" 175 | 176 | describe "namedGroup" $ do 177 | it "should get a single named group" $ do 178 | "a:b c:d" ^.. [regex|(?\w):(?\w)|] . namedGroup "after" 179 | `shouldBe` ["b", "d"] 180 | 181 | it "should set a single group" $ do 182 | "a:b c:d" & [regex|(\w):(?\w)|] . namedGroup "after" %~ T.toUpper 183 | `shouldBe` "a:B c:D" 184 | 185 | describe "matchAndGroups" $ do 186 | it "should get match and groups" $ do 187 | "raindrops on roses and whiskers on kittens" ^.. [regex|(\w+) on (\w+)|] . matchAndGroups 188 | `shouldBe` [("raindrops on roses",["raindrops","roses"]),("whiskers on kittens",["whiskers","kittens"])] 189 | 190 | --------------------------------------------------------------------------------