├── .gitignore ├── LICENSE ├── Main.hs ├── Makefile ├── README ├── Setup.lhs ├── binaries └── osx │ └── genex ├── dist └── doc │ └── html │ └── regex-genex │ ├── Regex-Genex.html │ ├── doc-index.html │ ├── frames.html │ ├── haddock-util.js │ ├── hslogo-16.png │ ├── index-frames.html │ ├── index.html │ ├── mini_Regex-Genex.html │ ├── minus.gif │ ├── ocean.css │ ├── plus.gif │ ├── regex-genex.haddock │ └── synopsis.png ├── regex-genex.cabal └── src └── Regex ├── Genex.hs └── Genex ├── Normalize.hs └── Pure.hs /.gitignore: -------------------------------------------------------------------------------- 1 | binaries/osx/yices 2 | dist 3 | .*~ 4 | tags 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | The "Artistic License" 6 | 7 | Preamble 8 | 9 | The intent of this document is to state the conditions under which a 10 | Package may be copied, such that the Copyright Holder maintains some 11 | semblance of artistic control over the development of the package, 12 | while giving the users of the package the right to use and distribute 13 | the Package in a more-or-less customary fashion, plus the right to make 14 | reasonable modifications. 15 | 16 | Definitions: 17 | 18 | "Package" refers to the collection of files distributed by the 19 | Copyright Holder, and derivatives of that collection of files 20 | created through textual modification. 21 | 22 | "Standard Version" refers to such a Package if it has not been 23 | modified, or has been modified in accordance with the wishes 24 | of the Copyright Holder as specified below. 25 | 26 | "Copyright Holder" is whoever is named in the copyright or 27 | copyrights for the package. 28 | 29 | "You" is you, if you're thinking about copying or distributing 30 | this Package. 31 | 32 | "Reasonable copying fee" is whatever you can justify on the 33 | basis of media cost, duplication charges, time of people involved, 34 | and so on. (You will not be required to justify it to the 35 | Copyright Holder, but only to the computing community at large 36 | as a market that must bear the fee.) 37 | 38 | "Freely Available" means that no fee is charged for the item 39 | itself, though there may be fees involved in handling the item. 40 | It also means that recipients of the item may redistribute it 41 | under the same conditions they received it. 42 | 43 | 1. You may make and give away verbatim copies of the source form of the 44 | Standard Version of this Package without restriction, provided that you 45 | duplicate all of the original copyright notices and associated disclaimers. 46 | 47 | 2. You may apply bug fixes, portability fixes and other modifications 48 | derived from the Public Domain or from the Copyright Holder. A Package 49 | modified in such a way shall still be considered the Standard Version. 50 | 51 | 3. You may otherwise modify your copy of this Package in any way, provided 52 | that you insert a prominent notice in each changed file stating how and 53 | when you changed that file, and provided that you do at least ONE of the 54 | following: 55 | 56 | a) place your modifications in the Public Domain or otherwise make them 57 | Freely Available, such as by posting said modifications to Usenet or 58 | an equivalent medium, or placing the modifications on a major archive 59 | site such as uunet.uu.net, or by allowing the Copyright Holder to include 60 | your modifications in the Standard Version of the Package. 61 | 62 | b) use the modified Package only within your corporation or organization. 63 | 64 | c) rename any non-standard executables so the names do not conflict 65 | with standard executables, which must also be provided, and provide 66 | a separate manual page for each non-standard executable that clearly 67 | documents how it differs from the Standard Version. 68 | 69 | d) make other distribution arrangements with the Copyright Holder. 70 | 71 | 4. You may distribute the programs of this Package in object code or 72 | executable form, provided that you do at least ONE of the following: 73 | 74 | a) distribute a Standard Version of the executables and library files, 75 | together with instructions (in the manual page or equivalent) on where 76 | to get the Standard Version. 77 | 78 | b) accompany the distribution with the machine-readable source of 79 | the Package with your modifications. 80 | 81 | c) give non-standard executables non-standard names, and clearly 82 | document the differences in manual pages (or equivalent), together 83 | with instructions on where to get the Standard Version. 84 | 85 | d) make other distribution arrangements with the Copyright Holder. 86 | 87 | 5. You may charge a reasonable copying fee for any distribution of this 88 | Package. You may charge any fee you choose for support of this 89 | Package. You may not charge a fee for this Package itself. However, 90 | you may distribute this Package in aggregate with other (possibly 91 | commercial) programs as part of a larger (possibly commercial) software 92 | distribution provided that you do not advertise this Package as a 93 | product of your own. You may embed this Package's interpreter within 94 | an executable of yours (by linking); this shall be construed as a mere 95 | form of aggregation, provided that the complete Standard Version of the 96 | interpreter is so embedded. 97 | 98 | 6. The scripts and library files supplied as input to or produced as 99 | output from the programs of this Package do not automatically fall 100 | under the copyright of this Package, but belong to whoever generated 101 | them, and may be sold commercially, and may be aggregated with this 102 | Package. If such scripts or library files are aggregated with this 103 | Package via the so-called "undump" or "unexec" methods of producing a 104 | binary executable image, then distribution of such an image shall 105 | neither be construed as a distribution of this Package nor shall it 106 | fall under the restrictions of Paragraphs 3 and 4, provided that you do 107 | not represent such an executable image as a Standard Version of this 108 | Package. 109 | 110 | 7. C subroutines (or comparably compiled subroutines in other 111 | languages) supplied by you and linked into this Package in order to 112 | emulate subroutines and variables of the language defined by this 113 | Package shall not be considered part of this Package, but are the 114 | equivalent of input as in Paragraph 6, provided these subroutines do 115 | not change the language in any way that would cause it to fail the 116 | regression tests for the language. 117 | 118 | 8. Aggregation of this Package with a commercial distribution is always 119 | permitted provided that the use of this Package is embedded; that is, 120 | when no overt attempt is made to make this Package's interfaces visible 121 | to the end user of the commercial distribution. Such use shall not be 122 | construed as a distribution of this Package. 123 | 124 | 9. The name of the Copyright Holder may not be used to endorse or promote 125 | products derived from this software without specific prior written permission. 126 | 127 | 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR 128 | IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 129 | WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 130 | 131 | The End 132 | -------------------------------------------------------------------------------- /Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | import Regex.Genex 3 | import System.IO 4 | import System.Environment 5 | import Data.Char (isDigit) 6 | 7 | defaultRegex :: String 8 | defaultRegex = "a(b|c)d{2,3}e*" 9 | 10 | main :: IO () 11 | main = do 12 | hSetBuffering stdout NoBuffering 13 | args <- getArgs 14 | case args of 15 | [] -> do 16 | prog <- getProgName 17 | if prog == "" then run defaultRegex else do 18 | fail $ "Usage: " ++ prog ++ " regex [regex...]" 19 | rx | all isPure rx -> mapM_ ((putStr "0 " >>) . print) (genexPure rx) 20 | | otherwise -> genexPrint rx 21 | where 22 | isPure [] = True 23 | isPure ('\\':'\\':cs) = isPure cs 24 | isPure ('\\':'b':_) = False 25 | isPure ('\\':c:cs) 26 | | isDigit c = False 27 | | otherwise = isPure cs 28 | isPure ('^':_) = False 29 | isPure ('$':_) = False 30 | isPure (_:cs) = isPure cs 31 | 32 | run :: String -> IO () 33 | run regex = genexPrint [regex] 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all :: install 2 | 3 | test :: binaries/osx/z3 binaries/osx/genex 4 | env PATH=./binaries/osx:$$PATH genex "a(b|c)d{2,3}e*" 5 | env PATH=./binaries/osx:$$PATH genex "a(b|c)d{2,3}e*\1" 6 | 7 | binaries/osx/z3 : 8 | curl https://research.microsoft.com/en-us/um/redmond/projects/z3/z3-osx-4.1-x64.tar.gz | tar zxf - 9 | cp z3/bin/z3 binaries/osx/ 10 | rm -rf z3 11 | 12 | binaries/osx/yices : 13 | curl 'http://yices.csl.sri.com/cgi-bin/yices-newdownload.cgi?file=yices2smt09-x86_64-apple-darwin9.8.0-static-gmp.tgz&accept=I+accept' | tar zxf - 14 | cp yices2smt09/bin/yices binaries/osx/ 15 | rm -rf yices2smt09 16 | 17 | binaries/osx/genex : 18 | cabal configure 19 | cabal build 20 | cp dist/build/genex/genex binaries/osx/ 21 | strip binaries/osx/genex 22 | 23 | install :: 24 | cabal install 25 | cp dist/build/genex/genex binaries/osx/ 26 | strip binaries/osx/genex 27 | 28 | ghci :: 29 | ghci -isrc Main.hs 30 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Available on Hackage as: http://hackage.haskell.org/package/regex-genex 2 | 3 | The "genex" program finds all permutations of strings that matches every 4 | regular expressions specified in the command line, with full support 5 | for back references (\1 .. \9) and word boundaries (\b). 6 | 7 | The output is unsorted, but the order is deterministic across multiple runs: 8 | 9 | $ genex '\d' '[123abc]' # Must match both 10 | 1.00000000 "2" 11 | 1.00000000 "3" 12 | 1.00000000 "1" 13 | 14 | To enforce a fixed ordering for alternations, pipe the output to "sort -n": 15 | 16 | $ genex '(__|<>){1,3}' | sort -n 17 | 2.00000000 "<>" 18 | 2.00000001 "__" 19 | 4.00000002 "<><>" 20 | 4.00000003 "__<>" 21 | 4.00000006 "<>__" 22 | 4.00000007 "____" 23 | 6.00000010 "<><><>" 24 | 6.00000011 "__<><>" 25 | 6.00000014 "<>__<>" 26 | 6.00000015 "____<>" 27 | 6.00000026 "<><>__" 28 | 6.00000027 "__<>__" 29 | 6.00000030 "<>____" 30 | 6.00000031 "______" 31 | 32 | Output size and maximum string length are both capped at 65535 currently, 33 | but both can be raised if needed. 34 | 35 | Because genex generates matches lazily, we can use "head -n" to display 36 | only part of its output: 37 | 38 | genex '[abc]+[123]+.+' | head -n 10 39 | 40 | Some caveats: 41 | 42 | - We translate * and + quantifiers into {0,3} and {1,4}, to make output 43 | appear more unique. 44 | 45 | - The set of . \D \W \S characters are limited to printable characters, 46 | again to make the output more pretty. 47 | 48 | - The ^ and $ anchors are taken to mean begin-of-line and end-of-line 49 | (implicit /m), since we already implicitly anchor on both ends. 50 | 51 | - No support yet for \l \u \L \U \Q \E (case and quotemeta modifiers) 52 | 53 | - No named Unicode properties or POSIX [[:upper:]] classes yet. 54 | 55 | Required Hackage libraries: sbv regex-tdfa stream-monad text 56 | 57 | Required binary in PATH: 58 | 59 | yices # Download it from http://yices.csl.sri.com/download-yices2.shtml 60 | 61 | You can directly run the Main.hs in the checkout directory as well: 62 | 63 | runghc Main.hs 'your regex here' 64 | 65 | Pre-built MacOSX binaries are in binaries/osx/; try "make test" for a sample run. 66 | 67 | Share and enjoy! 68 | Audrey 69 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env runghc 2 | > import Distribution.Simple 3 | > import System.Cmd (rawSystem) 4 | > 5 | > main :: IO () 6 | > main = defaultMainWithHooks simpleUserHooks 7 | -------------------------------------------------------------------------------- /binaries/osx/genex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/binaries/osx/genex -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/Regex-Genex.html: -------------------------------------------------------------------------------- 1 | Regex.Genex

regex-genex-0.6.0: From a regex, generate all possible strings it can match

Safe HaskellSafe-Infered

Regex.Genex

Description

This module and the accompanying genex program finds all permutations 5 | of strings that matches every input regular expressions, ordered from 6 | shortest to longest, with full support for back references ('\1' .. '\9') 7 | and word boundaries ('\b'). 8 |

It requires the yices binary in PATH; please download it from: 9 | http://yices.csl.sri.com/download-yices2.shtml 10 |

Synopsis

Documentation

data Model

A match consists of a string (list of codepoints), and a rank representing alternation order. 11 |

Constructors

Model 

Fields

modelChars :: [Word8]
 
modelRank :: Word64
 

Instances

genex :: [String] -> IO [String]

Given a list of regular repressions, returns all possible strings that matches every one of them. 12 | Guarantees to return shorter strings before longer ones. 13 |

genexPure :: [String] -> [String]

A pure and much faster variant of genex, but without support for 14 | back-references, anchors or word boundaries. 15 | Does not guarantee orders about length of strings. 16 | Does not depend on the external yices SMT solver. 17 |

genexPrint :: [String] -> IO ()

Same as genexModels, but print the models to standard output instead. 18 |

genexModels :: [String] -> IO [Model]

Same as genex, but with the entire model returned instead. 19 |

genexWith :: ([maxRepeat :: Int], Monoid a) => ([SatResult] -> Hits -> (Hits -> IO a) -> IO a) -> [[Char]] -> IO a

regexMatch :: [maxRepeat :: Int] => [[Char]] -> Str -> Symbolic SBool

-------------------------------------------------------------------------------- /dist/doc/html/regex-genex/doc-index.html: -------------------------------------------------------------------------------- 1 | regex-genex-0.6.0: From a regex, generate all possible strings it can match (Index)

regex-genex-0.6.0: From a regex, generate all possible strings it can match

Index

genexRegex.Genex
genexModelsRegex.Genex
genexPrintRegex.Genex
genexPureRegex.Genex
genexWithRegex.Genex
Model 
1 (Type/Class)Regex.Genex
2 (Data Constructor)Regex.Genex
modelCharsRegex.Genex
modelRankRegex.Genex
normalizeRegex.Genex.Normalize
regexMatchRegex.Genex
-------------------------------------------------------------------------------- /dist/doc/html/regex-genex/frames.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/haddock-util.js: -------------------------------------------------------------------------------- 1 | // Haddock JavaScript utilities 2 | 3 | var rspace = /\s\s+/g, 4 | rtrim = /^\s+|\s+$/g; 5 | 6 | function spaced(s) { return (" " + s + " ").replace(rspace, " "); } 7 | function trim(s) { return s.replace(rtrim, ""); } 8 | 9 | function hasClass(elem, value) { 10 | var className = spaced(elem.className || ""); 11 | return className.indexOf( " " + value + " " ) >= 0; 12 | } 13 | 14 | function addClass(elem, value) { 15 | var className = spaced(elem.className || ""); 16 | if ( className.indexOf( " " + value + " " ) < 0 ) { 17 | elem.className = trim(className + " " + value); 18 | } 19 | } 20 | 21 | function removeClass(elem, value) { 22 | var className = spaced(elem.className || ""); 23 | className = className.replace(" " + value + " ", " "); 24 | elem.className = trim(className); 25 | } 26 | 27 | function toggleClass(elem, valueOn, valueOff, bool) { 28 | if (bool == null) { bool = ! hasClass(elem, valueOn); } 29 | if (bool) { 30 | removeClass(elem, valueOff); 31 | addClass(elem, valueOn); 32 | } 33 | else { 34 | removeClass(elem, valueOn); 35 | addClass(elem, valueOff); 36 | } 37 | return bool; 38 | } 39 | 40 | 41 | function makeClassToggle(valueOn, valueOff) 42 | { 43 | return function(elem, bool) { 44 | return toggleClass(elem, valueOn, valueOff, bool); 45 | } 46 | } 47 | 48 | toggleShow = makeClassToggle("show", "hide"); 49 | toggleCollapser = makeClassToggle("collapser", "expander"); 50 | 51 | function toggleSection(id) 52 | { 53 | var b = toggleShow(document.getElementById("section." + id)); 54 | toggleCollapser(document.getElementById("control." + id), b); 55 | rememberCollapsed(id, b); 56 | return b; 57 | } 58 | 59 | var collapsed = {}; 60 | function rememberCollapsed(id, b) 61 | { 62 | if(b) 63 | delete collapsed[id] 64 | else 65 | collapsed[id] = null; 66 | 67 | var sections = []; 68 | for(var i in collapsed) 69 | { 70 | if(collapsed.hasOwnProperty(i)) 71 | sections.push(i); 72 | } 73 | // cookie specific to this page; don't use setCookie which sets path=/ 74 | document.cookie = "collapsed=" + escape(sections.join('+')); 75 | } 76 | 77 | function restoreCollapsed() 78 | { 79 | var cookie = getCookie("collapsed"); 80 | if(!cookie) 81 | return; 82 | 83 | var ids = cookie.split('+'); 84 | for(var i in ids) 85 | { 86 | if(document.getElementById("section." + ids[i])) 87 | toggleSection(ids[i]); 88 | } 89 | } 90 | 91 | function setCookie(name, value) { 92 | document.cookie = name + "=" + escape(value) + ";path=/;"; 93 | } 94 | 95 | function clearCookie(name) { 96 | document.cookie = name + "=;path=/;expires=Thu, 01-Jan-1970 00:00:01 GMT;"; 97 | } 98 | 99 | function getCookie(name) { 100 | var nameEQ = name + "="; 101 | var ca = document.cookie.split(';'); 102 | for(var i=0;i < ca.length;i++) { 103 | var c = ca[i]; 104 | while (c.charAt(0)==' ') c = c.substring(1,c.length); 105 | if (c.indexOf(nameEQ) == 0) { 106 | return unescape(c.substring(nameEQ.length,c.length)); 107 | } 108 | } 109 | return null; 110 | } 111 | 112 | 113 | 114 | var max_results = 75; // 50 is not enough to search for map in the base libraries 115 | var shown_range = null; 116 | var last_search = null; 117 | 118 | function quick_search() 119 | { 120 | perform_search(false); 121 | } 122 | 123 | function full_search() 124 | { 125 | perform_search(true); 126 | } 127 | 128 | 129 | function perform_search(full) 130 | { 131 | var text = document.getElementById("searchbox").value.toLowerCase(); 132 | if (text == last_search && !full) return; 133 | last_search = text; 134 | 135 | var table = document.getElementById("indexlist"); 136 | var status = document.getElementById("searchmsg"); 137 | var children = table.firstChild.childNodes; 138 | 139 | // first figure out the first node with the prefix 140 | var first = bisect(-1); 141 | var last = (first == -1 ? -1 : bisect(1)); 142 | 143 | if (first == -1) 144 | { 145 | table.className = ""; 146 | status.innerHTML = "No results found, displaying all"; 147 | } 148 | else if (first == 0 && last == children.length - 1) 149 | { 150 | table.className = ""; 151 | status.innerHTML = ""; 152 | } 153 | else if (last - first >= max_results && !full) 154 | { 155 | table.className = ""; 156 | status.innerHTML = "More than " + max_results + ", press Search to display"; 157 | } 158 | else 159 | { 160 | // decide what you need to clear/show 161 | if (shown_range) 162 | setclass(shown_range[0], shown_range[1], "indexrow"); 163 | setclass(first, last, "indexshow"); 164 | shown_range = [first, last]; 165 | table.className = "indexsearch"; 166 | status.innerHTML = ""; 167 | } 168 | 169 | 170 | function setclass(first, last, status) 171 | { 172 | for (var i = first; i <= last; i++) 173 | { 174 | children[i].className = status; 175 | } 176 | } 177 | 178 | 179 | // do a binary search, treating 0 as ... 180 | // return either -1 (no 0's found) or location of most far match 181 | function bisect(dir) 182 | { 183 | var first = 0, finish = children.length - 1; 184 | var mid, success = false; 185 | 186 | while (finish - first > 3) 187 | { 188 | mid = Math.floor((finish + first) / 2); 189 | 190 | var i = checkitem(mid); 191 | if (i == 0) i = dir; 192 | if (i == -1) 193 | finish = mid; 194 | else 195 | first = mid; 196 | } 197 | var a = (dir == 1 ? first : finish); 198 | var b = (dir == 1 ? finish : first); 199 | for (var i = b; i != a - dir; i -= dir) 200 | { 201 | if (checkitem(i) == 0) return i; 202 | } 203 | return -1; 204 | } 205 | 206 | 207 | // from an index, decide what the result is 208 | // 0 = match, -1 is lower, 1 is higher 209 | function checkitem(i) 210 | { 211 | var s = getitem(i).toLowerCase().substr(0, text.length); 212 | if (s == text) return 0; 213 | else return (s > text ? -1 : 1); 214 | } 215 | 216 | 217 | // from an index, get its string 218 | // this abstracts over alternates 219 | function getitem(i) 220 | { 221 | for ( ; i >= 0; i--) 222 | { 223 | var s = children[i].firstChild.firstChild.data; 224 | if (s.indexOf(' ') == -1) 225 | return s; 226 | } 227 | return ""; // should never be reached 228 | } 229 | } 230 | 231 | function setSynopsis(filename) { 232 | if (parent.window.synopsis) { 233 | if (parent.window.synopsis.location.replace) { 234 | // In Firefox this avoids adding the change to the history. 235 | parent.window.synopsis.location.replace(filename); 236 | } else { 237 | parent.window.synopsis.location = filename; 238 | } 239 | } 240 | } 241 | 242 | function addMenuItem(html) { 243 | var menu = document.getElementById("page-menu"); 244 | if (menu) { 245 | var btn = menu.firstChild.cloneNode(false); 246 | btn.innerHTML = html; 247 | menu.appendChild(btn); 248 | } 249 | } 250 | 251 | function adjustForFrames() { 252 | var bodyCls; 253 | 254 | if (parent.location.href == window.location.href) { 255 | // not in frames, so add Frames button 256 | addMenuItem("Frames"); 257 | bodyCls = "no-frame"; 258 | } 259 | else { 260 | bodyCls = "in-frame"; 261 | } 262 | addClass(document.body, bodyCls); 263 | } 264 | 265 | function reframe() { 266 | setCookie("haddock-reframe", document.URL); 267 | window.location = "frames.html"; 268 | } 269 | 270 | function postReframe() { 271 | var s = getCookie("haddock-reframe"); 272 | if (s) { 273 | parent.window.main.location = s; 274 | clearCookie("haddock-reframe"); 275 | } 276 | } 277 | 278 | function styles() { 279 | var i, a, es = document.getElementsByTagName("link"), rs = []; 280 | for (i = 0; a = es[i]; i++) { 281 | if(a.rel.indexOf("style") != -1 && a.title) { 282 | rs.push(a); 283 | } 284 | } 285 | return rs; 286 | } 287 | 288 | function addStyleMenu() { 289 | var as = styles(); 290 | var i, a, btns = ""; 291 | for(i=0; a = as[i]; i++) { 292 | btns += "
  • " 294 | + a.title + "
  • " 295 | } 296 | if (as.length > 1) { 297 | var h = "
    " 298 | + "Style ▾" 299 | + "" 300 | + "
    "; 301 | addMenuItem(h); 302 | } 303 | } 304 | 305 | function setActiveStyleSheet(title) { 306 | var as = styles(); 307 | var i, a, found; 308 | for(i=0; a = as[i]; i++) { 309 | a.disabled = true; 310 | // need to do this always, some browsers are edge triggered 311 | if(a.title == title) { 312 | found = a; 313 | } 314 | } 315 | if (found) { 316 | found.disabled = false; 317 | setCookie("haddock-style", title); 318 | } 319 | else { 320 | as[0].disabled = false; 321 | clearCookie("haddock-style"); 322 | } 323 | styleMenu(false); 324 | } 325 | 326 | function resetStyle() { 327 | var s = getCookie("haddock-style"); 328 | if (s) setActiveStyleSheet(s); 329 | } 330 | 331 | 332 | function styleMenu(show) { 333 | var m = document.getElementById('style-menu'); 334 | if (m) toggleShow(m, show); 335 | } 336 | 337 | 338 | function pageLoad() { 339 | addStyleMenu(); 340 | adjustForFrames(); 341 | resetStyle(); 342 | restoreCollapsed(); 343 | } 344 | 345 | -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/hslogo-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/dist/doc/html/regex-genex/hslogo-16.png -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/index-frames.html: -------------------------------------------------------------------------------- 1 | regex-genex-0.6.0: From a regex, generate all possible strings it can match

    Modules

    -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/index.html: -------------------------------------------------------------------------------- 1 | regex-genex-0.6.0: From a regex, generate all possible strings it can match

    regex-genex-0.6.0: From a regex, generate all possible strings it can match

    regex-genex-0.6.0: From a regex, generate all possible strings it can match

    From a regex, generate all possible strings it can match 5 |

    -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/mini_Regex-Genex.html: -------------------------------------------------------------------------------- 1 | Regex.Genex

    Regex.Genex

    data Model

    -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/minus.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/dist/doc/html/regex-genex/minus.gif -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/ocean.css: -------------------------------------------------------------------------------- 1 | /* @group Fundamentals */ 2 | 3 | * { margin: 0; padding: 0 } 4 | 5 | /* Is this portable? */ 6 | html { 7 | background-color: white; 8 | width: 100%; 9 | height: 100%; 10 | } 11 | 12 | body { 13 | background: white; 14 | color: black; 15 | text-align: left; 16 | min-height: 100%; 17 | position: relative; 18 | } 19 | 20 | p { 21 | margin: 0.8em 0; 22 | } 23 | 24 | ul, ol { 25 | margin: 0.8em 0 0.8em 2em; 26 | } 27 | 28 | dl { 29 | margin: 0.8em 0; 30 | } 31 | 32 | dt { 33 | font-weight: bold; 34 | } 35 | dd { 36 | margin-left: 2em; 37 | } 38 | 39 | a { text-decoration: none; } 40 | a[href]:link { color: rgb(196,69,29); } 41 | a[href]:visited { color: rgb(171,105,84); } 42 | a[href]:hover { text-decoration:underline; } 43 | 44 | /* @end */ 45 | 46 | /* @group Fonts & Sizes */ 47 | 48 | /* Basic technique & IE workarounds from YUI 3 49 | For reasons, see: 50 | http://yui.yahooapis.com/3.1.1/build/cssfonts/fonts.css 51 | */ 52 | 53 | body { 54 | font:13px/1.4 sans-serif; 55 | *font-size:small; /* for IE */ 56 | *font:x-small; /* for IE in quirks mode */ 57 | } 58 | 59 | h1 { font-size: 146.5%; /* 19pt */ } 60 | h2 { font-size: 131%; /* 17pt */ } 61 | h3 { font-size: 116%; /* 15pt */ } 62 | h4 { font-size: 100%; /* 13pt */ } 63 | h5 { font-size: 100%; /* 13pt */ } 64 | 65 | select, input, button, textarea { 66 | font:99% sans-serif; 67 | } 68 | 69 | table { 70 | font-size:inherit; 71 | font:100%; 72 | } 73 | 74 | pre, code, kbd, samp, tt, .src { 75 | font-family:monospace; 76 | *font-size:108%; 77 | line-height: 124%; 78 | } 79 | 80 | .links, .link { 81 | font-size: 85%; /* 11pt */ 82 | } 83 | 84 | #module-header .caption { 85 | font-size: 182%; /* 24pt */ 86 | } 87 | 88 | .info { 89 | font-size: 85%; /* 11pt */ 90 | } 91 | 92 | #table-of-contents, #synopsis { 93 | /* font-size: 85%; /* 11pt */ 94 | } 95 | 96 | 97 | /* @end */ 98 | 99 | /* @group Common */ 100 | 101 | .caption, h1, h2, h3, h4, h5, h6 { 102 | font-weight: bold; 103 | color: rgb(78,98,114); 104 | margin: 0.8em 0 0.4em; 105 | } 106 | 107 | * + h1, * + h2, * + h3, * + h4, * + h5, * + h6 { 108 | margin-top: 2em; 109 | } 110 | 111 | h1 + h2, h2 + h3, h3 + h4, h4 + h5, h5 + h6 { 112 | margin-top: inherit; 113 | } 114 | 115 | ul.links { 116 | list-style: none; 117 | text-align: left; 118 | float: right; 119 | display: inline-table; 120 | margin: 0 0 0 1em; 121 | } 122 | 123 | ul.links li { 124 | display: inline; 125 | border-left: 1px solid #d5d5d5; 126 | white-space: nowrap; 127 | padding: 0; 128 | } 129 | 130 | ul.links li a { 131 | padding: 0.2em 0.5em; 132 | } 133 | 134 | .hide { display: none; } 135 | .show { display: inherit; } 136 | .clear { clear: both; } 137 | 138 | .collapser { 139 | background-image: url(minus.gif); 140 | background-repeat: no-repeat; 141 | } 142 | .expander { 143 | background-image: url(plus.gif); 144 | background-repeat: no-repeat; 145 | } 146 | p.caption.collapser, 147 | p.caption.expander { 148 | background-position: 0 0.4em; 149 | } 150 | .collapser, .expander { 151 | padding-left: 14px; 152 | margin-left: -14px; 153 | cursor: pointer; 154 | } 155 | 156 | pre { 157 | padding: 0.25em; 158 | margin: 0.8em 0; 159 | background: rgb(229,237,244); 160 | overflow: auto; 161 | border-bottom: 0.25em solid white; 162 | /* white border adds some space below the box to compensate 163 | for visual extra space that paragraphs have between baseline 164 | and the bounding box */ 165 | } 166 | 167 | .src { 168 | background: #f0f0f0; 169 | padding: 0.2em 0.5em; 170 | } 171 | 172 | .keyword { font-weight: normal; } 173 | .def { font-weight: bold; } 174 | 175 | 176 | /* @end */ 177 | 178 | /* @group Page Structure */ 179 | 180 | #content { 181 | margin: 0 auto; 182 | padding: 0 2em 6em; 183 | } 184 | 185 | #package-header { 186 | background: rgb(41,56,69); 187 | border-top: 5px solid rgb(78,98,114); 188 | color: #ddd; 189 | padding: 0.2em; 190 | position: relative; 191 | text-align: left; 192 | } 193 | 194 | #package-header .caption { 195 | background: url(hslogo-16.png) no-repeat 0em; 196 | color: white; 197 | margin: 0 2em; 198 | font-weight: normal; 199 | font-style: normal; 200 | padding-left: 2em; 201 | } 202 | 203 | #package-header a:link, #package-header a:visited { color: white; } 204 | #package-header a:hover { background: rgb(78,98,114); } 205 | 206 | #module-header .caption { 207 | color: rgb(78,98,114); 208 | font-weight: bold; 209 | border-bottom: 1px solid #ddd; 210 | } 211 | 212 | table.info { 213 | float: right; 214 | padding: 0.5em 1em; 215 | border: 1px solid #ddd; 216 | color: rgb(78,98,114); 217 | background-color: #fff; 218 | max-width: 40%; 219 | border-spacing: 0; 220 | position: relative; 221 | top: -0.5em; 222 | margin: 0 0 0 2em; 223 | } 224 | 225 | .info th { 226 | padding: 0 1em 0 0; 227 | } 228 | 229 | div#style-menu-holder { 230 | position: relative; 231 | z-index: 2; 232 | display: inline; 233 | } 234 | 235 | #style-menu { 236 | position: absolute; 237 | z-index: 1; 238 | overflow: visible; 239 | background: #374c5e; 240 | margin: 0; 241 | text-align: center; 242 | right: 0; 243 | padding: 0; 244 | top: 1.25em; 245 | } 246 | 247 | #style-menu li { 248 | display: list-item; 249 | border-style: none; 250 | margin: 0; 251 | padding: 0; 252 | color: #000; 253 | list-style-type: none; 254 | } 255 | 256 | #style-menu li + li { 257 | border-top: 1px solid #919191; 258 | } 259 | 260 | #style-menu a { 261 | width: 6em; 262 | padding: 3px; 263 | display: block; 264 | } 265 | 266 | #footer { 267 | background: #ddd; 268 | border-top: 1px solid #aaa; 269 | padding: 0.5em 0; 270 | color: #666; 271 | text-align: center; 272 | position: absolute; 273 | bottom: 0; 274 | width: 100%; 275 | height: 3em; 276 | } 277 | 278 | /* @end */ 279 | 280 | /* @group Front Matter */ 281 | 282 | #table-of-contents { 283 | float: right; 284 | clear: right; 285 | background: #faf9dc; 286 | border: 1px solid #d8d7ad; 287 | padding: 0.5em 1em; 288 | max-width: 20em; 289 | margin: 0.5em 0 1em 1em; 290 | } 291 | 292 | #table-of-contents .caption { 293 | text-align: center; 294 | margin: 0; 295 | } 296 | 297 | #table-of-contents ul { 298 | list-style: none; 299 | margin: 0; 300 | } 301 | 302 | #table-of-contents ul ul { 303 | margin-left: 2em; 304 | } 305 | 306 | #description .caption { 307 | display: none; 308 | } 309 | 310 | #synopsis { 311 | display: none; 312 | } 313 | 314 | .no-frame #synopsis { 315 | display: block; 316 | position: fixed; 317 | right: 0; 318 | height: 80%; 319 | top: 10%; 320 | padding: 0; 321 | } 322 | 323 | #synopsis .caption { 324 | float: left; 325 | width: 29px; 326 | color: rgba(255,255,255,0); 327 | height: 110px; 328 | margin: 0; 329 | font-size: 1px; 330 | padding: 0; 331 | } 332 | 333 | #synopsis p.caption.collapser { 334 | background: url(synopsis.png) no-repeat -64px -8px; 335 | } 336 | 337 | #synopsis p.caption.expander { 338 | background: url(synopsis.png) no-repeat 0px -8px; 339 | } 340 | 341 | #synopsis ul { 342 | height: 100%; 343 | overflow: auto; 344 | padding: 0.5em; 345 | margin: 0; 346 | } 347 | 348 | #synopsis ul ul { 349 | overflow: hidden; 350 | } 351 | 352 | #synopsis ul, 353 | #synopsis ul li.src { 354 | background-color: #faf9dc; 355 | white-space: nowrap; 356 | list-style: none; 357 | margin-left: 0; 358 | } 359 | 360 | /* @end */ 361 | 362 | /* @group Main Content */ 363 | 364 | #interface div.top { margin: 2em 0; } 365 | #interface h1 + div.top, 366 | #interface h2 + div.top, 367 | #interface h3 + div.top, 368 | #interface h4 + div.top, 369 | #interface h5 + div.top { 370 | margin-top: 1em; 371 | } 372 | #interface p.src .link { 373 | float: right; 374 | color: #919191; 375 | border-left: 1px solid #919191; 376 | background: #f0f0f0; 377 | padding: 0 0.5em 0.2em; 378 | margin: 0 -0.5em 0 0.5em; 379 | } 380 | 381 | #interface table { border-spacing: 2px; } 382 | #interface td { 383 | vertical-align: top; 384 | padding-left: 0.5em; 385 | } 386 | #interface td.src { 387 | white-space: nowrap; 388 | } 389 | #interface td.doc p { 390 | margin: 0; 391 | } 392 | #interface td.doc p + p { 393 | margin-top: 0.8em; 394 | } 395 | 396 | .subs dl { 397 | margin: 0; 398 | } 399 | 400 | .subs dt { 401 | float: left; 402 | clear: left; 403 | display: block; 404 | margin: 1px 0; 405 | } 406 | 407 | .subs dd { 408 | float: right; 409 | width: 90%; 410 | display: block; 411 | padding-left: 0.5em; 412 | margin-bottom: 0.5em; 413 | } 414 | 415 | .subs dd.empty { 416 | display: none; 417 | } 418 | 419 | .subs dd p { 420 | margin: 0; 421 | } 422 | 423 | .top p.src { 424 | border-top: 1px solid #ccc; 425 | } 426 | 427 | .subs, .doc { 428 | /* use this selector for one level of indent */ 429 | padding-left: 2em; 430 | } 431 | 432 | .arguments { 433 | margin-top: -0.4em; 434 | } 435 | .arguments .caption { 436 | display: none; 437 | } 438 | 439 | .fields { padding-left: 1em; } 440 | 441 | .fields .caption { display: none; } 442 | 443 | .fields p { margin: 0 0; } 444 | 445 | /* this seems bulky to me 446 | .methods, .constructors { 447 | background: #f8f8f8; 448 | border: 1px solid #eee; 449 | } 450 | */ 451 | 452 | /* @end */ 453 | 454 | /* @group Auxillary Pages */ 455 | 456 | #mini { 457 | margin: 0 auto; 458 | padding: 0 1em 1em; 459 | } 460 | 461 | #mini > * { 462 | font-size: 93%; /* 12pt */ 463 | } 464 | 465 | #mini #module-list .caption, 466 | #mini #module-header .caption { 467 | font-size: 125%; /* 15pt */ 468 | } 469 | 470 | #mini #interface h1, 471 | #mini #interface h2, 472 | #mini #interface h3, 473 | #mini #interface h4 { 474 | font-size: 109%; /* 13pt */ 475 | margin: 1em 0 0; 476 | } 477 | 478 | #mini #interface .top, 479 | #mini #interface .src { 480 | margin: 0; 481 | } 482 | 483 | #mini #module-list ul { 484 | list-style: none; 485 | margin: 0; 486 | } 487 | 488 | #alphabet ul { 489 | list-style: none; 490 | padding: 0; 491 | margin: 0.5em 0 0; 492 | text-align: center; 493 | } 494 | 495 | #alphabet li { 496 | display: inline; 497 | margin: 0 0.25em; 498 | } 499 | 500 | #alphabet a { 501 | font-weight: bold; 502 | } 503 | 504 | #index .caption, 505 | #module-list .caption { font-size: 131%; /* 17pt */ } 506 | 507 | #index table { 508 | margin-left: 2em; 509 | } 510 | 511 | #index .src { 512 | font-weight: bold; 513 | } 514 | #index .alt { 515 | font-size: 77%; /* 10pt */ 516 | font-style: italic; 517 | padding-left: 2em; 518 | } 519 | 520 | #index td + td { 521 | padding-left: 1em; 522 | } 523 | 524 | #module-list ul { 525 | list-style: none; 526 | margin: 0 0 0 2em; 527 | } 528 | 529 | #module-list li { 530 | clear: right; 531 | } 532 | 533 | #module-list span.collapser, 534 | #module-list span.expander { 535 | background-position: 0 0.3em; 536 | } 537 | 538 | #module-list .package { 539 | float: right; 540 | } 541 | 542 | /* @end */ 543 | -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/plus.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/dist/doc/html/regex-genex/plus.gif -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/regex-genex.haddock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/dist/doc/html/regex-genex/regex-genex.haddock -------------------------------------------------------------------------------- /dist/doc/html/regex-genex/synopsis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audreyt/regex-genex/eacf18333725e32dd6baaee8cc5b9bc709d861ca/dist/doc/html/regex-genex/synopsis.png -------------------------------------------------------------------------------- /regex-genex.cabal: -------------------------------------------------------------------------------- 1 | Name : regex-genex 2 | Version : 0.7.0 3 | license : OtherLicense 4 | license-file : LICENSE 5 | cabal-version : >= 1.6 6 | copyright : 2011-2015 Audrey Tang 7 | maintainer : Audrey Tang 8 | category : Text, Regex 9 | stability : experimental 10 | build-type : Simple 11 | homepage : https://github.com/audreyt/regex-genex 12 | synopsis : From a regex, generate all possible strings it can match 13 | description : From a regex, generate all possible strings it can match 14 | author : Audrey Tang 15 | Tested-With: GHC==7.10.1 16 | 17 | library 18 | hs-source-dirs: . src 19 | exposed-modules: Regex.Genex Regex.Genex.Normalize 20 | other-modules: Regex.Genex.Pure 21 | extensions : ImplicitParams, NamedFieldPuns, ParallelListComp, PatternGuards, RecordWildCards 22 | build-depends: 23 | base >= 3 && < 5, mtl, containers, sbv >= 5 && < 6, regex-tdfa, stream-monad, text, logict 24 | 25 | executable genex 26 | main-is: Main.hs 27 | hs-source-dirs: . src 28 | extensions : ImplicitParams, NamedFieldPuns, ParallelListComp, PatternGuards, RecordWildCards 29 | build-depends: 30 | base >= 3 && < 5, mtl, containers, sbv >= 5 && < 6, regex-tdfa 31 | 32 | source-repository head 33 | type: git 34 | location: http://github.com/audreyt/regex-genex 35 | -------------------------------------------------------------------------------- /src/Regex/Genex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ImplicitParams, NamedFieldPuns, ParallelListComp, PatternGuards #-} 2 | {-| 3 | 4 | This module and the accompanying 'genex' program finds all permutations 5 | of strings that matches every input regular expressions, ordered from 6 | shortest to longest, with full support for back references ('\1' .. '\9') 7 | and word boundaries ('\b'). 8 | 9 | It requires the @z3@ or @yices@ binary in PATH. The latter may be downloaded from: 10 | 11 | 12 | -} 13 | module Regex.Genex (Model(..), genex, genexPure, genexPrint, genexModels, genexWith, regexMatch) where 14 | import Data.SBV 15 | import Data.SBV.Internals (SBV) 16 | import Data.Set (toList) 17 | import Data.Monoid 18 | import Control.Monad.State 19 | import qualified Data.Char 20 | import qualified Regex.Genex.Pure as Pure 21 | import Text.Regex.TDFA.Pattern 22 | import Regex.Genex.Normalize (normalize) 23 | import Text.Regex.TDFA.ReadRegex (parseRegex) 24 | import Data.IntSet (IntSet) 25 | import qualified Data.IntSet as IntSet 26 | import Data.IntMap (IntMap) 27 | import qualified Data.IntMap as IntMap 28 | import System.IO.Unsafe (unsafeInterleaveIO) 29 | 30 | -- | Given a list of regular repressions, returns all possible strings that matches every one of them. 31 | -- Guarantees to return shorter strings before longer ones. 32 | genex :: [String] -> IO [String] 33 | genex = let ?maxRepeat = maxRepeatDefault 34 | in genexWith getString 35 | 36 | -- | A match consists of a string (list of codepoints), and a rank representing alternation order. 37 | data Model = Model 38 | { modelChars :: [Word8] 39 | , modelRank :: Word64 40 | } 41 | deriving (Show, Eq, Ord) 42 | 43 | -- | Same as 'genex', but with the entire model returned instead. 44 | genexModels :: [String] -> IO [Model] 45 | genexModels = let ?maxRepeat = maxRepeatDefault 46 | in genexWith (getStringWith id) 47 | 48 | -- | Same as 'genexModels', but print the models to standard output instead. 49 | genexPrint :: [String] -> IO () 50 | genexPrint = let ?maxRepeat = maxRepeatDefault 51 | in genexWith displayString 52 | 53 | -- | A pure and much faster variant of 'genex', but without support for 54 | -- back-references, anchors or word boundaries. 55 | -- Does not guarantee orders about length of strings. 56 | -- Does not depend on the external @yices@ SMT solver. 57 | genexPure :: [String] -> [String] 58 | genexPure = Pure.genexPure 59 | 60 | type Len = Word16 61 | type SChar = SWord8 62 | type Str = [SChar] 63 | type Offset = SBV Len 64 | type Flips = [SWord64] 65 | type Captures = SFunArray Word8 Len 66 | type Hits = Word16 67 | 68 | maxHits :: Hits 69 | maxHits = maxBound -- 65535 70 | 71 | -- controlled by an implicit parameter, but this is the default 72 | -- when instantiated from functions that do not expose the implicit 73 | -- parameter to the user 74 | maxRepeatDefault :: Int 75 | maxRepeatDefault = 3 -- 7 and 15 are also good 76 | 77 | maxLength :: Len 78 | maxLength = maxBound -- 65535 79 | 80 | -- lengths p = let ?grp = mempty in IntSet.toList . fst $ runState (possibleLengths $ parse p) mempty 81 | 82 | minLen :: (?maxRepeat :: Int, ?grp :: GroupLens) => Pattern -> Int 83 | minLen p = case p of 84 | PEscape {getPatternChar = ch} 85 | | Data.Char.isDigit ch -> let num = charToDigit ch in 86 | IntSet.findMin (IntMap.findWithDefault (IntSet.singleton 0) num ?grp) 87 | _ -> IntSet.findMin . fst $ runState (possibleLengths p) mempty 88 | 89 | parse :: String -> Pattern 90 | parse r = case parseRegex r of 91 | Right (pattern, _) -> pattern 92 | Left x -> error $ show x 93 | 94 | type GroupLens = IntMap IntSet 95 | type BackReferences = IntSet 96 | 97 | possibleLengths :: (?maxRepeat :: Int, ?grp :: GroupLens) => Pattern -> State (GroupLens, BackReferences) IntSet 98 | possibleLengths pat = case pat of 99 | _ | isOne pat -> one 100 | PGroup (Just idx) p -> do 101 | lenP <- possibleLengths p 102 | modify $ \(g, b) -> (IntMap.insert idx lenP g, b) 103 | return lenP 104 | PGroup _ p -> possibleLengths p 105 | PCarat{} -> zero 106 | PDollar{} -> zero 107 | PQuest p -> maybeGroup p (`mappend` zeroSet) 108 | POr ps -> fmap mconcat $ mapM possibleLengths ps 109 | PConcat [] -> zero 110 | PConcat ps -> fmap (foldl1 sumSets) (mapM possibleLengths ps) 111 | PEscape {getPatternChar = ch} 112 | | ch `elem` "ntrfaedwsWSD" -> one 113 | | ch `elem` "b" -> zero 114 | | Data.Char.isDigit ch -> do 115 | let num = charToDigit ch 116 | modify $ \(g, b) -> (g, IntSet.insert num b) 117 | gets $ (IntMap.findWithDefault (IntMap.findWithDefault (error $ "No such capture: " ++ [ch]) num ?grp) num) . fst 118 | | Data.Char.isAlpha ch -> error $ "Unsupported escape: " ++ [ch] 119 | | otherwise -> one 120 | PBound low (Just high) p -> manyTimes p low high 121 | PBound low _ p -> manyTimes p low (low + ?maxRepeat) 122 | PPlus p -> manyTimes p 1 (?maxRepeat+1) 123 | PStar _ p -> manyTimes p 0 ?maxRepeat 124 | PEmpty -> zero 125 | _ -> error $ show pat 126 | where 127 | one = return $ IntSet.singleton 1 128 | zero = return $ IntSet.singleton 0 129 | zeroSet = IntSet.singleton 0 130 | sumSets s1 s2 = IntSet.unions [ IntSet.map (+elm) s2 | elm <- IntSet.elems s1 ] 131 | manyTimes p low high = maybeGroup p $ \lenP -> IntSet.unions 132 | [ foldl sumSets (IntSet.singleton 0) (replicate i lenP) 133 | | i <- [low..high] 134 | ] 135 | maybeGroup p@(PGroup (Just idx) _) f = do 136 | lenP <- possibleLengths p 137 | let lenP' = f lenP 138 | modify $ \(g, b) -> (IntMap.insert idx lenP' g, b) 139 | return lenP' 140 | maybeGroup p f = fmap f (possibleLengths p) 141 | 142 | charToDigit :: Char -> Int 143 | charToDigit ch = Data.Char.ord ch - Data.Char.ord '0' 144 | 145 | exactMatch :: (?maxRepeat :: Int, ?pats :: [(Pattern, GroupLens)]) => Len -> Symbolic SBool 146 | exactMatch len = do 147 | str <- mkExistVars $ fromEnum len 148 | initialFlips <- mkExistVars 1 149 | captureAt <- newArray_ (Just minBound) 150 | captureLen <- newArray_ (Just minBound) 151 | let ?str = str 152 | let initialStatus = Status 153 | { ok = true 154 | , pos = strLen 155 | , flips = initialFlips 156 | , captureAt = captureAt 157 | , captureLen = captureLen 158 | } 159 | strLen = literal len 160 | runPat s (pat, groupLens) = let ?pat = pat in let ?grp = groupLens in 161 | ite (ok s &&& pos s .== strLen) 162 | (match s{ pos = 0, captureAt, captureLen }) 163 | s{ ok = false, pos = maxBound, flips = [maxBound] } 164 | let Status{ ok, pos, flips } = foldl runPat initialStatus ?pats 165 | return (bAll (.== 0) flips &&& pos .== strLen &&& ok) 166 | 167 | data Status = Status 168 | { ok :: SBool 169 | , pos :: Offset 170 | , flips :: Flips 171 | , captureAt :: Captures 172 | , captureLen :: Captures 173 | } 174 | 175 | instance Mergeable Status where 176 | symbolicMerge f t s1 s2 = Status 177 | { ok = symbolicMerge f t (ok s1) (ok s2) 178 | , pos = symbolicMerge f t (pos s1) (pos s2) 179 | , flips = symbolicMerge f t (flips s1) (flips s2) 180 | , captureAt = symbolicMerge f t (captureAt s1) (captureAt s2) 181 | , captureLen = symbolicMerge f t (captureLen s1) (captureLen s2) 182 | } 183 | 184 | choice :: (?str :: Str, ?pat :: Pattern) => Flips -> [Flips -> Status] -> Status 185 | choice _ [] = error "X" 186 | choice flips [a] = a flips 187 | choice flips [a, b] = ite (lsb flip) (b flips') (a flips') 188 | where 189 | flip = head flips 190 | flips' = [flip `shiftR` 1] 191 | choice flips xs = select (map ($ flips') xs) (head xs [thisFlip]){ ok = false } thisFlip 192 | where 193 | bits = log2 $ length xs 194 | flips' = [head flips `shiftR` bits] 195 | thisFlip = head flips `shiftL` (64 - bits) `shiftR` (64 - bits) 196 | 197 | log2 :: Int -> Int 198 | log2 1 = 0 199 | log2 n = 1 + log2 ((n + 1) `div` 2) 200 | 201 | writeCapture :: Captures -> Int -> Offset -> Captures 202 | writeCapture cap idx val = writeArray cap (toEnum idx) val 203 | 204 | readCapture :: Captures -> Int -> Offset 205 | readCapture a = readArray a . toEnum 206 | 207 | isOne :: Pattern -> Bool 208 | isOne PChar{} = True 209 | isOne PDot{} = True 210 | isOne PAny {} = True 211 | isOne PAnyNot {} = True 212 | isOne (PGroup Nothing p) = isOne p 213 | isOne PEscape {getPatternChar = ch} 214 | | ch `elem` "ntrfaedwsWSD" = True 215 | | ch `elem` "b" = False 216 | | Data.Char.isDigit ch = False 217 | | Data.Char.isAlpha ch = error $ "Unsupported escape: " ++ [ch] 218 | | otherwise = True 219 | isOne _ = False 220 | 221 | matchOne :: (?pat :: Pattern) => SChar -> SBool 222 | matchOne cur = case ?pat of 223 | PChar {getPatternChar = ch} -> isChar ch 224 | PDot{} -> isDot 225 | PGroup Nothing p -> let ?pat = p in matchOne cur 226 | PAny {getPatternSet = pset} -> case pset of 227 | PatternSet (Just cset) _ _ _ -> oneOf $ toList cset 228 | _ -> error "TODO" 229 | PAnyNot {getPatternSet = pset} -> case pset of 230 | PatternSet (Just cset) _ _ _ -> noneOf $ toList cset 231 | _ -> error "TODO" 232 | PEscape {getPatternChar = ch} -> case ch of 233 | 'n' -> isChar '\n' 234 | 't' -> isChar '\t' 235 | 'r' -> isChar '\r' 236 | 'f' -> isChar '\f' 237 | 'a' -> isChar '\a' 238 | 'e' -> isChar '\ESC' 239 | 'd' -> isDigit 240 | 'w' -> isWordChar 241 | 's' -> isWhiteSpace 242 | 'W' -> (isDot &&& bnot isWordChar) 243 | 'S' -> (isDot &&& bnot isWhiteSpace) 244 | 'D' -> (isDot &&& bnot isDigit) 245 | _ -> isChar ch 246 | _ -> false 247 | where 248 | ord = toEnum . Data.Char.ord 249 | isChar ch = cur .== ord ch 250 | isDot = (cur .>= ord ' ' &&& cur .<= ord '~') 251 | oneOf cs = bOr [ ord ch .== cur | ch <- cs ] 252 | noneOf cs = bAnd ((cur .>= ord ' ') : (cur .<= ord '~') : [ ord ch ./= cur | ch <- cs ]) 253 | isDigit = (ord '0' .<= cur &&& ord '9' .>= cur) 254 | isWordChar = (cur .>= ord 'A' &&& cur .<= ord 'Z') 255 | ||| (cur .>= ord 'a' &&& cur .<= ord 'z') 256 | ||| (cur .== ord '_') 257 | isWhiteSpace = cur .== 32 ||| (9 .<= cur &&& 13 .>= cur &&& 11 ./= cur) 258 | 259 | 260 | match :: (?maxRepeat :: Int, ?str :: Str, ?pat :: Pattern, ?grp :: GroupLens) => Status -> Status 261 | match s@Status{ pos, flips, captureAt, captureLen } 262 | | isOne ?pat = ite (pos .>= strLen) __FAIL__ one 263 | | otherwise = ite (pos + (toEnum $ minLen ?pat) .> strLen) __FAIL__ $ case ?pat of 264 | PGroup (Just idx) p -> let s'@Status{ pos = pos', ok = ok' } = next p in 265 | ite ok' (s' 266 | { captureAt = writeCapture captureAt idx pos 267 | , captureLen = writeCapture captureLen idx (pos' - pos) 268 | }) __FAIL__ 269 | PGroup _ p -> next p 270 | PCarat{} -> ite (isBegin ||| (charAt (pos-1) .== ord '\n')) s __FAIL__ 271 | PDollar{} -> ite (isEnd ||| (charAt (pos+1) .== ord '\n')) s __FAIL__ 272 | PQuest p -> choice flips [\b -> let ?pat = p in match s{ flips = b }, \b -> s{ flips = b }] 273 | POr [p] -> next p 274 | POr ps -> choice flips $ map (\p -> \b -> let ?pat = p in match s{ flips = b }) ps 275 | PConcat [] -> s 276 | PConcat [p] -> next p 277 | PConcat ps 278 | | all isOne ps -> ite ( 279 | (bAnd [ let ?pat = p in matchOne (charAt (pos+i)) 280 | | p <- ps 281 | | i <- [0..] 282 | ]) 283 | ) s{ pos = pos + toEnum (length ps) } __FAIL__ 284 | | (ones@(_:_:_), rest) <- span isOne ps -> step [PConcat ones, PConcat rest] s 285 | | (nones@(_:_), rest@(_:_:_)) <- span (not . isOne) ps -> step (nones ++ [PConcat rest]) s 286 | | otherwise -> step ps s 287 | where 288 | step [] s' = s' 289 | step (p':ps') s' = 290 | let s''@Status{ ok } = (let ?pat = p' in match s') 291 | res = step ps' s'' 292 | in ite ok res __FAIL__ 293 | PEscape {getPatternChar = ch} -> case ch of 294 | 'b' -> ite isWordBoundary s __FAIL__ 295 | _ | Data.Char.isDigit ch -> 296 | let from = readCapture captureAt num 297 | Just defaultLen = IntMap.lookup num ?grp 298 | possibleLens = IntSet.toList defaultLen 299 | len = case possibleLens of 300 | [] -> 0 301 | [l] -> toEnum l 302 | _ -> readCapture captureLen num 303 | num = charToDigit ch 304 | in ite (matchCapture (from :: Offset) len 0) s{ pos = pos+len } __FAIL__ 305 | | Data.Char.isAlpha ch -> error $ "Unsupported escape: " ++ [ch] 306 | | otherwise -> cond (ord ch .== cur) 307 | PBound low (Just high) p -> let s'@Status{ ok = ok' } = (let ?pat = PConcat (replicate low p) in match s) in 308 | if low == high then s' else ite ok' (let ?pat = p in (manyTimes s' $ high - low)) s' 309 | PBound low _ p -> let ?pat = (PBound low (Just $ low + ?maxRepeat) p) in match s 310 | PPlus p -> 311 | let s'@Status{ok} = next p 312 | res = let ?pat = PStar True p in match s' 313 | in ite ok res s' 314 | PStar _ p -> next $ PBound 0 Nothing p 315 | PEmpty -> s 316 | _ -> error $ show ?pat 317 | where 318 | one = cond $ matchOne cur 319 | next p = let ?pat = p in match s 320 | strLen = toEnum (length ?str) 321 | manyTimes :: (?pat :: Pattern) => Status -> Int -> Status 322 | manyTimes s'@Status{ flips = flips' } n 323 | | n <= 0 = s' 324 | | otherwise = choice flips' [\b -> s'{ flips = b }, nextTime] 325 | where 326 | nextTime b = let s''@Status{ ok = ok'', pos = pos'' } = match s'{ flips = b } in 327 | ite (pos'' .<= strLen &&& ok'') (manyTimes s'' (n-1)) s'' 328 | 329 | cur = charAt pos 330 | charAt = select ?str 0 331 | cond b = ite b s{ pos = pos+1 } __FAIL__ 332 | ord = toEnum . Data.Char.ord 333 | matchCapture :: Offset -> Offset -> Int -> SBool 334 | matchCapture from len n 335 | | n >= (length ?str) = true 336 | | otherwise = (len .<= off) ||| (charAt (pos+off) .== charAt (from+off) &&& matchCapture from len (n+1)) 337 | where 338 | off = toEnum n 339 | __FAIL__ = s{ ok = false, pos = maxBound, flips = [maxBound] } 340 | isEnd = (pos .== toEnum (length ?str)) 341 | isBegin = (pos .== 0) 342 | isWordCharAt at = let char = charAt at in 343 | (char .>= ord 'A' &&& char .<= ord 'Z') 344 | ||| 345 | (char .>= ord 'a' &&& char .<= ord 'z') 346 | ||| 347 | (char .== ord '_') 348 | isWordBoundary = case length ?str of 349 | 0 -> false 350 | _ -> (isEnd &&& isWordCharAt (pos-1)) ||| 351 | (isBegin &&& isWordCharAt pos) ||| 352 | (isWordCharAt (pos-1) <+> isWordCharAt pos) 353 | 354 | 355 | displayString :: [SatResult] -> Hits -> (Hits -> IO ()) -> IO () 356 | displayString [] a next = next a 357 | displayString (r:rs) a next = do 358 | let Right (_, (chars, rank)) = getModel r 359 | putStr $ show (length (chars :: [Word8])) ++ "." 360 | let n = show (rank :: Word64) 361 | putStr (replicate (8 - length n) '0') 362 | putStr n 363 | putStr "\t\t" 364 | print $ map chr chars 365 | if (a+1 >= maxHits) then return () else 366 | displayString rs (a+1) next 367 | where 368 | chr = Data.Char.chr . fromEnum 369 | 370 | genexWith :: (?maxRepeat :: Int, Monoid a) => ([SatResult] -> Hits -> (Hits -> IO a) -> IO a) -> [[Char]] -> IO a 371 | genexWith f regexes = do 372 | let ?grp = mempty 373 | let p'lens = [ ((p', groupLens), lens) 374 | | p <- [ if r == "" then PEmpty else parse r | r <- regexes ] 375 | , let (lens, (groupLens, backRefs)) = runState (possibleLengths p) mempty 376 | , let p' = normalize backRefs p 377 | ] 378 | let ?pats = map fst p'lens 379 | let lens = IntSet.toAscList $ foldl1 IntSet.intersection (map snd p'lens) 380 | tryWith f (filter (<= maxLength) $ map toEnum lens) 0 381 | 382 | tryWith :: (?maxRepeat :: Int, ?pats :: [(Pattern, GroupLens)]) => 383 | Monoid a => ResultHandler a -> [Len] -> Hits -> IO a 384 | tryWith _ [] _ = return mempty 385 | tryWith f (len:lens) acc = if len > maxLength then return mempty else do 386 | AllSatResult (_, allRes) <- allSat $ exactMatch len 387 | f (map SatResult allRes) acc $ tryWith f lens 388 | 389 | type ResultHandler a = [SatResult] -> Hits -> (Hits -> IO a) -> IO a 390 | 391 | getStringWith :: (Model -> a) -> [SatResult] -> Hits -> (Hits -> IO [a]) -> IO [a] 392 | getStringWith _ [] a next = next a 393 | getStringWith f (r:rs) a next = do 394 | let Right (_, (chars, rank)) = getModel r 395 | rest <- if (a+1 >= maxHits) then return [] else 396 | unsafeInterleaveIO $ getStringWith f rs (a+1) next 397 | return (f (Model chars rank):rest) 398 | 399 | getString :: [SatResult] -> Hits -> (Hits -> IO [String]) -> IO [String] 400 | getString = getStringWith $ \Model{ modelChars } -> map chr modelChars 401 | where 402 | chr = Data.Char.chr . fromEnum 403 | 404 | -- Given a regex and a symbolic string, returns true if regex matches the string 405 | regexMatch :: (?maxRepeat :: Int) => [[Char]] -> Str -> Symbolic SBool 406 | regexMatch regexes str = do 407 | let ?grp = mempty 408 | let p'lens = [ ((p', groupLens), lens) 409 | | p <- [ if r == "" then PEmpty else parse r | r <- regexes ] 410 | , let (lens, (groupLens, backRefs)) = runState (possibleLengths p) mempty 411 | , let p' = normalize backRefs p 412 | ] 413 | let ?pats = map fst p'lens 414 | let lens = IntSet.toAscList $ foldl1 IntSet.intersection (map snd p'lens) 415 | initialFlips <- mkExistVars 1 416 | captureAt <- newArray_ (Just minBound) 417 | captureLen <- newArray_ (Just minBound) 418 | let ?str = str 419 | let strLen = literal (fromIntegral (length str)) 420 | let initialStatus = Status 421 | { ok = true 422 | , pos = strLen 423 | , flips = initialFlips 424 | , captureAt = captureAt 425 | , captureLen = captureLen 426 | } 427 | runPat s (pat, groupLens) = let ?pat = pat in let ?grp = groupLens in 428 | ite (ok s &&& pos s .== strLen) 429 | (match s{ pos = 0, captureAt, captureLen }) 430 | s{ ok = false, pos = maxBound, flips = [maxBound] } 431 | let Status{ ok, pos, flips } = foldl runPat initialStatus ?pats 432 | return (bAll (.== 0) flips &&& pos .== strLen &&& ok) 433 | -------------------------------------------------------------------------------- /src/Regex/Genex/Normalize.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ImplicitParams, NamedFieldPuns, PatternGuards #-} 2 | module Regex.Genex.Normalize (normalize) where 3 | import Data.Set (toList, Set) 4 | import Text.Regex.TDFA.Pattern 5 | import Text.Regex.TDFA.ReadRegex (parseRegex) 6 | import Data.IntSet (IntSet) 7 | import qualified Data.IntSet as IntSet 8 | import qualified Data.Set as Set 9 | 10 | type BackReferences = IntSet 11 | 12 | -- | Normalize a regex into @strong star normal form@, as defined in the paper 13 | -- @Simplifying Regular Expressions: A Quantitative Perspective@. 14 | normalize :: BackReferences -> Pattern -> Pattern 15 | normalize refs p = black $ let ?refs = refs in simplify p 16 | 17 | nullable :: Pattern -> Bool 18 | nullable pat = case pat of 19 | PGroup _ p -> nullable p 20 | PQuest{} -> True 21 | POr ps -> any nullable ps 22 | PConcat ps -> all nullable ps 23 | PBound 0 _ _ -> True 24 | PBound _ _ _ -> False 25 | PStar{} -> True 26 | PEmpty -> True 27 | _ -> False 28 | 29 | white :: Pattern -> Pattern 30 | white pat = case pat of 31 | PQuest p -> white p 32 | PStar _ p -> white p 33 | PGroup x p -> PGroup x $ white p 34 | POr ps -> POr (map white ps) 35 | PConcat ps -> if nullable pat 36 | then POr (map white ps) 37 | else pat 38 | PPlus p -> if nullable pat 39 | then PConcat [p, white p] 40 | else pat 41 | _ -> pat 42 | 43 | black :: Pattern -> Pattern 44 | black pat = case pat of 45 | POr ps -> POr (map black ps) 46 | PConcat ps -> PConcat (map black ps) 47 | PGroup x p -> PGroup x $ black p 48 | PStar x p -> PStar x $ white (black p) 49 | PPlus p -> PConcat [p, PStar (nullable p) (white $ black p)] 50 | PBound 0 Nothing p -> PStar (nullable p) (white $ black p) 51 | PBound x Nothing p -> PConcat [PBound x (Just x) p, PStar (nullable p) (white $ black p)] 52 | PBound x y p -> PBound x y $ black p 53 | PQuest p -> if nullable p 54 | then black p 55 | else PQuest $ black p 56 | _ -> pat 57 | 58 | _parse :: String -> Pattern 59 | _parse r = case parseRegex r of 60 | Right (pattern, _) -> pattern 61 | Left x -> error $ show x 62 | 63 | foldChars :: (Set Char, [Pattern]) -> Pattern -> (Set Char, [Pattern]) 64 | foldChars (cset, rest) pat = case pat of 65 | PChar { getPatternChar = ch } -> (Set.insert ch cset, rest) 66 | PAny {getPatternSet = PatternSet (Just cset') _ _ _} -> (Set.union cset cset', rest) 67 | _ -> (cset, pat:rest) 68 | 69 | simplify :: (?refs :: BackReferences) => Pattern -> Pattern 70 | simplify pat = case pat of 71 | PGroup (Just idx) p -> if idx `IntSet.member` ?refs then PGroup (Just idx) (simplify p) else simplify p 72 | PGroup _ p -> simplify p 73 | PQuest p -> case simplify p of 74 | PEmpty -> PEmpty 75 | p' -> PQuest p' 76 | PAny {getPatternSet = pset, getDoPa} -> case pset of 77 | PatternSet (Just cset) _ _ _ -> case toList cset of 78 | [ch] -> PChar { getPatternChar = ch, getDoPa } 79 | _ -> pat 80 | _ -> pat 81 | POr [] -> PEmpty 82 | POr [p] -> simplify p 83 | POr ps -> let ps' = map simplify ps in 84 | case foldl foldChars (Set.empty, []) ps' of 85 | (cset, rest) 86 | | null rest -> anySet 87 | | Set.null cset -> POr rest 88 | | [r] <- rest -> POr [anySet, r] 89 | | otherwise -> POr [anySet, POr rest] 90 | where 91 | anySet = case Set.size cset of 92 | 1 -> PChar { getPatternChar = Set.findMin cset, getDoPa = toEnum 0 } 93 | _ -> PAny { getPatternSet = PatternSet (Just cset) Nothing Nothing Nothing, getDoPa = toEnum 0 } 94 | PConcat [] -> PEmpty 95 | PConcat [p] -> simplify p 96 | PConcat ps -> case concatMap (fromConcat . simplify) ps of 97 | [] -> PEmpty 98 | ps' -> PConcat ps' 99 | where 100 | fromConcat (PConcat ps') = ps' 101 | fromConcat PEmpty = [] 102 | fromConcat p = [p] 103 | PBound low (Just high) p 104 | | high == low -> simplify $ PConcat (replicate low (simplify p)) 105 | PBound low high p -> PBound low high (simplify p) 106 | PPlus p -> PPlus (simplify p) 107 | PStar x p -> PStar x (simplify p) 108 | _ -> pat 109 | 110 | -------------------------------------------------------------------------------- /src/Regex/Genex/Pure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards, NamedFieldPuns #-} 2 | module Regex.Genex.Pure (genexPure) where 3 | import qualified Data.Text as T 4 | import qualified Data.IntSet as IntSet 5 | import qualified Data.Set as Set 6 | import Data.List (intersect, (\\)) 7 | import Control.Monad 8 | import Control.Monad.Stream 9 | import qualified Control.Monad.Stream as Stream 10 | import Regex.Genex.Normalize (normalize) 11 | import Debug.Trace 12 | import Text.Regex.TDFA.Pattern 13 | import Text.Regex.TDFA.ReadRegex (parseRegex) 14 | import Control.Monad.State 15 | import Control.Applicative 16 | 17 | parse :: String -> Pattern 18 | parse r = case parseRegex r of 19 | Right (pattern, _) -> pattern 20 | Left x -> error $ show x 21 | 22 | genexPure :: [String] -> [String] 23 | genexPure = map T.unpack . foldl1 intersect . map (Stream.runStream . run . normalize IntSet.empty . parse) 24 | 25 | maxRepeat :: Int 26 | maxRepeat = 10 27 | 28 | each = foldl1 (<|>) . map return 29 | 30 | run :: Pattern -> Stream T.Text 31 | run p = case p of 32 | PEmpty -> pure T.empty 33 | PChar{..} -> isChar getPatternChar 34 | PAny {getPatternSet = PatternSet (Just cset) _ _ _} -> each $ map T.singleton $ Set.toList cset 35 | PAnyNot {getPatternSet = PatternSet (Just cset) _ _ _} -> chars $ notChars $ concatMap expandEscape $ Set.toList cset 36 | PQuest p -> pure T.empty <|> run p 37 | PPlus p -> run $ PBound 1 Nothing p 38 | PStar _ p -> run $ PBound 0 Nothing p 39 | PBound low high p -> do 40 | n <- each [low..maybe (low+maxRepeat) id high] 41 | fmap T.concat . sequence $ replicate n (run p) 42 | PConcat ps -> fmap T.concat . suspended . sequence $ map run ps 43 | POr xs -> foldl1 mplus $ map run xs 44 | PDot{} -> chars $ notChars [] 45 | PEscape {..} -> chars $ expandEscape getPatternChar 46 | _ -> error $ show p 47 | where 48 | isChar = return . T.singleton 49 | chars = each . map T.singleton 50 | notChars = ([' '..'~'] \\) 51 | expandEscape ch = case ch of 52 | 'n' -> "\n" 53 | 't' -> "\t" 54 | 'r' -> "\r" 55 | 'f' -> "\f" 56 | 'a' -> "\a" 57 | 'e' -> "\ESC" 58 | 'd' -> ['0'..'9'] 59 | 'w' -> ['0'..'9'] ++ '_' : ['a'..'z'] ++ ['A'..'Z'] 60 | 's' -> "\9\32" 61 | 'D' -> notChars $ ['0'..'9'] 62 | 'W' -> notChars $ ['0'..'9'] ++ '_' : ['a'..'z'] ++ ['A'..'Z'] 63 | 'S' -> notChars "\9\32" 64 | ch -> [ch] 65 | --------------------------------------------------------------------------------