├── .gitignore ├── LICENSE ├── README.md ├── img └── sturgeon.png ├── project.clj ├── src └── miner │ ├── strgen.cljc │ └── strgen │ └── impl.cljc └── test └── miner └── test_strgen.cljc /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml 2 | pom.xml.asc 3 | *jar 4 | /lib/ 5 | /classes/ 6 | /target/ 7 | /checkouts/ 8 | .lein-deps-sum 9 | .lein-repl-history 10 | .lein-plugins/ 11 | .lein-failures 12 | .nrepl-port 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Eclipse Public License - v 1.0 2 | 3 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 4 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 5 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 6 | 7 | 1. DEFINITIONS 8 | 9 | "Contribution" means: 10 | 11 | a) in the case of the initial Contributor, the initial code and documentation 12 | distributed under this Agreement, and 13 | b) in the case of each subsequent Contributor: 14 | i) changes to the Program, and 15 | ii) additions to the Program; 16 | 17 | where such changes and/or additions to the Program originate from and are 18 | distributed by that particular Contributor. A Contribution 'originates' from 19 | a Contributor if it was added to the Program by such Contributor itself or 20 | anyone acting on such Contributor's behalf. Contributions do not include 21 | additions to the Program which: (i) are separate modules of software 22 | distributed in conjunction with the Program under their own license 23 | agreement, and (ii) are not derivative works of the Program. 24 | 25 | "Contributor" means any person or entity that distributes the Program. 26 | 27 | "Licensed Patents" mean patent claims licensable by a Contributor which are 28 | necessarily infringed by the use or sale of its Contribution alone or when 29 | combined with the Program. 30 | 31 | "Program" means the Contributions distributed in accordance with this Agreement. 32 | 33 | "Recipient" means anyone who receives the Program under this Agreement, 34 | including all Contributors. 35 | 36 | 2. GRANT OF RIGHTS 37 | a) Subject to the terms of this Agreement, each Contributor hereby grants 38 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 39 | reproduce, prepare derivative works of, publicly display, publicly perform, 40 | distribute and sublicense the Contribution of such Contributor, if any, and 41 | such derivative works, in source code and object code form. 42 | b) Subject to the terms of this Agreement, each Contributor hereby grants 43 | Recipient a non-exclusive, worldwide, royalty-free patent license under 44 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 45 | transfer the Contribution of such Contributor, if any, in source code and 46 | object code form. This patent license shall apply to the combination of the 47 | Contribution and the Program if, at the time the Contribution is added by 48 | the Contributor, such addition of the Contribution causes such combination 49 | to be covered by the Licensed Patents. The patent license shall not apply 50 | to any other combinations which include the Contribution. No hardware per 51 | se is licensed hereunder. 52 | c) Recipient understands that although each Contributor grants the licenses to 53 | its Contributions set forth herein, no assurances are provided by any 54 | Contributor that the Program does not infringe the patent or other 55 | intellectual property rights of any other entity. Each Contributor 56 | disclaims any liability to Recipient for claims brought by any other entity 57 | based on infringement of intellectual property rights or otherwise. As a 58 | condition to exercising the rights and licenses granted hereunder, each 59 | Recipient hereby assumes sole responsibility to secure any other 60 | intellectual property rights needed, if any. For example, if a third party 61 | patent license is required to allow Recipient to distribute the Program, it 62 | is Recipient's responsibility to acquire that license before distributing 63 | the Program. 64 | d) Each Contributor represents that to its knowledge it has sufficient 65 | copyright rights in its Contribution, if any, to grant the copyright 66 | license set forth in this Agreement. 67 | 68 | 3. REQUIREMENTS 69 | 70 | A Contributor may choose to distribute the Program in object code form under its 71 | own license agreement, provided that: 72 | 73 | a) it complies with the terms and conditions of this Agreement; and 74 | b) its license agreement: 75 | i) effectively disclaims on behalf of all Contributors all warranties and 76 | conditions, express and implied, including warranties or conditions of 77 | title and non-infringement, and implied warranties or conditions of 78 | merchantability and fitness for a particular purpose; 79 | ii) effectively excludes on behalf of all Contributors all liability for 80 | damages, including direct, indirect, special, incidental and 81 | consequential damages, such as lost profits; 82 | iii) states that any provisions which differ from this Agreement are offered 83 | by that Contributor alone and not by any other party; and 84 | iv) states that source code for the Program is available from such 85 | Contributor, and informs licensees how to obtain it in a reasonable 86 | manner on or through a medium customarily used for software exchange. 87 | 88 | When the Program is made available in source code form: 89 | 90 | a) it must be made available under this Agreement; and 91 | b) a copy of this Agreement must be included with each copy of the Program. 92 | Contributors may not remove or alter any copyright notices contained within 93 | the Program. 94 | 95 | Each Contributor must identify itself as the originator of its Contribution, if 96 | any, in a manner that reasonably allows subsequent Recipients to identify the 97 | originator of the Contribution. 98 | 99 | 4. COMMERCIAL DISTRIBUTION 100 | 101 | Commercial distributors of software may accept certain responsibilities with 102 | respect to end users, business partners and the like. While this license is 103 | intended to facilitate the commercial use of the Program, the Contributor who 104 | includes the Program in a commercial product offering should do so in a manner 105 | which does not create potential liability for other Contributors. Therefore, if 106 | a Contributor includes the Program in a commercial product offering, such 107 | Contributor ("Commercial Contributor") hereby agrees to defend and indemnify 108 | every other Contributor ("Indemnified Contributor") against any losses, damages 109 | and costs (collectively "Losses") arising from claims, lawsuits and other legal 110 | actions brought by a third party against the Indemnified Contributor to the 111 | extent caused by the acts or omissions of such Commercial Contributor in 112 | connection with its distribution of the Program in a commercial product 113 | offering. The obligations in this section do not apply to any claims or Losses 114 | relating to any actual or alleged intellectual property infringement. In order 115 | to qualify, an Indemnified Contributor must: a) promptly notify the Commercial 116 | Contributor in writing of such claim, and b) allow the Commercial Contributor to 117 | control, and cooperate with the Commercial Contributor in, the defense and any 118 | related settlement negotiations. The Indemnified Contributor may participate in 119 | any such claim at its own expense. 120 | 121 | For example, a Contributor might include the Program in a commercial product 122 | offering, Product X. That Contributor is then a Commercial Contributor. If that 123 | Commercial Contributor then makes performance claims, or offers warranties 124 | related to Product X, those performance claims and warranties are such 125 | Commercial Contributor's responsibility alone. Under this section, the 126 | Commercial Contributor would have to defend claims against the other 127 | Contributors related to those performance claims and warranties, and if a court 128 | requires any other Contributor to pay any damages as a result, the Commercial 129 | Contributor must pay those damages. 130 | 131 | 5. NO WARRANTY 132 | 133 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN 134 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR 135 | IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, 136 | NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each 137 | Recipient is solely responsible for determining the appropriateness of using and 138 | distributing the Program and assumes all risks associated with its exercise of 139 | rights under this Agreement , including but not limited to the risks and costs 140 | of program errors, compliance with applicable laws, damage to or loss of data, 141 | programs or equipment, and unavailability or interruption of operations. 142 | 143 | 6. DISCLAIMER OF LIABILITY 144 | 145 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 146 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 147 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST 148 | PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 149 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 150 | OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS 151 | GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 152 | 153 | 7. GENERAL 154 | 155 | If any provision of this Agreement is invalid or unenforceable under applicable 156 | law, it shall not affect the validity or enforceability of the remainder of the 157 | terms of this Agreement, and without further action by the parties hereto, such 158 | provision shall be reformed to the minimum extent necessary to make such 159 | provision valid and enforceable. 160 | 161 | If Recipient institutes patent litigation against any entity (including a 162 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 163 | (excluding combinations of the Program with other software or hardware) 164 | infringes such Recipient's patent(s), then such Recipient's rights granted under 165 | Section 2(b) shall terminate as of the date such litigation is filed. 166 | 167 | All Recipient's rights under this Agreement shall terminate if it fails to 168 | comply with any of the material terms or conditions of this Agreement and does 169 | not cure such failure in a reasonable period of time after becoming aware of 170 | such noncompliance. If all Recipient's rights under this Agreement terminate, 171 | Recipient agrees to cease use and distribution of the Program as soon as 172 | reasonably practicable. However, Recipient's obligations under this Agreement 173 | and any licenses granted by Recipient relating to the Program shall continue and 174 | survive. 175 | 176 | Everyone is permitted to copy and distribute copies of this Agreement, but in 177 | order to avoid inconsistency the Agreement is copyrighted and may only be 178 | modified in the following manner. The Agreement Steward reserves the right to 179 | publish new versions (including revisions) of this Agreement from time to time. 180 | No one other than the Agreement Steward has the right to modify this Agreement. 181 | The Eclipse Foundation is the initial Agreement Steward. The Eclipse Foundation 182 | may assign the responsibility to serve as the Agreement Steward to a suitable 183 | separate entity. Each new version of the Agreement will be given a 184 | distinguishing version number. The Program (including Contributions) may always 185 | be distributed subject to the version of the Agreement under which it was 186 | received. In addition, after a new version of the Agreement is published, 187 | Contributor may elect to distribute the Program (including its Contributions) 188 | under the new version. Except as expressly stated in Sections 2(a) and 2(b) 189 | above, Recipient receives no rights or licenses to the intellectual property of 190 | any Contributor under this Agreement, whether expressly, by implication, 191 | estoppel or otherwise. All rights in the Program not expressly granted under 192 | this Agreement are reserved. 193 | 194 | This Agreement is governed by the laws of the State of New York and the 195 | intellectual property laws of the United States of America. No party to this 196 | Agreement will bring a legal action under this Agreement more than one year 197 | after the cause of action arose. Each party waives its rights to a jury trial in 198 | any resulting litigation. 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # strgen 2 | 3 | A Clojure library with a test.check generator that generates strings from regular 4 | expressions. Use `miner.strgen/string-generator` with [test.check][tc] and 5 | [clojure.spec][cs]. 6 | 7 | There is also a case-insensitive variant called 8 | `miner.strgen/case-insensitive-string-generator` which generates strings with mixed case 9 | even though they were not explicitly represented as such in the regex. This feature is 10 | somewhat analogous to setting the `CASE_INSENSITIVE` flag with Java regular expressions. 11 | 12 | The `clojure.spec` library is new in Clojure 1.9. 13 | 14 | [tc]: https://github.com/clojure/test.check "test.check" 15 | [cs]: http://clojure.org/guides/spec 16 | 17 | ![strgen](img/sturgeon.png) 18 | 19 | 20 | ## Version 21 | 22 | \[com.velisco/strgen "0.2.5"] 23 | 24 | [![strgen on Clojars][shield]][st] 25 | 26 | [latest]: https://clojars.org/com.velisco/strgen/latest-version.svg "strgen on Clojars" 27 | [shield]: https://img.shields.io/clojars/v/com.velisco/strgen.svg "strgen on Clojars" 28 | [st]: https://clojars.org/com.velisco/strgen "strgen on Clojars" 29 | 30 | 31 | ## Usage 32 | 33 | ```clojure 34 | (require '[miner.strgen :as sg]) 35 | (require '[clojure.spec.alpha :as s]) 36 | (require '[clojure.test.check.generators :as gen]) 37 | 38 | (gen/sample (sg/string-generator #"[A-Z]{2,4}")) 39 | 40 | ;;=> ("ZOHX" "ZOXZ" "INX" "JO" "MRMZ" "TO" "PEHM" "YNK" "FJ" "JWH") 41 | 42 | (s/def ::foobar (let [re #"fo+(bar)?"] 43 | (s/spec (s/and string? #(re-matches re %)) 44 | :gen #(sg/string-generator re)))) 45 | 46 | (s/exercise ::foobar) 47 | 48 | ;;=> (["fo" "fo"] ["fobar" "fobar"] ["fo" "fo"] ["foo" "foo"] ["foooo" "foooo"] 49 | ;; ["fooo" "fooo"] ["fo" "fo"] ["foobar" "foobar"] ["fooooobar" "fooooobar"] 50 | ;; ["fooooobar" "fooooobar"]) 51 | 52 | 53 | (gen/sample (sg/case-insensitive-string-generator #"fo+ba[rz]")) 54 | 55 | ;;=> ("FobAz" "FOBAR" "fOBaZ" "FOOBAR" "fooBaZ" "FoOOoObar" "foooBAr" "foooObAZ" "fOOOoObAz" 56 | ;; "fooooooooobar") 57 | 58 | ``` 59 | 60 | ## Limitations 61 | 62 | Not all Java [regular expressions][re] are supported. The basics work, including: 63 | x* y? z+. [abc] [a-z] [^a] \n (a|b) \w \W \d \D \s \S x{N} x{N,} x{M,N}. ^x$ ignores the leading 64 | ^ and trailing $ as they generate no characters. (Capture groups) and \1 style back 65 | references are not supported. Character groups like [:alnum:] are not supported. All the 66 | other fancy flags, quantifiers, character classes, etc. are unsupported. In summary, if I 67 | couldn't use the regex feature without looking it up, I didn't support it. If people ask 68 | for something, I might work on it. 69 | 70 | [re]: http://en.wikipedia.org/wiki/Regular_expression 71 | 72 | When generating X-or-more items for regular expressions such as #"x*" or #"y+", the 73 | generator limits the number of items to a reasonably small count. You can control this with 74 | an optional second arg `or-more-limit` (an integer, default 9) when calling 75 | `string-generator`. 76 | 77 | 78 | ## Related Projects 79 | 80 | If you need better support for Java regular expressions when generating Strings, you should 81 | consider using the [test.chuck][chuck] library which provides the `string-from-regex` 82 | generator. 83 | 84 | [chuck]: https://github.com/gfredericks/test.chuck "test.chuck" 85 | 86 | 87 | ## License 88 | 89 | Copyright © 2016-2023 Stephen E. Miner 90 | 91 | Distributed under the Eclipse Public License, same as Clojure. 92 | 93 | -------------------------------------------------------------------------------- /img/sturgeon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/miner/strgen/ec4bdea1d7add8ea61c41d0eb57fd8f79720eaf5/img/sturgeon.png -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject com.velisco/strgen "0.2.5" 2 | :description "String generator from regular expressions, for use with Clojure test.check and spec" 3 | :url "https://github.com/miner/strgen" 4 | :deploy-repositories {"releases" :clojars} 5 | :license {:name "Eclipse Public License" 6 | :url "http://www.eclipse.org/legal/epl-v10.html"} 7 | :dependencies [[org.clojure/clojure "1.11.4"] 8 | [org.clojure/clojurescript "1.11.132"] 9 | [org.clojure/test.check "1.1.1"]] 10 | ) 11 | 12 | -------------------------------------------------------------------------------- /src/miner/strgen.cljc: -------------------------------------------------------------------------------- 1 | (ns miner.strgen 2 | (:require [miner.strgen.impl :as impl] 3 | [clojure.test.check.generators :as gen])) 4 | 5 | (defn string-generator 6 | "Returns a test.check generator that generates strings matching the given regular 7 | expression `regex`. (Fancy flags and POSIX extensions are not suppored; see the doc for 8 | more information about the supported regular expression syntax.) The optional 9 | `or-more-limit` controls the maximum numbers of elements that are generated when matching 10 | a potentially unbounded regex (such as #\"x*\" or #\"y+\"). The default is 9." 11 | 12 | ([regex] 13 | (impl/string-generator regex)) 14 | 15 | ([regex or-more-limit] 16 | (impl/string-generator regex or-more-limit))) 17 | 18 | 19 | (defn case-insensitive-string-generator 20 | "Like `string-generator` but case-insensitive so it generates a mix of upper and lowercase 21 | characters for the given regex." 22 | 23 | ([regex] 24 | (gen/bind (string-generator regex) impl/gen-case-insensitive)) 25 | 26 | ([regex or-more-limit] 27 | (gen/bind (string-generator regex or-more-limit) impl/gen-case-insensitive))) 28 | -------------------------------------------------------------------------------- /src/miner/strgen/impl.cljc: -------------------------------------------------------------------------------- 1 | (ns miner.strgen.impl 2 | (:require [clojure.set :as set] 3 | [clojure.string :as str] 4 | #?(:cljs cljs.reader) 5 | [clojure.test.check.generators :as gen])) 6 | 7 | ;; All of this is considered private 8 | 9 | ;; Minimal regex features 10 | ;; *?+ . [abc] [a-z] [^a] \n (a|b) \w \W \d \D \s \S 11 | ;; x{N} x{N,} x{N,M} -- number of matches 12 | ;; ^x$ works by ignoring ^ as first, and $ as last 13 | 14 | ;; not supported: \1, (capture group), [:alnum:], fancy flags, fancy quantifiers 15 | ;; http://en.wikipedia.org/wiki/Regular_expression 16 | 17 | ;; Try test.chuck for a better string-from-regex-generator, but it has some dependecies that 18 | ;; will not allow it to be used in a contrib library. 19 | ;; https://github.com/gfredericks/test.chuck 20 | 21 | ;; For regex * + {N,} there is technically no limit to how many items might match "or 22 | ;; more". For purposes of generating strings, we limit the or-more items to 23 | ;; *or-more-limit*. 24 | 25 | ;; Note: the intention is to capture the value of *or-more-limit* at the time of parsing the 26 | ;; regex -- that is, when `string-generator` is called with a second argument for 27 | ;; `or-more-limit`. Binding *or-more-limit* should not affect the results of executing a 28 | ;; saved generator at a later time. 29 | 30 | (def ^:dynamic *or-more-limit* 9) 31 | 32 | (declare parse-chars) 33 | 34 | (defn slash [c] 35 | (case c 36 | \d '(:digit) 37 | \D '(:not-digit) 38 | \w '(:word) 39 | \W '(:not-word) 40 | \s '(:space) 41 | \S '(:not-space) 42 | \t '(:tab) 43 | \n '(:newline) 44 | \r '(:return) 45 | ;; all the special characters in `parse-chars` should be literals after a backslash 46 | (\[ \] \* \+ \. \? \\ \( \) \/ \$ \^ \| \{ \}) c 47 | (throw (ex-info (str "Unsupported backslash char " c) {:unsupported-backslash c})))) 48 | 49 | (defn parse-set-contents [cs result] 50 | ;;(println "parse-set-contents " result " " (first cs)) 51 | (case (first cs) 52 | nil (throw (ex-info "Unterminated [set]" {:error :unterminated-set :partial result})) 53 | \] (if (not (seq result)) 54 | (recur (rest cs) (conj result \])) 55 | [(list* :set result) (rest cs)]) 56 | \- (if (or (not (seq result)) (= \] (second cs))) 57 | (recur (rest cs) (conj result \-)) 58 | (recur (rest (rest cs)) (conj (pop result) (list :btw (peek result) (second cs))))) 59 | \\ (if (= \] (second cs)) 60 | (recur (rest cs) (conj result \\)) 61 | (recur (rest (rest cs)) (conj result (slash (second cs))))) 62 | (recur (rest cs) (conj result (first cs))))) 63 | 64 | ;; already consumed first [ 65 | (defn parse-set [cs] 66 | (case (first cs) 67 | \^ (let [[setexp rst] (parse-set-contents (rest cs) [])] 68 | [(list* :inverted (rest setexp)) rst]) 69 | (parse-set-contents cs []))) 70 | 71 | (defn read-digits [ds] 72 | (when (seq ds) 73 | #?(:clj (Long/parseLong (apply str ds)) 74 | :cljs (cljs.reader/read-string (apply str ds))))) 75 | 76 | ;; parse {N,M} where { is already consumed 77 | ;; but ,M is optional 78 | ;; Note subtle distinction between exactly {N} no comma, and N-or-more {N,} 79 | ;; For purposes of gen, we cap unspecified N-or-more at (+ N *or-more-limit*) 80 | 81 | ;; returns vector of [N, M (possibly nil), rest-of-cs] 82 | (defn parse-times [cs] 83 | (loop [digits [] n nil comma false cs cs] 84 | (case (first cs) 85 | \} (if n 86 | [n (when comma (or (read-digits digits) (+ n *or-more-limit*))) (rest cs)] 87 | [(read-digits digits) nil (rest cs)]) 88 | \space (recur digits n comma (rest cs)) 89 | \, (recur [] (read-digits digits) true (rest cs)) 90 | (\0 \1 \2 \3 \4 \5 \6 \7 \8 \9) (recur (conj digits (first cs)) n comma (rest cs))))) 91 | 92 | ;; special chars for `parse-chars` should be literals for `slash` 93 | (defn parse-chars 94 | ([cs] (parse-chars cs [] [])) 95 | ([cs group result] 96 | ;; (println group " " result " " (first cs)) 97 | (case (first cs) 98 | nil (if (empty? result) group (conj result group)) 99 | \^ (if (and (empty? result) (empty? group)) 100 | ;; ignore, but only at start of regex 101 | (recur (rest cs) group result) 102 | (throw (ex-info "Unexpected ^ found" 103 | {:error :caret 104 | :partial (if (empty? result) group (conj result group)) 105 | :remaining (apply str cs)}))) 106 | \$ (if (empty? (rest cs)) 107 | ;; ignore, but only at end of regex 108 | (recur (rest cs) group result) 109 | (throw (ex-info "Unexpected $ found" 110 | {:error :dollar 111 | :partial (if (empty? result) group (conj result group)) 112 | :remaining (apply str cs)}))) 113 | \( (recur (rest cs) [] (conj result group)) 114 | \) (if (empty? result) 115 | (throw (ex-info "Missing ( for group" 116 | {:error :missing-open 117 | :partial group 118 | :remaining (apply str cs)})) 119 | (recur (rest cs) (conj (peek result) group) (pop result))) 120 | \[ (let [[setexp rst] (parse-set (rest cs))] (recur rst (conj group setexp) result)) 121 | \. (recur (rest cs) (conj group '(:any)) result) 122 | ;; :alt is temporarily inserted in place, to be regrouped later 123 | \| (recur (rest cs) (conj group :alt) result) 124 | \* (recur (rest cs) (conj (pop group) (list :* (peek group))) result) 125 | \+ (recur (rest cs) (conj (pop group) (list :+ (peek group))) result) 126 | \? (recur (rest cs) (conj (pop group) (list :? (peek group))) result) 127 | \{ (let [[n m rst] (parse-times (rest cs))] 128 | (recur rst (conj (pop group) (list :times (peek group) n m)) result)) 129 | \\ (recur (rest (rest cs)) (conj group (slash (second cs))) result) 130 | (recur (rest cs) (conj group (first cs)) result)))) 131 | 132 | 133 | ;; Hack alert! regroup-alt walks throught the initial parse and figures out how to regroup 134 | ;; the infix :alt markers into sexp-style prefix notation. Yes, it should have been done 135 | ;; that way from the start, but I couldn't figure out how to keep my groups straight in a 136 | ;; single pass. So we have to live with this. 137 | 138 | ;; pre mid post refer to the alt groups [pre (* :alt mid) :alt post] 139 | ;; mid is (or nil [[x]+]) -- multiple groups in mid 140 | ;; post is (nil or [x]) -- nil means no :alt has been seen yet 141 | ;; keywords are used as special groupings or operators in the first position of a list (not vector) 142 | (defn regroup-alt [coll] 143 | ;;DEBUG 144 | ;(println "REGROUP ALT:" coll) 145 | 146 | (loop [cs coll pre [] mid nil post nil] 147 | (if post 148 | (cond (empty? cs) (list* :alt pre (conj mid post)) 149 | (coll? (first cs)) (recur (rest cs) pre mid (conj post (regroup-alt (first cs)))) 150 | (= :alt (first cs)) (recur (rest cs) pre (conj mid post) []) 151 | :else (recur (rest cs) pre mid (conj post (first cs)))) 152 | (cond (empty? cs) (if (keyword? (first pre)) (seq pre) pre) 153 | (coll? (first cs)) (recur (rest cs) (conj pre (regroup-alt (first cs))) nil nil) 154 | (= :alt (first cs)) (recur (rest cs) pre [] []) 155 | :else (recur (rest cs) (conj pre (first cs)) nil nil))))) 156 | 157 | ;; SEM -- probably don't need to regroup alts within (:set ...) or (:inverted ...), etc. 158 | ;; really only within vectors I think 159 | 160 | (defn regex-seq [regex] 161 | (if (string? regex) 162 | (seq regex) 163 | #?(:clj (seq (str regex)) 164 | :cljs (drop-last (rest (seq (str regex))))))) 165 | 166 | ;; regex can be either a string or a regex 167 | (defn parse [regex] 168 | (try 169 | (regroup-alt (parse-chars (regex-seq regex))) 170 | #?(:clj (catch clojure.lang.ExceptionInfo e (throw e))) 171 | #?(:clj (catch Exception e (throw (ex-info (str "Confused by regular expression: " regex) 172 | {:failed regex} 173 | e)))) 174 | #?(:cljs (catch :default e (throw (ex-info (str "Confused by regular expression: " regex) 175 | {:failed regex} 176 | e)))) 177 | )) 178 | 179 | (declare tree->generator) 180 | 181 | (defn ascii [ch] 182 | #?(:clj (long ch) 183 | :cljs (.charCodeAt ch 0))) 184 | 185 | (defn between [ch-begin ch-end] 186 | (set (map char (range (ascii ch-begin) (inc (ascii ch-end)))))) 187 | 188 | ;; all these char ranges need double checking 189 | (def ^:private digits #{\0 \1 \2 \3 \4 \5 \6 \7 \8 \9}) 190 | (def ^:private atoz (between \a \z)) 191 | (def ^:private AtoZ (between \A \Z)) 192 | (def ^:private space (set (seq " \t\n\r"))) 193 | (def ^:private punctuation (set (seq ":#$%^&*()-+=!@~`;'?/.|\\[]{},<>\""))) 194 | (def ^:private underscore (set (seq "_"))) 195 | 196 | (def ^:private word (set/union atoz AtoZ digits underscore)) 197 | (def ^:private not-word (set/union space punctuation)) 198 | (def ^:private not-digits (set/union space punctuation atoz AtoZ underscore)) 199 | (def ^:private not-space (set/union word punctuation)) 200 | 201 | (def ^:private all-chars (set/union word punctuation space)) 202 | 203 | (defn charset-seq [tree] 204 | (if-not (seq? tree) 205 | (set tree) 206 | (case (first tree) 207 | :digit digits 208 | :not-digit not-digits 209 | :word word 210 | :not-word not-word 211 | :space space 212 | :not-space not-space 213 | :tab #{\t} 214 | :newline #{\n} 215 | :return #{\r} 216 | :btw (apply between (rest tree)) 217 | (set tree)))) 218 | 219 | (defn inverted [trees] 220 | (apply set/difference all-chars (set (filter char? trees)) (map charset-seq (filter seq? trees)))) 221 | 222 | (defn charset [trees] 223 | (apply set/union (set (filter char? trees)) (map charset-seq (filter seq? trees)))) 224 | 225 | (defn seq->generator [tree] 226 | (case (first tree) 227 | :any gen/char-ascii 228 | :times (if-let [m (nth tree 3)] 229 | ;; nil m means exactly N 230 | (gen/vector (tree->generator (second tree)) (nth tree 2) m) 231 | (gen/vector (tree->generator (second tree)) (nth tree 2))) 232 | :* (let [limit *or-more-limit*] 233 | (gen/sized (fn [n] (gen/vector (tree->generator (second tree)) 0 (min n limit))))) 234 | :+ (let [limit *or-more-limit*] 235 | (gen/sized (fn [n] (gen/vector (tree->generator (second tree)) 236 | 1 (max 1 (min n limit)))))) 237 | :? (gen/one-of [(gen/return "") (tree->generator (second tree))]) 238 | :alt (gen/one-of (map tree->generator (rest tree))) 239 | :set (gen/elements (charset (rest tree))) 240 | :inverted (gen/elements (inverted (rest tree))) 241 | :btw (gen/elements (apply between (rest tree))) 242 | :digit (gen/elements digits) 243 | :not-digit (gen/elements not-digits) 244 | :word (gen/elements word) 245 | :not-word (gen/elements not-word) 246 | :space (gen/elements space) 247 | :not-space (gen/elements not-space) 248 | :tab (gen/return \t) 249 | :newline (gen/return \n) 250 | :return (gen/return \r))) 251 | 252 | 253 | ;; This is a bit hairy. It's trying to group runs of single chars together to make a string 254 | ;; when possible. Also, handles special case of single item as a single generator rather 255 | ;; than a tuple as used in the general case. 256 | 257 | (defn vec->generator [trees] 258 | (loop [xs trees cs [] gens []] 259 | (cond (empty? xs) (let [gens (if (empty? cs) gens (conj gens (gen/return (apply str cs))))] 260 | (cond (empty? gens) (gen/return "") 261 | (= (count gens) 1) (first gens) 262 | :else (apply gen/tuple gens))) 263 | (char? (first xs)) (recur (rest xs) (conj cs (first xs)) gens) 264 | :else (recur (rest xs) [] (if (empty? cs) 265 | (conj gens (tree->generator (first xs))) 266 | (conj gens (gen/return (apply str cs)) 267 | (tree->generator (first xs)))))))) 268 | 269 | (defn tree->generator [tree] 270 | (cond (vector? tree) (vec->generator tree) 271 | (seq? tree) (seq->generator tree) 272 | (char? tree) (gen/return tree) 273 | (string? tree) (gen/return tree) 274 | :else (throw (ex-info (str "Unimplemented generator for " (pr-str tree)) 275 | {:unimplemented tree})))) 276 | 277 | ;; this is the main function needed for the public side 278 | (defn string-generator 279 | "Returns a test.check generator that generates strings matching the given regular 280 | expression `regex`. (Fancy flags and POXIX extensions are not suppored; see the doc for 281 | more information about the supported regular expression syntax.) The optional 282 | `or-more-limit` controls the maximum numbers of elements that are generated when matching 283 | a potentially unbounded regex (such as #\"x*\" or #\"y+\"). The default is 9." 284 | 285 | ([regex] 286 | (gen/fmap #(if (coll? %) (apply str (flatten %)) (str %)) 287 | (tree->generator (parse regex)))) 288 | 289 | ([regex or-more-limit] 290 | (binding [*or-more-limit* or-more-limit] 291 | (string-generator regex)))) 292 | 293 | ;; also used for the public side, to implement case-insensitive generation 294 | (defn gen-case-insensitive [s] 295 | (let [ups (str/upper-case s) 296 | lows (str/lower-case s)] 297 | (gen/one-of [(gen/elements [s (str/capitalize s) ups lows]) 298 | (gen/fmap (fn [bs] 299 | (str/join (map (fn [b low up] (if b up low)) bs lows ups))) 300 | (gen/vector gen/boolean (count s)))]))) 301 | -------------------------------------------------------------------------------- /test/miner/test_strgen.cljc: -------------------------------------------------------------------------------- 1 | (ns miner.test-strgen 2 | (:require 3 | #?(:cljs [cljs.test :refer-macros [deftest is testing run-tests]] 4 | :clj [clojure.test :refer [deftest is testing]]) 5 | [clojure.test.check.generators :as gen] 6 | [clojure.spec.alpha :as s] 7 | [clojure.string :as str] 8 | [miner.strgen :as sg])) 9 | 10 | ;; https://github.com/bensu/doo -- need a test runner for CLJS 11 | 12 | (def ^:dynamic *exercise-limit* 5000) 13 | 14 | (def regexes [#"f.o" 15 | #"foo/bar" 16 | #"f.*o+" 17 | #":k[a-z]o" 18 | #":k[a-z]/f\d*o+" 19 | #"s[a-z]o" 20 | #"s[a-z]/f\d*o+" 21 | #"(foo|bar|(baz+|quux?){2})+a?b+" 22 | #"((s[a-z]*)|\d+)(x[a-j]y|y[^-A-Za-z]z|pq|PQ)\w@[^A-Zaz]" 23 | ;; email example from spec guide 24 | #"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}$" 25 | #"\.\?\w\[\][-.?]" 26 | #"[^\s]" 27 | #"[^\S]" 28 | #"[a\sb]" 29 | #"\{\|\}" 30 | #"[^\s/]+/[^\s/]+" 31 | #"^f\^*bar\$?x$" 32 | #"^(https?|ftp)://[^\s/$.?#].[^\s]*$" ]) 33 | 34 | 35 | 36 | (defn test-re 37 | ([re] (test-re re *exercise-limit*)) 38 | ([re limit] 39 | (doseq [x (gen/sample (sg/string-generator re) limit)] 40 | (is (re-matches re x))))) 41 | 42 | 43 | (deftest show-info 44 | (testing "Show test info" 45 | (println) 46 | (println " ** Test Info **") 47 | (println " StrGen" (nth (clojure.edn/read-string (slurp "project.clj")) 2)) 48 | (println " Clojure" (clojure-version)) 49 | (println " Java" (System/getProperty "java.version")) 50 | (println) 51 | true)) 52 | 53 | (deftest gen-regexes 54 | (doseq [re regexes] 55 | (test-re re))) 56 | 57 | 58 | (defn test-spec-re 59 | ([re] (test-re re *exercise-limit*)) 60 | ([re limit] 61 | (doseq [[r c] (s/exercise (s/spec (s/and string? #(re-matches re %)) 62 | :gen #(sg/string-generator re)) 63 | limit)] 64 | (is (= r c))))) 65 | 66 | 67 | (deftest spec-regexes 68 | (doseq [re regexes] 69 | (test-spec-re re))) 70 | 71 | 72 | ;; SEM FIXME: Not a complete test as we convert everything back to lowercase before 73 | ;; matching. Could break if we changed `regexs` to require uppercase. We should also 74 | ;; validate that the generator yields sensible mixed case results. 75 | (defn test-spec-re-case-insensitive 76 | ([re] (test-re re *exercise-limit*)) 77 | ([re limit] 78 | (doseq [[r c] (s/exercise (s/spec (s/and string? #(re-matches re (str/lower-case %))) 79 | :gen #(sg/case-insensitive-string-generator re)) 80 | limit)] 81 | (is (= r c))))) 82 | 83 | (deftest spec-regexes-case-insensitive 84 | (doseq [re regexes] 85 | (test-spec-re-case-insensitive re))) 86 | 87 | --------------------------------------------------------------------------------