├── deps.edn ├── .gitignore ├── .github └── workflows │ └── clojure.yml ├── project.clj ├── LICENSE ├── README.md └── src └── globber └── glob.clj /deps.edn: -------------------------------------------------------------------------------- 1 | {:paths ["src"] 2 | :deps {org.clojure/clojure {:mvn/version "1.11.1"}}} 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | -------------------------------------------------------------------------------- /.github/workflows/clojure.yml: -------------------------------------------------------------------------------- 1 | name: Clojure CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: Run tests 12 | run: lein with-profile test do cljfmt check, test 13 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject spootnik/globber "0.4.3-SNAPSHOT" 2 | :description "globber: globbing searches in clojure" 3 | :url "https://github.com/pyr/globber" 4 | :license {:name "MIT License"} 5 | :dependencies [[org.clojure/clojure "1.11.1"]] 6 | :deploy-repositories [["releases" :clojars] ["snapshots" :clojars]] 7 | :profiles {:dev {:plugins [[lein-ancient "0.7.0"]] 8 | :global-vars {*warn-on-reflection* true}} 9 | :test {:plugins [[lein-difftest "2.0.0"] 10 | [lein-cljfmt "0.9.0"]] 11 | :global-vars {*warn-on-reflection* true} 12 | :pedantic? :abort}}) 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 Pierre-Yves Ritschard 2 | 3 | Permission to use, copy, modify, and distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | globber: globbing searches in clojure 2 | ===================================== 3 | 4 | ### Installation 5 | 6 | ```clojure 7 | [spootnik/globber "0.4.1"] 8 | ``` 9 | 10 | ### Usage 11 | 12 | Perform globbing, matching the supplied `expression` against 13 | the list of supplied `candidates`. 14 | 15 | - `candidates` is a collection of strings and must be seqable. 16 | - `expression` adheres mostly to the bash notion of globbing 17 | 18 | Globbing syntax: 19 | 20 | - Any stray character is matched exactly 21 | - Wildcards ('*') mean any number (including zero) of arbitrary chars 22 | - Anyones ('?') mean a single arbitrary char 23 | - Character classes are enclosed in square brackets and may contain 24 | arbitrary list of characters to match. If a character class begins 25 | with a ('!') or ('^') character, the class will be negated, i.e: 26 | will only match characters absent from the class. Empty charclasses, 27 | i.e: ('[]'), ('[!]'), and ('[^]') match their representation, not 28 | their content. 29 | - Trees match any of their branches. Trees are delimited by curly 30 | brackets and content are separated by commas. Empty trees, i.e: 31 | ('{}'), ('{,}'), ('{,,}') match their representation, not their 32 | content. 33 | 34 | ### Examples 35 | 36 | ```clojure 37 | (glob \"foobar\" [\"foobar\"]) ;; => (\"foobar\") 38 | (glob \"fo[a-z]\" [\"foobar\"]) ;; => (\"foobar\") 39 | ``` 40 | -------------------------------------------------------------------------------- /src/globber/glob.clj: -------------------------------------------------------------------------------- 1 | (ns globber.glob 2 | "Globbing searches in clojure." 3 | (:require [clojure.string :as str])) 4 | 5 | (defn char->token 6 | "Our simple lexer, understands escaped characters." 7 | [[prev c]] 8 | (if (= prev \\) 9 | c 10 | (case c 11 | \\ nil 12 | \* :wc ;; wildcard 13 | \{ :oc ;; open curly 14 | \} :cc ;; close curly 15 | \[ :ob ;; open bracket 16 | \] :cb ;; close bracket 17 | \? :ao ;; any one char 18 | c))) 19 | 20 | (defn expr->tokens 21 | "Looks at characters two at a time to yield a list of tokens." 22 | [expr] 23 | (->> (conj (seq expr) nil) 24 | (partition 2 1) 25 | (map char->token) 26 | (remove nil?))) 27 | 28 | (defn make-class 29 | "Build a list of all characters between `beg` and `end`" 30 | [beg end] 31 | (let [bi (int beg) 32 | ei (int end)] 33 | (when (> bi ei) 34 | (throw (ex-info "invalid character class" {:begin beg :end end}))) 35 | (for [x (range bi (inc ei))] (str (char x))))) 36 | 37 | (defn negated-cc? 38 | "Is this representing a negated character class?" 39 | [ast] 40 | (and (set? ast) (:negated? (meta ast)))) 41 | 42 | (defn close-charclass 43 | "We're finished running through a class. 44 | Yield a set with some additional metadata to 45 | indicate whether the class is negated. 46 | 47 | This also handles edge-cases (zero or single element 48 | classes)." 49 | [cc nchar] 50 | (let [ccount (count cc)] 51 | (cond (and nchar (zero? ccount)) (format "[%s]" nchar) 52 | nchar (with-meta (set cc) {:negated? true}) 53 | (zero? ccount) "[]" 54 | (= 1 (count cc)) (first cc) 55 | :else (with-meta (set cc) {:negated? false})))) 56 | 57 | (defn scan-charclass 58 | "Eat tokens until the character class is closed." 59 | [tokens] 60 | (let [[tokens nchar] (if (#{\! \^} (first tokens)) 61 | [(rest tokens) (first tokens)] 62 | [tokens false])] 63 | (loop [tokens tokens 64 | charclass []] 65 | (if (and (>= (count tokens) 3) 66 | (= (second tokens) \-)) 67 | (recur 68 | (drop 3 tokens) 69 | (vec (concat charclass (make-class (first tokens) (nth tokens 2))))) 70 | (let [[c & tokens] tokens] 71 | (case c 72 | nil (throw (ex-info "invalid parser state: unclosed charclass" {})) 73 | :wc (recur tokens (conj charclass "*")) 74 | :ao (recur tokens (conj charclass "?")) 75 | :oc (recur tokens (conj charclass "{")) 76 | :cc (recur tokens (conj charclass "}")) 77 | :ab (recur tokens (conj charclass "{")) 78 | :cb [(close-charclass charclass nchar) tokens] 79 | (recur tokens (conj charclass (str c))))))))) 80 | 81 | (defn ast-type 82 | "Determine if the ast can be unrolled to a string, can 83 | be exploded to a set of strings or needs to run through 84 | our algorithm." 85 | [ast] 86 | (cond 87 | (vector? ast) (reduce max 0 (map ast-type ast)) 88 | (negated-cc? ast) 2 89 | (set? ast) (reduce max 1 (map ast-type (seq ast))) 90 | (= :wc ast) 2 91 | (= :ao ast) 2 92 | :else 0)) 93 | 94 | (def ast-typenames 95 | "Yield a keyword name for the AST type" 96 | {0 :string 97 | 1 :explodable 98 | 2 :compound}) 99 | 100 | ;; These are cross-referenced. 101 | (declare tokens->ast) 102 | (declare scan-branches) 103 | 104 | (defn close-branch 105 | "We succesfuly parsed a branch, now yield 106 | the best possible AST for it." 107 | [branch] 108 | (let [raw? (fn [e] (or (char? e) (keyword? e))) 109 | parsed? (complement raw?)] 110 | (loop [tokens branch 111 | ast []] 112 | (if-let [token (first tokens)] 113 | (if (raw? token) 114 | (let [ast-tokens (take-while raw? tokens)] 115 | (recur (drop-while raw? tokens) 116 | (conj ast (tokens->ast ast-tokens)))) 117 | (let [sub-asts (take-while parsed? tokens)] 118 | (recur (drop-while parsed? tokens) (vec (concat ast sub-asts))))) 119 | (if (= 1 (count ast)) (first ast) ast))))) 120 | 121 | (defn scan-branch 122 | "Eat tokens until the end of a branch." 123 | [tokens] 124 | (loop [[c & tokens] tokens 125 | branch []] 126 | (case c 127 | nil (throw (ex-info "invalid parser state: unclosed branch" {})) 128 | \, [(close-branch branch) tokens true] 129 | :cc [(close-branch branch) tokens false] 130 | :oc (let [[sub-ast trail] (scan-branches tokens)] 131 | (recur trail (conj branch sub-ast))) 132 | (recur tokens (conj branch c))))) 133 | 134 | (defn scan-branches 135 | "A tree has started, scan all its branches and yield 136 | the best possible AST for it." 137 | [tokens] 138 | (loop [tokens tokens 139 | branches []] 140 | (let [[branch tokens more?] (scan-branch tokens) 141 | bcount (count branches)] 142 | (if more? 143 | (recur tokens (conj branches branch)) 144 | (cond 145 | (and (= 0 bcount) (empty? branch) (not more?)) 146 | ["{}" tokens] 147 | 148 | (and (= 0 bcount) (not more?)) 149 | [branch tokens] 150 | 151 | (every? empty? (conj branches branch)) 152 | [(format "{%s}" (reduce str (repeat bcount ","))) tokens] 153 | :else 154 | [(set (conj branches branch)) tokens]))))) 155 | 156 | (defn tokens->ast 157 | "Transform a list of tokens to an AST, if possible." 158 | [tokens] 159 | (loop [tokens tokens 160 | stack []] 161 | (if (seq tokens) 162 | (if-let [s (seq (take-while char? tokens))] 163 | (recur (seq (drop-while char? tokens)) 164 | (conj stack (apply str s))) 165 | (case (first tokens) 166 | :ob (let [[charclass trail] (scan-charclass (rest tokens))] 167 | (recur trail (conj stack charclass))) 168 | :ao (recur (rest tokens) (conj stack :ao)) 169 | :wc (recur (rest tokens) (conj stack :wc)) 170 | :oc (let [[branches trail] (scan-branches (rest tokens))] 171 | (recur trail (conj stack branches))) 172 | :cb (throw (ex-info "invalid parser state: dangling bracket" {:tokens tokens})) 173 | :cc (throw (ex-info "invalid parser state: dangling curly" {:tokens tokens})) 174 | (throw (ex-info "invalid parser state: unhandled char" {:tokens tokens})))) 175 | (if (= 1 (count stack)) (first stack) stack)))) 176 | 177 | (defn merge-ast 178 | "Merge two non-compound ASTs by yielding all possible combinations." 179 | ([] 180 | []) 181 | ([left right] 182 | (let [factor (* (count left) (count right))] 183 | (cond (empty? left) right 184 | (empty? right) left 185 | :else (for [l left r right] (str l r)))))) 186 | 187 | (defn explode-ast 188 | "Explode a non-compound AST to all its possible combinations." 189 | [ast] 190 | (cond (string? ast) (list ast) 191 | (negated-cc? ast) (throw (ex-info "cannot explode negated class" {})) 192 | (set? ast) (mapcat explode-ast ast) 193 | (sequential? ast) (reduce merge-ast (map explode-ast ast)))) 194 | 195 | (defn stringify-ast 196 | "Reduce a string-only AST to its equivalent string" 197 | [ast] 198 | (if (sequential? ast) 199 | (reduce str (map stringify-ast ast)) 200 | ast)) 201 | 202 | (defn partition-compound-ast 203 | "Partition a compound AST into a list of (eaters, ast) tuples where 204 | eaters is a list of eager character eaters and a potentially explodable 205 | AST" 206 | [tokens] 207 | (loop [eaters [] 208 | ast [] 209 | partitions [] 210 | [token & tokens] tokens] 211 | (cond 212 | (nil? token) (if (or (seq eaters) (seq ast)) 213 | (conj partitions [(seq eaters) (seq ast)]) 214 | partitions) 215 | (negated-cc? token) (if (seq ast) 216 | (recur [] 217 | [] 218 | (-> partitions 219 | (conj [(seq eaters) ast]) 220 | (conj [nil token])) 221 | tokens) 222 | (recur [] 223 | [] 224 | (conj partitions [(seq eaters) token]) 225 | tokens)) 226 | (#{:ao :wc} token) (if (seq ast) 227 | (recur [token] 228 | [] 229 | (conj partitions [(seq eaters) (seq ast)]) 230 | tokens) 231 | (recur (conj eaters token) 232 | ast 233 | partitions 234 | tokens)) 235 | :else (recur eaters (conj ast token) partitions tokens)))) 236 | 237 | (defn explode-compound-ast 238 | "Transform a partitioned compound AST by exploding eligible sub-ASTs" 239 | [partitions] 240 | (vec 241 | (for [[eaters ast] partitions] 242 | [eaters (if (negated-cc? ast) ast (explode-ast ast))]))) 243 | 244 | (defn all-indices 245 | "Find all indices of a substring in a given string" 246 | [s pat] 247 | (loop [res nil 248 | i (.lastIndexOf (str s) (str pat))] 249 | (cond (neg? i) res 250 | (zero? i) (conj res i) 251 | :else (recur (conj res i) 252 | (.lastIndexOf (str s) (str pat) (long (dec i))))))) 253 | 254 | (defn find-pattern 255 | "Match a string against a list of eaters and a pattern" 256 | [pos candidate s eaters pattern] 257 | (let [wc? (some #{:wc} eaters) 258 | minpos (count (filter #{:ao} eaters))] 259 | (for [match (all-indices s pattern) 260 | :when ((if wc? >= =) match minpos)] 261 | [(+ match pos (count pattern)) 262 | candidate 263 | (= (count candidate) 264 | (+ match pos (count pattern)))]))) 265 | 266 | (defn filter-compound-partition 267 | "Match a list of candidates against a single partition of 268 | a compound AST. When a match occurs, yield a list of 269 | the new candidates this may have generated and if they 270 | are eligible as a terminal match." 271 | [[eaters patterns] [pos candidate]] 272 | (let [s (.substring (str candidate) (str pos)) 273 | minpos (if eaters (count (filter #{:ao} eaters)) 0) 274 | wc? (boolean (seq (filter #{:wc} eaters))) 275 | exact? (fn [i] (= i (count candidate)))] 276 | (cond 277 | (and (nil? eaters) (negated-cc? patterns)) 278 | (when-not ((set patterns) (str (first s))) 279 | [[(inc pos) candidate (exact? (inc pos))]]) 280 | 281 | (negated-cc? patterns) 282 | (let [comp (if wc? >= =)] 283 | (for [[i c] (map-indexed vector (map str (seq s))) 284 | :when (and (not ((set patterns) c)) 285 | (comp i minpos))] 286 | [(+ pos 1 i) candidate (exact? (+ pos 1 i))])) 287 | 288 | (nil? eaters) ;; first partition 289 | (for [prefix patterns 290 | :let [pcount (count prefix)] 291 | :when (.startsWith s prefix)] 292 | [(+ pos pcount) candidate (exact? (+ pos pcount))]) 293 | 294 | (nil? patterns) ;; last partition 295 | (when ((if wc? >= =) (count s) minpos) 296 | [[(+ pos (if wc? (count s) minpos)) 297 | candidate 298 | (if wc? 299 | true 300 | (exact? (+ pos minpos)))]]) 301 | 302 | :else 303 | (->> patterns 304 | (mapcat (partial find-pattern pos candidate s eaters)) 305 | (remove nil?))))) 306 | 307 | (defn filter-compound-ast 308 | "Given a compound AST, yield the set of matching candidates." 309 | [partitions candidates] 310 | (let [candidates (map #(vector 0 % false) candidates)] 311 | (loop [[p & partitions] partitions 312 | candidates candidates] 313 | (if p 314 | (recur partitions 315 | (mapcat (partial filter-compound-partition p) candidates)) 316 | (->> candidates 317 | (filter (fn [[_ _ exact?]] exact?)) 318 | (map second) 319 | (set)))))) 320 | (defn glob 321 | "Perform globbing, matching the supplied `expression` against 322 | the list of supplied `candidates`. 323 | 324 | - `candidates` is a collection of strings and must be seqable. 325 | - `expression` adheres mostly to the bash notion of globbing 326 | 327 | Globbing syntax: 328 | 329 | - Any stray character is matched exactly 330 | - Wildcards ('*') mean any number (including zero) of arbitrary chars 331 | - Anyones ('?') mean a single arbitrary char 332 | - Character classes are enclosed in square brackets and may contain 333 | arbitrary list of characters to match. If a character class begins 334 | with a ('!') or ('^') character, the class will be negated, i.e: 335 | will only match characters absent from the class. Empty charclasses, 336 | i.e: ('[]'), ('[!]'), and ('[^]') match their representation, not 337 | their content. 338 | - Trees match any of their branches. Trees are delimited by curly 339 | brackets and content are separated by commas. Empty trees, i.e: 340 | ('{}'), ('{,}'), ('{,,}') match their representation, not their 341 | content. 342 | 343 | Examples: 344 | 345 | (glob \"foobar\" [\"foobar\"]) ;; => (\"foobar\") 346 | (glob \"fo[a-z]\" [\"foobar\"]) ;; => (\"foobar\") 347 | 348 | " 349 | [expression candidates] 350 | (let [ast (-> expression expr->tokens tokens->ast) 351 | type (get ast-typenames (ast-type ast))] 352 | (case type 353 | :string (filter (partial = (stringify-ast ast)) candidates) 354 | :explodable (filter (set (explode-ast ast)) candidates) 355 | (let [partitions (cond 356 | (sequential? ast) 357 | (-> ast partition-compound-ast explode-compound-ast) 358 | 359 | (#{:ao :wc} ast) 360 | [[(list ast) nil]] 361 | 362 | :else 363 | [[nil ast]])] 364 | (filter-compound-ast partitions candidates))))) 365 | --------------------------------------------------------------------------------