├── .gitignore ├── tests.edn ├── project.clj ├── src └── html5_walker │ ├── core.clj │ └── walker.clj ├── test └── html5_walker │ ├── walker_test.clj │ └── core_test.clj └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /.nrepl-port 2 | /target 3 | /pom.xml 4 | /pom.xml.asc 5 | -------------------------------------------------------------------------------- /tests.edn: -------------------------------------------------------------------------------- 1 | #kaocha/v1 2 | {:plugins [:noyoda.plugin/swap-actual-and-expected] 3 | :tests [{:id :unit 4 | :source-paths ["src" "test-data"] 5 | :focus-meta [:focus]}]} 6 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject html5-walker "2023.11.21" 2 | :description "Search and replace html5." 3 | :url "https://github.com/magnars/html5-walker" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[ch.digitalfondue.jfiveparse/jfiveparse "0.9.0"]] 7 | :profiles {:dev {:dependencies [[org.clojure/clojure "1.10.1"] 8 | [lambdaisland/kaocha "0.0-529"] 9 | [kaocha-noyoda "2019-06-03"]] 10 | :aliases {"kaocha" ["run" "-m" "kaocha.runner"]}}}) 11 | -------------------------------------------------------------------------------- /src/html5_walker/core.clj: -------------------------------------------------------------------------------- 1 | (ns html5-walker.core 2 | (:require [html5-walker.walker :as walker])) 3 | 4 | (defn enforce-child-selectors 5 | "Preserves the old behavior where [:div :a] enforced a child relationship." 6 | [path] 7 | (->> (partition-all 2 1 path) 8 | (remove (comp #{">"} #(some-> % name) first)) 9 | (mapcat 10 | (fn [[element descendant]] 11 | (if (nil? descendant) 12 | [element] 13 | [element '>]))))) 14 | 15 | (defn ^:export replace-in-document [html path->f] 16 | (->> (for [[path f] path->f] 17 | [(enforce-child-selectors path) f]) 18 | (into {}) 19 | (walker/replace-in-document html))) 20 | 21 | (defn ^:export replace-in-fragment [html path->f] 22 | (->> (for [[path f] path->f] 23 | [(enforce-child-selectors path) f]) 24 | (into {}) 25 | (walker/replace-in-fragment html))) 26 | 27 | (defn ^:export find-nodes [html path] 28 | (walker/find-nodes html (enforce-child-selectors path))) 29 | -------------------------------------------------------------------------------- /test/html5_walker/walker_test.clj: -------------------------------------------------------------------------------- 1 | (ns html5-walker.walker-test 2 | (:require [clojure.test :refer [deftest is testing]] 3 | [html5-walker.walker :as sut])) 4 | 5 | (deftest selector-test 6 | (testing "implicit descendant selector" 7 | (is (= (map #(.getAttribute % "href") 8 | (sut/find-nodes 9 | "
Hello! 10 | 11 | 12 | " 13 | '[div.bar a])) 14 | ["barn"]))) 15 | 16 | (testing "child selector does not match any descendant" 17 | (is (= (map #(.getAttribute % "href") 18 | (sut/find-nodes 19 | "Hello! 20 | 21 | 22 | " 23 | '[div.bar > a])) 24 | []))) 25 | 26 | (testing "explicit child selector" 27 | (is (= (map #(.getAttribute % "href") 28 | (sut/find-nodes 29 | "Hello! 30 | 31 | 32 | " 33 | '[div.bar > div > a])) 34 | ["barn"])))) 35 | -------------------------------------------------------------------------------- /src/html5_walker/walker.clj: -------------------------------------------------------------------------------- 1 | (ns html5-walker.walker 2 | (:require [clojure.string :as str]) 3 | (:import (ch.digitalfondue.jfiveparse Element Parser Selector))) 4 | 5 | (def prefix->kind 6 | {nil :element 7 | "#" :id 8 | "." :class 9 | "[" :attr}) 10 | 11 | (defn parse-selector 12 | "Breaks a CSS selector element into tag matcher, class matchers, id matcher, and 13 | attribute matchers." 14 | [selector] 15 | (->> (str/replace selector #":(first|last)-child" "") 16 | (re-seq #"([#\.\[])?([a-z0-9\-\:]+)(?:(.?=)([a-z0-9\-\:]+)])?") 17 | (map #(into [(prefix->kind (second %))] (remove nil? (drop 2 %)))) 18 | (concat (->> (re-seq #":((?:first|last)-child)" selector) 19 | (map (comp vector keyword second)))))) 20 | 21 | (comment 22 | 23 | (parse-selector "div#content.text[property=og:image].mobile[style][data-test~=bla]:first-child") 24 | (parse-selector "div:first-child") 25 | (parse-selector ":first-child") 26 | (parse-selector "[property]") 27 | 28 | ) 29 | 30 | (defn- match-path-fragment [selector element] 31 | (reduce 32 | (fn [s [kind m comparator v]] 33 | (case kind 34 | :element (.element s m) 35 | :id (.id s m) 36 | :class (.hasClass s m) 37 | :attr (case comparator 38 | "=" (.attrValEq s m v) 39 | "*=" (.attrValContains s m v) 40 | "$=" (.attrValEndWith s m v) 41 | "~=" (.attrValInList s m v) 42 | "^=" (.attrValStartWith s m v) 43 | nil (.attr s m)) 44 | :first-child (.isFirstChild s) 45 | :last-child (.isLastChild s))) 46 | selector 47 | (parse-selector (name element)))) 48 | 49 | (defn make-descendants-explicit 50 | "Walks a path and returns pairs of [descendant element] where descendant is 51 | either `:descendant` or `:child`, describing the desired relationship to the 52 | previous path element. `descendant` will be `nil` for the first element. `:>` 53 | creates a `:child` relationship between two elements, while elements that 54 | don't have an explicit relationship (e.g. `[:div :a]`) will have a 55 | `:descendant` interposed between them." 56 | [path] 57 | (->> (partition-all 2 1 path) 58 | (remove (comp #{">"} #(some-> % name) first)) 59 | (mapcat 60 | (fn [[element descendant]] 61 | (cond 62 | (nil? descendant) 63 | [element] 64 | 65 | (= ">" (name descendant)) 66 | [element :child] 67 | 68 | :else 69 | [element :descendant]))) 70 | (into [nil]) 71 | (partition 2))) 72 | 73 | (defn create-matcher [path] 74 | (let [path (make-descendants-explicit path)] 75 | (.toMatcher 76 | (reduce (fn [selector [descendant element-kw]] 77 | (-> (case descendant 78 | :descendant (.withDescendant selector) 79 | :child (.withChild selector)) 80 | (match-path-fragment element-kw))) 81 | (-> (Selector/select) 82 | (match-path-fragment (second (first path)))) 83 | (next path))))) 84 | 85 | (defn ^:export replace-in-document [html path->f] 86 | (let [doc (.parse (Parser.) html)] 87 | (doseq [[path f] path->f] 88 | (doseq [node (.getAllNodesMatching doc (create-matcher path))] 89 | (f node))) 90 | (str (re-find #"^]+>" html) 91 | (.getOuterHTML (.getDocumentElement doc))))) 92 | 93 | (defn ^:export replace-in-fragment [html path->f] 94 | (let [el (first (.parseFragment (Parser.) (Element. "div") (str "