├── .gitignore ├── LICENSE ├── README.md ├── css3selectors.nimble ├── src ├── css3selectors.nim └── css3selectors │ ├── css_selectors.nim │ └── dom_utils.nim └── tests ├── config.nims └── test.nim /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.exe -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Leon Lysak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSS3Selectors 2 | 3 | A Nim CSS Selectors library for the WHATWG standard compliant Chame HTML parser. Query HTML using CSS selectors with Nim just like you can with JavaScript's `document.querySelector`/`document.querySelectorAll`. 4 | 5 | > **CSS3Selectors** was created largely off the back of GULPF's [Nimquery](https://github.com/GULPF/nimquery/) library. Rather than using Nim's `htmlparser`, which is currently unreliable to scrape wild HTML, we leverage the [Chame HTML parser](https://git.sr.ht/~bptato/chame). 6 | 7 | **CSS3Selectors** is almost fully compliant with the [CSS3 Selectors standard](https://www.w3.org/TR/selectors-3/). The exceptions: 8 | 9 | - :root, :lang(...), :enabled, :disabled 10 | - :link, ::first-line, ::first-letter, :visited 11 | - :active, ::before, ::after, :hover, 12 | - :focus, :target, :checked, 13 | 14 | Those selectors were not implemented because they didn't make much sense in the situations where `Nimquery` was useful. 15 | 16 | While this library has been rigorously stress-tested there still may be bugs. Please report any you encounter in the wild :) 17 | 18 | ## Installation 19 | 20 | Install from nimble: `nimble install css3selectors` 21 | 22 | Alternatively clone via git: `git clone https://github.com/Niminem/CSS3Selectors` 23 | 24 | ## Usage 25 | 26 | ```nim 27 | import std/streams 28 | import pkg/chame/minidom 29 | import css3selectors 30 | 31 | let html = """ 32 | 33 | 34 |
1
37 |2
38 |3
39 |4
40 | 41 | 42 | """ 43 | let document = Node(parseHtml(newStringStream(html))) 44 | let elements = document.querySelectorAll("p:nth-child(odd)") 45 | echo elements # @[1
,3
] 46 | 47 | let htmlFragment = parseHTMLFragment("1
127 |2
128 |3
129 |4
130 | 131 | 132 | """ 133 | let document = Node(parseHtml(newStringStream(html))) 134 | let options = DefaultQueryOptions - { optSimpleNot } 135 | let elements = document.querySelectorAll("p:not(.maybe-skip:nth-child(even))", options) 136 | echo elements 137 | # @[1
,3
,4
] 138 | ``` 139 | 140 | ## TODO 141 | - Add more helper procs like those we see in [`std/xmltree`](https://nim-lang.org/docs/xmltree.html) for easier DOM parsing (ex: [`innerText()`](https://nim-lang.org/docs/xmltree.html#innerText%2CXmlNode)). We may want to move these into another library over time. 142 | -------------------------------------------------------------------------------- /css3selectors.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.1.0" 4 | author = "Niminem" 5 | description = "A Nim CSS Selectors library for the WHATWG standard compliant Chame HTML parser. Query HTML using CSS selectors with Nim just like you can with JavaScript." 6 | license = "MIT" 7 | srcDir = "src" 8 | 9 | # Dependencies 10 | requires "nim >= 2.0.0" 11 | requires "https://git.sr.ht/~bptato/chame" 12 | -------------------------------------------------------------------------------- /src/css3selectors.nim: -------------------------------------------------------------------------------- 1 | import css3selectors/[dom_utils, css_selectors] 2 | export dom_utils, css_selectors 3 | -------------------------------------------------------------------------------- /src/css3selectors/css_selectors.nim: -------------------------------------------------------------------------------- 1 | # Spec: https://www.w3.org/TR/css3-selectors/ 2 | 3 | import std/[strutils, unicode, math, parseutils, sets] 4 | import pkg/[chame/minidom] 5 | import dom_utils 6 | 7 | const DEBUG = false 8 | 9 | type 10 | ParseError* = object of ValueError 11 | 12 | TokenKind = enum 13 | tkInvalid 14 | 15 | tkBracketStart, tkBracketEnd 16 | tkParam 17 | tkComma 18 | 19 | # NOTE: These are handled the same in some contexts, but they 20 | # are different. `tkIdentifier` can only contain a very specific 21 | # subset of characters, but tkString can contain anything. 22 | # This means that both `#foo%` and `[id=foo%]` is invalid, 23 | # but not `[id="foo%"]` or `#foo\%`. 24 | tkIdentifier, tkString 25 | 26 | tkClass, tkId, tkElement 27 | 28 | tkCombinatorDescendents, tkCombinatorChildren 29 | tkCombinatorNextSibling, tkCombinatorSiblings 30 | 31 | tkAttributeExact # [attr=...] 32 | tkAttributeItem # [attr~=...] 33 | tkAttributePipe # [attr|=...] 34 | tkAttributeExists # [attr] 35 | tkAttributeStart # [attr^=...] 36 | tkAttributeEnd # [attr$=...] 37 | tkAttributeSubstring # [attr*=...] 38 | 39 | tkPseudoNthChild, tkPseudoNthLastChild 40 | tkPseudoNthOfType, tkPseudoNthLastOfType 41 | 42 | tkPseudoFirstOfType, tkPseudoLastOfType 43 | tkPseudoOnlyChild, tkPseudoOnlyOfType, tkPseudoEmpty 44 | tkPseudoFirstChild, tkPseudoLastChild 45 | 46 | tkPseudoNot 47 | 48 | tkEoi # End of input 49 | 50 | Token = object 51 | kind: TokenKind 52 | value: string 53 | 54 | const AttributeKinds = { 55 | tkAttributeExact, tkAttributeItem, 56 | tkAttributePipe, tkAttributeExists, 57 | tkAttributeStart, tkAttributeEnd, 58 | tkAttributeSubstring 59 | } 60 | 61 | const NthKinds = { 62 | tkPseudoNthChild, tkPseudoNthLastChild, 63 | tkPseudoNthOfType, tkPseudoNthLastOfType 64 | } 65 | 66 | type 67 | Demand = object 68 | case kind: Tokenkind 69 | of AttributeKinds: 70 | attrName, attrValue: string 71 | of NthKinds: 72 | a, b: int 73 | of tkPseudoNot: 74 | notQuery: QueryPart 75 | of tkElement: 76 | element: string 77 | else: discard 78 | 79 | Combinator = enum 80 | cmDescendants = tkCombinatorDescendents, 81 | cmChildren = tkCombinatorChildren, 82 | cmNextSibling = tkCombinatorNextSibling, 83 | cmSiblings = tkCombinatorSiblings, 84 | cmRoot # Special case for the first query 85 | 86 | QueryOption* = enum 87 | optUniqueIds ## Assume unique id's or not 88 | optUnicodeIdentifiers ## Allow non-ascii in identifiers (e.g `#exämple`) 89 | optSimpleNot ## Only allow simple selectors as the argument 90 | ## for ":not". Combinators and/or commas are not 91 | ## allowed even if this option is excluded. 92 | 93 | Lexer = object 94 | input: string 95 | pos: int 96 | options: set[QueryOption] 97 | current, next: Token 98 | 99 | Query* = object ## Represents a parsed query. 100 | subqueries: seq[seq[QueryPart]] 101 | options: set[QueryOption] 102 | queryStr: string ## The original input string 103 | 104 | QueryPart = object 105 | demands: seq[Demand] 106 | combinator: Combinator 107 | 108 | # Used during the search to keep track which parts of the subqueries 109 | # have already been matched. 110 | NodeWithContext = object 111 | # We need access to the siblings of the node 112 | # which we get through the parent. 113 | parent: Node 114 | # Index is the index used by `xmltree`, 115 | # elementIndex is the index when only counting elements 116 | # (not text nodes etc). 117 | index, elementIndex: int 118 | searchStates: HashSet[(int, int)] 119 | 120 | const DefaultQueryOptions* = {optUniqueIds, optUnicodeIdentifiers, 121 | optSimpleNot} 122 | const NimqueryDefaultOptions* {.deprecated.} = DefaultQueryOptions 123 | 124 | const Identifiers = Letters + Digits + {'-', '_', '\\'} 125 | # NOTE: This is not the same as `strutils.Whitespace`. 126 | # These values are defined by spec. 127 | const CssWhitespace = {'\x20', '\x09', '\x0A', '\x0D', '\x0C'} 128 | const Combinators = CssWhitespace + {'+', '~', '>'} 129 | 130 | const PseudoNoParamsKinds = { 131 | tkPseudoFirstOfType, tkPseudoLastOfType, 132 | tkPseudoOnlyChild, tkPseudoOnlyOfType, 133 | tkPseudoEmpty, tkPseudoFirstChild, 134 | tkPseudoLastChild 135 | } 136 | 137 | const PseudoParamsKinds = NthKinds + {tkPseudoNot} 138 | 139 | const CombinatorKinds = { 140 | tkCombinatorChildren, tkCombinatorDescendents, 141 | tkCombinatorNextSibling, tkCombinatorSiblings 142 | } 143 | 144 | template log(x: varargs[untyped]) = 145 | when DEBUG: 146 | debugEcho x 147 | 148 | func safeCharCompare(str: string, idx: int, cs: set[char]): bool {.inline.} = 149 | if idx > high(str): return false 150 | if idx < low(str): return false 151 | return str[idx] in cs 152 | 153 | func safeCharCompare(str: string, idx: int, c: char): bool {.inline.} = 154 | return str.safeCharCompare(idx, {c}) 155 | 156 | func node(pair: NodeWithContext): Node = 157 | return pair.parent.childList[pair.index] 158 | 159 | func attrComparerString(kind: TokenKind): string = 160 | case kind 161 | of tkAttributeExact: return "=" 162 | of tkAttributeItem: return "~=" 163 | of tkAttributePipe: return "|=" 164 | of tkAttributeExists: return "" 165 | of tkAttributeStart: return "^=" 166 | of tkAttributeEnd: return "$=" 167 | of tkAttributeSubstring: return "*=" 168 | else: raiseAssert "Invalid attr kind: " & $kind 169 | 170 | func newUnexpectedCharacterException(s: string): ref ParseError = 171 | return newException(ParseError, "Unexpected character: '" & s & "'") 172 | 173 | func newUnexpectedCharacterException(c: char): ref ParseError = 174 | newUnexpectedCharacterException($c) 175 | 176 | func initNotDemand(notQuery: QueryPart): Demand = 177 | result = Demand(kind: tkPseudoNot, notQuery: notQuery) 178 | 179 | func initElementDemand(element: string): Demand = 180 | result = Demand(kind: tkElement, element: element) 181 | 182 | func initPseudoDemand(kind: TokenKind): Demand = 183 | result = Demand(kind: kind) 184 | 185 | func initAttributeDemand(kind: TokenKind, name, value: string): Demand = 186 | case kind 187 | of AttributeKinds: 188 | result = Demand(kind: kind, attrName: name, attrValue: value) 189 | else: 190 | raiseAssert "invalid kind: " & $kind 191 | 192 | func initNthChildDemand(kind: TokenKind, a, b: int): Demand = 193 | case kind 194 | of NthKinds: 195 | result = Demand(kind: kind, a: a, b: b) 196 | else: 197 | raiseAssert "invalid kind: " & $kind 198 | 199 | func `$`(demand: Demand): string {.raises: [].} = 200 | case demand.kind: 201 | of AttributeKinds: 202 | if demand.kind == tkAttributeExists: 203 | result = "[" & demand.attrName & "]" 204 | else: 205 | result = "[" & demand.attrName & demand.kind.attrComparerString & 206 | "'" & demand.attrValue & "']" 207 | of tkPseudoNot: 208 | result = ":" & $demand.kind & "(" & $demand.notQuery & ")" 209 | of NthKinds: 210 | result = ":" & $demand.kind & "(" & $demand.a & "n, " & $demand.b & ")" 211 | of PseudoNoParamsKinds: 212 | result = ":" & $demand.kind 213 | of tkElement: 214 | result = demand.element 215 | else: 216 | result = $demand.kind 217 | 218 | func `==`(d1, d2: Demand): bool = 219 | if d1.kind != d2.kind: return false 220 | case d1.kind 221 | of AttributeKinds: 222 | return d1.attrName == d2.attrName and d1.attrValue == d2.attrValue 223 | of NthKinds: 224 | return d1.a == d2.b 225 | of tkPseudoNot: 226 | return d1.notQuery == d2.notQuery 227 | of tkElement: 228 | return d1.element == d2.element 229 | else: 230 | raise newException(Exception, "Invalid demand kind: " & $d1.kind) 231 | 232 | iterator siblings(pair: NodeWithContext, 233 | startAtIndex = 0): Element = 234 | if pair.parent != nil: 235 | var idx = startAtIndex 236 | while idx < pair.parent.childList.len: 237 | let node = pair.parent.childList[idx] 238 | if node of Element: 239 | yield Element(node) 240 | idx.inc 241 | 242 | func initToken(kind: TokenKind, value: string = ""): Token = 243 | return Token(kind: kind, value: value) 244 | 245 | func initQueryPart(demands: seq[Demand], combinator: Combinator): QueryPart = 246 | return QueryPart(demands: demands, combinator: combinator) 247 | 248 | func canFindMultiple(q: seq[QueryPart], options: set[QueryOption]): bool = 249 | ## Returns true if the subquery ``q`` can match multiple elements. 250 | var lastPart = q[^1] 251 | for demand in lastPart.demands: 252 | if optUniqueIds in options and demand.kind in AttributeKinds and 253 | demand.attrName == "id": 254 | return false 255 | if lastPart.combinator in {cmChildren, cmSiblings} and demand.kind in 256 | {tkPseudoFirstOfType, tkPseudoLastOfType, 257 | tkPseudoFirstChild, tkPseudoLastChild, tkPseudoOnlyOfType}: 258 | return false 259 | 260 | return true 261 | 262 | func `$`*(q: Query): string = 263 | ## Returns the original input string used to construct the query 264 | result = q.queryStr 265 | 266 | func isValidNotQuery(q: Query, options: set[QueryOption]): bool = 267 | return 268 | q.subqueries.len == 1 and 269 | q.subqueries[0].len == 1 and 270 | (q.subqueries[0][0].demands.len == 1 or not (optSimpleNot in options)) 271 | 272 | func readEscape(input: string, idx: var int, buffer: var string) = 273 | assert input[idx] == '\\' 274 | idx.inc 275 | 276 | # Linefeed, carriage return and form feed can't be escaped. 277 | if input[idx] in {'\x0C', '\x0D', '\x0A'}: 278 | raise newUnexpectedCharacterException(input[idx]) 279 | 280 | # No special handling is required for these. 281 | # E.g '\n' means 'n', not 'newline'. 282 | if input[idx] notin HexDigits: 283 | # FIXME: Should this read a grapheme instead of a rune? I don't know 284 | let runeStr = input.runeAt(idx).toUTF8 285 | buffer.add runeStr 286 | idx.inc runeStr.len 287 | 288 | else: 289 | var hexStr = "" 290 | 291 | while input[idx] in HexDigits and hexStr.len < 6: 292 | hexStr.add input[idx] 293 | idx.inc 294 | 295 | # Skip whitespace after hex input 296 | if input[idx] in CssWhitespace: 297 | idx.inc 298 | 299 | try: 300 | let runeStr = hexStr.parseHexInt.Rune.toUTF8 301 | buffer.add runeStr 302 | except ValueError: 303 | raiseAssert "Can't happen" 304 | 305 | func readStringLiteral(input: string, idx: var int, buffer: var string) = 306 | assert input[idx] in {'\'', '"'} 307 | 308 | let ch = input[idx] 309 | idx.inc 310 | 311 | while input[idx] != ch: 312 | if input[idx] == '\\': 313 | readEscape(input, idx, buffer) 314 | else: 315 | buffer.add input[idx] 316 | idx.inc 317 | 318 | if idx > high(input): 319 | raise newException(ParseError, "Non-terminated string") 320 | 321 | idx.inc 322 | 323 | func readIdentifier(input: string, idx: var int, buffer: var string) = 324 | const intIdentifiers = { 325 | 'a' .. 'z', 'A' .. 'Z', 326 | '0' .. '9', 327 | '-', '_', '\\' 328 | } 329 | 330 | if input[idx] == '_' or 331 | input[idx] in Digits or 332 | (input[idx] == '-' and 333 | input.safeCharCompare(idx + 1, {'-'} + Digits)): 334 | raise newUnexpectedCharacterException(input[idx + 1]) 335 | 336 | func isValidIdentifier(rune: Rune): bool = 337 | if rune.char in intIdentifiers: 338 | return true 339 | # Spec: https://www.w3.org/TR/CSS21/syndata.html#value-def-identifier 340 | return rune >=% 0x00A0.Rune 341 | 342 | while idx < input.len: 343 | # NOTE: `idx` is the byte offset of input, so `runeAt(idx)` is correct. 344 | let rune = input.runeAt(idx) 345 | 346 | if not isValidIdentifier(rune): 347 | break 348 | 349 | if rune == '\\'.Rune: 350 | readEscape(input, idx, buffer) 351 | else: 352 | let unicodeCh = $rune 353 | idx.inc unicodeCh.len 354 | buffer.add unicodeCh 355 | 356 | func readIdentifierAscii(input: string, idx: var int, buffer: var string) = 357 | if input[idx] == '-' and input.safeCharCompare(idx + 1, {'-'} + Digits): 358 | raise newUnexpectedCharacterException(input[idx + 1]) 359 | 360 | while input[idx] in Identifiers and idx < input.len: 361 | if input[idx] == '\\': 362 | readEscape(input, idx, buffer) 363 | else: 364 | buffer.add input[idx] 365 | idx.inc 366 | 367 | func readParams(input: string, idx: var int, buffer: var string) = 368 | # Fragile, ugly, ok 369 | var paramContextCount = 0 370 | var dblQuoteStringContext = false 371 | var sglQuoteStringContext = false 372 | idx.inc 373 | 374 | while input[idx] != ')' or paramContextCount > 0 or 375 | dblQuoteStringContext or sglQuoteStringContext: 376 | if input[idx] == '"' and not sglQuoteStringContext: 377 | dblQuoteStringContext = not dblQuoteStringContext 378 | 379 | if input[idx] == '\'' and not dblQuoteStringContext: 380 | sglQuoteStringContext = not sglQuoteStringContext 381 | 382 | if input[idx] == '(' and not dblQuoteStringContext and 383 | not sglQuoteStringContext: 384 | paramContextCount.inc 385 | 386 | if input[idx] == ')' and not dblQuoteStringContext and 387 | not sglQuoteStringContext: 388 | paramContextCount.dec 389 | 390 | if input[idx] == '\\': 391 | buffer.add input[idx] 392 | idx.inc 393 | 394 | buffer.add input[idx] 395 | idx.inc 396 | 397 | if idx > high(input): 398 | raise newException(ParseError, 399 | "Non-terminated pseudo argument list") 400 | 401 | idx.inc 402 | 403 | func parsePseudoNthArguments(input: string): tuple[a: int, b: int] = 404 | var buffer = "" 405 | var idx = 0 406 | idx.inc skipWhile(input, CssWhitespace, idx) 407 | 408 | template takeInt: int = 409 | var v: int 410 | try: 411 | v = buffer.parseInt 412 | buffer = "" 413 | # NOTE: This branch can only be taken in case of overflow 414 | except ValueError as err: 415 | raise newException(ParseError, err.msg) 416 | v 417 | 418 | if idx + 2 < input.len and input[idx..idx+2].cmpIgnoreCase("odd") == 0: 419 | result = (2, 1) 420 | idx.inc 3 421 | elif idx + 3 < input.len and input[idx..idx+3].cmpIgnoreCase("even") == 0: 422 | result = (2, 0) 423 | idx.inc 4 424 | else: 425 | if idx < input.len and input[idx] in {'+', '-'}: 426 | buffer.add input[idx] 427 | idx.inc 428 | if idx >= input.len: 429 | raise newException(ParseError, "Invalid parameter for ':nth-*'") 430 | if input[idx] notin Digits: 431 | buffer.add "1" 432 | while idx < input.len and input[idx] in Digits: 433 | buffer.add input[idx] 434 | idx.inc 435 | if idx < input.len and input[idx] in {'n', 'N'}: 436 | idx.inc 437 | result.a = takeInt() 438 | idx.inc skipWhile(input, CssWhitespace, idx) 439 | if idx < input.len and input[idx] in {'+', '-'}: 440 | buffer.add input[idx] 441 | idx.inc 442 | idx.inc skipWhile(input, CssWhitespace, idx) 443 | if idx >= input.len or input[idx] notin Digits: 444 | raise newUnexpectedCharacterException(input[idx]) 445 | while idx < input.len and input[idx] in Digits: 446 | buffer.add input[idx] 447 | idx.inc 448 | result.b = takeInt() 449 | else: 450 | discard # done, only "a" was specified 451 | else: 452 | result.b = takeInt() 453 | 454 | idx.inc skipWhile(input, CssWhitespace, idx) 455 | if idx <= input.high: 456 | raise newUnexpectedCharacterException(input[idx]) 457 | 458 | func initPseudoToken(str: string): Token = 459 | let kind = case str 460 | of ":empty": tkPseudoEmpty 461 | of ":only-child": tkPseudoOnlyChild 462 | of ":only-of-type": tkPseudoOnlyOfType 463 | of ":first-child": tkPseudoFirstChild 464 | of ":last-child": tkPseudoLastChild 465 | of ":last-of-type": tkPseudoLastOfType 466 | of ":first-of-type": tkPseudoFirstOfType 467 | of ":not": tkPseudoNot 468 | of ":nth-child": tkPseudoNthChild 469 | of ":nth-last-child": tkPseudoNthLastChild 470 | of ":nth-of-type": tkPseudoNthOfType 471 | of ":nth-last-of-type": tkPseudoNthLastOfType 472 | else: 473 | raise newException(ParseError, "Unknown pseudo selector: " & str) 474 | result = initToken(kind) 475 | 476 | func isFinishedSimpleSelector(prev: Token, prevPrev: Token): bool = 477 | # Checks if the last two tokens represents the end of a simple selector. 478 | # This is needed to determine if a space is significant or not. 479 | if prev.kind in {tkBracketEnd, tkParam, tkElement} + PseudoNoParamsKinds: 480 | return true 481 | if prev.kind == tkIdentifier and prevPrev.kind in {tkClass, tkId}: 482 | return true 483 | 484 | proc forward(lexer: var Lexer) = 485 | if lexer.pos > lexer.input.high: 486 | lexer.current = lexer.next 487 | lexer.next = initToken(tkEoi) 488 | return 489 | 490 | let ch = lexer.input[lexer.pos] 491 | var skip = false 492 | var token: Token 493 | log "char: '" & ch & "'" 494 | 495 | case ch: 496 | 497 | of {'"', '\''}: 498 | var buffer = "" 499 | readStringLiteral(lexer.input, lexer.pos, buffer) 500 | token = initToken(tkString, buffer) 501 | 502 | of CssWhitespace: 503 | if lexer.pos + 1 < lexer.input.len and 504 | lexer.input[lexer.pos + 1] notin Combinators and 505 | isFinishedSimpleSelector(lexer.next, lexer.current): 506 | token = initToken(tkCombinatorDescendents) 507 | else: 508 | skip = true 509 | 510 | lexer.pos.inc 511 | 512 | of '~': 513 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 514 | token = initToken(tkAttributeItem) 515 | lexer.pos.inc 2 516 | else: 517 | token = initToken(tkCombinatorSiblings) 518 | lexer.pos.inc 519 | 520 | of '+': 521 | token = initToken(tkCombinatorNextSibling) 522 | lexer.pos.inc 523 | 524 | of '>': 525 | token = initToken(tkCombinatorChildren) 526 | lexer.pos.inc 527 | 528 | of '[': 529 | token = initToken(tkBracketStart) 530 | lexer.pos.inc 531 | 532 | of ']': 533 | token = initToken(tkBracketEnd) 534 | lexer.pos.inc 535 | 536 | of ':': 537 | var buffer = "" 538 | buffer.add ch 539 | lexer.pos.inc 540 | while lexer.pos <= lexer.input.high and 541 | lexer.input[lexer.pos] in Identifiers: 542 | buffer.add lexer.input[lexer.pos] 543 | lexer.pos.inc 544 | 545 | token = initPseudoToken(buffer.toLowerAscii) 546 | 547 | of '#': 548 | lexer.pos.inc 549 | token = initToken(tkId) 550 | 551 | of '.': 552 | lexer.pos.inc 553 | token = initToken(tkClass) 554 | 555 | of '*': 556 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 557 | token = initToken(tkAttributeSubstring) 558 | lexer.pos.inc 2 559 | else: 560 | lexer.pos.inc 561 | # No need to emit since tkUniversal matches everything? 562 | # token = initToken(tkUniversal) 563 | skip = true 564 | 565 | of '(': 566 | var buffer = "" 567 | readParams(lexer.input, lexer.pos, buffer) 568 | token = initToken(tkParam, buffer) 569 | 570 | of '=': 571 | token = initToken(tkAttributeExact) 572 | lexer.pos.inc 573 | 574 | of '|': 575 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 576 | token = initToken(tkAttributePipe) 577 | lexer.pos.inc 2 578 | 579 | of '^': 580 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 581 | token = initToken(tkAttributeStart) 582 | lexer.pos.inc 2 583 | 584 | of '$': 585 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 586 | token = initToken(tkAttributeEnd) 587 | lexer.pos.inc 2 588 | 589 | of ',': 590 | token = initToken(tkComma) 591 | lexer.pos.inc 592 | 593 | else: 594 | var buffer = "" 595 | if optUnicodeIdentifiers in lexer.options: 596 | readIdentifier(lexer.input, lexer.pos, buffer) 597 | else: 598 | readIdentifierAscii(lexer.input, lexer.pos, buffer) 599 | 600 | if buffer.len == 0: 601 | let rune = lexer.input.runeAt(lexer.pos) 602 | raise newUnexpectedCharacterException($rune) 603 | 604 | if lexer.next.kind in CombinatorKinds + {tkComma, tkInvalid}: 605 | token = initToken(tkElement, buffer.toLowerAscii) 606 | else: 607 | token = initToken(tkIdentifier, buffer) 608 | 609 | if not skip: 610 | if token.kind == tkInvalid: 611 | raise newUnexpectedCharacterException(ch) 612 | 613 | # TODO: It might be wise to perform some validation here. 614 | # e.g tkParam is only valid after tkPseudoNot tkPseudoNth* 615 | lexer.current = lexer.next 616 | lexer.next = token 617 | else: 618 | lexer.forward 619 | 620 | proc initLexer(input: string, options: set[QueryOption]): Lexer = 621 | # TODO: Get rid of strip 622 | result.input = strutils.strip(input) 623 | result.pos = 0 624 | result.options = options 625 | forward(result) 626 | forward(result) 627 | 628 | proc eat(lexer: var Lexer, kind: set[TokenKind]): Token = 629 | if lexer.next.kind notin kind: 630 | raise newException(ParseError, "") 631 | lexer.forward() 632 | result = lexer.current 633 | 634 | proc eat(lexer: var Lexer, kind: TokenKind): Token {.inline.} = 635 | lexer.eat({kind}) 636 | 637 | func hasAttr(node: Node, attr: string): bool {.inline.} = 638 | if not (node of Element): return false 639 | let 640 | e = Element(node) 641 | attrs = e.getAttrs() 642 | return (e.attrs.len > 0) and attr in attrs 643 | 644 | func validateNth(a, b, nSiblings: int): bool = 645 | if a == 0: 646 | return nSiblings == b - 1 647 | let n = (nSiblings - (b - 1)) / a 648 | return n.floor == n and n >= 0 649 | 650 | func satisfies(pair: NodeWithContext, demands: seq[Demand]): bool {.raises: [], gcsafe.} 651 | 652 | func satisfies(pair: NodeWithContext, demand: Demand): bool = 653 | 654 | if not (pair.node of Element): return false 655 | let node = Element(pair.node) 656 | 657 | case demand.kind 658 | of tkAttributeExists: 659 | return node.hasAttr(demand.attrName) 660 | 661 | of tkAttributeItem: 662 | return node.hasAttr(demand.attrName) and 663 | (demand.attrValue.len > 0) and 664 | demand.attrValue in node.getAttr(demand.attrName).split(CssWhitespace) 665 | 666 | # Empty attrValue is allowed, 667 | # and will match any value starting with '-' 668 | of tkAttributePipe: 669 | return node.hasAttr(demand.attrName) and 670 | demand.attrValue == node.getAttr(demand.attrName).split("-")[0] 671 | 672 | of tkAttributeExact: 673 | return node.getAttr(demand.attrName) == demand.attrValue 674 | 675 | of tkAttributeStart: 676 | return demand.attrValue.len > 0 and 677 | node.getAttr(demand.attrName).startsWith(demand.attrValue) 678 | 679 | of tkAttributeEnd: 680 | return demand.attrValue.len > 0 and 681 | node.getAttr(demand.attrName).endsWith(demand.attrValue) 682 | 683 | of tkAttributeSubstring: 684 | return demand.attrValue.len > 0 and 685 | node.getAttr(demand.attrName) in demand.attrValue 686 | 687 | of tkElement: 688 | return $node.tagType() == demand.element 689 | 690 | of tkPseudoEmpty: 691 | return node.childList.len == 0 692 | 693 | of tkPseudoOnlyChild: 694 | for sibling in pair.siblings: 695 | if sibling != node: 696 | return false 697 | return true 698 | 699 | of tkPseudoOnlyOfType: 700 | for sibling in pair.siblings: 701 | if sibling != node and sibling.tagType() == node.tagType(): 702 | return false 703 | return true 704 | 705 | of tkPseudoFirstChild: 706 | return pair.elementIndex == 0 707 | 708 | of tkPseudoLastChild: 709 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 710 | return false 711 | return true 712 | 713 | of tkPseudoFirstOfType: 714 | for sibling in pair.siblings: 715 | if sibling.tagType() == node.tagType(): 716 | return sibling == node 717 | 718 | of tkPseudoLastOfType: 719 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 720 | if sibling.tagType() == node.tagType(): 721 | return false 722 | return true 723 | 724 | of tkPseudoNot: 725 | return not pair.satisfies(demand.notQuery.demands) 726 | 727 | of tkPseudoNthChild: 728 | return validateNth(demand.a, demand.b, pair.elementIndex) 729 | 730 | of tkPseudoNthLastChild: 731 | var nSiblingsAfter = 0 732 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 733 | nSiblingsAfter.inc 734 | return validateNth(demand.a, demand.b, nSiblingsAfter) 735 | 736 | of tkPseudoNthOfType: 737 | var nSiblingsOfTypeBefore = 0 738 | for sibling in pair.siblings: 739 | if sibling == node: 740 | break 741 | elif sibling.tagType() == node.tagType(): 742 | nSiblingsOfTypeBefore.inc 743 | 744 | return validateNth(demand.a, demand.b, nSiblingsOfTypeBefore) 745 | 746 | of tkPseudoNthLastOfType: 747 | var nSiblingsOfTypeAfter = 0 748 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 749 | if sibling.tagType() == node.tagType(): 750 | nSiblingsOfTypeAfter.inc 751 | 752 | return validateNth(demand.a, demand.b, nSiblingsOfTypeAfter) 753 | else: 754 | raiseAssert "Invalid demand: " & $demand 755 | 756 | func satisfies(pair: NodeWithContext, demands: seq[Demand]): bool = 757 | for demand in demands: 758 | if not pair.satisfies(demand): 759 | return false 760 | return true 761 | 762 | func exec*(query: Query, node: Node, single: bool): seq[Element] = 763 | ## Execute an already parsed query. If `single = true`, 764 | ## it will never return more than one element. 765 | 766 | var initialStates = initHashSet[(int, int)]() 767 | for idx, s in query.subqueries: 768 | initialStates.incl (idx, 0) 769 | 770 | var stack = @[NodeWithContext(parent: node, index: 0, elementIndex: 0, searchStates: initialStates )] 771 | # Certain queries (e.g queries ending with an id selector) can be eliminated and doesn't need to be checked 772 | # anymore after the first match. These seqs are mapped to the subqueries by index. 773 | var subqueryCanBeEliminated = newSeq[bool](query.subqueries.len) 774 | var subqueryIsEliminated = newSeq[bool](query.subqueries.len) 775 | 776 | for idx, subquery in query.subqueries: 777 | subqueryCanBeEliminated[idx] = not canFindMultiple(subquery, query.options) 778 | 779 | while stack.len > 0: 780 | var entry = stack.pop() 781 | 782 | # Search states that should be forwarded to children 783 | var forChildren = initHashSet[(int, int)]() 784 | # Search states that should be forwarded to siblings 785 | var forSiblings = initHashSet[(int, int)]() 786 | 787 | for searchState in entry.searchStates: 788 | if subqueryIsEliminated[searchState[0]]: 789 | continue 790 | 791 | let subquery = query.subqueries[searchState[0]] 792 | let subqueryPart = subquery[searchState[1]] 793 | 794 | if subqueryPart.combinator == cmDescendants or subqueryPart.combinator == cmRoot: 795 | forChildren.incl searchState 796 | forSiblings.incl searchState 797 | elif subqueryPart.combinator == cmSiblings or subqueryPart.combinator == cmChildren: 798 | forSiblings.incl searchState 799 | 800 | if entry.satisfies(subqueryPart.demands): 801 | if searchState[1] + 1 == subquery.len: 802 | result.add Element(entry.node) 803 | if single: 804 | return 805 | if subqueryCanBeEliminated[searchState[0]]: 806 | subqueryIsEliminated[searchState[0]] = true 807 | else: 808 | let nextSubqueryPart = subquery[searchState[1] + 1] 809 | if nextSubqueryPart.combinator == cmChildren or nextSubqueryPart.combinator == cmDescendants: 810 | forChildren.incl (searchState[0], searchState[1] + 1) 811 | elif nextSubqueryPart.combinator == cmNextSibling or nextSubqueryPart.combinator == cmSiblings: 812 | forSiblings.incl (searchState[0], searchState[1] + 1) 813 | 814 | # Below results in a depth first search. 815 | 816 | # Add next sibling to stack 817 | if entry.parent != nil: 818 | var idx = entry.index + 1 819 | while idx < entry.parent.childList.len and not (entry.parent.childList[idx] of Element): 820 | idx.inc 821 | if idx < entry.parent.childList.len: 822 | stack.add NodeWithContext( 823 | parent: entry.parent, 824 | index: idx, 825 | elementIndex: entry.elementIndex + 1, 826 | searchStates: forSiblings) 827 | 828 | # Add first child to stack 829 | if entry.node.childList.len > 0: 830 | var idx = 0 831 | while idx < entry.node.childList.len and not (entry.node.childList[idx] of Element): 832 | idx.inc 833 | if idx < entry.node.childList.len: 834 | stack.add NodeWithContext( 835 | parent: entry.node, 836 | index: idx, 837 | elementIndex: 0, 838 | searchStates: forChildren) 839 | 840 | func parseHtmlQuery*(queryString: string, 841 | options: set[QueryOption] = DefaultQueryOptions): Query 842 | {.raises: [ParseError].} = 843 | ## Parses a query for later use. 844 | ## Raises `ParseError` if parsing of `queryString` fails. 845 | result.queryStr = queryString 846 | var parts = newSeq[QueryPart]() 847 | var demands = newSeq[Demand]() 848 | var lexer = initLexer(queryString, options) 849 | var combinator = cmRoot 850 | 851 | try: 852 | while true: 853 | case lexer.current.kind 854 | 855 | of tkClass: 856 | demands.add initAttributeDemand(tkAttributeItem, "class", 857 | lexer.eat(tkIdentifier).value) 858 | 859 | of tkId: 860 | demands.add initAttributeDemand(tkAttributeExact, "id", 861 | lexer.eat(tkIdentifier).value) 862 | 863 | of tkElement: 864 | demands.add initElementDemand(lexer.current.value) 865 | 866 | of tkBracketStart: 867 | let f = lexer.eat(tkIdentifier) 868 | let nkind = lexer.next.kind 869 | case nkind 870 | of AttributeKinds - {tkAttributeExists}: 871 | discard lexer.eat(nkind) 872 | let v = lexer.eat({tkIdentifier, tkString}) 873 | demands.add initAttributeDemand(nkind, f.value, v.value) 874 | discard lexer.eat(tkBracketEnd) 875 | of tkBracketEnd: 876 | demands.add initAttributeDemand(tkAttributeExists, 877 | f.value, "") 878 | discard lexer.eat(tkBracketEnd) 879 | else: 880 | raise newException(ParseError, "") 881 | 882 | of PseudoNoParamsKinds: 883 | demands.add initPseudoDemand(lexer.current.kind) 884 | 885 | of PseudoParamsKinds: 886 | let pseudoKind = lexer.current.kind 887 | let params = lexer.eat(tkParam) 888 | case pseudoKind 889 | of tkPseudoNot: 890 | # Not the cleanest way to this, but eh 891 | let notQuery = parseHtmlQuery(params.value, options) 892 | 893 | if not notQuery.isValidNotQuery(options): 894 | raise newException(ParseError, 895 | ":not argument must be a simple selector, but " & 896 | "was '" & params.value & "'") 897 | 898 | demands.add initNotDemand(notQuery.subqueries[0][0]) 899 | of NthKinds: 900 | let (a, b) = parsePseudoNthArguments(params.value) 901 | demands.add initNthChildDemand(pseudoKind, a, b) 902 | else: doAssert(false) # can't happen 903 | 904 | of CombinatorKinds: 905 | parts.add initQueryPart(demands, combinator) 906 | demands = @[] 907 | combinator = lexer.current.kind.ord.Combinator 908 | 909 | of tkComma: 910 | parts.add initQueryPart(demands, combinator) 911 | result.subqueries.add parts 912 | demands = @[] 913 | parts = @[] 914 | combinator = cmRoot 915 | 916 | of tkIdentifier, tkString, tkBracketEnd, 917 | tkParam, tkInvalid, AttributeKinds: 918 | raise newException(ParseError, "") 919 | 920 | of tkEoi: 921 | break 922 | 923 | lexer.forward() 924 | except ParseError as err: 925 | let msg = 926 | if err.msg == "": 927 | "Failed to parse CSS query '" & queryString & "'" 928 | else: 929 | "Failed to parse CSS query '" & queryString & "': " & err.msg 930 | raise newException(ParseError, msg) 931 | 932 | parts.add initQuerypart(demands, combinator) 933 | result.subqueries.add parts 934 | result.options = options 935 | 936 | log "\ninput: \n" & queryString 937 | 938 | func querySelector*(root: Node, queryString: string, 939 | options: set[QueryOption] = DefaultQueryOptions): Element 940 | {.raises: [ParseError].} = 941 | ## Get the first element matching `queryString`, 942 | ## or `nil` if no such element exists. 943 | ## Raises `ParseError` if parsing of `queryString` fails. 944 | let query = parseHtmlQuery(queryString, options) 945 | let lst = query.exec(root, single = true) 946 | if lst.len > 0: 947 | lst[0] 948 | else: 949 | nil 950 | 951 | func querySelector*(root: seq[Node], queryString: string, # MADE BY ME (supporting HTML Fragment) 952 | options: set[QueryOption] = DefaultQueryOptions): Element 953 | {.raises: [ParseError].} = 954 | ## Get the first element matching `queryString`, 955 | ## or `nil` if no such element exists. 956 | ## Raises `ParseError` if parsing of `queryString` fails. 957 | let query = parseHtmlQuery(queryString, options) 958 | let lst = query.exec(root.makeElemRoot(), single = true) 959 | if lst.len > 0: 960 | lst[0] 961 | else: 962 | nil 963 | 964 | func querySelectorAll*(root: Node, queryString: string, 965 | options: set[QueryOption] = DefaultQueryOptions): 966 | seq[Element] {.raises: [ParseError].} = 967 | ## Get all elements matching `queryString`. 968 | ## Raises `ParseError` if parsing of `queryString` fails. 969 | let query = parseHtmlQuery(queryString, options) 970 | result = query.exec(root, single = false) 971 | 972 | func querySelectorAll*(root: seq[Node], queryString: string, 973 | options: set[QueryOption] = DefaultQueryOptions): 974 | seq[Element] {.raises: [ParseError].} = 975 | ## Get all elements matching `queryString`. 976 | ## Raises `ParseError` if parsing of `queryString` fails. 977 | let query = parseHtmlQuery(queryString, options) 978 | result = query.exec(root.makeElemRoot(), single = false) 979 | -------------------------------------------------------------------------------- /src/css3selectors/dom_utils.nim: -------------------------------------------------------------------------------- 1 | import std/sugar 2 | import pkg/[chame/minidom] 3 | 4 | func escapeText(s: string, attribute_mode = false): string = 5 | var nbsp_mode = false 6 | var nbsp_prev: char 7 | for c in s: 8 | if nbsp_mode: 9 | if c == char(0xA0): 10 | result &= " " 11 | else: 12 | result &= nbsp_prev & c 13 | nbsp_mode = false 14 | elif c == '&': 15 | result &= "&" 16 | elif c == char(0xC2): 17 | nbsp_mode = true 18 | nbsp_prev = c 19 | elif attribute_mode and c == '"': 20 | result &= """ 21 | elif not attribute_mode and c == '<': 22 | result &= "<" 23 | elif not attribute_mode and c == '>': 24 | result &= ">" 25 | else: 26 | result &= c 27 | 28 | func `$`*(node: Node): string = 29 | if node of Element: 30 | let element = Element(node) 31 | var x = "" 32 | if element.namespace == Namespace.SVG: 33 | x = "svg " 34 | elif element.namespace == Namespace.MATHML: 35 | x = "math " 36 | result = "<" & x & element.localNameStr 37 | for k, v in element.attrsStr: 38 | result &= ' ' & k & "=\"" & v.escapeText(true) & "\"" 39 | result &= ">" 40 | for node in element.childList: 41 | result &= $node 42 | result &= "" & x & element.localNameStr & ">" 43 | elif node of Text: 44 | let text = Text(node) 45 | result = text.data.escapeText() 46 | elif node of Comment: 47 | result = "" 48 | elif node of DocumentType: 49 | result = "" 50 | elif node of Document: 51 | result = "Node of Document" 52 | else: 53 | assert false 54 | 55 | func makeElemRoot*(list: seq[Node]): Element = 56 | result = Element() 57 | for node in list: 58 | if node of Element: 59 | result.childList.add(node) 60 | else: 61 | for n in node.childList: # probably a hack but whatever.. for now ;) 62 | if n of Element: 63 | result.childList.add(n) 64 | break 65 | 66 | #func getAttr*(e: Element; key: string): string {.inline} = 67 | # let factory = e.document.factory 68 | # for attr in e.attrs: 69 | # if factory.atomToStr(attr.name) == key: return attr.value 70 | # return "" 71 | 72 | func getAttr*(e: Element; key: string): string {.inline} = 73 | let factory = e.document.factory 74 | let atomizedKey = factory.strToAtom(key) 75 | for attr in e.attrs: 76 | if attr.name == atomizedKey: return attr.value 77 | return "" 78 | 79 | func getAttrs*(e: Element): seq[string] {.inline.} = 80 | let factory = e.document.factory 81 | result = collect(for attr in e.attrs: factory.atomToStr(attr.name)) 82 | -------------------------------------------------------------------------------- /tests/config.nims: -------------------------------------------------------------------------------- 1 | switch("path", "$projectDir/../src") 2 | -------------------------------------------------------------------------------- /tests/test.nim: -------------------------------------------------------------------------------- 1 | import std/[unittest, streams] 2 | import pkg/chame/minidom 3 | import css3selectors 4 | 5 | const html = """ 6 | 7 | 8 | 9 |189 | 190 | 191 | 192 |
1
213 |2
214 |3
215 |4
216 |1
,2
,3
,4
]") 568 | 569 | test "issue11": 570 | let fragment = parseHTMLFragment(""" 571 |