├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── changelog.md ├── nimquery.nim ├── nimquery.nimble └── tests ├── incltests.nim └── tests.nim /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the action will run. Triggers the workflow on push or pull request 6 | # events but only for the master branch 7 | on: 8 | push: 9 | branches: [ master ] 10 | pull_request: 11 | branches: [ master ] 12 | 13 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 14 | jobs: 15 | # This workflow contains a single job called "build" 16 | build: 17 | # The type of runner that the job will run on 18 | runs-on: ubuntu-latest 19 | 20 | strategy: 21 | matrix: 22 | nim: [ 'devel', 'stable' ] 23 | 24 | # Steps represent a sequence of tasks that will be executed as part of the job 25 | steps: 26 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 27 | - uses: actions/checkout@v2 28 | 29 | - name: Setup Nim environment 30 | uses: jiro4989/setup-nim-action@v1 31 | with: 32 | nim-version: ${{ matrix.nim }} 33 | 34 | - run: nimble test -Y 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | nimsuggest.log 3 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017, Oscar Nihlgård 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nimquery  2 | A library for querying HTML using CSS selectors, like JavaScripts `document.querySelector`/`document.querySelectorAll`. 3 | 4 | ## Installation 5 | 6 | Nimquery is available on Nimble: 7 | ``` 8 | nimble install nimquery 9 | ``` 10 | 11 | ## Usage 12 | ```nim 13 | from xmltree import `$` 14 | from htmlparser import parseHtml 15 | import nimquery 16 | 17 | let html = """ 18 | 19 | 20 |
1
23 |2
24 |3
25 |4
26 | 27 | 28 | """ 29 | let xml = parseHtml(html) 30 | let elements = xml.querySelectorAll("p:nth-child(odd)") 31 | echo elements 32 | # => @[1
,3
] 33 | ``` 34 | 35 | ## API 36 | 37 | ```nim 38 | proc querySelectorAll*(root: XmlNode, 39 | queryString: string, 40 | options: set[QueryOption] = DefaultQueryOptions): seq[XmlNode] 41 | ``` 42 | Get all elements matching `queryString`. 43 | Raises `ParseError` if parsing of `queryString` fails. 44 | See [Options](#options) for information about the `options` parameter. 45 | 46 | - - - 47 | 48 | ```nim 49 | proc querySelector*(root: XmlNode, 50 | queryString: string, 51 | options: set[QueryOption] = DefaultQueryOptions): XmlNode 52 | ``` 53 | Get the first element matching `queryString`, or `nil` if no such element exists. 54 | Raises `ParseError` if parsing of `queryString` fails. 55 | See [Options](#options) for information about the `options` parameter. 56 | 57 | - - - 58 | 59 | ```nim 60 | proc parseHtmlQuery*(queryString: string, 61 | options: set[QueryOption] = DefaultQueryOptions): Query 62 | ``` 63 | Parses a query for later use. 64 | Raises `ParseError` if parsing of `queryString` fails. 65 | See [Options](#options) for information about the `options` parameter. 66 | 67 | - - - 68 | 69 | ```nim 70 | proc exec*(query: Query, 71 | root: XmlNode, 72 | single: bool): seq[XmlNode] 73 | ``` 74 | Execute an already parsed query. If `single = true`, it will never return more than one element. 75 | 76 | ### Options 77 | The `QueryOption` enum contains flags for configuring the behavior when parsing/searching: 78 | 79 | - `optUniqueIds`: Indicates if id attributes should be assumed to be unique. 80 | - `optSimpleNot`: Indicates if only simple selectors are allowed as an argument to the `:not(...)` psuedo-class. Note that combinators are not allowed in the argument even if this flag is excluded. 81 | - `optUnicodeIdentifiers`: Indicates if unicode characters are allowed inside identifiers. Doesn't affect strings where unicode is always allowed. 82 | 83 | The default options is defined as `const DefaultQueryOptions* = { optUniqueIds, optUnicodeIdentifiers, optSimpleNot }`. 84 | 85 | Below is an example of using the options parameter to allow a complex `:not(...)` selector. 86 | 87 | ```nim 88 | import xmltree 89 | import htmlparser 90 | import streams 91 | import nimquery 92 | 93 | let html = """ 94 | 95 | 96 |1
99 |2
100 |3
101 |4
102 | 103 | 104 | """ 105 | let xml = parseHtml(newStringStream(html)) 106 | let options = DefaultQueryOptions - { optSimpleNot } 107 | let elements = xml.querySelectorAll("p:not(.maybe-skip:nth-child(even))", options) 108 | echo elements 109 | # => @[1
,3
,4
] 110 | ``` 111 | 112 | ## Unsupported selectors 113 | Nimquery supports all [CSS3 selectors](https://www.w3.org/TR/css3-selectors) except the following: `:root`, `:link`, `:visited`, `:active`, `:hover`, `:focus`, `:target`, `:lang(...)`, `:enabled`, `:disabled`, `:checked`, `::first-line`, `::first-letter`, `::before`, `::after`. These selectors will not be implemented because they don't make much sense in the situations where Nimquery is useful. 114 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | Version 1.2.2 (2019-06-30) 2 | ============= 3 | - Fixed compatibility with Nim v0.20.0 4 | 5 | Version 1.2.1 (2019-04-18) 6 | ============= 7 | - Fixed #3 8 | 9 | Version 1.2.0 (2019-01-23) 10 | ============= 11 | - Improved error handling: 12 | * `ParseError` now inherits from `ValueError` instead of `Exception` 13 | * `ParseError` is now the only catchable exception that will be raised by exported Nimquery procs (this was already the documented behavior, but wasn't true for some edge cases). 14 | * Exception messages now always contain the full query that caused the exception. 15 | - CSS pseudo selectors are now case insensitive. 16 | - Now requires Nim 0.19.2 or later. 17 | - Much more strict and correct parsing for the `:nth-*` family of pseudo selectors. -------------------------------------------------------------------------------- /nimquery.nim: -------------------------------------------------------------------------------- 1 | # Spec: https://www.w3.org/TR/css3-selectors/ 2 | 3 | import std / [xmltree, strutils, strtabs, unicode, math, parseutils, sets] 4 | 5 | const DEBUG = false 6 | 7 | type 8 | ParseError* = object of ValueError 9 | 10 | TokenKind = enum 11 | tkInvalid 12 | 13 | tkBracketStart, tkBracketEnd 14 | tkParam 15 | tkComma 16 | 17 | # NOTE: These are handled the same in some contexts, but they 18 | # are different. `tkIdentifier` can only contain a very specific 19 | # subset of characters, but tkString can contain anything. 20 | # This means that both `#foo%` and `[id=foo%]` is invalid, 21 | # but not `[id="foo%"]` or `#foo\%`. 22 | tkIdentifier, tkString 23 | 24 | tkClass, tkId, tkElement 25 | 26 | tkCombinatorDescendents, tkCombinatorChildren 27 | tkCombinatorNextSibling, tkCombinatorSiblings 28 | 29 | tkAttributeExact # [attr=...] 30 | tkAttributeItem # [attr~=...] 31 | tkAttributePipe # [attr|=...] 32 | tkAttributeExists # [attr] 33 | tkAttributeStart # [attr^=...] 34 | tkAttributeEnd # [attr$=...] 35 | tkAttributeSubstring # [attr*=...] 36 | 37 | tkPseudoNthChild, tkPseudoNthLastChild 38 | tkPseudoNthOfType, tkPseudoNthLastOfType 39 | 40 | tkPseudoFirstOfType, tkPseudoLastOfType 41 | tkPseudoOnlyChild, tkPseudoOnlyOfType, tkPseudoEmpty 42 | tkPseudoFirstChild, tkPseudoLastChild 43 | 44 | tkPseudoNot 45 | 46 | tkEoi # End of input 47 | 48 | Token = object 49 | kind: TokenKind 50 | value: string 51 | 52 | const AttributeKinds = { 53 | tkAttributeExact, tkAttributeItem, 54 | tkAttributePipe, tkAttributeExists, 55 | tkAttributeStart, tkAttributeEnd, 56 | tkAttributeSubstring 57 | } 58 | 59 | const NthKinds = { 60 | tkPseudoNthChild, tkPseudoNthLastChild, 61 | tkPseudoNthOfType, tkPseudoNthLastOfType 62 | } 63 | 64 | type 65 | Demand = object 66 | case kind: Tokenkind 67 | of AttributeKinds: 68 | attrName, attrValue: string 69 | of NthKinds: 70 | a, b: int 71 | of tkPseudoNot: 72 | notQuery: QueryPart 73 | of tkElement: 74 | element: string 75 | else: discard 76 | 77 | Combinator = enum 78 | cmDescendants = tkCombinatorDescendents, 79 | cmChildren = tkCombinatorChildren, 80 | cmNextSibling = tkCombinatorNextSibling, 81 | cmSiblings = tkCombinatorSiblings, 82 | cmRoot # Special case for the first query 83 | 84 | QueryOption* = enum 85 | optUniqueIds ## Assume unique id's or not 86 | optUnicodeIdentifiers ## Allow non-ascii in identifiers (e.g `#exämple`) 87 | optSimpleNot ## Only allow simple selectors as the argument 88 | ## for ":not". Combinators and/or commas are not 89 | ## allowed even if this option is excluded. 90 | 91 | Lexer = object 92 | input: string 93 | pos: int 94 | options: set[QueryOption] 95 | current, next: Token 96 | 97 | Query* = object ## Represents a parsed query. 98 | subqueries: seq[seq[QueryPart]] 99 | options: set[QueryOption] 100 | queryStr: string ## The original input string 101 | 102 | QueryPart = object 103 | demands: seq[Demand] 104 | combinator: Combinator 105 | 106 | # Used during the search to keep track which parts of the subqueries 107 | # have already been matched. 108 | NodeWithContext = object 109 | # We need access to the siblings of the node 110 | # which we get through the parent. 111 | parent: XmlNode 112 | # Index is the index used by `xmltree`, 113 | # elementIndex is the index when only counting elements 114 | # (not text nodes etc). 115 | index, elementIndex: int 116 | searchStates: HashSet[(int, int)] 117 | 118 | {.deprecated: [NimqueryOption: QueryOption].} 119 | 120 | const DefaultQueryOptions* = {optUniqueIds, optUnicodeIdentifiers, 121 | optSimpleNot} 122 | const NimqueryDefaultOptions* {.deprecated.} = DefaultQueryOptions 123 | 124 | const Identifiers = Letters + Digits + {'-', '_', '\\'} 125 | # NOTE: This is not the same as `strutils.Whitespace`. 126 | # These values are defined by spec. 127 | const CssWhitespace = {'\x20', '\x09', '\x0A', '\x0D', '\x0C'} 128 | const Combinators = CssWhitespace + {'+', '~', '>'} 129 | 130 | const PseudoNoParamsKinds = { 131 | tkPseudoFirstOfType, tkPseudoLastOfType, 132 | tkPseudoOnlyChild, tkPseudoOnlyOfType, 133 | tkPseudoEmpty, tkPseudoFirstChild, 134 | tkPseudoLastChild 135 | } 136 | 137 | const PseudoParamsKinds = NthKinds + {tkPseudoNot} 138 | 139 | const CombinatorKinds = { 140 | tkCombinatorChildren, tkCombinatorDescendents, 141 | tkCombinatorNextSibling, tkCombinatorSiblings 142 | } 143 | 144 | template log(x: varargs[untyped]) = 145 | when DEBUG: 146 | debugEcho x 147 | 148 | func safeCharCompare(str: string, idx: int, cs: set[char]): bool {.inline.} = 149 | if idx > high(str): return false 150 | if idx < low(str): return false 151 | return str[idx] in cs 152 | 153 | func safeCharCompare(str: string, idx: int, c: char): bool {.inline.} = 154 | return str.safeCharCompare(idx, {c}) 155 | 156 | func node(pair: NodeWithContext): XmlNode = 157 | return pair.parent[pair.index] 158 | 159 | func attrComparerString(kind: TokenKind): string = 160 | case kind 161 | of tkAttributeExact: return "=" 162 | of tkAttributeItem: return "~=" 163 | of tkAttributePipe: return "|=" 164 | of tkAttributeExists: return "" 165 | of tkAttributeStart: return "^=" 166 | of tkAttributeEnd: return "$=" 167 | of tkAttributeSubstring: return "*=" 168 | else: raiseAssert "Invalid attr kind: " & $kind 169 | 170 | func newUnexpectedCharacterException(s: string): ref ParseError = 171 | return newException(ParseError, "Unexpected character: '" & s & "'") 172 | 173 | func newUnexpectedCharacterException(c: char): ref ParseError = 174 | newUnexpectedCharacterException($c) 175 | 176 | func initNotDemand(notQuery: QueryPart): Demand = 177 | result = Demand(kind: tkPseudoNot, notQuery: notQuery) 178 | 179 | func initElementDemand(element: string): Demand = 180 | result = Demand(kind: tkElement, element: element) 181 | 182 | func initPseudoDemand(kind: TokenKind): Demand = 183 | result = Demand(kind: kind) 184 | 185 | func initAttributeDemand(kind: TokenKind, name, value: string): Demand = 186 | case kind 187 | of AttributeKinds: 188 | result = Demand(kind: kind, attrName: name, attrValue: value) 189 | else: 190 | raiseAssert "invalid kind: " & $kind 191 | 192 | func initNthChildDemand(kind: TokenKind, a, b: int): Demand = 193 | case kind 194 | of NthKinds: 195 | result = Demand(kind: kind, a: a, b: b) 196 | else: 197 | raiseAssert "invalid kind: " & $kind 198 | 199 | func `$`(demand: Demand): string {.raises: [].} = 200 | case demand.kind: 201 | of AttributeKinds: 202 | if demand.kind == tkAttributeExists: 203 | result = "[" & demand.attrName & "]" 204 | else: 205 | result = "[" & demand.attrName & demand.kind.attrComparerString & 206 | "'" & demand.attrValue & "']" 207 | of tkPseudoNot: 208 | result = ":" & $demand.kind & "(" & $demand.notQuery & ")" 209 | of NthKinds: 210 | result = ":" & $demand.kind & "(" & $demand.a & "n, " & $demand.b & ")" 211 | of PseudoNoParamsKinds: 212 | result = ":" & $demand.kind 213 | of tkElement: 214 | result = demand.element 215 | else: 216 | result = $demand.kind 217 | 218 | func `==`(d1, d2: Demand): bool = 219 | if d1.kind != d2.kind: return false 220 | case d1.kind 221 | of AttributeKinds: 222 | return d1.attrName == d2.attrName and d1.attrValue == d2.attrValue 223 | of NthKinds: 224 | return d1.a == d2.b 225 | of tkPseudoNot: 226 | return d1.notQuery == d2.notQuery 227 | of tkElement: 228 | return d1.element == d2.element 229 | else: 230 | raise newException(Exception, "Invalid demand kind: " & $d1.kind) 231 | 232 | iterator siblings(pair: NodeWithContext, 233 | startAtIndex = 0): XmlNode = 234 | if pair.parent != nil: 235 | var idx = startAtIndex 236 | while idx < pair.parent.len: 237 | let el = pair.parent[idx] 238 | if el.kind == xnElement: 239 | yield el 240 | idx.inc 241 | 242 | func initToken(kind: TokenKind, value: string = ""): Token = 243 | return Token(kind: kind, value: value) 244 | 245 | func initQueryPart(demands: seq[Demand], combinator: Combinator): QueryPart = 246 | return QueryPart(demands: demands, combinator: combinator) 247 | 248 | func canFindMultiple(q: seq[QueryPart], options: set[QueryOption]): bool = 249 | ## Returns true if the subquery ``q`` can match multiple elements. 250 | var lastPart = q[^1] 251 | for demand in lastPart.demands: 252 | if optUniqueIds in options and demand.kind in AttributeKinds and 253 | demand.attrName == "id": 254 | return false 255 | if lastPart.combinator in {cmChildren, cmSiblings} and demand.kind in 256 | {tkPseudoFirstOfType, tkPseudoLastOfType, 257 | tkPseudoFirstChild, tkPseudoLastChild, tkPseudoOnlyOfType}: 258 | return false 259 | 260 | return true 261 | 262 | func `$`*(q: Query): string = 263 | ## Returns the original input string used to construct the query 264 | result = q.queryStr 265 | 266 | func isValidNotQuery(q: Query, options: set[QueryOption]): bool = 267 | return 268 | q.subqueries.len == 1 and 269 | q.subqueries[0].len == 1 and 270 | (q.subqueries[0][0].demands.len == 1 or not (optSimpleNot in options)) 271 | 272 | func readEscape(input: string, idx: var int, buffer: var string) = 273 | assert input[idx] == '\\' 274 | idx.inc 275 | 276 | # Linefeed, carriage return and form feed can't be escaped. 277 | if input[idx] in {'\x0C', '\x0D', '\x0A'}: 278 | raise newUnexpectedCharacterException(input[idx]) 279 | 280 | # No special handling is required for these. 281 | # E.g '\n' means 'n', not 'newline'. 282 | if input[idx] notin HexDigits: 283 | # FIXME: Should this read a grapheme instead of a rune? I don't know 284 | let runeStr = input.runeAt(idx).toUTF8 285 | buffer.add runeStr 286 | idx.inc runeStr.len 287 | 288 | else: 289 | var hexStr = "" 290 | 291 | while input[idx] in HexDigits and hexStr.len < 6: 292 | hexStr.add input[idx] 293 | idx.inc 294 | 295 | # Skip whitespace after hex input 296 | if input[idx] in CssWhitespace: 297 | idx.inc 298 | 299 | try: 300 | let runeStr = hexStr.parseHexInt.Rune.toUTF8 301 | buffer.add runeStr 302 | except ValueError: 303 | raiseAssert "Can't happen" 304 | 305 | func readStringLiteral(input: string, idx: var int, buffer: var string) = 306 | assert input[idx] in {'\'', '"'} 307 | 308 | let ch = input[idx] 309 | idx.inc 310 | 311 | while input[idx] != ch: 312 | if input[idx] == '\\': 313 | readEscape(input, idx, buffer) 314 | else: 315 | buffer.add input[idx] 316 | idx.inc 317 | 318 | if idx > high(input): 319 | raise newException(ParseError, "Non-terminated string") 320 | 321 | idx.inc 322 | 323 | func readIdentifier(input: string, idx: var int, buffer: var string) = 324 | const intIdentifiers = { 325 | 'a' .. 'z', 'A' .. 'Z', 326 | '0' .. '9', 327 | '-', '_', '\\' 328 | } 329 | 330 | if input[idx] == '_' or 331 | input[idx] in Digits or 332 | (input[idx] == '-' and 333 | input.safeCharCompare(idx + 1, {'-'} + Digits)): 334 | raise newUnexpectedCharacterException(input[idx + 1]) 335 | 336 | func isValidIdentifier(rune: Rune): bool = 337 | if rune.char in intIdentifiers: 338 | return true 339 | # Spec: https://www.w3.org/TR/CSS21/syndata.html#value-def-identifier 340 | return rune >=% 0x00A0.Rune 341 | 342 | while idx < input.len: 343 | # NOTE: `idx` is the byte offset of input, so `runeAt(idx)` is correct. 344 | let rune = input.runeAt(idx) 345 | 346 | if not isValidIdentifier(rune): 347 | break 348 | 349 | if rune == '\\'.Rune: 350 | readEscape(input, idx, buffer) 351 | else: 352 | let unicodeCh = $rune 353 | idx.inc unicodeCh.len 354 | buffer.add unicodeCh 355 | 356 | func readIdentifierAscii(input: string, idx: var int, buffer: var string) = 357 | if input[idx] == '-' and input.safeCharCompare(idx + 1, {'-'} + Digits): 358 | raise newUnexpectedCharacterException(input[idx + 1]) 359 | 360 | while input[idx] in Identifiers and idx < input.len: 361 | if input[idx] == '\\': 362 | readEscape(input, idx, buffer) 363 | else: 364 | buffer.add input[idx] 365 | idx.inc 366 | 367 | func readParams(input: string, idx: var int, buffer: var string) = 368 | # Fragile, ugly, ok 369 | var paramContextCount = 0 370 | var dblQuoteStringContext = false 371 | var sglQuoteStringContext = false 372 | idx.inc 373 | 374 | while input[idx] != ')' or paramContextCount > 0 or 375 | dblQuoteStringContext or sglQuoteStringContext: 376 | if input[idx] == '"' and not sglQuoteStringContext: 377 | dblQuoteStringContext = not dblQuoteStringContext 378 | 379 | if input[idx] == '\'' and not dblQuoteStringContext: 380 | sglQuoteStringContext = not sglQuoteStringContext 381 | 382 | if input[idx] == '(' and not dblQuoteStringContext and 383 | not sglQuoteStringContext: 384 | paramContextCount.inc 385 | 386 | if input[idx] == ')' and not dblQuoteStringContext and 387 | not sglQuoteStringContext: 388 | paramContextCount.dec 389 | 390 | if input[idx] == '\\': 391 | buffer.add input[idx] 392 | idx.inc 393 | 394 | buffer.add input[idx] 395 | idx.inc 396 | 397 | if idx > high(input): 398 | raise newException(ParseError, 399 | "Non-terminated pseudo argument list") 400 | 401 | idx.inc 402 | 403 | func parsePseudoNthArguments(input: string): tuple[a: int, b: int] = 404 | var buffer = "" 405 | var idx = 0 406 | idx.inc skipWhile(input, CssWhitespace, idx) 407 | 408 | template takeInt: int = 409 | var v: int 410 | try: 411 | v = buffer.parseInt 412 | buffer = "" 413 | # NOTE: This branch can only be taken in case of overflow 414 | except ValueError as err: 415 | raise newException(ParseError, err.msg) 416 | v 417 | 418 | if idx + 2 < input.len and input[idx..idx+2].cmpIgnoreCase("odd") == 0: 419 | result = (2, 1) 420 | idx.inc 3 421 | elif idx + 3 < input.len and input[idx..idx+3].cmpIgnoreCase("even") == 0: 422 | result = (2, 0) 423 | idx.inc 4 424 | else: 425 | if idx < input.len and input[idx] in {'+', '-'}: 426 | buffer.add input[idx] 427 | idx.inc 428 | if idx >= input.len: 429 | raise newException(ParseError, "Invalid parameter for ':nth-*'") 430 | if input[idx] notin Digits: 431 | buffer.add "1" 432 | while idx < input.len and input[idx] in Digits: 433 | buffer.add input[idx] 434 | idx.inc 435 | if idx < input.len and input[idx] in {'n', 'N'}: 436 | idx.inc 437 | result.a = takeInt() 438 | idx.inc skipWhile(input, CssWhitespace, idx) 439 | if idx < input.len and input[idx] in {'+', '-'}: 440 | buffer.add input[idx] 441 | idx.inc 442 | idx.inc skipWhile(input, CssWhitespace, idx) 443 | if idx >= input.len or input[idx] notin Digits: 444 | raise newUnexpectedCharacterException(input[idx]) 445 | while idx < input.len and input[idx] in Digits: 446 | buffer.add input[idx] 447 | idx.inc 448 | result.b = takeInt() 449 | else: 450 | discard # done, only "a" was specified 451 | else: 452 | result.b = takeInt() 453 | 454 | idx.inc skipWhile(input, CssWhitespace, idx) 455 | if idx <= input.high: 456 | raise newUnexpectedCharacterException(input[idx]) 457 | 458 | func initPseudoToken(str: string): Token = 459 | let kind = case str 460 | of ":empty": tkPseudoEmpty 461 | of ":only-child": tkPseudoOnlyChild 462 | of ":only-of-type": tkPseudoOnlyOfType 463 | of ":first-child": tkPseudoFirstChild 464 | of ":last-child": tkPseudoLastChild 465 | of ":last-of-type": tkPseudoLastOfType 466 | of ":first-of-type": tkPseudoFirstOfType 467 | of ":not": tkPseudoNot 468 | of ":nth-child": tkPseudoNthChild 469 | of ":nth-last-child": tkPseudoNthLastChild 470 | of ":nth-of-type": tkPseudoNthOfType 471 | of ":nth-last-of-type": tkPseudoNthLastOfType 472 | else: 473 | raise newException(ParseError, "Unknown pseudo selector: " & str) 474 | result = initToken(kind) 475 | 476 | func isFinishedSimpleSelector(prev: Token, prevPrev: Token): bool = 477 | # Checks if the last two tokens represents the end of a simple selector. 478 | # This is needed to determine if a space is significant or not. 479 | if prev.kind in {tkBracketEnd, tkParam, tkElement} + PseudoNoParamsKinds: 480 | return true 481 | if prev.kind == tkIdentifier and prevPrev.kind in {tkClass, tkId}: 482 | return true 483 | 484 | proc forward(lexer: var Lexer) = 485 | if lexer.pos > lexer.input.high: 486 | lexer.current = lexer.next 487 | lexer.next = initToken(tkEoi) 488 | return 489 | 490 | let ch = lexer.input[lexer.pos] 491 | var skip = false 492 | var token: Token 493 | log "char: '" & ch & "'" 494 | 495 | case ch: 496 | 497 | of {'"', '\''}: 498 | var buffer = "" 499 | readStringLiteral(lexer.input, lexer.pos, buffer) 500 | token = initToken(tkString, buffer) 501 | 502 | of CssWhitespace: 503 | if lexer.pos + 1 < lexer.input.len and 504 | lexer.input[lexer.pos + 1] notin Combinators and 505 | isFinishedSimpleSelector(lexer.next, lexer.current): 506 | token = initToken(tkCombinatorDescendents) 507 | else: 508 | skip = true 509 | 510 | lexer.pos.inc 511 | 512 | of '~': 513 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 514 | token = initToken(tkAttributeItem) 515 | lexer.pos.inc 2 516 | else: 517 | token = initToken(tkCombinatorSiblings) 518 | lexer.pos.inc 519 | 520 | of '+': 521 | token = initToken(tkCombinatorNextSibling) 522 | lexer.pos.inc 523 | 524 | of '>': 525 | token = initToken(tkCombinatorChildren) 526 | lexer.pos.inc 527 | 528 | of '[': 529 | token = initToken(tkBracketStart) 530 | lexer.pos.inc 531 | 532 | of ']': 533 | token = initToken(tkBracketEnd) 534 | lexer.pos.inc 535 | 536 | of ':': 537 | var buffer = "" 538 | buffer.add ch 539 | lexer.pos.inc 540 | while lexer.pos <= lexer.input.high and 541 | lexer.input[lexer.pos] in Identifiers: 542 | buffer.add lexer.input[lexer.pos] 543 | lexer.pos.inc 544 | 545 | token = initPseudoToken(buffer.toLowerAscii) 546 | 547 | of '#': 548 | lexer.pos.inc 549 | token = initToken(tkId) 550 | 551 | of '.': 552 | lexer.pos.inc 553 | token = initToken(tkClass) 554 | 555 | of '*': 556 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 557 | token = initToken(tkAttributeSubstring) 558 | lexer.pos.inc 2 559 | else: 560 | lexer.pos.inc 561 | # No need to emit since tkUniversal matches everything? 562 | # token = initToken(tkUniversal) 563 | skip = true 564 | 565 | of '(': 566 | var buffer = "" 567 | readParams(lexer.input, lexer.pos, buffer) 568 | token = initToken(tkParam, buffer) 569 | 570 | of '=': 571 | token = initToken(tkAttributeExact) 572 | lexer.pos.inc 573 | 574 | of '|': 575 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 576 | token = initToken(tkAttributePipe) 577 | lexer.pos.inc 2 578 | 579 | of '^': 580 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 581 | token = initToken(tkAttributeStart) 582 | lexer.pos.inc 2 583 | 584 | of '$': 585 | if lexer.input.safeCharCompare(lexer.pos + 1, '='): 586 | token = initToken(tkAttributeEnd) 587 | lexer.pos.inc 2 588 | 589 | of ',': 590 | token = initToken(tkComma) 591 | lexer.pos.inc 592 | 593 | else: 594 | var buffer = "" 595 | if optUnicodeIdentifiers in lexer.options: 596 | readIdentifier(lexer.input, lexer.pos, buffer) 597 | else: 598 | readIdentifierAscii(lexer.input, lexer.pos, buffer) 599 | 600 | if buffer.len == 0: 601 | let rune = lexer.input.runeAt(lexer.pos) 602 | raise newUnexpectedCharacterException($rune) 603 | 604 | if lexer.next.kind in CombinatorKinds + {tkComma, tkInvalid}: 605 | token = initToken(tkElement, buffer.toLowerAscii) 606 | else: 607 | token = initToken(tkIdentifier, buffer) 608 | 609 | if not skip: 610 | if token.kind == tkInvalid: 611 | raise newUnexpectedCharacterException(ch) 612 | 613 | # TODO: It might be wise to perform some validation here. 614 | # e.g tkParam is only valid after tkPseudoNot tkPseudoNth* 615 | lexer.current = lexer.next 616 | lexer.next = token 617 | else: 618 | lexer.forward 619 | 620 | proc initLexer(input: string, options: set[QueryOption]): Lexer = 621 | # TODO: Get rid of strip 622 | result.input = strutils.strip(input) 623 | result.pos = 0 624 | result.options = options 625 | forward(result) 626 | forward(result) 627 | 628 | proc eat(lexer: var Lexer, kind: set[TokenKind]): Token = 629 | if lexer.next.kind notin kind: 630 | raise newException(ParseError, "") 631 | lexer.forward() 632 | result = lexer.current 633 | 634 | proc eat(lexer: var Lexer, kind: TokenKind): Token {.inline.} = 635 | lexer.eat({kind}) 636 | 637 | func hasAttr(node: XmlNode, attr: string): bool {.inline.} = 638 | return not node.attrs.isNil and node.attrs.hasKey(attr) 639 | 640 | func validateNth(a, b, nSiblings: int): bool = 641 | if a == 0: 642 | return nSiblings == b - 1 643 | let n = (nSiblings - (b - 1)) / a 644 | return n.floor == n and n >= 0 645 | 646 | func satisfies(pair: NodeWithContext, demands: seq[Demand]): bool 647 | {.raises: [], gcsafe.} 648 | 649 | func satisfies(pair: NodeWithContext, demand: Demand): bool = 650 | let node = pair.node 651 | 652 | case demand.kind 653 | of tkAttributeExists: 654 | return node.hasAttr(demand.attrName) 655 | 656 | of tkAttributeItem: 657 | return node.hasAttr(demand.attrName) and 658 | (demand.attrValue.len > 0) and 659 | demand.attrValue in node.attr(demand.attrName).split(CssWhitespace) 660 | 661 | # Empty attrValue is allowed, 662 | # and will match any value starting with '-' 663 | of tkAttributePipe: 664 | return node.hasAttr(demand.attrName) and 665 | demand.attrValue == node.attr(demand.attrName).split("-")[0] 666 | 667 | of tkAttributeExact: 668 | return node.attr(demand.attrName) == demand.attrValue 669 | 670 | of tkAttributeStart: 671 | return demand.attrValue.len > 0 and 672 | node.attr(demand.attrName).startsWith(demand.attrValue) 673 | 674 | of tkAttributeEnd: 675 | return demand.attrValue.len > 0 and 676 | node.attr(demand.attrName).endsWith(demand.attrValue) 677 | 678 | of tkAttributeSubstring: 679 | return demand.attrValue.len > 0 and 680 | node.attr(demand.attrName) in demand.attrValue 681 | 682 | of tkElement: 683 | return node.tag == demand.element 684 | 685 | of tkPseudoEmpty: 686 | return node.len == 0 687 | 688 | of tkPseudoOnlyChild: 689 | for sibling in pair.siblings: 690 | if sibling != node: 691 | return false 692 | return true 693 | 694 | of tkPseudoOnlyOfType: 695 | for sibling in pair.siblings: 696 | if sibling != node and sibling.tag == node.tag: 697 | return false 698 | return true 699 | 700 | of tkPseudoFirstChild: 701 | return pair.elementIndex == 0 702 | 703 | of tkPseudoLastChild: 704 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 705 | return false 706 | return true 707 | 708 | of tkPseudoFirstOfType: 709 | for sibling in pair.siblings: 710 | if sibling.tag == node.tag: 711 | return sibling == node 712 | 713 | of tkPseudoLastOfType: 714 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 715 | if sibling.tag == node.tag: 716 | return false 717 | return true 718 | 719 | of tkPseudoNot: 720 | return not pair.satisfies(demand.notQuery.demands) 721 | 722 | of tkPseudoNthChild: 723 | return validateNth(demand.a, demand.b, pair.elementIndex) 724 | 725 | of tkPseudoNthLastChild: 726 | var nSiblingsAfter = 0 727 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 728 | nSiblingsAfter.inc 729 | return validateNth(demand.a, demand.b, nSiblingsAfter) 730 | 731 | of tkPseudoNthOfType: 732 | var nSiblingsOfTypeBefore = 0 733 | for sibling in pair.siblings: 734 | if sibling == node: 735 | break 736 | elif sibling.tag == node.tag: 737 | nSiblingsOfTypeBefore.inc 738 | 739 | return validateNth(demand.a, demand.b, nSiblingsOfTypeBefore) 740 | 741 | of tkPseudoNthLastOfType: 742 | var nSiblingsOfTypeAfter = 0 743 | for sibling in pair.siblings(startAtIndex = pair.index + 1): 744 | if sibling.tag == node.tag: 745 | nSiblingsOfTypeAfter.inc 746 | 747 | return validateNth(demand.a, demand.b, nSiblingsOfTypeAfter) 748 | else: 749 | raiseAssert "Invalid demand: " & $demand 750 | 751 | func satisfies(pair: NodeWithContext, demands: seq[Demand]): bool = 752 | for demand in demands: 753 | if not pair.satisfies(demand): 754 | return false 755 | return true 756 | 757 | func exec*(query: Query, node: XmlNode, single: bool): seq[XmlNode] = 758 | ## Execute an already parsed query. If `single = true`, 759 | ## it will never return more than one element. 760 | 761 | var initialStates = initHashSet[(int, int)]() 762 | for idx, s in query.subqueries: 763 | initialStates.incl (idx, 0) 764 | 765 | var stack = @[NodeWithContext(parent: <>"wrapper"(node), index: 0, elementIndex: 0, searchStates: initialStates )] 766 | # Certain queries (e.g queries ending with an id selector) can be eliminated and doesn't need to be checked 767 | # anymore after the first match. These seqs are mapped to the subqueries by index. 768 | var subqueryCanBeEliminated = newSeq[bool](query.subqueries.len) 769 | var subqueryIsEliminated = newSeq[bool](query.subqueries.len) 770 | 771 | for idx, subquery in query.subqueries: 772 | subqueryCanBeEliminated[idx] = not canFindMultiple(subquery, query.options) 773 | 774 | while stack.len > 0: 775 | var entry = stack.pop() 776 | 777 | # Search states that should be forwarded to children 778 | var forChildren = initHashSet[(int, int)]() 779 | # Search states that should be forwarded to siblings 780 | var forSiblings = initHashSet[(int, int)]() 781 | 782 | for searchState in entry.searchStates: 783 | if subqueryIsEliminated[searchState[0]]: 784 | continue 785 | 786 | let subquery = query.subqueries[searchState[0]] 787 | let subqueryPart = subquery[searchState[1]] 788 | 789 | if subqueryPart.combinator == cmDescendants or subqueryPart.combinator == cmRoot: 790 | forChildren.incl searchState 791 | forSiblings.incl searchState 792 | elif subqueryPart.combinator == cmSiblings or subqueryPart.combinator == cmChildren: 793 | forSiblings.incl searchState 794 | 795 | if entry.satisfies(subqueryPart.demands): 796 | if searchState[1] + 1 == subquery.len: 797 | result.add entry.node 798 | if single: 799 | return 800 | if subqueryCanBeEliminated[searchState[0]]: 801 | subqueryIsEliminated[searchState[0]] = true 802 | else: 803 | let nextSubqueryPart = subquery[searchState[1] + 1] 804 | if nextSubqueryPart.combinator == cmChildren or nextSubqueryPart.combinator == cmDescendants: 805 | forChildren.incl (searchState[0], searchState[1] + 1) 806 | elif nextSubqueryPart.combinator == cmNextSibling or nextSubqueryPart.combinator == cmSiblings: 807 | forSiblings.incl (searchState[0], searchState[1] + 1) 808 | 809 | # Below results in a depth first search. 810 | 811 | # Add next sibling to stack 812 | if entry.parent != nil: 813 | var idx = entry.index + 1 814 | while idx < entry.parent.len and entry.parent[idx].kind != xnElement: 815 | idx.inc 816 | if idx < entry.parent.len: 817 | stack.add NodeWithContext( 818 | parent: entry.parent, 819 | index: idx, 820 | elementIndex: entry.elementIndex + 1, 821 | searchStates: forSiblings) 822 | 823 | # Add first child to stack 824 | if entry.node.len > 0: 825 | var idx = 0 826 | while idx < entry.node.len and entry.node[idx].kind != xnElement: 827 | idx.inc 828 | if idx < entry.node.len: 829 | stack.add NodeWithContext( 830 | parent: entry.node, 831 | index: idx, 832 | elementIndex: 0, 833 | searchStates: forChildren) 834 | 835 | func parseHtmlQuery*(queryString: string, 836 | options: set[QueryOption] = DefaultQueryOptions): Query 837 | {.raises: [ParseError].} = 838 | ## Parses a query for later use. 839 | ## Raises `ParseError` if parsing of `queryString` fails. 840 | result.queryStr = queryString 841 | var parts = newSeq[QueryPart]() 842 | var demands = newSeq[Demand]() 843 | var lexer = initLexer(queryString, options) 844 | var combinator = cmRoot 845 | 846 | try: 847 | while true: 848 | case lexer.current.kind 849 | 850 | of tkClass: 851 | demands.add initAttributeDemand(tkAttributeItem, "class", 852 | lexer.eat(tkIdentifier).value) 853 | 854 | of tkId: 855 | demands.add initAttributeDemand(tkAttributeExact, "id", 856 | lexer.eat(tkIdentifier).value) 857 | 858 | of tkElement: 859 | demands.add initElementDemand(lexer.current.value) 860 | 861 | of tkBracketStart: 862 | let f = lexer.eat(tkIdentifier) 863 | let nkind = lexer.next.kind 864 | case nkind 865 | of AttributeKinds - {tkAttributeExists}: 866 | discard lexer.eat(nkind) 867 | let v = lexer.eat({tkIdentifier, tkString}) 868 | demands.add initAttributeDemand(nkind, f.value, v.value) 869 | discard lexer.eat(tkBracketEnd) 870 | of tkBracketEnd: 871 | demands.add initAttributeDemand(tkAttributeExists, 872 | f.value, "") 873 | discard lexer.eat(tkBracketEnd) 874 | else: 875 | raise newException(ParseError, "") 876 | 877 | of PseudoNoParamsKinds: 878 | demands.add initPseudoDemand(lexer.current.kind) 879 | 880 | of PseudoParamsKinds: 881 | let pseudoKind = lexer.current.kind 882 | let params = lexer.eat(tkParam) 883 | case pseudoKind 884 | of tkPseudoNot: 885 | # Not the cleanest way to this, but eh 886 | let notQuery = parseHtmlQuery(params.value, options) 887 | 888 | if not notQuery.isValidNotQuery(options): 889 | raise newException(ParseError, 890 | ":not argument must be a simple selector, but " & 891 | "was '" & params.value & "'") 892 | 893 | demands.add initNotDemand(notQuery.subqueries[0][0]) 894 | of NthKinds: 895 | let (a, b) = parsePseudoNthArguments(params.value) 896 | demands.add initNthChildDemand(pseudoKind, a, b) 897 | else: doAssert(false) # can't happen 898 | 899 | of CombinatorKinds: 900 | parts.add initQueryPart(demands, combinator) 901 | demands = @[] 902 | combinator = lexer.current.kind.ord.Combinator 903 | 904 | of tkComma: 905 | parts.add initQueryPart(demands, combinator) 906 | result.subqueries.add parts 907 | demands = @[] 908 | parts = @[] 909 | combinator = cmRoot 910 | 911 | of tkIdentifier, tkString, tkBracketEnd, 912 | tkParam, tkInvalid, AttributeKinds: 913 | raise newException(ParseError, "") 914 | 915 | of tkEoi: 916 | break 917 | 918 | lexer.forward() 919 | except ParseError as err: 920 | let msg = 921 | if err.msg == "": 922 | "Failed to parse CSS query '" & queryString & "'" 923 | else: 924 | "Failed to parse CSS query '" & queryString & "': " & err.msg 925 | raise newException(ParseError, msg) 926 | 927 | parts.add initQuerypart(demands, combinator) 928 | result.subqueries.add parts 929 | result.options = options 930 | 931 | log "\ninput: \n" & queryString 932 | 933 | func querySelector*(root: XmlNode, queryString: string, 934 | options: set[QueryOption] = DefaultQueryOptions): XmlNode 935 | {.raises: [ParseError].} = 936 | ## Get the first element matching `queryString`, 937 | ## or `nil` if no such element exists. 938 | ## Raises `ParseError` if parsing of `queryString` fails. 939 | let query = parseHtmlQuery(queryString, options) 940 | let lst = query.exec(root, single = true) 941 | if lst.len > 0: 942 | lst[0] 943 | else: 944 | nil 945 | 946 | func querySelectorAll*(root: XmlNode, queryString: string, 947 | options: set[QueryOption] = DefaultQueryOptions): 948 | seq[XmlNode] {.raises: [ParseError].} = 949 | ## Get all elements matching `queryString`. 950 | ## Raises `ParseError` if parsing of `queryString` fails. 951 | let query = parseHtmlQuery(queryString, options) 952 | result = query.exec(root, single = false) 953 | -------------------------------------------------------------------------------- /nimquery.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "2.0.1" 4 | author = "Oscar Nihlgård" 5 | description = "Library for querying HTML using CSS-selectors (like JavaScripts document.querySelector)" 6 | license = "MIT" 7 | 8 | skipDirs = @["tests"] 9 | 10 | requires "nim >= 0.20.0" 11 | 12 | task test, "Run the tests": 13 | exec "nim c -r tests/incltests" 14 | rmFile "tests/incltests" 15 | exec "nim c -r tests/tests" 16 | rmFile "tests/tests" 17 | -------------------------------------------------------------------------------- /tests/incltests.nim: -------------------------------------------------------------------------------- 1 | import std / unittest 2 | include ../nimquery.nim 3 | 4 | # NOTE: CSS selectprs are case insensitive! 5 | 6 | test "parsePseudoNthArguments": 7 | check parsePseudoNthArguments("odd") == (2, 1) 8 | check parsePseudoNthArguments("ODD") == (2, 1) 9 | check parsePseudoNthArguments(" odd ") == (2, 1) 10 | check parsePseudoNthArguments("even") == (2, 0) 11 | check parsePseudoNthArguments("1n + 1") == (1, 1) 12 | check parsePseudoNthArguments("2n+0") == (2, 0) 13 | check parsePseudoNthArguments(" 2n + 0") == (2, 0) 14 | check parsePseudoNthArguments("2n") == (2, 0) 15 | check parsePseudoNthArguments("n + 0") == (1, 0) 16 | check parsePseudoNthArguments("-n + 1") == (-1, 1) 17 | check parsePseudoNthArguments("1") == (0, 1) 18 | check parsePseudoNthArguments("0n") == (0, 0) 19 | 20 | expect(ParseError): 21 | discard parsePseudoNthArguments("1 + 1") 22 | expect(ParseError): 23 | discard parsePseudoNthArguments("1 +") -------------------------------------------------------------------------------- /tests/tests.nim: -------------------------------------------------------------------------------- 1 | import std / [unittest, xmltree, streams, htmlparser, strtabs] 2 | import ../nimquery 3 | 4 | const html = """ 5 | 6 | 7 | 8 |188 | 189 | 190 | 191 |
1
212 |2
213 |3
214 |4
215 |1
,2
,3
,4
]") 565 | 566 | test "issue11": 567 | let xml = parseHtml(newStringStream(""" 568 |