├── .gitignore ├── Sources └── SwiftHTMLParser │ ├── Helpers │ ├── RegexError.swift │ └── RegexHelper.swift │ ├── Parser │ ├── Tags │ │ ├── XMLTags.swift │ │ ├── SVGTags.swift │ │ └── KnownHTMLTags.swift │ ├── ParseFormat.swift │ ├── Models │ │ ├── Nodes │ │ │ ├── Node.swift │ │ │ ├── NodeType.swift │ │ │ ├── DocumentTypeNode.swift │ │ │ ├── TextNode.swift │ │ │ ├── Comment.swift │ │ │ ├── CData.swift │ │ │ └── Element.swift │ │ ├── Attribute.swift │ │ └── Tag.swift │ ├── ParseError.swift │ ├── LookaheadValidator.swift │ ├── CDATAParser.swift │ ├── HTMLParser.swift │ ├── CommentParser.swift │ ├── ScriptParser.swift │ ├── AttributeParser.swift │ └── TagParser.swift │ ├── ProjectConfig.swift │ ├── Extensions │ ├── CharacterExtensions.swift │ ├── StringExtensions.swift │ └── Appendable.swift │ └── Traverser │ ├── Selectors │ ├── NodeSelectors │ │ ├── CDataSelector.swift │ │ ├── TextNodeSelector.swift │ │ ├── CommentSelector.swift │ │ └── ElementSelector.swift │ ├── NodeSelector.swift │ ├── AttributeSelector.swift │ ├── SelectorBuilders │ │ ├── PositionIntSelectorBuilder.swift │ │ ├── IdStringSelectorBuilder.swift │ │ ├── TextStringSelectorBuilder.swift │ │ ├── ValueStringSelectorBuilder.swift │ │ └── TagNameStringSelectorBuilder.swift │ ├── IntSelector.swift │ ├── StringSelector.swift │ └── ClassSelector.swift │ └── HTMLTraverser.swift ├── Dockerfile ├── Tests ├── TestFiles │ ├── Mock │ │ ├── Elements │ │ │ ├── element-unclosed-end-tag.html │ │ │ ├── elements-simple.html │ │ │ ├── elemnent-stray-end-html-tag.html │ │ │ ├── empty-element.html │ │ │ ├── elemnent-stray-end-tag.html │ │ │ ├── elements-quotes.html │ │ │ └── element-name-on-new-line.html │ │ ├── Attributes │ │ │ ├── attributes-tabs.html │ │ │ ├── attributes-quotes.html │ │ │ ├── attributes-simple.html │ │ │ └── attributes-multiple-value-class.html │ │ ├── Javascript │ │ │ ├── javascript-simple.html │ │ │ ├── javascript-comments.html │ │ │ ├── javascript-quotes.html │ │ │ └── javascript-quotes-with-escape-characters.html │ │ ├── Comments │ │ │ ├── conditional-comments-salvageable.html │ │ │ ├── declarations.html │ │ │ └── comments.html │ │ ├── SVG │ │ │ └── svg-simple.html │ │ └── Documentation │ │ │ └── simple.html │ ├── TestFileURLs.swift │ └── RealWorld │ │ └── weather-forcast.xml └── SwiftHTMLParserTests │ ├── TestHelper.swift │ ├── AppendableTests.swift │ ├── SVGParserTests.swift │ ├── CommentParserTests.swift │ ├── DocumentationTests.swift │ ├── JavascriptParserTests.swift │ ├── PerformanceTests.swift │ ├── AttributeParserTests.swift │ ├── ElementTraverserTests.swift │ └── ElementTests.swift ├── Package.swift ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | /*.xcodeproj 5 | /.swiftpm 6 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Helpers/RegexError.swift: -------------------------------------------------------------------------------- 1 | // 2 | // RegexError.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-05. 6 | // 7 | 8 | import Foundation 9 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Tags/XMLTags.swift: -------------------------------------------------------------------------------- 1 | // 2 | // XMLTags.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-13. 6 | // 7 | 8 | import Foundation 9 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 1 2 | FROM vapor/swift:5.1-bionic 3 | # 2 4 | WORKDIR /package 5 | # 3 6 | COPY . ./ 7 | # 4 8 | RUN swift package resolve 9 | RUN swift package clean 10 | # 5 11 | #RUN swift test --enable-test-discovery 12 | CMD ["swift", "test", "--enable-test-discovery"] 13 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/ParseFormat.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ParseFormat.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-13. 6 | // 7 | 8 | import Foundation 9 | 10 | public enum ParseFormat { 11 | case html 12 | case xml 13 | case svg 14 | } 15 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/ProjectConfig.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TestConfig.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-04. 6 | // 7 | 8 | import Foundation 9 | 10 | struct ProjectConfig { 11 | 12 | // for Debugging 13 | static let shouldPrintTags = false 14 | static let shouldPrintWarnings = false 15 | } 16 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/Node.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Node.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-08. 6 | // 7 | 8 | import Foundation 9 | 10 | public protocol Node { 11 | var nodeType: NodeType { get } 12 | var startIndex: String.Index { get } 13 | var endIndex: String.Index { get } 14 | } 15 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/element-unclosed-end-tag.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is the first paragraph.

9 | 10 |
11 |
12 |

This is the second paragraph.

13 | 14 | 15 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/elements-simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is the first paragraph.

9 |

This is the second paragraph.

10 |

This is the third paragraph.

11 | 12 | 13 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/elemnent-stray-end-html-tag.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is the first paragraph.

9 | 10 |
11 |

This is the second paragraph.

12 |
13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/NodeType.swift: -------------------------------------------------------------------------------- 1 | // 2 | // NodeType.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-08. 6 | // 7 | 8 | import Foundation 9 | 10 | public enum NodeType: Int { 11 | case element = 1 12 | case attribute = 2 13 | case text = 3 14 | case CDATASection = 4 15 | case comment = 8 16 | case documentType = 10 17 | } 18 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/empty-element.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 |
11 | hello12344 12 |
13 | 14 | 15 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/elemnent-stray-end-tag.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is the first paragraph.

9 | 10 |
11 |

This is the second paragraph.

12 |
13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/DocumentTypeNode.swift: -------------------------------------------------------------------------------- 1 | // 2 | // DocumentTypeNode.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-09-07. 6 | // 7 | 8 | import Foundation 9 | 10 | struct DocumentTypeNode: Node { 11 | public let nodeType = NodeType.documentType 12 | public var startIndex: String.Index 13 | public var endIndex: String.Index 14 | public var name: String 15 | } 16 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/elements-quotes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

'John "ShotGun" Nelson'

9 |

"John 'ShotGun' Nelson"

10 |

It's alright

11 |

I love the " (double Quote) character

12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Attributes/attributes-tabs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | /some/other/img.png 16 | 17 |
18 | First name:
19 | 20 |
21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/TestHelper.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TestHelper.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-25. 6 | // 7 | 8 | import Foundation 9 | import SwiftHTMLParser 10 | 11 | 12 | struct TestHelper { 13 | static func openFileAndParseHTML(fileURL: URL) throws -> [Node] { 14 | // get html string from file 15 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8) 16 | 17 | // create object from raw html file 18 | let nodeTree = try HTMLParser.parse(htmlString) 19 | 20 | return nodeTree 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Javascript/javascript-simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | 11 | 12 |

This is a demonstration.

13 | 14 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Extensions/CharacterExtensions.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CharacterExtensions.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-09-11. 6 | // 7 | 8 | import Foundation 9 | 10 | extension Character { 11 | func isEqualToOneOf(characters: [Character]) -> Bool { 12 | for aCharacter in characters { 13 | if self == aCharacter { 14 | return true 15 | } 16 | } 17 | 18 | return false 19 | } 20 | 21 | func isNotEqualToOneOf(characters: [Character]) -> Bool { 22 | return !self.isEqualToOneOf(characters: characters) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Attributes/attributes-quotes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 | 9 |

10 | This is the first paragraph. 11 |

12 |

13 | This is the second paragraph. 14 |

15 | 16 | Girl with a jacket 17 | 18 | 19 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Comments/conditional-comments-salvageable.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | Test Simple Title 9 | 10 | 11 |

This is a heading

12 | 13 | 16 |
17 | 18 |

Above conditional comments incorect, should ignore div

19 | 20 | 21 |
hello world
22 | 23 |

This is shown in chrome

24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/TextNode.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TextNode.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-08. 6 | // 7 | 8 | import Foundation 9 | 10 | public struct TextNode: Node { 11 | public let nodeType = NodeType.text 12 | public var startIndex: String.Index 13 | public var endIndex: String.Index 14 | public var text: String 15 | 16 | init (startIndex: String.Index, endIndex: String.Index, text: String) { 17 | self.startIndex = startIndex 18 | self.endIndex = endIndex 19 | self.text = text.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Comments/declarations.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 | 8 | this is inner text 9 | 10 | here is some more inner text 11 | 12 |

This is a Heading

13 |

This is a paragraph.

14 | 15 | here is even more inner text 16 | 17 | 18 | 19 |
This is a div
20 | 21 | This inner text is getting out of control 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/SVG/svg-simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | 11 | 12 |

This is a demonstration.

13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Attributes/attributes-simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | This is a link 11 | This is an alternate link 12 | 13 |
This is a div with an empty attribute
14 | 15 |
16 | First name:
17 | 18 |
19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Javascript/javascript-comments.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | 11 | 12 |

This is a demonstration.

13 | 14 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/ParseError.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ParseError.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-05. 6 | // 7 | 8 | import Foundation 9 | 10 | enum ParseError: Error { 11 | case tagNotFound 12 | case tagNameNotFound 13 | case invalidTag 14 | case openingTagNotFound 15 | case canNotFindClosingTagWithoutAnyOpenedTags 16 | case closingTagNotFound(String) 17 | case attributeNotFound 18 | case closingTagNameDoesNotMatchOpeningTagName(erroredTag: Tag) 19 | case endOfFileReachedBeforeClosingTagFound 20 | case endOfFileReachedBeforeScriptClosingTagFound 21 | case endOfFileReachedBeforeCommentCloseFound 22 | case endOfFileReachedBeforeCDATACloseFound 23 | case invalidCDATA 24 | } 25 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Attributes/attributes-multiple-value-class.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 | 9 |

10 | This is the first paragraph. 11 |

12 |

13 | This is the second paragraph. 14 |

15 |

16 | This is the third paragraph. 17 |

18 |

19 | 20 | This is the fourth paragraph. 21 |

22 | 23 | 24 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/CDataSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-29. 6 | // 7 | 8 | import Foundation 9 | 10 | public final class CDataSelector: NodeSelector, TextStringSelectorBuilder { 11 | private(set) public var position = IntSelector() 12 | private(set) public var text = StringSelector() 13 | 14 | // public init 15 | public init() {} 16 | 17 | public func testAgainst(_ node: Node) -> Bool { 18 | // return false if node is not an element 19 | guard let cdata = node as? CData else { 20 | return false 21 | } 22 | 23 | if text.testAgainst(cdata.text) == false { 24 | return false 25 | } 26 | 27 | return true 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/TextNodeSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TextNodeSelector.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-30. 6 | // 7 | 8 | import Foundation 9 | 10 | public final class TextNodeSelector: NodeSelector, TextStringSelectorBuilder { 11 | private(set) public var text = StringSelector() 12 | private(set) public var position = IntSelector() 13 | 14 | // public init 15 | public init() {} 16 | 17 | public func testAgainst(_ node: Node) -> Bool { 18 | // return false if node is not an TextNode 19 | guard let textNode = node as? TextNode else { 20 | return false 21 | } 22 | 23 | if text.testAgainst(textNode.text) == false { 24 | return false 25 | } 26 | 27 | return true 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/Comment.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Comment.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-08. 6 | // 7 | 8 | import Foundation 9 | 10 | public struct Comment: Node { 11 | public let nodeType = NodeType.comment 12 | public var startIndex: String.Index 13 | public var endIndex: String.Index 14 | 15 | var textStartIndex: String.Index 16 | var textEndIndex: String.Index 17 | 18 | public var text: String 19 | 20 | init (startIndex: String.Index, endIndex: String.Index, textStartIndex: String.Index, textEndIndex: String.Index, text: String) { 21 | self.startIndex = startIndex 22 | self.endIndex = endIndex 23 | self.textStartIndex = textStartIndex 24 | self.textEndIndex = textEndIndex 25 | 26 | self.text = text 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/CData.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CDATA.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-13. 6 | // 7 | 8 | import Foundation 9 | 10 | public struct CData: Node { 11 | public let nodeType = NodeType.CDATASection 12 | public var startIndex: String.Index 13 | public var endIndex: String.Index 14 | 15 | var textStartIndex: String.Index 16 | var textEndIndex: String.Index 17 | 18 | public var text: String 19 | 20 | public init (startIndex: String.Index, endIndex: String.Index, textStartIndex: String.Index, textEndIndex: String.Index, text: String) { 21 | self.startIndex = startIndex 22 | self.endIndex = endIndex 23 | self.textStartIndex = textStartIndex 24 | self.textEndIndex = textEndIndex 25 | 26 | self.text = text 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ElementSelector.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-05-27. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | /// based on Xpath and selector 12 | public protocol NodeSelector: AnyObject, PositionIntSelectorBuilder { 13 | 14 | /// Tests the element against the current Node 15 | func testAgainst(_ node: Node) -> Bool 16 | } 17 | 18 | //extension NodeSelector { 19 | // 20 | //// /// Matches when position is less than the given value 21 | //// func whenPositionLessThan(_ position: Int) -> NodeSelector { 22 | //// 23 | //// } 24 | //// 25 | //// /// Matches when position is greater than the given value 26 | //// func whenPositionGreaterThan(_ position: Int) -> NodeSelector { 27 | //// 28 | //// } 29 | //} 30 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/CommentSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CommentSelector.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-31. 6 | // 7 | 8 | import Foundation 9 | 10 | public final class CommentSelector: NodeSelector, TextStringSelectorBuilder { 11 | private(set) public var position = IntSelector() 12 | private(set) public var text = StringSelector() 13 | 14 | // public init 15 | public init() {} 16 | 17 | /// returns true if the Node = satisfies the selector 18 | public func testAgainst(_ node: Node) -> Bool { 19 | // return false if node is not an CommentNode 20 | guard let comment = node as? Comment else { 21 | return false 22 | } 23 | 24 | if text.testAgainst(comment.text) == false { 25 | return false 26 | } 27 | 28 | return true 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/AppendableTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // AppendableITests.swift.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-11-05. 6 | // 7 | 8 | import Foundation 9 | 10 | import XCTest 11 | @testable import SwiftHTMLParser 12 | 13 | final class AppendableTests: XCTestCase { 14 | 15 | func testAppendOrIntialize() { 16 | // single value 17 | var optArray: [String]? = nil 18 | optArray.appendOrInit("hello appendOrInit") 19 | XCTAssertEqual(optArray![0], "hello appendOrInit") 20 | 21 | // multiple values 22 | var optArray2: [String]? = nil 23 | optArray2.appendOrInit(contentsOf: ["sunny", "rainy", "cloudy"]) 24 | XCTAssertEqual(optArray2?.count, 3) 25 | 26 | var optSet: Set? = nil 27 | optSet.insertOrInit("apple") 28 | optSet.formUnionOrInit(["banana", "pineapple", "cherry", "pear"]) 29 | XCTAssertEqual(optSet?.count, 5) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/AttributeSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // AttributeSelector.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-14. 6 | // 7 | 8 | import Foundation 9 | 10 | public class AttributeSelector: ValueStringSelectorBuilder { 11 | private(set) public var name: String 12 | private(set) public var value = StringSelector() 13 | 14 | public init(name: String) { 15 | self.name = name 16 | } 17 | 18 | public func withValue(_ value: String) -> Self { 19 | self.value.withString(value) 20 | return self 21 | } 22 | 23 | /// returns true if the element satisfies the selector 24 | public func testSelector(against element: Element) -> Bool { 25 | let attributeValue = element.attributeValue(for: self.name) 26 | 27 | if value.testAgainst(attributeValue) == false { 28 | return false 29 | } 30 | 31 | return true 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Elements/element-name-on-new-line.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 |

This is a Heading

8 |
This is text in a div
11 | Cities 16 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Comments/comments.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Simple Title 5 | 6 | 7 | 8 | this is inner text 9 | 10 | here is some more inner text 11 | 12 |

This is a Heading

13 |

This is a paragraph.

14 | 15 | here is even more inner text 16 | 17 | 21 | 22 |
This is a div
23 | 24 | This next comment has one character of text 25 | 26 | 27 | This next comment has no text 28 | 29 | 30 | This inner text is getting out of control 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Extensions/StringExtensions.swift: -------------------------------------------------------------------------------- 1 | // 2 | // StringExtensions.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-08-19. 6 | // 7 | 8 | import Foundation 9 | 10 | extension String { 11 | func subscring(after afterIndex: String.Index, numberOfCharacters: Int) -> String { 12 | let lastIndex = self.index(afterIndex, offsetBy: numberOfCharacters) 13 | if lastIndex < self.endIndex { 14 | return String(self[afterIndex...lastIndex]) 15 | } else { 16 | return String(self[afterIndex...self.endIndex]) 17 | } 18 | } 19 | 20 | func encompassesIndex(_ index: String.Index) -> Bool { 21 | if (index < self.endIndex) { 22 | return true 23 | } 24 | 25 | return false 26 | } 27 | 28 | func isEmptyOrWhitespace() -> Bool { 29 | if(self.isEmpty) { 30 | return true 31 | } 32 | 33 | if self.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines).isEmpty { 34 | return true 35 | } 36 | 37 | return false 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.6 2 | 3 | import PackageDescription 4 | 5 | let package = Package( 6 | name: "swift-html-parser", 7 | products: [ 8 | // Products define the executables and libraries produced by a package, and make them visible to other packages. 9 | .library( 10 | name: "SwiftHTMLParser", 11 | targets: ["SwiftHTMLParser"]), 12 | ], 13 | dependencies: [ 14 | // Dependencies declare other packages that this package depends on. 15 | ], 16 | targets: [ 17 | // Targets are the basic building blocks of a package. A target can define a module or a test suite. 18 | // Targets can depend on other targets in this package, and on products in packages which this package depends on. 19 | .target( 20 | name: "SwiftHTMLParser", 21 | dependencies: []), 22 | .target( 23 | name: "TestFiles", 24 | dependencies: [], 25 | path: "Tests/TestFiles", 26 | resources: [.copy("Mock"),.copy("RealWorld")] 27 | ), 28 | .testTarget( 29 | name: "SwiftHTMLParserTests", 30 | dependencies: ["SwiftHTMLParser", "TestFiles"]), 31 | ] 32 | ) 33 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Documentation/simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | This is a Simple Example 5 | 6 | 7 |

This is a Heading

8 | 9 |
10 |

This is the first paragraph.

11 |

This is the second paragraph.

12 |

This is the third paragraph.

13 |

This is the fourth paragraph.

14 |

This is the fifth paragraph.

15 | 16 |
17 |

Editor Notes

18 |
19 |
20 | 21 |
22 |
    23 |
  • This is the first citation.
  • 24 |
  • This is the second citation.
  • 25 |
  • This is the third citation.
  • 26 |
27 | 28 |
29 |

Bibliography Notes

30 |
31 |
32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/SVGParserTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // SVGParserTests.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-12. 6 | // 7 | 8 | import Foundation 9 | 10 | import XCTest 11 | @testable import SwiftHTMLParser 12 | import TestFiles 13 | 14 | 15 | final class SVGParserTests: XCTestCase { 16 | func testSVG() { 17 | guard let fileURL = TestFileURLs.svgTestFilesDirectoryURL? 18 | .appendingPathComponent("svg-simple.html") else { 19 | XCTFail("Could find get file URL to parse") 20 | return 21 | } 22 | 23 | // get html string from file 24 | var htmlStringResult: String? = nil 25 | do { 26 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 27 | } catch { 28 | XCTFail("Could not open file URL: \(fileURL)") 29 | return 30 | } 31 | guard let htmlString = htmlStringResult else { 32 | XCTFail("Could not open file URL: \(fileURL)") 33 | return 34 | } 35 | 36 | // create object from raw html file 37 | guard let elementArray = try? HTMLParser.parse(htmlString) else { 38 | XCTFail("Could not parse HTML") 39 | return 40 | } 41 | 42 | XCTAssertEqual(elementArray.count, 2) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Javascript/javascript-quotes.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | This is a title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | 11 | 12 |

This is a demonstration.

13 | 14 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/LookaheadValidator.swift: -------------------------------------------------------------------------------- 1 | // 2 | // LookAheadValidator.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-09. 6 | // 7 | 8 | import Foundation 9 | 10 | struct LookaheadValidator { 11 | 12 | func isValidLookahead(for source: String, atIndex currentIndex: String.Index, checkFor stringToCheckFor: String) -> Bool { 13 | var localCurrentIndex = currentIndex 14 | var stringToCheckForCurrentIndex = stringToCheckFor.startIndex 15 | 16 | while stringToCheckForCurrentIndex < stringToCheckFor.endIndex { 17 | // check localCurrentIndex hasn't gone past soure endIndex 18 | if (localCurrentIndex > source.endIndex) { 19 | return false 20 | } 21 | 22 | // compare characters 23 | if source[localCurrentIndex] != stringToCheckFor[stringToCheckForCurrentIndex] { 24 | // found a character in source that did not match lookahead 25 | return false 26 | } 27 | 28 | // increment localCurrentIndex and stringToCheckForCurrentIndex (go to next character in string) 29 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1) 30 | stringToCheckForCurrentIndex = stringToCheckFor.index(stringToCheckForCurrentIndex, offsetBy: 1) 31 | } 32 | 33 | return true 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Tests/TestFiles/TestFileURLs.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Reid Nantes on 2019-08-17. 3 | // 4 | 5 | import Foundation 6 | 7 | 8 | public struct TestFileURLs { 9 | static public let testFilesResourceDirectoryURL: URL? = Bundle.module.resourceURL 10 | 11 | // mock 12 | static public let testFilesMockDirectoryURL: URL? = Self.testFilesResourceDirectoryURL?.appendingPathComponent("Mock") 13 | 14 | static public let attributesTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Attributes") 15 | static public let commentsTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Comments") 16 | static public let documentationTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Documentation") 17 | static public let elementsTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Elements") 18 | static public let javascriptTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Javascript") 19 | static public let performanceTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("Performance") 20 | static public let svgTestFilesDirectoryURL: URL? = Self.testFilesMockDirectoryURL?.appendingPathComponent("SVG") 21 | 22 | // real world 23 | static public let realWorldTestFilesDirectoryURL: URL? = Self.testFilesResourceDirectoryURL?.appendingPathComponent("RealWorld") 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/PositionIntSelectorBuilder.swift: -------------------------------------------------------------------------------- 1 | // 2 | // PositionIntSelectorBuilder.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-11-03. 6 | // 7 | 8 | import Foundation 9 | 10 | 11 | public protocol PositionIntSelectorBuilder { 12 | var position: IntSelector { get } 13 | } 14 | 15 | public extension PositionIntSelectorBuilder { 16 | /// Matches when the target equals the given value 17 | func atPosition(_ value: Int) -> Self { 18 | self.position.withValue(value) 19 | return self 20 | } 21 | 22 | /// Matches when the target equals any of the given values 23 | func whenPositionIsAny(_ values: [Int]) -> Self { 24 | self.position.whenValueIsAny(values) 25 | return self 26 | } 27 | 28 | /// Matches when the target is less than the given value 29 | func whenPositionIsLessThan(_ value: Int) -> Self { 30 | self.position.whenValueIsLessThan(value) 31 | return self 32 | } 33 | 34 | /// Matches when the target is greater than the given value 35 | func whenPositionIsGreaterThan(_ value: Int) -> Self { 36 | self.position.whenValueIsGreaterThan(value) 37 | return self 38 | } 39 | 40 | /// Does not match if the target equals is the given value 41 | func whenPositionIsNot(_ value: Int) -> Self { 42 | self.position.whenValueIsNot(value) 43 | return self 44 | } 45 | 46 | /// Does not match if the target equals any of the given values 47 | func whenPositionIsNotAny(_ values: [Int]) -> Self { 48 | self.position.whenValueIsNotAny(values) 49 | return self 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Attribute.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Attribute.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-02-13. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | public struct Attribute: Node { 12 | public var nodeType = NodeType.attribute 13 | public var name: String 14 | public var value: String? 15 | 16 | var nameStartIndex: String.Index 17 | var nameEndIndex: String.Index 18 | 19 | var valueStartIndex: String.Index? 20 | var valueEndIndex: String.Index? 21 | var valueStartIndexWithQuotes: String.Index? 22 | var valueEndIndexWithQuotes: String.Index? 23 | 24 | public var endIndex: String.Index { 25 | if valueEndIndexWithQuotes != nil { 26 | return valueEndIndexWithQuotes! 27 | } 28 | 29 | if valueEndIndex != nil { 30 | return valueEndIndex! 31 | } 32 | 33 | return nameEndIndex 34 | } 35 | 36 | public var startIndex: String.Index { 37 | return nameStartIndex 38 | } 39 | 40 | public init(nameStartIndex: String.Index, 41 | nameEndIndex: String.Index, 42 | valueStartIndex: String.Index?, 43 | valueEndIndex: String.Index?, 44 | valueStartIndexWithQuotes: String.Index?, 45 | valueEndIndexWithQuotes: String.Index?, 46 | name: String, 47 | value: String?) { 48 | self.nameStartIndex = nameStartIndex 49 | self.nameEndIndex = nameEndIndex 50 | 51 | self.valueStartIndex = valueStartIndex 52 | self.valueEndIndex = valueEndIndex 53 | 54 | self.valueStartIndexWithQuotes = valueStartIndexWithQuotes 55 | self.valueEndIndexWithQuotes = valueEndIndexWithQuotes 56 | 57 | self.name = name 58 | self.value = value 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Tags/SVGTags.swift: -------------------------------------------------------------------------------- 1 | // 2 | // SVGTags.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-12. 6 | // 7 | 8 | import Foundation 9 | 10 | enum SVGTagNames: String { 11 | case a 12 | case altGlyph 13 | case altGlyphDef 14 | case altGlyphItem 15 | case animate 16 | case animateColor 17 | case animateMotion 18 | case animateTransform 19 | case circle 20 | case clipPath 21 | case colorProfile = "color-profile" 22 | case cursor 23 | case defs 24 | case desc 25 | case ellipse 26 | case feBlend 27 | case feColorMatrix 28 | case feComponentTransfer 29 | case feComposite 30 | case feConvolveMatrix 31 | case feDiffuseLighting 32 | case feDisplacementMap 33 | case feDistantLight 34 | case feFlood 35 | case feFuncA 36 | case feFuncB 37 | case feFuncG 38 | case feFuncR 39 | case feGaussianBlur 40 | case feImage 41 | case feMerge 42 | case feMergeNode 43 | case feMorphology 44 | case feOffset 45 | case fePointLight 46 | case feSpecularLighting 47 | case feSpotLight 48 | case feTile 49 | case feTurbulence 50 | case filter 51 | case font 52 | case fontFace = "font-face" 53 | case fontFaceFormat = "font-face-format" 54 | case fontFaceName = "font-face-name" 55 | case fontFaceSRC = "font-face-src" 56 | case fontFaceURI = "font-face-uri" 57 | case foreignObject 58 | case g 59 | case glyph 60 | case glyphRef 61 | case hkern 62 | case image 63 | case line 64 | case linearGradient 65 | case marker 66 | case mask 67 | case metadata 68 | case missingGlyph = "missing-glyph" 69 | case mpath 70 | case path 71 | case pattern 72 | case polygon 73 | case polyline 74 | case radialGradient 75 | case rect 76 | case script 77 | case set 78 | case stop 79 | case style 80 | case svg 81 | case switchTag = "switch" 82 | case symbol 83 | case text 84 | case textPath 85 | case title 86 | case tref 87 | case tspan 88 | case use 89 | case view 90 | case vkern 91 | } 92 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Extensions/Appendable.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-25. 6 | // 7 | 8 | import Foundation 9 | 10 | protocol Insertable: Collection { 11 | init() 12 | mutating func append(_ newElement: Element) 13 | mutating func append(contentsOf newElements: S) where Element == S.Element, S : Sequence 14 | } 15 | 16 | protocol SetInsertable: Collection { 17 | init() 18 | mutating func insert(_ newMember: Element) -> (inserted: Bool, memberAfterInsert: Element) 19 | mutating func formUnion(_ other: S) where Element == S.Element, S : Sequence 20 | } 21 | 22 | extension Array: Insertable {} 23 | extension Set: SetInsertable {} 24 | 25 | extension Optional where Wrapped: Insertable { 26 | mutating func appendOrInit(_ newElement: Wrapped.Iterator.Element) { 27 | if self == nil { 28 | var newArray = Wrapped.init() 29 | newArray.append(newElement) 30 | self = newArray 31 | } else { 32 | self?.append(newElement) 33 | } 34 | } 35 | 36 | mutating func appendOrInit(contentsOf newElements: S) where Wrapped.Iterator.Element == S.Element, S : Sequence { 37 | if self == nil { 38 | var newArray = Wrapped.init() 39 | newArray.append(contentsOf: newElements) 40 | self = newArray 41 | } else { 42 | self?.append(contentsOf: newElements) 43 | } 44 | } 45 | } 46 | 47 | extension Optional where Wrapped: SetInsertable { 48 | mutating func insertOrInit(_ newElement: Wrapped.Iterator.Element) { 49 | if self == nil { 50 | var newSet = Wrapped.init() 51 | _ = newSet.insert(newElement) 52 | self = newSet 53 | } else { 54 | _ = self?.insert(newElement) 55 | } 56 | } 57 | 58 | mutating func formUnionOrInit(_ other: S) where Wrapped.Iterator.Element == S.Element, S : Sequence { 59 | if self == nil { 60 | var newSet = Wrapped.init() 61 | newSet.formUnion(other) 62 | self = newSet 63 | } else { 64 | self?.formUnion(other) 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/CDATAParser.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CDATAParser.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-13. 6 | // 7 | 8 | import Foundation 9 | 10 | struct CDATASpecialCharacters { 11 | // strings 12 | let CDATAOpening = "" 14 | } 15 | 16 | struct CDATAParser { 17 | fileprivate let lookaheadValidator = LookaheadValidator() 18 | fileprivate let specialCharacters = CDATASpecialCharacters() 19 | 20 | func parse(source: String, currentIndex: String.Index) throws -> CData { 21 | var localCurrentIndex = currentIndex 22 | let startIndex = currentIndex 23 | var textStartIndex: String.Index? 24 | 25 | // validate stating 26 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex, 27 | checkFor: specialCharacters.CDATAOpening) { 28 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: specialCharacters.CDATAOpening.count) 29 | textStartIndex = localCurrentIndex 30 | } else { 31 | throw ParseError.invalidCDATA 32 | } 33 | 34 | while localCurrentIndex < source.endIndex { 35 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex, 36 | checkFor: specialCharacters.CDATAClosing) { 37 | let textEndIndex = source.index(localCurrentIndex, offsetBy: -1) 38 | let endIndex = source.index(localCurrentIndex, offsetBy: (specialCharacters.CDATAClosing.count - 1)) 39 | 40 | return CData.init(startIndex: startIndex, 41 | endIndex: endIndex, 42 | textStartIndex: textStartIndex!, 43 | textEndIndex: textEndIndex, 44 | text: String(source[textStartIndex!...textEndIndex])) 45 | } 46 | // increment localCurrentIndex 47 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1) 48 | } 49 | 50 | throw ParseError.endOfFileReachedBeforeCDATACloseFound 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/IntSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-11-03. 6 | // 7 | 8 | import Foundation 9 | 10 | public final class IntSelector { 11 | private(set) var anyValues: [Int]? 12 | private(set) var lessThanValues: [Int]? 13 | private(set) var greaterThanValues: [Int]? 14 | 15 | // negatives 16 | private(set) var notAnyValues: [Int]? 17 | } 18 | 19 | internal extension IntSelector { 20 | func withValue(_ value: Int) { 21 | anyValues.appendOrInit(value) 22 | } 23 | 24 | func whenValueIsAny(_ values: [Int]) { 25 | anyValues.appendOrInit(contentsOf: values) 26 | } 27 | 28 | func whenValueIsLessThan(_ value: Int) { 29 | lessThanValues.appendOrInit(value) 30 | } 31 | 32 | func whenValueIsGreaterThan(_ value: Int) { 33 | greaterThanValues.appendOrInit(value) 34 | } 35 | 36 | // negatives 37 | func whenValueIsNot(_ value: Int) { 38 | notAnyValues.appendOrInit(value) 39 | } 40 | 41 | func whenValueIsNotAny(_ values: [Int]) { 42 | notAnyValues.appendOrInit(contentsOf: values) 43 | } 44 | 45 | func testAgainst(_ value: Int?) -> Bool { 46 | guard let value = value else { 47 | if anyValues != nil || lessThanValues != nil || greaterThanValues != nil { 48 | return false 49 | } else { 50 | return true 51 | } 52 | } 53 | 54 | if let anyValues = anyValues { 55 | if anyValues.contains(where: { value == $0 }) == false { 56 | return false 57 | } 58 | } 59 | 60 | if let lessThanValues = lessThanValues { 61 | if lessThanValues.allSatisfy({ value < $0 }) == false { 62 | return false 63 | } 64 | } 65 | 66 | if let greaterThanValues = greaterThanValues { 67 | if greaterThanValues.allSatisfy({ value > $0 }) == false { 68 | return false 69 | } 70 | } 71 | 72 | if let notAnyValues = notAnyValues { 73 | if notAnyValues.allSatisfy({ value != $0 }) == false { 74 | return false 75 | } 76 | } 77 | 78 | return true 79 | } 80 | } 81 | 82 | -------------------------------------------------------------------------------- /Tests/TestFiles/Mock/Javascript/javascript-quotes-with-escape-characters.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | This is a title 5 | 6 | 7 |

This is a Heading

8 |

This is a paragraph.

9 | 10 | 11 | 12 |

This is a demonstration.

13 | 14 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/IdStringSelectorBuilder.swift: -------------------------------------------------------------------------------- 1 | // 2 | // IdStringSelectorBuilder.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-11-02. 6 | // 7 | 8 | import Foundation 9 | 10 | public protocol IdStringSelectorBuilder { 11 | var id: StringSelector { get } 12 | } 13 | 14 | public extension IdStringSelectorBuilder { 15 | /// Matches when the target equals the given id 16 | func withId(_ id: String) -> Self { 17 | self.id.withString(id) 18 | return self 19 | } 20 | 21 | /// Matches when the target equals any of the given ids 22 | func whenIdIsAny(_ ids: [String]) -> Self { 23 | self.id.whenStringIsAny(ids) 24 | return self 25 | } 26 | 27 | /// Matches when the target contains the given id 28 | func containsId(_ keyword: String) -> Self { 29 | self.id.whenStringContainsAny([keyword]) 30 | return self 31 | } 32 | 33 | /// Matches when the target contains any of the given ids 34 | func whenIdContainsAny(_ keywords: [String]) -> Self { 35 | self.id.whenStringContainsAny(keywords) 36 | return self 37 | } 38 | 39 | /// Matches when the target contains all of the given ids 40 | func whenIdContainsAll(_ keywords: [String]) -> Self { 41 | self.id.whenStringContainsAll(keywords) 42 | return self 43 | } 44 | 45 | // negatives 46 | /// Does not match when the target equals the given id 47 | func whenIdIsNot(_ id: String) -> Self { 48 | self.id.whenStringIsNot(id) 49 | return self 50 | } 51 | 52 | /// Does not match if the target equals any of the given ids 53 | func whenIdIsNotAny(_ ids: [String]) -> Self { 54 | self.id.whenStringIsNotAny(ids) 55 | return self 56 | } 57 | 58 | /// Does not match if the target contains the given id 59 | func whenIdDoesNotContain(_ keyword: String) -> Self { 60 | self.id.whenStringDoesNotContainAny([keyword]) 61 | return self 62 | } 63 | 64 | /// Does not match if the target contains any of the given ids 65 | func whenIdDoesNotContainAny(_ keywords: [String]) -> Self { 66 | self.id.whenStringDoesNotContainAny(keywords) 67 | return self 68 | } 69 | 70 | /// Does not match if the target contains all of the given ids 71 | func whenIdDoesNotContainAll(_ keywords: [String]) -> Self { 72 | self.id.whenStringDoesNotContainAll(keywords) 73 | return self 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/TextStringSelectorBuilder.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TextStringSelectorBuilder.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-31. 6 | // 7 | 8 | import Foundation 9 | 10 | public protocol TextStringSelectorBuilder { 11 | var text: StringSelector { get } 12 | } 13 | 14 | public extension TextStringSelectorBuilder { 15 | /// Matches when the target equals the given value 16 | func withText(_ value: String) -> Self { 17 | self.text.withString(value) 18 | return self 19 | } 20 | 21 | /// Matches when the target equals any of the given values 22 | func whenTextIsAny(_ values: [String]) -> Self { 23 | self.text.whenStringIsAny(values) 24 | return self 25 | } 26 | 27 | /// Matches when the target contains the given value 28 | func containingText(_ value: String) -> Self { 29 | self.text.whenStringContainsAny([value]) 30 | return self 31 | } 32 | 33 | /// Matches when the target contains any of the given values 34 | func whenTextContainsAny(_ keywords: [String]) -> Self { 35 | self.text.whenStringContainsAny(keywords) 36 | return self 37 | } 38 | 39 | /// Matches when the target contains all of the given values 40 | func whenTextContainsAll(_ keywords: [String]) -> Self { 41 | self.text.whenStringContainsAll(keywords) 42 | return self 43 | } 44 | 45 | // negatives 46 | /// Does not match when the target equals the given value 47 | func whenTextIsNot(_ value: String) -> Self { 48 | self.text.whenStringIsNot(value) 49 | return self 50 | } 51 | 52 | /// Does not match if the target equals any of the given values 53 | func whenTextIsNotAny(_ values: [String]) -> Self { 54 | self.text.whenStringIsNotAny(values) 55 | return self 56 | } 57 | 58 | /// Does not match if the target contains the given value 59 | func whenTextDoesNotContain(_ keyword: String) -> Self { 60 | self.text.whenStringDoesNotContainAny([keyword]) 61 | return self 62 | } 63 | 64 | /// Does not match if the target contains any of the given values 65 | func whenTextDoesNotContainAny(_ keywords: [String]) -> Self { 66 | self.text.whenStringDoesNotContainAny(keywords) 67 | return self 68 | } 69 | 70 | /// Does not match if the target contains all of the given values 71 | func whenTextDoesNotContainAll(_ keywords: [String]) -> Self { 72 | self.text.whenStringDoesNotContainAll(keywords) 73 | return self 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/ValueStringSelectorBuilder.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ValueStringSelectorBuilder.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-31. 6 | // 7 | 8 | import Foundation 9 | 10 | public protocol ValueStringSelectorBuilder { 11 | var value: StringSelector { get } 12 | } 13 | 14 | public extension ValueStringSelectorBuilder { 15 | /// Matches when the target equals the given value 16 | func withValue(_ value: String) -> Self { 17 | self.value.withString(value) 18 | return self 19 | } 20 | 21 | /// Matches when the target equals any of the given values 22 | func whenValueIsAny(_ values: [String]) -> Self { 23 | self.value.whenStringIsAny(values) 24 | return self 25 | } 26 | 27 | /// Matches when the target contains the given value 28 | func containingValue(_ value: String) -> Self { 29 | self.value.whenStringContainsAny([value]) 30 | return self 31 | } 32 | 33 | /// Matches when the target contains any of the given values 34 | func whenValueContainsAny(_ keywords: [String]) -> Self { 35 | self.value.whenStringContainsAny(keywords) 36 | return self 37 | } 38 | 39 | /// Matches when the target contains all of the given values 40 | func whenValueContainsAll(_ keywords: [String]) -> Self { 41 | self.value.whenStringContainsAll(keywords) 42 | return self 43 | } 44 | 45 | // negatives 46 | /// Does not match when the target equals the given value 47 | func whenValueIsNot(_ value: String) -> Self { 48 | self.value.whenStringIsNot(value) 49 | return self 50 | } 51 | 52 | /// Does not match if the target equals any of the given values 53 | func whenValueIsNotAny(_ values: [String]) -> Self { 54 | self.value.whenStringIsNotAny(values) 55 | return self 56 | } 57 | 58 | /// Does not match if the target contains the given value 59 | func whenValueDoesNotContain(_ keyword: String) -> Self { 60 | self.value.whenStringDoesNotContainAny([keyword]) 61 | return self 62 | } 63 | 64 | /// Does not match if the target contains any of the given values 65 | func whenValueDoesNotContainAny(_ keywords: [String]) -> Self { 66 | self.value.whenStringDoesNotContainAny(keywords) 67 | return self 68 | } 69 | 70 | /// Does not match if the target contains all of the given values 71 | func whenValueDoesNotContainAll(_ keywords: [String]) -> Self { 72 | self.value.whenStringDoesNotContainAll(keywords) 73 | return self 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/TagNameStringSelectorBuilder.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TagNameSelectorBuilder.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-11-02. 6 | // 7 | 8 | import Foundation 9 | 10 | public protocol TagNameStringSelectorBuilder { 11 | var tagName: StringSelector { get } 12 | } 13 | 14 | public extension TagNameStringSelectorBuilder { 15 | /// Matches when the target equals the given value 16 | func withTagName(_ value: String) -> Self { 17 | self.tagName.withString(value) 18 | return self 19 | } 20 | 21 | /// Matches when the target equals any of the given values 22 | func whenTagNameIsAny(_ values: [String]) -> Self { 23 | self.tagName.whenStringIsAny(values) 24 | return self 25 | } 26 | 27 | /// Matches when the target contains the given value 28 | func containingTagName(_ value: String) -> Self { 29 | self.tagName.whenStringContainsAny([value]) 30 | return self 31 | } 32 | 33 | /// Matches when the target contains any of the given values 34 | func whenTagNameContainsAny(_ keywords: [String]) -> Self { 35 | self.tagName.whenStringContainsAny(keywords) 36 | return self 37 | } 38 | 39 | /// Matches when the target contains all of the given values 40 | func whenTagNameContainsAll(_ keywords: [String]) -> Self { 41 | self.tagName.whenStringContainsAll(keywords) 42 | return self 43 | } 44 | 45 | // negatives 46 | /// Does not match when the target equals the given value 47 | func whenTagNameIsNot(_ value: String) -> Self { 48 | self.tagName.whenStringIsNot(value) 49 | return self 50 | } 51 | 52 | /// Does not match if the target equals any of the given values 53 | func whenTagNameIsNotAny(_ values: [String]) -> Self { 54 | self.tagName.whenStringIsNotAny(values) 55 | return self 56 | } 57 | 58 | /// Does not match if the target contains the given value 59 | func whenTagNameDoesNotContain(_ keyword: String) -> Self { 60 | self.tagName.whenStringDoesNotContainAny([keyword]) 61 | return self 62 | } 63 | 64 | /// Does not match if the target contains any of the given values 65 | func whenTagNameDoesNotContainAny(_ keywords: [String]) -> Self { 66 | self.tagName.whenStringDoesNotContainAny(keywords) 67 | return self 68 | } 69 | 70 | /// Does not match if the target contains all of the given values 71 | func whenTagNameDoesNotContainAll(_ keywords: [String]) -> Self { 72 | self.tagName.whenStringDoesNotContainAll(keywords) 73 | return self 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/HTMLParser.swift: -------------------------------------------------------------------------------- 1 | // 2 | // HTMLParser.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-02-13. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | public struct HTMLParser: MarkupParser { 12 | public static func parse(_ html: String) throws -> [Node] { 13 | return try parse(pageSource: html, format: .html) 14 | } 15 | } 16 | 17 | public struct XMLParser: MarkupParser { 18 | public static func parse(_ xml: String) throws -> [Node] { 19 | return try parse(pageSource: xml, format: .xml) 20 | } 21 | } 22 | 23 | 24 | protocol MarkupParser { } 25 | 26 | extension MarkupParser { 27 | 28 | /// Parses an html or xml string and outputs an node/element tree 29 | fileprivate static func parse(pageSource: String, format: ParseFormat) throws -> [Node] { 30 | var rootNodes = [Node]() 31 | let source = removeIEStatments(pageSource: pageSource) 32 | var currentIndex = source.startIndex 33 | 34 | var isEndOfFileReached = false 35 | while currentIndex < source.endIndex && isEndOfFileReached == false { 36 | let elementParser = ElementParser.init(openedTags: []) 37 | do { 38 | let rootElementAndOuterNodes = try elementParser.parseNextElement(pageSource: pageSource, 39 | currentIndex: currentIndex, 40 | depth: 0, 41 | parseFormat: format) 42 | 43 | rootNodes.append(contentsOf: rootElementAndOuterNodes.outerNodes) 44 | // check if an element was found 45 | if let element = rootElementAndOuterNodes.element { 46 | // set the currentIndex to end of rootElement index 47 | currentIndex = source.index(element.endIndex, offsetBy: 1) 48 | rootNodes.append(element) 49 | } else { 50 | // element was not found, end of file reached without error 51 | isEndOfFileReached = true 52 | } 53 | } catch { 54 | throw error 55 | } 56 | } 57 | 58 | return rootNodes 59 | } 60 | 61 | // removed conditional IE statement. 62 | // example; 63 | static func removeIEStatments(pageSource: String) -> String { 64 | //let pattern = "" 65 | //return pageSource.replacingOccurrences(of: pattern, with: "", options: [.regularExpression]) 66 | 67 | return pageSource 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Helpers/RegexHelper.swift: -------------------------------------------------------------------------------- 1 | // 2 | // RegexHelper.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-08-07. 6 | // 7 | 8 | import Foundation 9 | 10 | struct RegexHelper { 11 | 12 | func matchRanges(for regexPattern: String, inString inputString: String) -> [Range] { 13 | guard let regex = try? NSRegularExpression(pattern: regexPattern, options: [.caseInsensitive]) else { 14 | //print("invalid regex") 15 | return [] 16 | } 17 | 18 | let range = NSRange(inputString.startIndex..., in: inputString) 19 | let matches = regex.matches(in: inputString, options: [], range: range) 20 | 21 | var matchRanges = [Range]() 22 | for match in matches { 23 | matchRanges.append(Range(match.range, in: inputString)!) 24 | } 25 | 26 | return matchRanges 27 | } 28 | 29 | func matches(for regexPattern: String, inString inputString: String) -> [String] { 30 | let matchRanges = self.matchRanges(for: regexPattern, inString: inputString) 31 | 32 | var matchingStrings = [String]() 33 | for range in matchRanges { 34 | matchingStrings.append(String(inputString[range])) 35 | } 36 | 37 | return matchingStrings 38 | } 39 | 40 | func firstMatchRange(for regexPattern: String, inString inputString: String) -> Range? { 41 | guard let regex = try? NSRegularExpression(pattern: regexPattern, options: [.caseInsensitive]) else { 42 | //print("Invalid Regex Pattern: \(regexPattern)") 43 | return nil 44 | } 45 | 46 | let range = NSRange(inputString.startIndex..., in: inputString) 47 | let firstMatch = regex.firstMatch(in: inputString, options: [], range: range) 48 | 49 | if let match = firstMatch { 50 | // first match found 51 | return Range(match.range, in: inputString)! 52 | } else { 53 | // no match found 54 | return nil 55 | } 56 | } 57 | 58 | func firstMatch(for regexPattern: String, inString inputString: String) -> String? { 59 | let firstMatchRange = self.firstMatchRange(for: regexPattern, inString: inputString) 60 | 61 | if let range = firstMatchRange { 62 | // match found 63 | let matchingString = String(inputString[range]) 64 | return matchingString 65 | } else { 66 | // no match found 67 | return nil 68 | } 69 | } 70 | 71 | func replaceFirstMatch(for regexPattern: String, inString inputString: String, withString replacementString: String) -> String { 72 | let firstMatchRange = self.firstMatchRange(for: regexPattern, inString: inputString) 73 | 74 | if let range = firstMatchRange { 75 | // match found 76 | return inputString.replacingCharacters(in: range, with: replacementString) 77 | } else { 78 | // no match found 79 | return inputString 80 | } 81 | } 82 | 83 | func replaceMatches(for regexPattern: String, inString inputString: String, withString replacementString: String) -> String? { 84 | guard let regex = try? NSRegularExpression(pattern: regexPattern, options: []) else { 85 | return inputString 86 | } 87 | 88 | let range = NSRange(inputString.startIndex..., in: inputString) 89 | return regex.stringByReplacingMatches(in: inputString, options: [], range: range, withTemplate: replacementString) 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/HTMLTraverser.swift: -------------------------------------------------------------------------------- 1 | // 2 | // HTMLTraverser.swift 3 | // 4 | // Created by Reid Nantes on 2018-05-27. 5 | // Copyright © 2018 Reid Nantes. All rights reserved. 6 | // 7 | 8 | import Foundation 9 | 10 | public struct HTMLTraverser { 11 | 12 | public static func hasMatchingNode(in parsedNodes: [Node], matching nodeSelctorPath: [NodeSelector]) -> Bool { 13 | if findNodes(in: parsedNodes, matching: nodeSelctorPath).count > 0 { 14 | return true 15 | } else { 16 | return false 17 | } 18 | } 19 | 20 | public static func findElements(in parsedNodes: [Node], matching nodeSelectorPath: [NodeSelector]) -> [Element] { 21 | let nodes = findNodes(in: parsedNodes, matching: nodeSelectorPath) 22 | return nodes.compactMap({ $0 as? Element }) 23 | } 24 | 25 | public static func findNodes(in parsedNodes: [Node], matching nodeSelectorPath: [NodeSelector]) -> [Node] { 26 | // start with every element matching 27 | var matchingNodes = parsedNodes 28 | var selectorPathIndex = 0 29 | var matchedSelectors = [NodeSelector]() 30 | // var unmatchedSelector: NodeSelector? = nil 31 | 32 | while selectorPathIndex < nodeSelectorPath.count && matchingNodes.count > 0 { 33 | var shouldReturnChildrenOfMatches = true 34 | // if not the last selectorNode get the children 35 | if selectorPathIndex == nodeSelectorPath.count - 1 { 36 | shouldReturnChildrenOfMatches = false 37 | } 38 | 39 | let currentSelector = nodeSelectorPath[selectorPathIndex] 40 | matchingNodes = getMatchesAtDepth(nodeSelector: currentSelector, 41 | nodesAtDepth: matchingNodes, 42 | shouldReturnChildrenOfMatches: shouldReturnChildrenOfMatches) 43 | 44 | // if matched add currentSelector to list of matchedSelectors 45 | if (matchingNodes.count > 0) { 46 | matchedSelectors.append(currentSelector) 47 | } else { 48 | // if not matched set unmatchedSelector 49 | // TODO: return result or throw error with this result? 50 | //unmatchedSelector = currentSelector 51 | } 52 | 53 | selectorPathIndex += 1 54 | } 55 | 56 | return matchingNodes 57 | } 58 | 59 | private static func getMatchesAtDepth(nodeSelector: NodeSelector, nodesAtDepth: [Node], shouldReturnChildrenOfMatches: Bool) -> [Node] { 60 | var matchesAtDepth = [Node]() 61 | 62 | var currentPosition = 0 63 | 64 | for node in nodesAtDepth { 65 | if compare(nodeSelector: nodeSelector, node: node) == true { 66 | if nodeSelector.position.testAgainst(currentPosition) { 67 | if shouldReturnChildrenOfMatches == true { 68 | if let element = node as? Element { 69 | matchesAtDepth.append(contentsOf: element.childNodes) 70 | } 71 | } else { 72 | matchesAtDepth.append(node) 73 | } 74 | } 75 | currentPosition += 1 76 | } 77 | } 78 | 79 | return matchesAtDepth 80 | } 81 | 82 | private static func compare(nodeSelector: NodeSelector, node: Node) -> Bool { 83 | if nodeSelector.testAgainst(node) == false { 84 | return false 85 | } 86 | 87 | return true 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/ElementSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ElementSelector.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-29. 6 | // 7 | 8 | import Foundation 9 | 10 | public class ElementSelector: NodeSelector, TagNameStringSelectorBuilder, IdStringSelectorBuilder { 11 | private(set) public var position = IntSelector() 12 | 13 | // string selector 14 | private(set) public var tagName = StringSelector() 15 | private(set) public var id = StringSelector() 16 | 17 | // className selector 18 | private(set) var classNameSelector = ClassSelector() 19 | 20 | // attribute selectors 21 | private(set) public var attributes: [AttributeSelector]? 22 | 23 | // childNode selector 24 | private(set) public var childNodeSelectors: [NodeSelector]? 25 | private(set) public var childNodeSelectorPathsAll: [[NodeSelector]]? 26 | 27 | // public init 28 | public init() {} 29 | 30 | /// Selects element if it has the given attribute 31 | public func withAttribute(_ attributeSelector: AttributeSelector) -> ElementSelector { 32 | self.attributes.appendOrInit(attributeSelector) 33 | return self 34 | } 35 | 36 | /// Selects element if it has the given id attribute value 37 | public func withId(_ id: String) -> ElementSelector { 38 | self.attributes.appendOrInit(AttributeSelector.init(name: "id").withValue(id)) 39 | return self 40 | } 41 | 42 | /// Selects element if it has a child node matching the given childNodeSelector 43 | public func withChildNodeSelectorPath(_ childNodeSelectorPath: [NodeSelector]) -> Self { 44 | self.childNodeSelectorPathsAll.appendOrInit(childNodeSelectorPath) 45 | return self 46 | } 47 | 48 | public func withChildElement(_ elementSelector: ElementSelector) -> Self { 49 | self.childNodeSelectors.appendOrInit(elementSelector) 50 | return self 51 | } 52 | 53 | public func withChildTextNode(_ textNodeSelector: TextNodeSelector) -> Self { 54 | self.childNodeSelectors.appendOrInit(textNodeSelector) 55 | return self 56 | } 57 | 58 | public func withChildCommentNode(_ commentNodeSelector: CommentSelector) -> Self { 59 | self.childNodeSelectors.appendOrInit(commentNodeSelector) 60 | return self 61 | } 62 | 63 | public func withChildCDataNode(_ cDataSelector: CDataSelector) -> Self { 64 | self.childNodeSelectors.appendOrInit(cDataSelector) 65 | return self 66 | } 67 | 68 | public func testAgainst(_ node: Node) -> Bool { 69 | // return false if node is not an element 70 | guard let element = node as? Element else { 71 | return false 72 | } 73 | 74 | // test tagName selector 75 | if self.tagName.testAgainst(element.tagName) == false { 76 | return false 77 | } 78 | 79 | //test classNames 80 | if self.classNameSelector.testAgainst(element) == false { 81 | return false 82 | } 83 | 84 | //test attributes (including id) 85 | if self.attributes?.allSatisfy({ $0.testSelector(against: element) }) == false { 86 | return false 87 | } 88 | 89 | // test child selectors 90 | if childNodeSelectors?.allSatisfy( { 91 | HTMLTraverser.hasMatchingNode(in: element.childNodes, matching: [$0]) 92 | }) == false { 93 | return false 94 | } 95 | 96 | // test childNodeSelectorPaths 97 | if childNodeSelectorPathsAll?.allSatisfy( { 98 | HTMLTraverser.hasMatchingNode(in: element.childNodes, matching: $0) 99 | }) == false { 100 | return false 101 | } 102 | 103 | return true 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Nodes/Element.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ElementF.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-02-13. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | public struct Element: Node { 12 | public let nodeType = NodeType.element 13 | var openingTag: Tag 14 | var closingTag: Tag? 15 | 16 | public var childNodes: [Node] 17 | 18 | // index information 19 | public var depth: Int 20 | 21 | public var startIndex: String.Index { 22 | get { 23 | return openingTag.startIndex 24 | } 25 | } 26 | public var endIndex: String.Index { 27 | get { 28 | if closingTag != nil { 29 | return closingTag!.endIndex 30 | } else { 31 | return openingTag.endIndex 32 | } 33 | } 34 | } 35 | 36 | public var isEmptyElement: Bool { 37 | get { 38 | return openingTag.isEmptyElementTag 39 | } 40 | } 41 | 42 | public var isSelfClosingElement: Bool { 43 | get { 44 | return openingTag.isSelfClosing 45 | } 46 | } 47 | 48 | public var tagName: String { 49 | return openingTag.tagName 50 | } 51 | 52 | public var id: String? { 53 | return openingTag.attributes["id"]?.value 54 | } 55 | 56 | public var classNames: [String] { 57 | return openingTag.classNames 58 | } 59 | 60 | public let commentNodes: [Comment] 61 | // lazy var commentNodes: [Comment] = { 62 | // return childNodes.filter({ $0.nodeType == NodeType.comment }) as! [Comment] 63 | // }() 64 | 65 | public let textNodes: [TextNode] 66 | // lazy var textNodes: [TextNode] = { 67 | // return childNodes.filter({ $0.nodeType == NodeType.text }) as! [TextNode] 68 | // }() 69 | 70 | public let CDATASections: [CData] 71 | // lazy var CDATASections: [CData] = { 72 | // return childNodes.filter({ $0.nodeType == NodeType.CDATASection }) as! [CData] 73 | // }() 74 | 75 | public let childElements: [Element] 76 | // lazy var childElements: [Element] = { 77 | // return childNodes.filter({ $0.nodeType == NodeType.element }) as! [Element] 78 | // }() 79 | 80 | init(openingTag: Tag, closingTag: Tag?, childNodes: [Node], depth: Int) { 81 | self.depth = depth 82 | self.openingTag = openingTag 83 | self.closingTag = closingTag 84 | self.childNodes = childNodes 85 | self.depth = depth 86 | 87 | self.textNodes = childNodes.filter({ $0.nodeType == NodeType.text }) as! [TextNode] 88 | self.CDATASections = childNodes.filter({ $0.nodeType == NodeType.CDATASection }) as! [CData] 89 | self.commentNodes = childNodes.filter({ $0.nodeType == NodeType.comment }) as! [Comment] 90 | self.childElements = childNodes.filter({ $0.nodeType == NodeType.element }) as! [Element] 91 | } 92 | 93 | public func attribute(attributeName: String) -> Attribute? { 94 | return openingTag.attributes[attributeName] 95 | } 96 | 97 | public func attributeValue(for attributeName: String) -> String? { 98 | return openingTag.attributes[attributeName]?.value 99 | } 100 | 101 | func containsAttribute(_ attributeName: String) -> Bool { 102 | if openingTag.attributes[attributeName] != nil { 103 | return true 104 | } else { 105 | return false 106 | } 107 | } 108 | 109 | func innerTextBlocksContains(text: String) -> Bool { 110 | for textNode in textNodes { 111 | if textNode.text.contains(text) { 112 | return true 113 | } 114 | } 115 | 116 | return false 117 | } 118 | 119 | func innerCDataContains(text: String) -> Bool { 120 | for cData in CDATASections { 121 | if cData.text.contains(text) { 122 | return true 123 | } 124 | } 125 | 126 | return false 127 | } 128 | 129 | func commentsContains(text: String) -> Bool { 130 | for comment in commentNodes { 131 | if comment.text.contains(text) { 132 | return true 133 | } 134 | } 135 | 136 | return false 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Models/Tag.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Tag.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-02-13. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | // a closing or opening tag 12 | public struct Tag { 13 | let startIndex: String.Index 14 | let endIndex: String.Index 15 | 16 | fileprivate var _isEmptyElementTag: Bool = false 17 | var isEmptyElementTag: Bool { 18 | return _isEmptyElementTag 19 | } 20 | fileprivate var _isClosingTag: Bool = false 21 | var isClosingTag: Bool { 22 | return _isClosingTag 23 | } 24 | fileprivate var _isSelfClosing: Bool = false 25 | var isSelfClosing: Bool { 26 | return _isSelfClosing 27 | } 28 | 29 | fileprivate var classNamesCache = [String]() 30 | var classNames: [String] { 31 | guard let classAttribute = attributes["class"] else { 32 | return [] 33 | } 34 | 35 | guard let classAttributeValue = classAttribute.value else { 36 | return [] 37 | } 38 | 39 | return getClassNames(classAttributeValue: classAttributeValue) 40 | } 41 | 42 | let tagText: String 43 | 44 | let tagName: String 45 | let attributes: [String: Attribute] 46 | 47 | public init(startIndex: String.Index, endIndex: String.Index, tagText: String, tagName: String) { 48 | self.startIndex = startIndex 49 | self.endIndex = endIndex 50 | 51 | self.tagText = tagText 52 | self.tagName = tagName 53 | 54 | let attributeParser = AttributeParser() 55 | self.attributes = attributeParser.parseAttributes(tagText: tagText, tagName: tagName) 56 | 57 | self._isSelfClosing = checkIsSelfClosing(tagText: tagText) 58 | self._isEmptyElementTag = checkIsEmptyElementTag(tagName: tagName) 59 | self._isClosingTag = checkIsClosingTag() 60 | } 61 | 62 | // checks if empty element 63 | // HTML elements with no content are called empty elements. Empty elements do not have an end tag (ex:
) 64 | func checkIsEmptyElementTag(tagName: String) -> Bool { 65 | let tagNameWithoutSlash = tagName.replacingOccurrences(of: "/", with: "") 66 | 67 | // check if known empty element 68 | if emptyElementTagNames.contains(tagNameWithoutSlash) { 69 | return true 70 | } 71 | 72 | // check if DOCTYPE 73 | if tagName.caseInsensitiveCompare("!DOCTYPE") == ComparisonResult.orderedSame { 74 | return true 75 | } 76 | 77 | return false 78 | } 79 | 80 | // check if tag is self closing, ending with /> 81 | // ex: i.e 82 | func checkIsSelfClosing(tagText: String) -> Bool { 83 | let lastCharacter = tagText[tagText.index(tagText.endIndex, offsetBy: -1)] 84 | let secondLastCharacter = tagText[tagText.index(tagText.endIndex, offsetBy: -2)] 85 | 86 | if lastCharacter == ">" && secondLastCharacter == "/" { 87 | return true 88 | } else { 89 | return false 90 | } 91 | } 92 | 93 | func checkIsClosingTag() -> Bool { 94 | if tagText.prefix(2) == "" { 99 | return true 100 | } 101 | 102 | return false 103 | } 104 | 105 | func getDescription() -> String { 106 | var description = "" 107 | description = description + "tagText: \(tagText)\n" 108 | description = description + "tagText.count: \(tagText.count)\n" 109 | description = description + "tag.startIndex: \(startIndex.utf16Offset(in: tagText))\n" 110 | description = description + "tag.endIndex: \(endIndex.utf16Offset(in: tagText))\n" 111 | 112 | return description 113 | } 114 | 115 | func getClassNames(classAttributeValue: String) -> [String] { 116 | // (?=\s*) -> 0 or more whitespaces, but dont capture 117 | // [\w\d]+ -> 1 or more non-whitespace characters 118 | let classNameRegexPattern = "(?=\\s*)[^\\n\\r\\s]+(?=\\s*)" 119 | 120 | let regexHelper = RegexHelper() 121 | return regexHelper.matches(for: classNameRegexPattern, inString: classAttributeValue) 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/StringSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // File.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-31. 6 | // 7 | 8 | import Foundation 9 | 10 | public final class StringSelector { 11 | public init() {} 12 | 13 | /// Matches if the target is any of the keywords 14 | private(set) var stringIsAny: [String]? 15 | /// Matches if the target contains anhy the keywords 16 | private(set) var stringContainsAny: [String]? 17 | /// Matches if the target contains all the keywords 18 | private(set) var stringContainsAll: [String]? 19 | 20 | // negatives 21 | /// Does not match if the target is any of the keywords 22 | private(set) var stringIsNotAny: [String]? 23 | /// Does not match if the target contains any of the keywords 24 | private(set) var stringDoesNotContainAny: [String]? 25 | /// Does not match if the target contains all of the keywords 26 | private(set) var stringDoesNotContainAll: [String]? 27 | } 28 | 29 | 30 | internal extension StringSelector { 31 | func withString(_ value: String) { 32 | self.stringIsAny.appendOrInit(value) 33 | } 34 | 35 | func whenStringIsAny(_ values: [String]) { 36 | self.stringIsAny.appendOrInit(contentsOf: values) 37 | } 38 | 39 | /// matches when attribute value contains the given values 40 | func whenStringContainsAny(_ keywords: [String]) { 41 | self.stringContainsAny.appendOrInit(contentsOf: keywords) 42 | } 43 | 44 | /// matches when the target value does not contains the given values 45 | func whenStringContainsAll(_ keywords: [String]) { 46 | self.stringContainsAll.appendOrInit(contentsOf: keywords) 47 | } 48 | 49 | /// Does not match when the target equals the given value 50 | func whenStringIsNot(_ value: String) { 51 | self.stringIsNotAny.appendOrInit(value) 52 | } 53 | 54 | /// Does not match if the target equals any of the given values 55 | func whenStringIsNotAny(_ values: [String]) { 56 | self.stringIsNotAny.appendOrInit(contentsOf: values) 57 | } 58 | 59 | /// Does not match if the target contains any of the given values 60 | func whenStringDoesNotContainAny(_ values: [String]) { 61 | self.stringDoesNotContainAny.appendOrInit(contentsOf: values) 62 | } 63 | 64 | /// Does not match if the target contains all of the given values 65 | func whenStringDoesNotContainAll(_ values: [String]) { 66 | self.stringDoesNotContainAll.appendOrInit(contentsOf: values) 67 | } 68 | } 69 | 70 | extension StringSelector { 71 | func testAgainst(_ string: String?) -> Bool { 72 | guard let string = string else { 73 | if stringIsAny != nil || stringContainsAny != nil || stringContainsAll != nil { 74 | return false 75 | } else { 76 | return true 77 | } 78 | } 79 | 80 | if let stringIsAny = stringIsAny { 81 | if stringIsAny.contains(where: { string == $0 }) == false { 82 | return false 83 | } 84 | } 85 | 86 | if let stringContainsAll = stringContainsAll { 87 | if stringContainsAll.allSatisfy({ string.contains($0) }) == false { 88 | return false 89 | } 90 | } 91 | 92 | if let stringContainsAny = stringContainsAny { 93 | if stringContainsAny.contains(where: { string.contains($0) }) == false { 94 | return false 95 | } 96 | } 97 | 98 | // negatives 99 | 100 | // fails if string is any of the keywords 101 | if let stringIsNotAny = stringIsNotAny { 102 | if stringIsNotAny.contains(where: { string == $0 }) == true { 103 | return false 104 | } 105 | } 106 | 107 | // fails if string contains any of the keywords 108 | if let stringDoesNotContainsAny = stringDoesNotContainAny { 109 | if stringDoesNotContainsAny.contains(where: { string.contains($0) }) == true { 110 | return false 111 | } 112 | } 113 | 114 | // fails if string contains all the keywords 115 | if let stringDoesNotContainsAll = stringDoesNotContainAll { 116 | if stringDoesNotContainsAll.allSatisfy({ string.contains($0) }) == true { 117 | return false 118 | } 119 | } 120 | 121 | return true 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/CommentParserTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CommentTests.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-11. 6 | // 7 | 8 | import XCTest 9 | @testable import SwiftHTMLParser 10 | import TestFiles 11 | 12 | final class CommentParserTests: XCTestCase { 13 | func testComments() { 14 | guard let fileURL = TestFileURLs.commentsTestFilesDirectoryURL? 15 | .appendingPathComponent("comments.html") else { 16 | XCTFail("Could not get url to test file") 17 | return 18 | } 19 | 20 | // get html string from file 21 | var htmlStringResult: String? = nil 22 | do { 23 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 24 | } catch { 25 | XCTFail("Could not open file at: \(fileURL.path)") 26 | } 27 | guard let htmlString = htmlStringResult else { 28 | XCTFail("Could not open file at: \(fileURL.path)") 29 | return 30 | } 31 | 32 | // create object from raw html file 33 | guard let elementArray = try? HTMLParser.parse(htmlString) else { 34 | XCTFail("Could not parse HTML") 35 | return 36 | } 37 | 38 | // find matching elements by traversing the created html object 39 | var nodeSelectorPath = [ 40 | ElementSelector().withTagName("html"), 41 | ElementSelector().withTagName("body") 42 | ] 43 | 44 | var matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath) 45 | 46 | XCTAssertEqual(matchingElements[0].childNodes.count, 15) 47 | XCTAssertEqual(matchingElements[0].commentNodes.count, 6) 48 | XCTAssertEqual(matchingElements[0].childElements.count, 3) 49 | XCTAssertEqual(matchingElements[0].textNodes.count, 6) 50 | 51 | XCTAssertEqual(matchingElements[0].commentNodes[0].text, " This is a comment ") 52 | XCTAssertEqual(matchingElements[0].commentNodes[1].text, " This is annother comment ") 53 | XCTAssertEqual(matchingElements[0].commentNodes[3].text, " no space between the comment and div ") 54 | XCTAssertEqual(matchingElements[0].commentNodes[4].text, "x") 55 | XCTAssertEqual(matchingElements[0].commentNodes[5].text, "") 56 | 57 | nodeSelectorPath = [ 58 | ElementSelector().withTagName("html"), 59 | ElementSelector().withTagName("body"), 60 | ElementSelector().withTagName("div"), 61 | ] 62 | 63 | matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath) 64 | XCTAssertEqual(matchingElements.count, 1) 65 | XCTAssertEqual(matchingElements[0].textNodes.first!.text, "This is a div") 66 | } 67 | 68 | func testConditionalComments() throws { 69 | guard let fileURL = TestFileURLs.commentsTestFilesDirectoryURL? 70 | .appendingPathComponent("conditional-comments-salvageable.html") else { 71 | XCTFail("Could not get url to test file") 72 | return 73 | } 74 | 75 | // get html string from file 76 | var htmlStringResult: String? = nil 77 | do { 78 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 79 | } catch { 80 | XCTFail("Could not open file at: \(fileURL.path)") 81 | } 82 | guard let htmlString = htmlStringResult else { 83 | XCTFail("Could not open file at: \(fileURL.path)") 84 | return 85 | } 86 | 87 | // create object from raw html file 88 | guard let elementArray = try? HTMLParser.parse(htmlString) else { 89 | XCTFail("Could not parse HTML") 90 | return 91 | } 92 | 93 | //XCTAssertEqual(elementArray.count, 2) 94 | 95 | // find matching elements by traversing the created html object 96 | let nodeSelectorPath = [ 97 | ElementSelector().withTagName("html"), 98 | ElementSelector().withTagName("body") 99 | ] 100 | 101 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath) 102 | 103 | XCTAssertEqual(matchingElements.count, 1) 104 | XCTAssertEqual(matchingElements.first!.commentNodes.count, 1) 105 | //let commentText = try XCTUnwrap(matchingElements.first?.commentNodes.first?.text) 106 | let commentText = matchingElements.first!.commentNodes.first!.text 107 | XCTAssertTrue(commentText.contains("

You are using Internet Explorer 6. :(

")) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Traverser/Selectors/ClassSelector.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ClassSelector.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-22. 6 | // 7 | 8 | import Foundation 9 | 10 | extension ElementSelector { 11 | /// Matches if the target matches the given className 12 | public func withClassName(_ className: String) -> Self { 13 | self.classNameSelector.hasClassNameAny.appendOrInit(className) 14 | return self 15 | } 16 | 17 | /// Matches if the target matches any of the given classNames 18 | public func withClassNamesAny(_ classNames: [String]) -> Self { 19 | self.classNameSelector.hasClassNameAny.appendOrInit(contentsOf: classNames) 20 | return self 21 | } 22 | 23 | /// Matches if the target matches all of the given classNames 24 | public func withClassNamesAll(_ classNames: [String]) -> Self { 25 | self.classNameSelector.hasClassNamesAll.appendOrInit(contentsOf: classNames) 26 | return self 27 | } 28 | 29 | /// Matches if the target has the exact of the given classNames. 30 | public func withClassNamesExact(_ classNames: [String]) -> Self { 31 | self.classNameSelector.hasClassNamesExact.appendOrInit(contentsOf: classNames) 32 | return self 33 | } 34 | 35 | // negatives 36 | /// Does not match if the node has the className 37 | public func withoutClassName(_ className : String) -> Self { 38 | self.classNameSelector.doesNotHaveClassNameAny.appendOrInit(className) 39 | return self 40 | } 41 | 42 | /// Does not match if any of the given classNames are present 43 | public func withoutClassNameAny(_ classNames : [String]) -> Self { 44 | self.classNameSelector.doesNotHaveClassNameAny.appendOrInit(contentsOf: classNames) 45 | return self 46 | } 47 | 48 | /// Does not match if all of the given classNames are present 49 | public func withoutClassNameAll(_ classNames : [String]) -> Self { 50 | self.classNameSelector.doesNotHaveClassNamesAll.appendOrInit(contentsOf: classNames) 51 | return self 52 | } 53 | 54 | /// Does not match if the element has the exact classNames 55 | public func withoutClassNameExact(_ classNames : [String]) -> Self { 56 | self.classNameSelector.doesNotHaveClassNamesExact.appendOrInit(contentsOf: classNames) 57 | return self 58 | } 59 | 60 | } 61 | 62 | internal final class ClassSelector { 63 | var hasClassNameAny: [String]? 64 | var hasClassNamesAll: [String]? 65 | var hasClassNamesExact: [String]? 66 | 67 | // negatives 68 | // does not match if any of the given classNames are present 69 | var doesNotHaveClassNameAny: [String]? 70 | // does not match if all of the given classNames are present 71 | var doesNotHaveClassNamesAll: [String]? 72 | // does not match if the element has the exact classNames 73 | var doesNotHaveClassNamesExact: [String]? 74 | 75 | 76 | /// returns true if the element satisfies the selector 77 | internal func testAgainst(_ element: Element) -> Bool { 78 | let classNamesSet = Set(element.classNames) 79 | 80 | if let hasClassNameAny = hasClassNameAny { 81 | if hasClassNameAny.contains(where: { classNamesSet.contains($0) }) == false { 82 | return false 83 | } 84 | } 85 | 86 | if let hasClassNamesAll = hasClassNamesAll { 87 | if hasClassNamesAll.allSatisfy({ classNamesSet.contains($0) }) == false { 88 | return false 89 | } 90 | } 91 | 92 | if let hasClassNamesExact = hasClassNamesExact { 93 | if hasClassNamesExact.allSatisfy({ classNamesSet.contains($0) }) == false || hasClassNamesExact.count != classNamesSet.count { 94 | return false 95 | } 96 | } 97 | 98 | if let doesNotHaveClassNameAny = doesNotHaveClassNameAny { 99 | if doesNotHaveClassNameAny.contains(where: { classNamesSet.contains($0) }) == true { 100 | return false 101 | } 102 | } 103 | 104 | if let doesNotHaveClassNamesAll = doesNotHaveClassNamesAll { 105 | if doesNotHaveClassNamesAll.allSatisfy({ classNamesSet.contains($0) }) == true { 106 | return false 107 | } 108 | } 109 | 110 | if let doesNotHaveClassNamesExact = doesNotHaveClassNamesExact { 111 | if doesNotHaveClassNamesExact.allSatisfy({ classNamesSet.contains($0) }) == true && doesNotHaveClassNamesExact.count == classNamesSet.count { 112 | return false 113 | } 114 | } 115 | 116 | return true 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/CommentParser.swift: -------------------------------------------------------------------------------- 1 | // 2 | // CommentParser.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-08. 6 | // 7 | 8 | import Foundation 9 | 10 | struct CommentSpecialCharacters { 11 | // strings 12 | let declarationOpening = "" 15 | let declarationClosing = ">" 16 | let conditionalCommentOpening = "" 18 | } 19 | 20 | enum CommentType { 21 | case comment 22 | case declaration 23 | } 24 | 25 | /// Parses comments 26 | struct CommentParser { 27 | fileprivate let lookaheadValidator = LookaheadValidator() 28 | fileprivate let SpecialCharacters = CommentSpecialCharacters() 29 | 30 | /// Parses a comment starting at currentIndex 31 | /// Example of a comment: 32 | func parseComment(source: String, currentIndex: String.Index, commentType: CommentType) throws -> Comment { 33 | let startIndex = currentIndex 34 | 35 | // skip over html comment opening i.e 84 | func parseConditionalComment(source: String, currentIndex: String.Index) throws -> Comment { 85 | let startIndex = currentIndex 86 | // skip over html comment opening i.e 6 | Guelph - Weather - Environment Canada 7 | 8 | 9 | 10 | 11 | Environment Canada 12 | https://www.weather.gc.ca 13 | 14 | 2018-12-14T10:01:44Z 15 | tag:weather.gc.ca,2013-04-16:20181214100144 16 | https://www.weather.gc.ca/template/gcweb/v4.0.24/assets/wmms-alt.png 17 | https://www.weather.gc.ca/template/gcweb/v4.0.24/assets/favicon.ico 18 | Copyright 2018, Environment Canada 19 | 20 | FOG ADVISORY , Guelph 21 | 22 | 2018-12-14T04:24:00Z 23 | 2018-12-14T04:24:00Z 24 | 25 | Persons in or near this area should be on the lookout for adverse weather conditions and take necessary safety precautions. Issued: 11:24 PM EST Thursday 13 December 2018 26 | tag:weather.gc.ca,2013-04-16:on-5_w1:201812140424 27 | 28 | 29 | Current Conditions: -0.8°C 30 | 31 | 2018-12-14T10:00:00Z 32 | 2018-12-14T10:00:00Z 33 | 34 | Observed at: Guelph Turfgrass 05:00 AM EST Friday 14 December 2018
35 | Temperature: -0.8°C
36 | Humidity: 99 %
37 | Dewpoint: -0.9°C
38 | Wind: 0 km/h
39 | Air Quality Health Index: 2
40 | ]]>
41 | tag:weather.gc.ca,2013-04-16:on-5_cc:20181214100000 42 |
43 | 44 | Friday: A few showers. High plus 4. 45 | 46 | 2018-12-14T10:00:00Z 47 | 2018-12-14T10:00:00Z 48 | 49 | Cloudy. A few rain showers beginning early this morning and ending early this afternoon. Risk of freezing rain early this morning. Fog dissipating this morning. Wind becoming southwest 20 km/h gusting to 40 near noon. High plus 4. UV index 1 or low. Forecast issued 05:00 AM EST Friday 14 December 2018 50 | tag:weather.gc.ca,2013-04-16:on-5_fc1:20181214100000 51 | 52 | 53 | Friday night: Mainly cloudy. Low minus 2. 54 | 55 | 2018-12-14T10:00:00Z 56 | 2018-12-14T10:00:00Z 57 | 58 | Mainly cloudy. Fog patches developing near midnight. Wind up to 15 km/h. Low minus 2. Wind chill minus 6 overnight. Forecast issued 05:00 AM EST Friday 14 December 2018 59 | tag:weather.gc.ca,2013-04-16:on-5_fc2:20181214100000 60 | 61 | 62 | Saturday: Mainly cloudy. High plus 4. 63 | 64 | 2018-12-14T10:00:00Z 65 | 2018-12-14T10:00:00Z 66 | 67 | Mainly cloudy. Fog patches dissipating in the morning. Wind becoming northeast 20 km/h gusting to 40 in the morning. High plus 4. Forecast issued 05:00 AM EST Friday 14 December 2018 68 | tag:weather.gc.ca,2013-04-16:on-5_fc3:20181214100000 69 | 70 | 71 | Saturday night: Chance of flurries. Low minus 1. POP 30% 72 | 73 | 2018-12-14T10:00:00Z 74 | 2018-12-14T10:00:00Z 75 | 76 | Cloudy periods with 30 percent chance of flurries. Low minus 1. Forecast issued 05:00 AM EST Friday 14 December 2018 77 | tag:weather.gc.ca,2013-04-16:on-5_fc4:20181214100000 78 | 79 | 80 | Sunday: A mix of sun and cloud. High plus 5. 81 | 82 | 2018-12-14T10:00:00Z 83 | 2018-12-14T10:00:00Z 84 | 85 | A mix of sun and cloud. High plus 5. Forecast issued 05:00 AM EST Friday 14 December 2018 86 | tag:weather.gc.ca,2013-04-16:on-5_fc5:20181214100000 87 | 88 | 89 | Sunday night: Chance of flurries. Low zero. POP 30% 90 | 91 | 2018-12-14T10:00:00Z 92 | 2018-12-14T10:00:00Z 93 | 94 | Cloudy periods with 30 percent chance of flurries. Low zero. Forecast issued 05:00 AM EST Friday 14 December 2018 95 | tag:weather.gc.ca,2013-04-16:on-5_fc6:20181214100000 96 | 97 | 98 | Monday: Chance of flurries. High zero. POP 30% 99 | 100 | 2018-12-14T10:00:00Z 101 | 2018-12-14T10:00:00Z 102 | 103 | Cloudy with 30 percent chance of flurries. High zero. Forecast issued 05:00 AM EST Friday 14 December 2018 104 | tag:weather.gc.ca,2013-04-16:on-5_fc7:20181214100000 105 | 106 | 107 | Monday night: Cloudy periods. Low minus 8. 108 | 109 | 2018-12-14T10:00:00Z 110 | 2018-12-14T10:00:00Z 111 | 112 | Cloudy periods. Low minus 8. Forecast issued 05:00 AM EST Friday 14 December 2018 113 | tag:weather.gc.ca,2013-04-16:on-5_fc8:20181214100000 114 | 115 | 116 | Tuesday: A mix of sun and cloud. High minus 2. 117 | 118 | 2018-12-14T10:00:00Z 119 | 2018-12-14T10:00:00Z 120 | 121 | A mix of sun and cloud. High minus 2. Forecast issued 05:00 AM EST Friday 14 December 2018 122 | tag:weather.gc.ca,2013-04-16:on-5_fc9:20181214100000 123 | 124 | 125 | Tuesday night: Cloudy periods. Low minus 5. 126 | 127 | 2018-12-14T10:00:00Z 128 | 2018-12-14T10:00:00Z 129 | 130 | Cloudy periods. Low minus 5. Forecast issued 05:00 AM EST Friday 14 December 2018 131 | tag:weather.gc.ca,2013-04-16:on-5_fc10:20181214100000 132 | 133 | 134 | Wednesday: Cloudy. High plus 1. 135 | 136 | 2018-12-14T10:00:00Z 137 | 2018-12-14T10:00:00Z 138 | 139 | Cloudy. High plus 1. Forecast issued 05:00 AM EST Friday 14 December 2018 140 | tag:weather.gc.ca,2013-04-16:on-5_fc11:20181214100000 141 | 142 | 143 | Wednesday night: Chance of flurries. Low zero. POP 40% 144 | 145 | 2018-12-14T10:00:00Z 146 | 2018-12-14T10:00:00Z 147 | 148 | Cloudy with 40 percent chance of flurries. Low zero. Forecast issued 05:00 AM EST Friday 14 December 2018 149 | tag:weather.gc.ca,2013-04-16:on-5_fc12:20181214100000 150 | 151 | 152 | Thursday: Chance of flurries or rain showers. High plus 2. POP 40% 153 | 154 | 2018-12-14T10:00:00Z 155 | 2018-12-14T10:00:00Z 156 | 157 | Cloudy with 40 percent chance of flurries or rain showers. High plus 2. Forecast issued 05:00 AM EST Friday 14 December 2018 158 | tag:weather.gc.ca,2013-04-16:on-5_fc13:20181214100000 159 | 160 | 161 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/TagParser.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TagParser.swift 3 | // SwiftHTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-12-04. 6 | // 7 | 8 | import Foundation 9 | 10 | enum TagParserState { 11 | case notWithinQuotesOrComment 12 | case withinDoubleQuotes 13 | case withinSingleQuotes 14 | } 15 | 16 | enum TagOpeningType { 17 | case element 18 | case CDATA 19 | case declaration 20 | case comment 21 | } 22 | 23 | struct TagSpecificCharacters { 24 | // characters 25 | let tagOpeningCharacter: Character = "<" 26 | let tagClosingCharacter: Character = ">" 27 | let doubleQuote: Character = "\"" // i.e " 28 | let singleQuote: Character = "'" // i.e ' 29 | let space: Character = " " 30 | let equalSign: Character = "=" 31 | 32 | // strings 33 | let declarationOpening = "" 36 | let conditionalCommentOpening = "" 38 | let CDATAOpening = "" 40 | 41 | // array 42 | 43 | } 44 | 45 | struct TagParser { 46 | fileprivate let commentParser = CommentParser() 47 | fileprivate let cdataParser = CDATAParser() 48 | fileprivate let lookaheadValidator = LookaheadValidator() 49 | fileprivate let specificCharacters = TagSpecificCharacters() 50 | fileprivate let isPoorlyFormattedCommentsAllowed: Bool = true 51 | 52 | func getNextTag(source: String, currentIndex: String.Index) throws -> (childNodes: [Node], tag: Tag?) { 53 | var isTagOpened = false 54 | var localCurrentIndex = currentIndex 55 | var tagStartIndex: String.Index? 56 | 57 | var childNodes = [Node]() 58 | var parseState = TagParserState.notWithinQuotesOrComment 59 | 60 | // iterate through string indices until tag is found or end of string 61 | while source.encompassesIndex(localCurrentIndex) { 62 | 63 | if isTagOpened == false { 64 | if parseState == .notWithinQuotesOrComment { 65 | if let tagOpeningType = resolveTagOpeningType(source: source, index: 66 | localCurrentIndex) { 67 | 68 | // set inner text block 69 | if (currentIndex != localCurrentIndex) { 70 | var textBlockStartIndex = currentIndex 71 | 72 | // changed 73 | if let lastChildNode = childNodes.last { 74 | textBlockStartIndex = source.index(lastChildNode.endIndex, offsetBy: 1) 75 | } 76 | 77 | let textBlockEndIndex = source.index(localCurrentIndex, offsetBy: -1) 78 | 79 | // if tags or comments are right beside each other dont add text block i.e 80 | if textBlockStartIndex <= textBlockEndIndex { 81 | let textBlockText = String(source[textBlockStartIndex...textBlockEndIndex]) 82 | if (textBlockText.isEmptyOrWhitespace() == false) { 83 | let innerTextBlock = TextNode.init(startIndex: textBlockStartIndex, 84 | endIndex: textBlockEndIndex, 85 | text: textBlockText) 86 | childNodes.append(innerTextBlock) 87 | } 88 | } 89 | } 90 | 91 | switch tagOpeningType { 92 | case .element: 93 | isTagOpened = true 94 | tagStartIndex = localCurrentIndex 95 | case .comment: 96 | do { 97 | let comment = try commentParser.parseComment(source: source, 98 | currentIndex: localCurrentIndex, 99 | commentType: .comment) 100 | localCurrentIndex = comment.endIndex 101 | childNodes.append(comment) 102 | } catch { 103 | throw ParseError.endOfFileReachedBeforeCommentCloseFound 104 | } 105 | case .declaration: 106 | do { 107 | let comment = try commentParser.parseComment(source: source, 108 | currentIndex: localCurrentIndex, 109 | commentType: .declaration) 110 | localCurrentIndex = comment.endIndex 111 | childNodes.append(comment) 112 | } catch { 113 | throw ParseError.endOfFileReachedBeforeCommentCloseFound 114 | } 115 | case .CDATA: 116 | // is CDATA 117 | do { 118 | let cdata = try cdataParser.parse(source: source, currentIndex: localCurrentIndex) 119 | localCurrentIndex = cdata.endIndex 120 | childNodes.append(cdata) 121 | } catch { 122 | throw ParseError.endOfFileReachedBeforeCommentCloseFound 123 | } 124 | } 125 | } 126 | } 127 | } else { 128 | switch parseState { 129 | case .notWithinQuotesOrComment: 130 | if source[localCurrentIndex] == specificCharacters.tagClosingCharacter { 131 | // tag is closed 132 | do { 133 | let tag = try foundTag(source: source, tagStartIndex: tagStartIndex!, tagEndIndex: localCurrentIndex) 134 | return (childNodes, tag) 135 | } catch { 136 | throw error 137 | } 138 | } 139 | if source[localCurrentIndex] == specificCharacters.doubleQuote { 140 | parseState = .withinDoubleQuotes 141 | } else if source[localCurrentIndex] == specificCharacters.singleQuote { 142 | parseState = .withinSingleQuotes 143 | } 144 | case .withinDoubleQuotes: 145 | if source[localCurrentIndex] == specificCharacters.doubleQuote { 146 | parseState = .notWithinQuotesOrComment 147 | } 148 | case .withinSingleQuotes: 149 | if source[localCurrentIndex] == specificCharacters.singleQuote { 150 | parseState = .notWithinQuotesOrComment 151 | } 152 | } 153 | } 154 | 155 | // increment localCurrentIndex 156 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1) 157 | 158 | // if source.encompassesIndex(localCurrentIndex) { 159 | // print("localCurrentIndex: \(localCurrentIndex)") 160 | // print(source[localCurrentIndex]) 161 | // } 162 | } 163 | 164 | // a tag not found before end of file reached 165 | return (childNodes, nil) 166 | } 167 | 168 | func resolveTagOpeningType(source: String, index: String.Index) -> TagOpeningType? { 169 | if source[index] == specificCharacters.tagOpeningCharacter { 170 | if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.declarationOpening) { 171 | // check if comment opening 172 | if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.commentOpening) { 173 | return TagOpeningType.comment 174 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.CDATAOpening) { 175 | return TagOpeningType.CDATA 176 | } 177 | return TagOpeningType.declaration 178 | } 179 | return TagOpeningType.element 180 | } 181 | return nil 182 | } 183 | 184 | /// produces a `tag` from the found tag text, parsing attributes etc 185 | func foundTag(source: String, tagStartIndex: String.Index, tagEndIndex: String.Index) throws -> Tag { 186 | // create tagText string from indexes 187 | let tagText = String(source[tagStartIndex...tagEndIndex]) 188 | 189 | // get tagName from tagText 190 | let tagNameResult: String? 191 | do { 192 | tagNameResult = try parseTagName(tagText: tagText) 193 | } catch { 194 | throw ParseError.tagNameNotFound 195 | } 196 | guard let tagName = tagNameResult else { 197 | throw ParseError.tagNameNotFound 198 | } 199 | 200 | // create the tag 201 | return Tag.init(startIndex: tagStartIndex, endIndex: tagEndIndex, tagText: tagText, tagName: tagName) 202 | } 203 | 204 | func parseTagName(tagText: String) throws -> String { 205 | var currentIndex = tagText.startIndex 206 | let endIndex = tagText.endIndex 207 | 208 | var startTagNameIndex: String.Index? 209 | 210 | 211 | var isFirstCharacterFound = false 212 | while currentIndex < endIndex { 213 | if isFirstCharacterFound == false { 214 | // keep going until you find the first char (ignore < and whitespace) 215 | if tagText[currentIndex] != TagSpecificCharacters().tagOpeningCharacter && tagText[currentIndex].isWhitespace == false { 216 | isFirstCharacterFound = true 217 | // add char to tag 218 | startTagNameIndex = currentIndex 219 | } 220 | } else { 221 | if tagText[currentIndex] == ">" || tagText[currentIndex].isWhitespace { 222 | // dont include last > or whitespace in tagName 223 | let endTagNameIndex = tagText.index(currentIndex, offsetBy: -1) 224 | let tagName = String(tagText[startTagNameIndex!...endTagNameIndex]) 225 | return tagName.trimmingCharacters(in: .whitespacesAndNewlines) 226 | } 227 | } 228 | 229 | currentIndex = tagText.index(currentIndex, offsetBy: 1) 230 | } 231 | 232 | throw ParseError.tagNameNotFound 233 | } 234 | 235 | } 236 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/ElementTraverserTests.swift: -------------------------------------------------------------------------------- 1 | // 2 | // TestElementTraverser.swift 3 | // 4 | // 5 | // Created by Reid Nantes on 2019-10-22. 6 | // 7 | 8 | import XCTest 9 | import SwiftHTMLParser 10 | import TestFiles 11 | 12 | final class ElementTraverserTests: XCTestCase { 13 | 14 | func testSelectTagName() { 15 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 16 | .appendingPathComponent("attributes-multiple-value-class.html") else { 17 | XCTFail("Could find get file URL to parse") 18 | return 19 | } 20 | 21 | var nodeTreeResult: [Node]? = nil 22 | do { 23 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 24 | } catch { 25 | XCTFail(error.localizedDescription) 26 | return 27 | } 28 | guard let nodeTree = nodeTreeResult else { 29 | XCTFail("nodeTreeResult was nil") 30 | return 31 | } 32 | 33 | // find matching elements by traversing the created html object 34 | let nodeSelectorPath = [ 35 | ElementSelector().withTagName("html"), 36 | ElementSelector().containingTagName("bod"), 37 | ElementSelector().withTagName("p") 38 | ] 39 | 40 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 41 | XCTAssertEqual(matchingElements.count, 4) 42 | } 43 | 44 | 45 | func testSelectAttributes() { 46 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 47 | .appendingPathComponent("attributes-simple.html") else { 48 | XCTFail("Could find get file URL to parse") 49 | return 50 | } 51 | 52 | var nodeTreeResult: [Node]? = nil 53 | do { 54 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 55 | } catch { 56 | XCTFail(error.localizedDescription) 57 | return 58 | } 59 | guard let nodeTree = nodeTreeResult else { 60 | XCTFail("nodeTreeResult was nil") 61 | return 62 | } 63 | 64 | // find matching elements by traversing the created html object 65 | let nodeSelectorPath: [NodeSelector] = [ 66 | ElementSelector().withTagName("html"), 67 | ElementSelector().withTagName("body"), 68 | ElementSelector().withTagName("a") 69 | .withAttribute(AttributeSelector.init(name: "href").withValue("https://duckduckgo.com")) 70 | ] 71 | 72 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 73 | XCTAssertEqual(matchingElements.count, 1) 74 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is an alternate link") 75 | } 76 | 77 | func testSelectClassName() { 78 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 79 | .appendingPathComponent("attributes-multiple-value-class.html") else { 80 | XCTFail("Could find get file URL to parse") 81 | return 82 | } 83 | 84 | var nodeTreeResult: [Node]? = nil 85 | do { 86 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 87 | } catch { 88 | XCTFail(error.localizedDescription) 89 | return 90 | } 91 | guard let nodeTree = nodeTreeResult else { 92 | XCTFail("nodeTreeResult was nil") 93 | return 94 | } 95 | 96 | // find matching elements by traversing the created html object 97 | var nodeSelectorPath = [ 98 | ElementSelector().withTagName("html"), 99 | ElementSelector().withTagName("body"), 100 | ElementSelector().withTagName("p") 101 | .withClassNamesAny(["body-paragraph"]) 102 | ] 103 | 104 | var matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 105 | XCTAssertTrue(matchingElements.count == 1) 106 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.") 107 | 108 | // find matching elements by traversing the created html object 109 | nodeSelectorPath = [ 110 | ElementSelector().withTagName("html"), 111 | ElementSelector().withTagName("body"), 112 | ElementSelector().withTagName("p").withClassName("stylized-paragraph") 113 | 114 | //.withoutClassName("into-paragraph") 115 | ] 116 | 117 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 118 | XCTAssertTrue(matchingElements.count == 4) 119 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the first paragraph.") 120 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.") 121 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the third paragraph.") 122 | XCTAssertEqual(matchingElements[3].textNodes[0].text, "This is the fourth paragraph.") 123 | 124 | // find matching elements by traversing the created html object 125 | nodeSelectorPath = [ 126 | ElementSelector().withTagName("html"), 127 | ElementSelector().withTagName("body"), 128 | ElementSelector().withTagName("p") 129 | .withClassNamesExact(["stylized-paragraph"]) 130 | ] 131 | 132 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 133 | XCTAssertTrue(matchingElements.count == 1) 134 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the third paragraph.") 135 | 136 | 137 | // find matching elements by traversing the created html object 138 | nodeSelectorPath = [ 139 | ElementSelector().withTagName("html"), 140 | ElementSelector().withTagName("body"), 141 | ElementSelector().withTagName("p") 142 | .withoutClassNameAny(["into-paragraph"]) 143 | ] 144 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 145 | XCTAssertEqual(matchingElements.count, 3) 146 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.") 147 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the third paragraph.") 148 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the fourth paragraph.") 149 | } 150 | 151 | func testSelectPosition() { 152 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 153 | .appendingPathComponent("attributes-multiple-value-class.html") else { 154 | XCTFail("Could find get file URL to parse") 155 | return 156 | } 157 | 158 | var nodeTreeResult: [Node]? = nil 159 | do { 160 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 161 | } catch { 162 | XCTFail(error.localizedDescription) 163 | return 164 | } 165 | guard let nodeTree = nodeTreeResult else { 166 | XCTFail("nodeTreeResult was nil") 167 | return 168 | } 169 | 170 | // test position equal 171 | var nodeSelectorPath = [ 172 | ElementSelector().withTagName("html"), 173 | ElementSelector().withTagName("body"), 174 | ElementSelector().withTagName("p").atPosition(1) 175 | ] 176 | var matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 177 | XCTAssertTrue(matchingElements.count == 1) 178 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.") 179 | 180 | // test position greater than 181 | nodeSelectorPath = [ 182 | ElementSelector().withTagName("html"), 183 | ElementSelector().withTagName("body"), 184 | ElementSelector().withTagName("p").whenPositionIsGreaterThan(1) 185 | ] 186 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 187 | XCTAssertTrue(matchingElements.count == 2) 188 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the third paragraph.") 189 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the fourth paragraph.") 190 | 191 | // test position less than 192 | nodeSelectorPath = [ 193 | ElementSelector().withTagName("html"), 194 | ElementSelector().withTagName("body"), 195 | ElementSelector().withTagName("p").whenPositionIsLessThan(3) 196 | ] 197 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 198 | XCTAssertTrue(matchingElements.count == 3) 199 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the first paragraph.") 200 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.") 201 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the third paragraph.") 202 | } 203 | 204 | func testSelectInnerText() { 205 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 206 | .appendingPathComponent("attributes-multiple-value-class.html") else { 207 | XCTFail("Could find get file URL to parse") 208 | return 209 | } 210 | 211 | var nodeTreeResult: [Node]? = nil 212 | do { 213 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 214 | } catch { 215 | XCTFail(error.localizedDescription) 216 | return 217 | } 218 | guard let nodeTree = nodeTreeResult else { 219 | XCTFail("nodeTreeResult was nil") 220 | return 221 | } 222 | 223 | // find matching elements by traversing the created html object 224 | let nodeSelectorPath = [ 225 | ElementSelector().withTagName("html"), 226 | ElementSelector().withTagName("body"), 227 | ElementSelector().withTagName("p") 228 | .withChildTextNode(TextNodeSelector().withText("This is the second paragraph.")) 229 | ] 230 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 231 | XCTAssertEqual(matchingElements.count, 1) 232 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.") 233 | } 234 | 235 | func testSelectInnerComment() { 236 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL? 237 | .appendingPathComponent("attributes-multiple-value-class.html") else { 238 | XCTFail("Could find get file URL to parse") 239 | return 240 | } 241 | 242 | var nodeTreeResult: [Node]? = nil 243 | do { 244 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL) 245 | } catch { 246 | XCTFail(error.localizedDescription) 247 | return 248 | } 249 | guard let nodeTree = nodeTreeResult else { 250 | XCTFail("nodeTreeResult was nil") 251 | return 252 | } 253 | 254 | // find matching elements by traversing the created html object 255 | let nodeSelectorPath = [ 256 | ElementSelector().withTagName("html"), 257 | ElementSelector().withTagName("body"), 258 | ElementSelector().withTagName("p") 259 | .withChildCommentNode(CommentSelector().containingText("This is a comment")) 260 | ] 261 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath) 262 | XCTAssertEqual(matchingElements.count, 1) 263 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the fourth paragraph.") 264 | } 265 | 266 | } 267 | 268 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Sources/SwiftHTMLParser/Parser/Tags/KnownHTMLTags.swift: -------------------------------------------------------------------------------- 1 | // 2 | // tags.swift 3 | // HTMLParser 4 | // 5 | // Created by Reid Nantes on 2018-05-26. 6 | // Copyright © 2018 Reid Nantes. All rights reserved. 7 | // 8 | 9 | import Foundation 10 | 11 | struct HTMLTag { 12 | let name: String 13 | let isEmpty: Bool 14 | 15 | init(name: String, isEmpty: Bool = false) { 16 | self.name = name 17 | self.isEmpty = isEmpty 18 | } 19 | } 20 | 21 | enum HTMLTagID: String { 22 | case a = "a" 23 | case abbr = "abbr" 24 | case address = "address" 25 | case area = "area" 26 | case article = "article" 27 | case aside = "aside" 28 | case audio = "audio" 29 | case b = "b" 30 | case base = "base" 31 | case bdi = "bdi" 32 | case bdo = "bdo" 33 | case blockquote = "blockquote" 34 | case body = "body" 35 | case br = "br" 36 | case button = "button" 37 | case canvas = "canvas" 38 | case caption = "caption" 39 | case cite = "cite" 40 | case code = "code" 41 | case col = "col" 42 | case colgroup = "colgroup" 43 | case data = "data" 44 | case datalist = "datalist" 45 | case dd = "dd" 46 | case del = "del" 47 | case details = "details" 48 | case dfn = "dfn" 49 | case dialog = "dialog" 50 | case div = "div" 51 | case dl = "dl" 52 | case dt = "dt" 53 | case element = "Element" 54 | case em = "em" 55 | case embed = "embed" 56 | case fieldset = "fieldset" 57 | case figcaption = "figcaption" 58 | case figure = "figure" 59 | case footer = "footer" 60 | case form = "form" 61 | case h1 = "h1" 62 | case h2 = "h2" 63 | case h3 = "h3" 64 | case h4 = "h4" 65 | case h5 = "h5" 66 | case h6 = "h6" 67 | case head = "head" 68 | case header = "header" 69 | case hgroup = "hgroup" 70 | case hr = "hr" 71 | case html = "html" 72 | case i = "i" 73 | case iframe = "iframe" 74 | case img = "img" 75 | case input = "input" 76 | case ins = "ins" 77 | case kbd = "kbd" 78 | case label = "label" 79 | case legend = "legend" 80 | case li = "li" 81 | case link = "link" 82 | case main = "main" 83 | case map = "map" 84 | case mark = "mark" 85 | case mathML = "MathML" 86 | case math = "math " 87 | case menu = "menu" 88 | case meta = "meta" 89 | case meter = "meter" 90 | case nav = "nav" 91 | case noscript = "noscript" 92 | case object = "object" 93 | case ol = "ol" 94 | case optgroup = "optgroup" 95 | case option = "option" 96 | case output = "output" 97 | case p = "p" 98 | case param = "param" 99 | case picture = "picture" 100 | case pre = "pre" 101 | case progress = "progress" 102 | case q = "q" 103 | case rp = "rp" 104 | case rt = "rt" 105 | case ruby = "ruby" 106 | case s = "s" 107 | case samp = "samp" 108 | case script = "script" 109 | case section = "section" 110 | case select = "select" 111 | case slot = "slot" 112 | case small = "small" 113 | case source = "source" 114 | case span = "span" 115 | case strong = "strong" 116 | case style = "style" 117 | case sub = "sub" 118 | case summary = "summary" 119 | case sup = "sup" 120 | case svg = "svg" 121 | case table = "table" 122 | case tbody = "tbody" 123 | case td = "td" 124 | case template = "template" 125 | case textarea = "textarea" 126 | case tfoot = "tfoot" 127 | case th = "th" 128 | case thead = "thead" 129 | case time = "time" 130 | case title = "title" 131 | case tr = "tr" 132 | case track = "track" 133 | case u = "u" 134 | case ul = "ul" 135 | case `var` = "var" 136 | case video = "video" 137 | case wbr = "wbr" 138 | } 139 | 140 | let htmlTags: [HTMLTagID: HTMLTag] = [ 141 | .a: HTMLTag.init(name: "a"), 142 | .abbr: HTMLTag.init(name: "abbr"), 143 | .address: HTMLTag.init(name: "address"), 144 | .area: HTMLTag.init(name: "area", isEmpty: true), 145 | .article: HTMLTag.init(name: "article"), 146 | .aside: HTMLTag.init(name: "aside"), 147 | .audio: HTMLTag.init(name: "audio"), 148 | .b: HTMLTag.init(name: "b"), 149 | .base: HTMLTag.init(name: "base", isEmpty: true), 150 | .bdi: HTMLTag.init(name: "bdi"), 151 | .bdo: HTMLTag.init(name: "bdo"), 152 | .blockquote: HTMLTag.init(name: "blockquote"), 153 | .body: HTMLTag.init(name: "body"), 154 | .br: HTMLTag.init(name: "br", isEmpty: true), 155 | .button: HTMLTag.init(name: "button"), 156 | .canvas: HTMLTag.init(name: "canvas"), 157 | .caption: HTMLTag.init(name: "caption"), 158 | .cite: HTMLTag.init(name: "cite"), 159 | .code: HTMLTag.init(name: "code"), 160 | .col: HTMLTag.init(name: "col", isEmpty: true), 161 | .colgroup: HTMLTag.init(name: "colgroup"), 162 | .data: HTMLTag.init(name: "data"), 163 | .datalist: HTMLTag.init(name: "datalist"), 164 | .dd: HTMLTag.init(name: "dd"), 165 | .del: HTMLTag.init(name: "del"), 166 | .details: HTMLTag.init(name: "details"), 167 | .dfn: HTMLTag.init(name: "dfn"), 168 | .dialog: HTMLTag.init(name: "dialog"), 169 | .div: HTMLTag.init(name: "div"), 170 | .dl: HTMLTag.init(name: "dl"), 171 | .dt: HTMLTag.init(name: "dt"), 172 | .element: HTMLTag.init(name: "Element"), 173 | .em: HTMLTag.init(name: "em"), 174 | .embed: HTMLTag.init(name: "embed", isEmpty: true), 175 | .fieldset: HTMLTag.init(name: "fieldset"), 176 | .figcaption: HTMLTag.init(name: "figcaption"), 177 | .figure: HTMLTag.init(name: "figure"), 178 | .footer: HTMLTag.init(name: "footer"), 179 | .form: HTMLTag.init(name: "form"), 180 | .h1: HTMLTag.init(name: "h1"), 181 | .h2: HTMLTag.init(name: "h2"), 182 | .h3: HTMLTag.init(name: "h3"), 183 | .h4: HTMLTag.init(name: "h4"), 184 | .h5: HTMLTag.init(name: "h5"), 185 | .h6: HTMLTag.init(name: "h6"), 186 | .head: HTMLTag.init(name: "head"), 187 | .header: HTMLTag.init(name: "header"), 188 | .hgroup: HTMLTag.init(name: "hgroup"), 189 | .hr: HTMLTag.init(name: "hr", isEmpty: true), 190 | .html: HTMLTag.init(name: "html"), 191 | .i: HTMLTag.init(name: "i"), 192 | .iframe: HTMLTag.init(name: "iframe", isEmpty: true), 193 | .img: HTMLTag.init(name: "img", isEmpty: true), 194 | .input: HTMLTag.init(name: "input", isEmpty: true), 195 | .ins: HTMLTag.init(name: "ins"), 196 | .kbd: HTMLTag.init(name: "kbd"), 197 | .label: HTMLTag.init(name: "label"), 198 | .legend: HTMLTag.init(name: "legend"), 199 | .li: HTMLTag.init(name: "li"), 200 | .link: HTMLTag.init(name: "link", isEmpty: true), 201 | .main: HTMLTag.init(name: "main"), 202 | .map: HTMLTag.init(name: "map"), 203 | .mark: HTMLTag.init(name: "mark"), 204 | .mathML: HTMLTag.init(name: "MathML"), 205 | .math: HTMLTag.init(name: "math "), 206 | .menu: HTMLTag.init(name: "menu"), 207 | .meta: HTMLTag.init(name: "meta", isEmpty: true), 208 | .meter: HTMLTag.init(name: "meter"), 209 | .nav: HTMLTag.init(name: "nav"), 210 | .noscript: HTMLTag.init(name: "noscript"), 211 | .object: HTMLTag.init(name: "object"), 212 | .ol: HTMLTag.init(name: "ol"), 213 | .optgroup: HTMLTag.init(name: "optgroup"), 214 | .option: HTMLTag.init(name: "option"), 215 | .output: HTMLTag.init(name: "output"), 216 | .p: HTMLTag.init(name: "p"), 217 | .param: HTMLTag.init(name: "param", isEmpty: true), 218 | .picture: HTMLTag.init(name: "picture"), 219 | .pre: HTMLTag.init(name: "pre"), 220 | .progress: HTMLTag.init(name: "progress"), 221 | .q: HTMLTag.init(name: "q"), 222 | .rp: HTMLTag.init(name: "rp"), 223 | .rt: HTMLTag.init(name: "rt"), 224 | .ruby: HTMLTag.init(name: "ruby"), 225 | .s: HTMLTag.init(name: "s"), 226 | .samp: HTMLTag.init(name: "samp"), 227 | .script: HTMLTag.init(name: "script"), 228 | .section: HTMLTag.init(name: "section"), 229 | .select: HTMLTag.init(name: "select"), 230 | .slot: HTMLTag.init(name: "slot"), 231 | .small: HTMLTag.init(name: "small"), 232 | .source: HTMLTag.init(name: "source", isEmpty: true), 233 | .span: HTMLTag.init(name: "span"), 234 | .strong: HTMLTag.init(name: "strong"), 235 | .style: HTMLTag.init(name: "style"), 236 | .sub: HTMLTag.init(name: "sub"), 237 | .summary: HTMLTag.init(name: "summary"), 238 | .sup: HTMLTag.init(name: "sup"), 239 | .svg: HTMLTag.init(name: "svg"), 240 | .table: HTMLTag.init(name: "table"), 241 | .tbody: HTMLTag.init(name: "tbody"), 242 | .td: HTMLTag.init(name: "td"), 243 | .template: HTMLTag.init(name: "template", isEmpty: true), 244 | .textarea: HTMLTag.init(name: "textarea"), 245 | .tfoot: HTMLTag.init(name: "tfoot"), 246 | .th: HTMLTag.init(name: "th"), 247 | .thead: HTMLTag.init(name: "thead"), 248 | .time: HTMLTag.init(name: "time"), 249 | .title: HTMLTag.init(name: "title"), 250 | .tr: HTMLTag.init(name: "tr"), 251 | .track: HTMLTag.init(name: "track", isEmpty: true), 252 | .u: HTMLTag.init(name: "u"), 253 | .ul: HTMLTag.init(name: "ul"), 254 | .var: HTMLTag.init(name: "var"), 255 | .video: HTMLTag.init(name: "video"), 256 | .wbr: HTMLTag.init(name: "wbr", isEmpty: true) 257 | ] 258 | let selfClosingHTMLTags: [HTMLTagID] = [ 259 | .area, 260 | .base, 261 | .br, 262 | .col, 263 | .embed, 264 | .hr, 265 | .iframe, 266 | .img, 267 | .input, 268 | .link, 269 | .meta, 270 | .param, 271 | .source, 272 | .template, 273 | .track, 274 | .wbr 275 | ] 276 | 277 | let allHTMLTagNames = [ 278 | "a", 279 | "abbr", 280 | "address", 281 | "area", 282 | "article", 283 | "aside", 284 | "audio", 285 | "b", 286 | "base", 287 | "bdi", 288 | "bdo", 289 | "blockquote", 290 | "body", 291 | "br", 292 | "button", 293 | "canvas", 294 | "caption", 295 | "cite", 296 | "code", 297 | "col", 298 | "colgroup", 299 | "data", 300 | "datalist", 301 | "dd", 302 | "del", 303 | "details", 304 | "dfn", 305 | "dialog", 306 | "div", 307 | "dl", 308 | "dt", 309 | "em", 310 | "embed", 311 | "fieldset", 312 | "figcaption", 313 | "figure", 314 | "footer", 315 | "form", 316 | "h1,", 317 | "h2,", 318 | "h3,", 319 | "h4,", 320 | "h5,", 321 | "h6", 322 | "head", 323 | "header", 324 | "hgroup", 325 | "hr", 326 | "html", 327 | "i", 328 | "iframe", 329 | "img", 330 | "input", 331 | "ins", 332 | "kbd", 333 | "label", 334 | "legend", 335 | "li", 336 | "link", 337 | "main", 338 | "map", 339 | "mark", 340 | "MathMLmath", 341 | "menu", 342 | "meta", 343 | "meter", 344 | "nav", 345 | "noscript", 346 | "object", 347 | "ol", 348 | "optgroup", 349 | "option", 350 | "output", 351 | "p", 352 | "param", 353 | "picture", 354 | "pre", 355 | "progress", 356 | "q", 357 | "rp", 358 | "rt", 359 | "ruby", 360 | "s", 361 | "samp", 362 | "script", 363 | "section", 364 | "select", 365 | "slot", 366 | "small", 367 | "source", 368 | "span", 369 | "strong", 370 | "style", 371 | "sub", 372 | "summary", 373 | "sup", 374 | "SVG", 375 | "svg", 376 | "table", 377 | "tbody", 378 | "td", 379 | "template", 380 | "textarea", 381 | "tfoot", 382 | "th", 383 | "thead", 384 | "time", 385 | "title", 386 | "tr", 387 | "track", 388 | "u", 389 | "ul", 390 | "var", 391 | "video", 392 | "wbr" 393 | ] 394 | 395 | // elements with no end tag 396 | // reference: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element 397 | let emptyElementTagNames = [ 398 | "area", 399 | "base", 400 | "br", 401 | "col", 402 | "embed", 403 | "hr", 404 | "iframe", 405 | "img", 406 | "input", 407 | "link", 408 | "meta", 409 | "param", 410 | "source", 411 | "template", 412 | "track", 413 | "wbr" 414 | ] 415 | 416 | let ignoredTags: [String] = [ 417 | "svg", 418 | "script" 419 | ] 420 | -------------------------------------------------------------------------------- /Tests/SwiftHTMLParserTests/ElementTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | @testable import SwiftHTMLParser 3 | import TestFiles 4 | 5 | final class SwiftHTMLParserTests: XCTestCase { 6 | 7 | func testOpenFile() { 8 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 9 | .appendingPathComponent("elements-simple.html") else { 10 | XCTFail("Could find get file URL to parse") 11 | return 12 | } 13 | 14 | // get html string from file 15 | var htmlStringResult: String? = nil 16 | do { 17 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 18 | } catch { 19 | XCTFail("Could not open file URL: \(fileURL)") 20 | return 21 | } 22 | guard let htmlString = htmlStringResult else { 23 | XCTFail("Could not open file URL: \(fileURL)") 24 | return 25 | } 26 | 27 | XCTAssertTrue(htmlString.count > 100) 28 | XCTAssertTrue(htmlString.hasPrefix("")) 29 | XCTAssertTrue(htmlString.contains("")) 30 | XCTAssertTrue(htmlString.contains("Test Simple Title")) 31 | XCTAssertTrue(htmlString.contains("

This is a Heading

")) 32 | XCTAssertTrue(htmlString.contains("")) 33 | } 34 | 35 | func testSimple() { 36 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 37 | .appendingPathComponent("elements-simple.html") else { 38 | XCTFail("Could find get file URL to parse") 39 | return 40 | } 41 | 42 | // get html string from file 43 | var htmlStringResult: String? = nil 44 | do { 45 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 46 | } catch { 47 | XCTFail("Could not open file URL: \(fileURL)") 48 | return 49 | } 50 | guard let htmlString = htmlStringResult else { 51 | XCTFail("Could not open file URL: \(fileURL)") 52 | return 53 | } 54 | 55 | // create object from raw html file 56 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 57 | XCTFail("Could not parse HTML") 58 | return 59 | } 60 | 61 | XCTAssertEqual(nodeArray.count, 2) 62 | 63 | // find matching elements by traversing the created html object 64 | var nodeSelectorPath = [ 65 | ElementSelector().withTagName("html"), 66 | ElementSelector().withTagName("head"), 67 | ElementSelector().withTagName("title") 68 | ] 69 | 70 | var matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 71 | 72 | XCTAssertEqual(matchingElements.count, 1) 73 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "Test Simple Title") 74 | 75 | nodeSelectorPath = [ 76 | ElementSelector().withTagName("html"), 77 | ElementSelector().withTagName("body"), 78 | ElementSelector().withTagName("p") 79 | ] 80 | 81 | matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 82 | 83 | XCTAssertEqual(matchingElements.count, 3) 84 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.") 85 | } 86 | 87 | func testQuotes() { 88 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 89 | .appendingPathComponent("elements-quotes.html") else { 90 | XCTFail("Could find get file URL to parse") 91 | return 92 | } 93 | 94 | // get html string from file 95 | var htmlStringResult: String? = nil 96 | do { 97 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 98 | } catch { 99 | XCTFail("Could not open file URL: \(fileURL)") 100 | return 101 | } 102 | guard let htmlString = htmlStringResult else { 103 | XCTFail("Could not open file URL: \(fileURL)") 104 | return 105 | } 106 | 107 | // create object from raw html file 108 | guard let elementArray = try? HTMLParser.parse(htmlString) else { 109 | XCTFail("Could not parse HTML") 110 | return 111 | } 112 | 113 | // find matching elements by traversing the created html object 114 | let nodeSelectorPath = [ 115 | ElementSelector().withTagName("html"), 116 | ElementSelector().withTagName("body"), 117 | ElementSelector().withTagName("p") 118 | ] 119 | 120 | let matchingElements = HTMLTraverser.findElements(in: elementArray, 121 | matching: nodeSelectorPath) 122 | 123 | 124 | XCTAssertEqual(matchingElements.count, 4) 125 | XCTAssertEqual(matchingElements[0].textNodes.first!.text, "'John \"ShotGun\" Nelson'") 126 | XCTAssertEqual(matchingElements[1].textNodes.first!.text, "\"John 'ShotGun' Nelson\"") 127 | XCTAssertEqual(matchingElements[2].textNodes.first!.text, "It's alright") 128 | XCTAssertEqual(matchingElements[3].textNodes.first!.text, "I love the \" (double Quote) character") 129 | } 130 | 131 | func testClosingEmptyTag() { 132 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 133 | .appendingPathComponent("empty-element.html") else { 134 | XCTFail("Could find get file URL to parse") 135 | return 136 | } 137 | 138 | // get html string from file 139 | var htmlStringResult: String? = nil 140 | do { 141 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 142 | } catch { 143 | XCTFail("Could not open file URL: \(fileURL)") 144 | return 145 | } 146 | guard let htmlString = htmlStringResult else { 147 | XCTFail("Could not open file URL: \(fileURL)") 148 | return 149 | } 150 | 151 | // create object from raw html file 152 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 153 | XCTFail("Could not parse HTML") 154 | return 155 | } 156 | 157 | // find matching elements by traversing the created html object 158 | let nodeSelectorPath = [ 159 | ElementSelector().withTagName("html"), 160 | ElementSelector().withTagName("body"), 161 | ElementSelector().withTagName("form") 162 | ] 163 | 164 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 165 | 166 | XCTAssertEqual(matchingElements.count, 1) 167 | XCTAssertEqual(matchingElements[0].childElements.count, 1) 168 | } 169 | 170 | func testElementNameOnNewLine() { 171 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 172 | .appendingPathComponent("element-name-on-new-line.html") else { 173 | XCTFail("Could find get file URL to parse") 174 | return 175 | } 176 | 177 | // get html string from file 178 | var htmlStringResult: String? = nil 179 | do { 180 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 181 | } catch { 182 | XCTFail("Could not open file URL: \(fileURL)") 183 | return 184 | } 185 | guard let htmlString = htmlStringResult else { 186 | XCTFail("Could not open file URL: \(fileURL)") 187 | return 188 | } 189 | 190 | // create object from raw html file 191 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 192 | XCTFail("Could not parse HTML") 193 | return 194 | } 195 | 196 | // find matching elements by traversing the created html object 197 | let nodeSelectorPath = [ 198 | ElementSelector().withTagName("html"), 199 | ElementSelector().withTagName("body"), 200 | ElementSelector().withTagName("div") 201 | ] 202 | 203 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 204 | 205 | XCTAssertEqual(matchingElements.count, 1) 206 | XCTAssertEqual(matchingElements.first?.tagName, "div") 207 | XCTAssertEqual(matchingElements.first?.attributeValue(for: "name"), "bob") 208 | XCTAssertEqual(matchingElements.first?.attributeValue(for: "type"), "email") 209 | } 210 | 211 | func testElementUnclosedEndTag() { 212 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 213 | .appendingPathComponent("element-unclosed-end-tag.html") else { 214 | XCTFail("Could find get file URL to parse") 215 | return 216 | } 217 | 218 | // get html string from file 219 | var htmlStringResult: String? = nil 220 | do { 221 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 222 | } catch { 223 | XCTFail("Could not open file URL: \(fileURL)") 224 | return 225 | } 226 | guard let htmlString = htmlStringResult else { 227 | XCTFail("Could not open file URL: \(fileURL)") 228 | return 229 | } 230 | 231 | // create object from raw html file 232 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 233 | XCTFail("Could not parse HTML") 234 | return 235 | } 236 | 237 | // find matching elements by traversing the created html object 238 | let nodeSelectorPath = [ 239 | ElementSelector().withTagName("html"), 240 | ElementSelector().withTagName("body"), 241 | ElementSelector().withTagName("div") 242 | ] 243 | 244 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 245 | 246 | XCTAssertEqual(matchingElements.count, 1) 247 | XCTAssertEqual(matchingElements.first?.tagName, "div") 248 | XCTAssertEqual(matchingElements.first?.childElements.count, 1) 249 | } 250 | 251 | func testElementStrayEndTag() { 252 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 253 | .appendingPathComponent("elemnent-stray-end-tag.html") else { 254 | XCTFail("Could find get file URL to parse") 255 | return 256 | } 257 | 258 | // get html string from file 259 | var htmlStringResult: String? = nil 260 | do { 261 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 262 | } catch { 263 | XCTFail("Could not open file URL: \(fileURL)") 264 | return 265 | } 266 | guard let htmlString = htmlStringResult else { 267 | XCTFail("Could not open file URL: \(fileURL)") 268 | return 269 | } 270 | 271 | // create object from raw html file 272 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 273 | XCTFail("Could not parse HTML") 274 | return 275 | } 276 | 277 | // find matching elements by traversing the created html object 278 | let nodeSelectorPath = [ 279 | ElementSelector().withTagName("html"), 280 | ElementSelector().withTagName("body"), 281 | ElementSelector().withTagName("div") 282 | ] 283 | 284 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 285 | 286 | XCTAssertEqual(matchingElements.count, 1) 287 | XCTAssertEqual(matchingElements.first?.tagName, "div") 288 | XCTAssertEqual(matchingElements.first?.childElements.count, 1) 289 | } 290 | 291 | func testElementStrayHTMLEndTag() { 292 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL? 293 | .appendingPathComponent("elemnent-stray-end-html-tag.html") else { 294 | XCTFail("Could find get file URL to parse") 295 | return 296 | } 297 | 298 | // get html string from file 299 | var htmlStringResult: String? = nil 300 | do { 301 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8) 302 | } catch { 303 | XCTFail("Could not open file URL: \(fileURL)") 304 | return 305 | } 306 | guard let htmlString = htmlStringResult else { 307 | XCTFail("Could not open file URL: \(fileURL)") 308 | return 309 | } 310 | 311 | // create object from raw html file 312 | guard let nodeArray = try? HTMLParser.parse(htmlString) else { 313 | XCTFail("Could not parse HTML") 314 | return 315 | } 316 | 317 | // find matching elements by traversing the created html object 318 | let nodeSelectorPath = [ 319 | ElementSelector().withTagName("html"), 320 | ElementSelector().withTagName("body"), 321 | ElementSelector().withTagName("div") 322 | ] 323 | 324 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath) 325 | 326 | XCTAssertEqual(matchingElements.count, 1) 327 | XCTAssertEqual(matchingElements.first?.tagName, "div") 328 | XCTAssertEqual(matchingElements.first?.childElements.count, 1) 329 | } 330 | } 331 | --------------------------------------------------------------------------------