├── .gitignore ├── Dockerfile ├── LICENSE ├── Package.swift ├── README.md ├── Sources └── SwiftHTMLParser │ ├── Extensions │ ├── Appendable.swift │ ├── CharacterExtensions.swift │ └── StringExtensions.swift │ ├── Helpers │ ├── RegexError.swift │ └── RegexHelper.swift │ ├── Parser │ ├── AttributeParser.swift │ ├── CDATAParser.swift │ ├── CommentParser.swift │ ├── ElementParser.swift │ ├── HTMLParser.swift │ ├── LookaheadValidator.swift │ ├── Models │ │ ├── Attribute.swift │ │ ├── Nodes │ │ │ ├── CData.swift │ │ │ ├── Comment.swift │ │ │ ├── DocumentTypeNode.swift │ │ │ ├── Element.swift │ │ │ ├── Node.swift │ │ │ ├── NodeType.swift │ │ │ └── TextNode.swift │ │ └── Tag.swift │ ├── ParseError.swift │ ├── ParseFormat.swift │ ├── ScriptParser.swift │ ├── TagParser.swift │ └── Tags │ │ ├── KnownHTMLTags.swift │ │ ├── SVGTags.swift │ │ └── XMLTags.swift │ ├── ProjectConfig.swift │ └── Traverser │ ├── HTMLTraverser.swift │ └── Selectors │ ├── AttributeSelector.swift │ ├── ClassSelector.swift │ ├── IntSelector.swift │ ├── NodeSelector.swift │ ├── NodeSelectors │ ├── CDataSelector.swift │ ├── CommentSelector.swift │ ├── ElementSelector.swift │ └── TextNodeSelector.swift │ ├── SelectorBuilders │ ├── IdStringSelectorBuilder.swift │ ├── PositionIntSelectorBuilder.swift │ ├── TagNameStringSelectorBuilder.swift │ ├── TextStringSelectorBuilder.swift │ └── ValueStringSelectorBuilder.swift │ └── StringSelector.swift └── Tests ├── SwiftHTMLParserTests ├── AppendableTests.swift ├── AttributeParserTests.swift ├── CommentParserTests.swift ├── DocumentationTests.swift ├── ElementTests.swift ├── ElementTraverserTests.swift ├── JavascriptParserTests.swift ├── PerformanceTests.swift ├── RealWorldTests.swift ├── SVGParserTests.swift └── TestHelper.swift └── TestFiles ├── Mock ├── Attributes │ ├── attributes-multiple-value-class.html │ ├── attributes-quotes.html │ ├── attributes-simple.html │ └── attributes-tabs.html ├── Comments │ ├── comments.html │ ├── conditional-comments-salvageable.html │ └── declarations.html ├── Documentation │ └── simple.html ├── Elements │ ├── element-name-on-new-line.html │ ├── element-unclosed-end-tag.html │ ├── elements-quotes.html │ ├── elements-simple.html │ ├── elemnent-stray-end-html-tag.html │ ├── elemnent-stray-end-tag.html │ └── empty-element.html ├── Javascript │ ├── javascript-comments.html │ ├── javascript-quotes-with-escape-characters.html │ ├── javascript-quotes.html │ └── javascript-simple.html ├── Performance │ ├── deep.html │ └── long.txt └── SVG │ └── svg-simple.html ├── RealWorld ├── amazon-home-page.html ├── apple-home-page.html ├── cnn-home-page.html ├── digitalocean-home-page.html ├── espn-home-page.html ├── facebook-home-page.html ├── google-home-page.html ├── linkedin-home-page.html ├── medium-home-page.html ├── reddit-home-page.html ├── weather-forcast.html ├── weather-forcast.xml ├── weather-hourly.html ├── weather-radar-2.html ├── weather-radar.html ├── wikipedia-home-page.html └── youtube-trending.html └── TestFileURLs.swift /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /.build 3 | /Packages 4 | /*.xcodeproj 5 | /.swiftpm 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 1 2 | FROM vapor/swift:5.1-bionic 3 | # 2 4 | WORKDIR /package 5 | # 3 6 | COPY . ./ 7 | # 4 8 | RUN swift package resolve 9 | RUN swift package clean 10 | # 5 11 | #RUN swift test --enable-test-discovery 12 | CMD ["swift", "test", "--enable-test-discovery"] 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version:5.6 2 | 3 | import PackageDescription 4 | 5 | let package = Package( 6 | name: "swift-html-parser", 7 | products: [ 8 | // Products define the executables and libraries produced by a package, and make them visible to other packages. 9 | .library( 10 | name: "SwiftHTMLParser", 11 | targets: ["SwiftHTMLParser"]), 12 | ], 13 | dependencies: [ 14 | // Dependencies declare other packages that this package depends on. 15 | ], 16 | targets: [ 17 | // Targets are the basic building blocks of a package. A target can define a module or a test suite. 18 | // Targets can depend on other targets in this package, and on products in packages which this package depends on. 19 | .target( 20 | name: "SwiftHTMLParser", 21 | dependencies: []), 22 | .target( 23 | name: "TestFiles", 24 | dependencies: [], 25 | path: "Tests/TestFiles", 26 | resources: [.copy("Mock"),.copy("RealWorld")] 27 | ), 28 | .testTarget( 29 | name: "SwiftHTMLParserTests", 30 | dependencies: ["SwiftHTMLParser", "TestFiles"]), 31 | ] 32 | ) 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SwiftHTMLParser 2 | SwiftHTMLParser is a library for parsing and traverseing HTML and XML written in Swift. It parses plaintext HTML or XML into an object tree (DOM), and allows for the easy traversal and searching of the tree's nodes, similar to an HTML Selector or XPath. 3 | 4 | ## Installation 5 | To depend on SwiftHTMLParser in your own project, add it to the `dependencies` clause in your `Package.swift` file: 6 | ```swift 7 | dependencies: [ 8 | .package(url: "https://github.com/rnantes/swift-html-parser.git", from: "1.0.0") 9 | ] 10 | ``` 11 | 12 | ## Basic Structure 13 | Object naming is based on the [HTML Standard](https://html.spec.whatwg.org/dev/syntax.html#syntax). There are also easy to follow introductions available from [w3schools](https://www.w3schools.com/html/default.asp) and [w3](https://www.w3.org/TR/html53/introduction.html#a-quick-introduction-to-html). 14 | 15 | * `Node`, a protocol: - Consists of an start and closing `Tag`. (Closing tags may be ommited in some special cases) 16 | * `Tag`, a struct: - contains the tag's name, the opening tag contains any of the node's `Attribute`s 17 | * `Attribute`, a struct: - consist of a name and an associated value 18 | 19 | #### Nodes 20 | * `Element`, a struct: - a Node that may contain nested nodes. 21 | * `TextNode`, a struct:- a Node that represents a block of text. 22 | * `Comment`, a struct: - a Node that represents a single or multi-line comment within an element. 23 | * `CData`, a struct: - a Node that represents a CData section and its associated text. 24 | * `DocumentTypeNode`, a struct: - a Node which provides metadata on how to parse the document 25 | 26 | ## Using the API 27 | 28 | #### Read in Plaintext HTML from a File 29 | ```swift 30 | let fileURL = URL.init(fileURLWithPath: "/some/absolute/path/simple.html")! 31 | ``` 32 | 33 | #### Parse the HTML String Into a Tree of Node Objects (DOM) 34 | ```swift 35 | let nodeTree = try HTMLParser.parse(htmlString) 36 | ``` 37 | Alternativly to parse an XML file 38 | ```swift 39 | let nodeTree = try XMLParser.parse(xmlString) 40 | ``` 41 | 42 | #### Create a Node Selector Path Then Traverse the Node Tree to Find Matching Nodes 43 | Element, Text, Comment, and CData selectors are availabe 44 | ```swift 45 | // create a node selector path to describe what nodes to match in the nodeTree 46 | let nodeSelectorPath: [NodeSelector] = [ 47 | ElementSelector().withTagName("html"), 48 | ElementSelector().withTagName("body"), 49 | ElementSelector().withTagName("div").withClassName("essay"), 50 | ElementSelector().withTagName("p").atPosition(0) 51 | ] 52 | 53 | // find the nodes that match the nodeSelectorPath 54 | let matchingNodes = HTMLTraverser.findNodes(in: nodeTree, matching: nodeSelectorPath) 55 | ``` 56 | 57 | ## Tutorial 58 | 59 | #### The HTML File We Will Use for The Following Examples 60 | We will use the example file: simple.html 61 | ```HTML 62 | 63 | 64 |
65 |This is the first paragraph.
72 |This is the second paragraph.
73 |This is the third paragraph.
74 |This is the fourth paragraph.
75 |This is the fifth paragraph.
76 | 77 | elements with the className 'body-paragraph'
123 | // will print: 3
124 | print(matchingElements.count)
125 | }
126 | ```
127 |
128 | #### Find a Matching Text Node
129 | ```swift
130 | func parseAndTraverseSimpleHTMLTextNode() throws {
131 | // get string from file
132 | let fileURL = URL.init(fileURLWithPath: "/some/absolute/path/simple.html")!
133 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8)
134 |
135 | // parse the htmlString into a tree of node objects (DOM)
136 | let nodeTree = try HTMLParser.parse(htmlString)
137 |
138 | // create a node selector path to describe what nodes to match in the nodeTree
139 | // this is equvalent to the selector: body > p or xpath: /html/body/p
140 | let nodeSelectorPath: [NodeSelector] = [
141 | ElementSelector().withTagName("html"),
142 | ElementSelector().withTagName("body"),
143 | ElementSelector().withTagName("div").withClassName("bibliography"),
144 | ElementSelector().withTagName("ul"),
145 | ElementSelector().withTagName("li").withId("citation-1999"),
146 | TextNodeSelector()
147 | ]
148 |
149 | // find the nodes that match the nodeSelectorPath
150 | // Notice we use the findNodes() function which can match with any node type
151 | let matchingNodes = HTMLTraverser.findNodes(in: nodeTree, matching: nodeSelectorPath)
152 |
153 | // matchingNodes will contain the matching generic node
154 | // we have to cast the Node to a TextNode to access its text property
155 | guard let paragraphTextNode = matchingNodes.first as? TextNode else {
156 | // could not find paragraph text node
157 | return
158 | }
159 |
160 | // will print: This is the second citation.
161 | print(paragraphTextNode.text)
162 | }
163 | ```
164 |
165 | #### Find Matching Elements Using a Child Node Selector Path
166 | ```swift
167 | func parseAndTraverseSimpleHTMLChildNodeSelectorPath() throws {
168 | // get string from file
169 | let fileURL = URL.init(fileURLWithPath: "/some/absolute/path/simple.html")!
170 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8)
171 |
172 | // parse the htmlString into a tree of node objects (DOM)
173 | let nodeTree = try HTMLParser.parse(htmlString)
174 |
175 | // create a child node selector path that will match the parent node
176 | // only if the childNodeSelectorPath matches the element's child nodes
177 | let childNodeSelectorPath: [NodeSelector] = [
178 | ElementSelector().withTagName("div"),
179 | ElementSelector().withTagName("h3"),
180 | TextNodeSelector().withText("Editor Notes")
181 | ]
182 |
183 | // create a node selector path to describe what nodes to match in the nodeTree
184 | // Notice the last ElementSelector will only match if the element contains
185 | // child nodes that match the childNodeSelectorPath
186 | let nodeSelectorPath: [NodeSelector] = [
187 | ElementSelector().withTagName("html"),
188 | ElementSelector().withTagName("body"),
189 | ElementSelector().withTagName("div").withChildNodeSelectorPath(childNodeSelectorPath)
190 | ]
191 |
192 | // find the nodes that match the nodeSelectorPath
193 | // Notice we use the findNodes() function which can match with any node type
194 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
195 |
196 | // matchingElements should only contain the div element with the 'essay' class name
197 | // will print: 1
198 | print(matchingElements.count)
199 |
200 | guard let divElement = matchingElements.first else {
201 | // could not find paragraph text node
202 | XCTFail("could not find paragraph text node")
203 | return
204 | }
205 |
206 | guard let firstClassName = divElement.classNames.first else {
207 | // divElement does not have any classnames
208 | return
209 | }
210 |
211 | // will print: essay
212 | print(firstClassName)
213 | }
214 | ```
215 |
216 | ## Testing
217 | Automated testing was used to validate the parsing of tags, comments, single and double quoted attributes, imbedded JavaScript, etc. Specially created sample HTML files as well as HTML from top sites were used in testing. However, all cases may not have been covered. Please open a issue on Github and provide sample HTML if you discover a bug so it can be fixed and a test case can be added.
218 |
219 |
220 | #### Run Tests Via the Command Line
221 | `swift test`
222 |
223 | #### Run Tests Via Docker
224 | `docker build -t swift-html-parser . && docker run -it swift-html-parser`
225 |
226 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Extensions/Appendable.swift:
--------------------------------------------------------------------------------
1 | //
2 | // File.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-25.
6 | //
7 |
8 | import Foundation
9 |
10 | protocol Insertable: Collection {
11 | init()
12 | mutating func append(_ newElement: Element)
13 | mutating func append
62 | nodeSelectorPath = [
63 | ElementSelector().withTagName("html"),
64 | ElementSelector().withTagName("body"),
65 | ElementSelector().withTagName("div")
66 | ]
67 |
68 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
69 |
70 | XCTAssertEqual(matchingElements.first!.attributeValue(for: "emptyAtrribute")!, "")
71 |
72 |
73 | nodeSelectorPath = [
74 | ElementSelector().withTagName("html"),
75 | ElementSelector().withTagName("body"),
76 | ElementSelector().withTagName("form"),
77 | ElementSelector().withTagName("input")
78 | ]
79 |
80 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
81 |
82 | XCTAssertEqual(matchingElements.count, 1)
83 |
84 | // test attribute with name but no value
85 | XCTAssertEqual(matchingElements[0].containsAttribute("disabled"), true)
86 | XCTAssertEqual(matchingElements[0].attributeValue(for: "disabled"), nil)
87 | }
88 |
89 | func testAttributesQuotes() {
90 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
91 | .appendingPathComponent("attributes-quotes.html") else {
92 | XCTFail("Could find get file URL to parse")
93 | return
94 | }
95 |
96 | // get html string from file
97 | var htmlStringResult: String? = nil
98 | do {
99 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
100 | } catch {
101 | XCTFail("Could not open file URL: \(fileURL)")
102 | return
103 | }
104 | guard let htmlString = htmlStringResult else {
105 | XCTFail("Could not open file URL: \(fileURL)")
106 | return
107 | }
108 |
109 | // create object from raw html file
110 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
111 | XCTFail("Could not parse HTML")
112 | return
113 | }
114 |
115 | // find matching elements by traversing the created html object
116 | let nodeSelectorPath = [
117 | ElementSelector().withTagName("html"),
118 | ElementSelector().withTagName("body"),
119 | ElementSelector().withTagName("p")
120 | ]
121 |
122 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
123 |
124 | XCTAssertEqual(matchingElements.count, 2)
125 |
126 | // test attribute with double quotes within single quotes
127 | XCTAssertEqual(matchingElements[0].openingTag.attributes.count, 2)
128 | XCTAssertEqual(matchingElements[0].attributeValue(for: "title")!, "John \"ShotGun\" Nelson")
129 |
130 | // test attribute with single quotes within double quotes
131 | XCTAssertEqual(matchingElements[1].openingTag.attributes.count, 2)
132 | XCTAssertEqual(matchingElements[1].attributeValue(for: "title")!, "John 'ShotGun' Nelson")
133 | }
134 |
135 | func testAttributesTabs() {
136 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
137 | .appendingPathComponent("attributes-tabs.html") else {
138 | XCTFail("Could find get file URL to parse")
139 | return
140 | }
141 |
142 | // get html string from file
143 | var htmlStringResult: String? = nil
144 | do {
145 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
146 | } catch {
147 | XCTFail("Could not open file URL: \(fileURL)")
148 | return
149 | }
150 | guard let htmlString = htmlStringResult else {
151 | XCTFail("Could not open file URL: \(fileURL)")
152 | return
153 | }
154 |
155 | // create object from raw html file
156 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
157 | XCTFail("Could not parse HTML")
158 | return
159 | }
160 |
161 | // find matching elements by traversing the created html object
162 | let nodeSelectorPath = [
163 | ElementSelector().withTagName("html"),
164 | ElementSelector().withTagName("body"),
165 | ElementSelector().withTagName("img")
166 | ]
167 |
168 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
169 |
170 | XCTAssertEqual(matchingElements.count, 1)
171 | XCTAssertEqual(matchingElements[0].openingTag.tagName, "img")
172 |
173 | // test attribute
174 | XCTAssertEqual(matchingElements[0].attributeValue(for: "height")!, "580")
175 | XCTAssertEqual(matchingElements[0].attributeValue(for: "width")!, "480")
176 | XCTAssertEqual(matchingElements[0].attributeValue(for: "src")!, "/some/img.jpg")
177 | XCTAssertEqual(matchingElements[0].attributeValue(for: "alt")!, "/some/other/img.png")
178 | }
179 |
180 | }
181 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/CommentParserTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // CommentTests.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-11.
6 | //
7 |
8 | import XCTest
9 | @testable import SwiftHTMLParser
10 | import TestFiles
11 |
12 | final class CommentParserTests: XCTestCase {
13 | func testComments() {
14 | guard let fileURL = TestFileURLs.commentsTestFilesDirectoryURL?
15 | .appendingPathComponent("comments.html") else {
16 | XCTFail("Could not get url to test file")
17 | return
18 | }
19 |
20 | // get html string from file
21 | var htmlStringResult: String? = nil
22 | do {
23 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
24 | } catch {
25 | XCTFail("Could not open file at: \(fileURL.path)")
26 | }
27 | guard let htmlString = htmlStringResult else {
28 | XCTFail("Could not open file at: \(fileURL.path)")
29 | return
30 | }
31 |
32 | // create object from raw html file
33 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
34 | XCTFail("Could not parse HTML")
35 | return
36 | }
37 |
38 | // find matching elements by traversing the created html object
39 | var nodeSelectorPath = [
40 | ElementSelector().withTagName("html"),
41 | ElementSelector().withTagName("body")
42 | ]
43 |
44 | var matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
45 |
46 | XCTAssertEqual(matchingElements[0].childNodes.count, 15)
47 | XCTAssertEqual(matchingElements[0].commentNodes.count, 6)
48 | XCTAssertEqual(matchingElements[0].childElements.count, 3)
49 | XCTAssertEqual(matchingElements[0].textNodes.count, 6)
50 |
51 | XCTAssertEqual(matchingElements[0].commentNodes[0].text, " This is a comment ")
52 | XCTAssertEqual(matchingElements[0].commentNodes[1].text, " This is annother comment ")
53 | XCTAssertEqual(matchingElements[0].commentNodes[3].text, " no space between the comment and div ")
54 | XCTAssertEqual(matchingElements[0].commentNodes[4].text, "x")
55 | XCTAssertEqual(matchingElements[0].commentNodes[5].text, "")
56 |
57 | nodeSelectorPath = [
58 | ElementSelector().withTagName("html"),
59 | ElementSelector().withTagName("body"),
60 | ElementSelector().withTagName("div"),
61 | ]
62 |
63 | matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
64 | XCTAssertEqual(matchingElements.count, 1)
65 | XCTAssertEqual(matchingElements[0].textNodes.first!.text, "This is a div")
66 | }
67 |
68 | func testConditionalComments() throws {
69 | guard let fileURL = TestFileURLs.commentsTestFilesDirectoryURL?
70 | .appendingPathComponent("conditional-comments-salvageable.html") else {
71 | XCTFail("Could not get url to test file")
72 | return
73 | }
74 |
75 | // get html string from file
76 | var htmlStringResult: String? = nil
77 | do {
78 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
79 | } catch {
80 | XCTFail("Could not open file at: \(fileURL.path)")
81 | }
82 | guard let htmlString = htmlStringResult else {
83 | XCTFail("Could not open file at: \(fileURL.path)")
84 | return
85 | }
86 |
87 | // create object from raw html file
88 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
89 | XCTFail("Could not parse HTML")
90 | return
91 | }
92 |
93 | //XCTAssertEqual(elementArray.count, 2)
94 |
95 | // find matching elements by traversing the created html object
96 | let nodeSelectorPath = [
97 | ElementSelector().withTagName("html"),
98 | ElementSelector().withTagName("body")
99 | ]
100 |
101 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
102 |
103 | XCTAssertEqual(matchingElements.count, 1)
104 | XCTAssertEqual(matchingElements.first!.commentNodes.count, 1)
105 | //let commentText = try XCTUnwrap(matchingElements.first?.commentNodes.first?.text)
106 | let commentText = matchingElements.first!.commentNodes.first!.text
107 | XCTAssertTrue(commentText.contains(" You are using Internet Explorer 6. :( elements with the className 'body-paragraph'
35 | // will print: 3
36 | print(matchingElements.count)
37 | }
38 |
39 | func parseAndTraverseSimpleHTMLTextNode() throws {
40 | // get string from file
41 | let fileURL = TestFileURLs.documentationTestFilesDirectoryURL!.appendingPathComponent("simple.html")
42 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8)
43 |
44 | // parse the htmlString into a tree of node objects (DOM)
45 | let nodeTree = try HTMLParser.parse(htmlString)
46 |
47 | // create a node selector path to describe what nodes to match in the nodeTree
48 | // this is equvalent to the selector: body > p or xpath: /html/body/p
49 | let nodeSelectorPath: [NodeSelector] = [
50 | ElementSelector().withTagName("html"),
51 | ElementSelector().withTagName("body"),
52 | ElementSelector().withTagName("div").withClassName("bibliography"),
53 | ElementSelector().withTagName("ul"),
54 | ElementSelector().withTagName("li").withId("citation-1999"),
55 | TextNodeSelector()
56 | ]
57 |
58 | // find the nodes that match the nodeSelectorPath
59 | // Notice we use the findNodes() function which can match with any node type
60 | let matchingNodes = HTMLTraverser.findNodes(in: nodeTree, matching: nodeSelectorPath)
61 |
62 | // matchingNodes will contain the matching node
63 | // we have to cast the Node to a TextNode to access its text property
64 | guard let paragraphTextNode = matchingNodes.first as? TextNode else {
65 | // could not find paragraph text node
66 | return
67 | }
68 |
69 | // will print: This is the second citation.
70 | print(paragraphTextNode.text)
71 | }
72 |
73 | func testParseAndTraverseSimpleHTMLChildNodeSelectorPath() throws {
74 | // get string from file
75 | let fileURL = TestFileURLs.documentationTestFilesDirectoryURL!.appendingPathComponent("simple.html")
76 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8)
77 |
78 | // parse the htmlString into a tree of node objects (DOM)
79 | let nodeTree = try HTMLParser.parse(htmlString)
80 |
81 | // create a child node selector path that will match the parent node
82 | // only if the childNodeSelectorPath matches the element's child nodes
83 | let childNodeSelectorPath: [NodeSelector] = [
84 | ElementSelector().withTagName("div"),
85 | ElementSelector().withTagName("p"),
86 | TextNodeSelector().withText("Editor Notes")
87 | ]
88 |
89 | // create a node selector path to describe what nodes to match in the nodeTree
90 | // Notice the last ElementSelector will only match if the element contains
91 | // child nodes that match the childNodeSelectorPath
92 | let nodeSelectorPath: [NodeSelector] = [
93 | ElementSelector().withTagName("html"),
94 | ElementSelector().withTagName("body"),
95 | ElementSelector().withTagName("div").withChildNodeSelectorPath(childNodeSelectorPath),
96 | ]
97 |
98 | // find the nodes that match the nodeSelectorPath
99 | // Notice we use the findNodes() function which can match with any node type
100 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
101 |
102 | // matchingElements should only contain the div element with the 'essay' class namee
103 | // will print: 1
104 | print(matchingElements.count)
105 |
106 | guard let divElement = matchingElements.first else {
107 | // could not find paragraph text node
108 | XCTFail("could not find paragraph text node")
109 | return
110 | }
111 |
112 | guard let firstClassName = divElement.classNames.first else {
113 | // divElement does not have any classnames
114 | XCTFail("divElement does not have any classnames")
115 | return
116 | }
117 |
118 | // will print: essay
119 | print(firstClassName)
120 |
121 | XCTAssertEqual(matchingElements.count, 1)
122 | XCTAssertEqual(firstClassName, "essay")
123 | }
124 |
125 | }
126 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/ElementTests.swift:
--------------------------------------------------------------------------------
1 | import XCTest
2 | @testable import SwiftHTMLParser
3 | import TestFiles
4 |
5 | final class SwiftHTMLParserTests: XCTestCase {
6 |
7 | func testOpenFile() {
8 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
9 | .appendingPathComponent("elements-simple.html") else {
10 | XCTFail("Could find get file URL to parse")
11 | return
12 | }
13 |
14 | // get html string from file
15 | var htmlStringResult: String? = nil
16 | do {
17 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
18 | } catch {
19 | XCTFail("Could not open file URL: \(fileURL)")
20 | return
21 | }
22 | guard let htmlString = htmlStringResult else {
23 | XCTFail("Could not open file URL: \(fileURL)")
24 | return
25 | }
26 |
27 | XCTAssertTrue(htmlString.count > 100)
28 | XCTAssertTrue(htmlString.hasPrefix(""))
29 | XCTAssertTrue(htmlString.contains(""))
30 | XCTAssertTrue(htmlString.contains("
10 | This is the first paragraph.
11 |
13 | This is the second paragraph.
14 |
16 | This is the third paragraph.
17 |
19 |
20 | This is the fourth paragraph.
21 |
10 | This is the first paragraph.
11 |
13 | This is the second paragraph.
14 | This is a paragraph. This is a paragraph. This is a paragraph. Above conditional comments incorect, should ignore div This is shown in chrome This is a paragraph. This is the first paragraph. This is the second paragraph. This is the third paragraph. This is the fourth paragraph. This is the fifth paragraph. Editor Notes Bibliography Notes This is the first paragraph. This is the second paragraph. 'John "ShotGun" Nelson' "John 'ShotGun' Nelson" It's alright I love the " (double Quote) character This is the first paragraph. This is the second paragraph. This is the third paragraph. This is the first paragraph. This is the second paragraph.(contentsOf newElements: S) where Element == S.Element, S : Sequence
14 | }
15 |
16 | protocol SetInsertable: Collection {
17 | init()
18 | mutating func insert(_ newMember: Element) -> (inserted: Bool, memberAfterInsert: Element)
19 | mutating func formUnion(_ other: S) where Element == S.Element, S : Sequence
20 | }
21 |
22 | extension Array: Insertable {}
23 | extension Set: SetInsertable {}
24 |
25 | extension Optional where Wrapped: Insertable {
26 | mutating func appendOrInit(_ newElement: Wrapped.Iterator.Element) {
27 | if self == nil {
28 | var newArray = Wrapped.init()
29 | newArray.append(newElement)
30 | self = newArray
31 | } else {
32 | self?.append(newElement)
33 | }
34 | }
35 |
36 | mutating func appendOrInit(contentsOf newElements: S) where Wrapped.Iterator.Element == S.Element, S : Sequence {
37 | if self == nil {
38 | var newArray = Wrapped.init()
39 | newArray.append(contentsOf: newElements)
40 | self = newArray
41 | } else {
42 | self?.append(contentsOf: newElements)
43 | }
44 | }
45 | }
46 |
47 | extension Optional where Wrapped: SetInsertable {
48 | mutating func insertOrInit(_ newElement: Wrapped.Iterator.Element) {
49 | if self == nil {
50 | var newSet = Wrapped.init()
51 | _ = newSet.insert(newElement)
52 | self = newSet
53 | } else {
54 | _ = self?.insert(newElement)
55 | }
56 | }
57 |
58 | mutating func formUnionOrInit(_ other: S) where Wrapped.Iterator.Element == S.Element, S : Sequence {
59 | if self == nil {
60 | var newSet = Wrapped.init()
61 | newSet.formUnion(other)
62 | self = newSet
63 | } else {
64 | self?.formUnion(other)
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Extensions/CharacterExtensions.swift:
--------------------------------------------------------------------------------
1 | //
2 | // CharacterExtensions.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-09-11.
6 | //
7 |
8 | import Foundation
9 |
10 | extension Character {
11 | func isEqualToOneOf(characters: [Character]) -> Bool {
12 | for aCharacter in characters {
13 | if self == aCharacter {
14 | return true
15 | }
16 | }
17 |
18 | return false
19 | }
20 |
21 | func isNotEqualToOneOf(characters: [Character]) -> Bool {
22 | return !self.isEqualToOneOf(characters: characters)
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Extensions/StringExtensions.swift:
--------------------------------------------------------------------------------
1 | //
2 | // StringExtensions.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-08-19.
6 | //
7 |
8 | import Foundation
9 |
10 | extension String {
11 | func subscring(after afterIndex: String.Index, numberOfCharacters: Int) -> String {
12 | let lastIndex = self.index(afterIndex, offsetBy: numberOfCharacters)
13 | if lastIndex < self.endIndex {
14 | return String(self[afterIndex...lastIndex])
15 | } else {
16 | return String(self[afterIndex...self.endIndex])
17 | }
18 | }
19 |
20 | func encompassesIndex(_ index: String.Index) -> Bool {
21 | if (index < self.endIndex) {
22 | return true
23 | }
24 |
25 | return false
26 | }
27 |
28 | func isEmptyOrWhitespace() -> Bool {
29 | if(self.isEmpty) {
30 | return true
31 | }
32 |
33 | if self.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines).isEmpty {
34 | return true
35 | }
36 |
37 | return false
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Helpers/RegexError.swift:
--------------------------------------------------------------------------------
1 | //
2 | // RegexError.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-05.
6 | //
7 |
8 | import Foundation
9 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Helpers/RegexHelper.swift:
--------------------------------------------------------------------------------
1 | //
2 | // RegexHelper.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-08-07.
6 | //
7 |
8 | import Foundation
9 |
10 | struct RegexHelper {
11 |
12 | func matchRanges(for regexPattern: String, inString inputString: String) -> [Range
)
64 | func checkIsEmptyElementTag(tagName: String) -> Bool {
65 | let tagNameWithoutSlash = tagName.replacingOccurrences(of: "/", with: "")
66 |
67 | // check if known empty element
68 | if emptyElementTagNames.contains(tagNameWithoutSlash) {
69 | return true
70 | }
71 |
72 | // check if DOCTYPE
73 | if tagName.caseInsensitiveCompare("!DOCTYPE") == ComparisonResult.orderedSame {
74 | return true
75 | }
76 |
77 | return false
78 | }
79 |
80 | // check if tag is self closing, ending with />
81 | // ex: i.e
82 | func checkIsSelfClosing(tagText: String) -> Bool {
83 | let lastCharacter = tagText[tagText.index(tagText.endIndex, offsetBy: -1)]
84 | let secondLastCharacter = tagText[tagText.index(tagText.endIndex, offsetBy: -2)]
85 |
86 | if lastCharacter == ">" && secondLastCharacter == "/" {
87 | return true
88 | } else {
89 | return false
90 | }
91 | }
92 |
93 | func checkIsClosingTag() -> Bool {
94 | if tagText.prefix(2) == "" {
95 | return true
96 | }
97 |
98 | if tagText == "" {
99 | return true
100 | }
101 |
102 | return false
103 | }
104 |
105 | func getDescription() -> String {
106 | var description = ""
107 | description = description + "tagText: \(tagText)\n"
108 | description = description + "tagText.count: \(tagText.count)\n"
109 | description = description + "tag.startIndex: \(startIndex.utf16Offset(in: tagText))\n"
110 | description = description + "tag.endIndex: \(endIndex.utf16Offset(in: tagText))\n"
111 |
112 | return description
113 | }
114 |
115 | func getClassNames(classAttributeValue: String) -> [String] {
116 | // (?=\s*) -> 0 or more whitespaces, but dont capture
117 | // [\w\d]+ -> 1 or more non-whitespace characters
118 | let classNameRegexPattern = "(?=\\s*)[^\\n\\r\\s]+(?=\\s*)"
119 |
120 | let regexHelper = RegexHelper()
121 | return regexHelper.matches(for: classNameRegexPattern, inString: classAttributeValue)
122 | }
123 |
124 | }
125 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/ParseError.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ParseError.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-05.
6 | //
7 |
8 | import Foundation
9 |
10 | enum ParseError: Error {
11 | case tagNotFound
12 | case tagNameNotFound
13 | case invalidTag
14 | case openingTagNotFound
15 | case canNotFindClosingTagWithoutAnyOpenedTags
16 | case closingTagNotFound(String)
17 | case attributeNotFound
18 | case closingTagNameDoesNotMatchOpeningTagName(erroredTag: Tag)
19 | case endOfFileReachedBeforeClosingTagFound
20 | case endOfFileReachedBeforeScriptClosingTagFound
21 | case endOfFileReachedBeforeCommentCloseFound
22 | case endOfFileReachedBeforeCDATACloseFound
23 | case invalidCDATA
24 | }
25 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/ParseFormat.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ParseFormat.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-13.
6 | //
7 |
8 | import Foundation
9 |
10 | public enum ParseFormat {
11 | case html
12 | case xml
13 | case svg
14 | }
15 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/ScriptParser.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ScriptParser.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-09.
6 | //
7 |
8 | import Foundation
9 |
10 | struct ScriptParser {
11 |
12 | enum ScriptParseState {
13 | case notWithinQuotesOrComment
14 | case withinDoubleQuotes
15 | case withinSingleQuotes
16 | case withinMultiLineComment
17 | case withinSingleLineComment
18 | }
19 |
20 | struct ScriptSpecificCharacters {
21 | let scriptEndTag = ""
22 |
23 | // strings
24 | let multiLineCommentOpening = "/*"
25 | let multiLineCommentClosing = "*/"
26 | let SingleLineCommentOpening = "//"
27 |
28 | let escapedBackslash = "\\\\" // i.e \\
29 | let escapedDoubleQuote = "\\\"" // i.e \"
30 | let escapedSingleQuote = "\\'" // i.e \'
31 |
32 | // characters
33 | let doubleQuote: Character = "\"" // i.e "
34 | let singleQuote: Character = "'" // i.e '
35 | let newline: Character = "\n"
36 | }
37 |
38 | fileprivate let lookaheadValidator = LookaheadValidator()
39 |
40 | // not intended to fully parse javascript, rather save it to inner text
41 | func parseScript(source: String, currentIndex: String.Index) throws -> (innerTextBlock: TextNode, closingScriptTag: Tag) {
42 | var localCurrentIndex = currentIndex
43 | var parseState = ScriptParseState.notWithinQuotesOrComment
44 | //var isTagOpened = false
45 |
46 | //var tagStartIndex: String.Index? = nil
47 | let specificCharacters = ScriptSpecificCharacters()
48 |
49 | while localCurrentIndex < source.endIndex {
50 | switch parseState {
51 | case .notWithinQuotesOrComment:
52 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex, checkFor: specificCharacters.scriptEndTag) {
53 | let tagStartIndex = localCurrentIndex
54 | let tagEndIndex = source.index(localCurrentIndex, offsetBy: 8)
55 |
56 | // create tagText string from indexes
57 | let tagText = String(source[tagStartIndex...tagEndIndex])
58 | let tagName = "/script"
59 |
60 | //define innerTextBlock
61 | let textBlockStartIndex = currentIndex
62 | let textBlockEndIndex = source.index(tagStartIndex, offsetBy: -1)
63 | var textBlockString = ""
64 | // create string if text block is not an empty string - i.e
65 | if (source.distance(from: textBlockStartIndex, to: textBlockEndIndex) > 0) {
66 | textBlockString = String(source[textBlockStartIndex...textBlockEndIndex])
67 | }
68 |
69 | let innerTextBlock = TextNode.init(startIndex: textBlockStartIndex,
70 | endIndex: textBlockEndIndex,
71 | text: textBlockString)
72 |
73 | let tag = Tag.init(startIndex: tagStartIndex, endIndex: tagEndIndex, tagText: tagText, tagName: tagName)
74 | return (innerTextBlock, tag)
75 | }
76 | // look for quotes and comments
77 | if source[localCurrentIndex] == specificCharacters.doubleQuote {
78 | parseState = .withinDoubleQuotes
79 | } else if source[localCurrentIndex] == specificCharacters.singleQuote {
80 | parseState = .withinSingleQuotes
81 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
82 | checkFor: specificCharacters.multiLineCommentOpening) {
83 | parseState = .withinMultiLineComment
84 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
85 | checkFor: specificCharacters.SingleLineCommentOpening) {
86 | parseState = .withinSingleLineComment
87 | }
88 | case .withinDoubleQuotes:
89 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
90 | checkFor: specificCharacters.escapedBackslash) {
91 | // is escaped backslash
92 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
93 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
94 | checkFor: specificCharacters.escapedDoubleQuote) {
95 | // is double quote escape character - increment localCurrentIndex past it
96 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
97 | } else if source[localCurrentIndex] == specificCharacters.doubleQuote {
98 | parseState = .notWithinQuotesOrComment
99 | }
100 | case .withinSingleQuotes:
101 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
102 | checkFor: specificCharacters.escapedBackslash) {
103 | // is escaped backslash
104 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
105 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
106 | checkFor: specificCharacters.escapedSingleQuote) {
107 | // is single quoute escape character - increment localCurrentIndex past it
108 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
109 | } else if source[localCurrentIndex] == specificCharacters.singleQuote {
110 | parseState = .notWithinQuotesOrComment
111 | }
112 | case .withinMultiLineComment:
113 | if lookaheadValidator.isValidLookahead(for: source, atIndex: localCurrentIndex,
114 | checkFor: specificCharacters.multiLineCommentClosing) {
115 | parseState = .notWithinQuotesOrComment
116 | }
117 | case .withinSingleLineComment:
118 | if source[localCurrentIndex] == specificCharacters.newline {
119 | parseState = .notWithinQuotesOrComment
120 | }
121 | }
122 |
123 | //print(localCurrentIndex.encodedOffset)
124 |
125 | // if localCurrentIndex.encodedOffset % 100 == 0 {
126 | // print(localCurrentIndex.encodedOffset)
127 | // }
128 |
129 | // increment localCurrentIndex (go to next character in string)
130 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
131 | }
132 |
133 | // throw error if a tag not found before end of file reached
134 | throw ParseError.endOfFileReachedBeforeScriptClosingTagFound
135 | }
136 |
137 | func checkIfScriptClosingTag() {
138 |
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/TagParser.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TagParser.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-04.
6 | //
7 |
8 | import Foundation
9 |
10 | enum TagParserState {
11 | case notWithinQuotesOrComment
12 | case withinDoubleQuotes
13 | case withinSingleQuotes
14 | }
15 |
16 | enum TagOpeningType {
17 | case element
18 | case CDATA
19 | case declaration
20 | case comment
21 | }
22 |
23 | struct TagSpecificCharacters {
24 | // characters
25 | let tagOpeningCharacter: Character = "<"
26 | let tagClosingCharacter: Character = ">"
27 | let doubleQuote: Character = "\"" // i.e "
28 | let singleQuote: Character = "'" // i.e '
29 | let space: Character = " "
30 | let equalSign: Character = "="
31 |
32 | // strings
33 | let declarationOpening = ""
36 | let conditionalCommentOpening = ""
38 | let CDATAOpening = ""
40 |
41 | // array
42 |
43 | }
44 |
45 | struct TagParser {
46 | fileprivate let commentParser = CommentParser()
47 | fileprivate let cdataParser = CDATAParser()
48 | fileprivate let lookaheadValidator = LookaheadValidator()
49 | fileprivate let specificCharacters = TagSpecificCharacters()
50 | fileprivate let isPoorlyFormattedCommentsAllowed: Bool = true
51 |
52 | func getNextTag(source: String, currentIndex: String.Index) throws -> (childNodes: [Node], tag: Tag?) {
53 | var isTagOpened = false
54 | var localCurrentIndex = currentIndex
55 | var tagStartIndex: String.Index?
56 |
57 | var childNodes = [Node]()
58 | var parseState = TagParserState.notWithinQuotesOrComment
59 |
60 | // iterate through string indices until tag is found or end of string
61 | while source.encompassesIndex(localCurrentIndex) {
62 |
63 | if isTagOpened == false {
64 | if parseState == .notWithinQuotesOrComment {
65 | if let tagOpeningType = resolveTagOpeningType(source: source, index:
66 | localCurrentIndex) {
67 |
68 | // set inner text block
69 | if (currentIndex != localCurrentIndex) {
70 | var textBlockStartIndex = currentIndex
71 |
72 | // changed
73 | if let lastChildNode = childNodes.last {
74 | textBlockStartIndex = source.index(lastChildNode.endIndex, offsetBy: 1)
75 | }
76 |
77 | let textBlockEndIndex = source.index(localCurrentIndex, offsetBy: -1)
78 |
79 | // if tags or comments are right beside each other dont add text block i.e
80 | if textBlockStartIndex <= textBlockEndIndex {
81 | let textBlockText = String(source[textBlockStartIndex...textBlockEndIndex])
82 | if (textBlockText.isEmptyOrWhitespace() == false) {
83 | let innerTextBlock = TextNode.init(startIndex: textBlockStartIndex,
84 | endIndex: textBlockEndIndex,
85 | text: textBlockText)
86 | childNodes.append(innerTextBlock)
87 | }
88 | }
89 | }
90 |
91 | switch tagOpeningType {
92 | case .element:
93 | isTagOpened = true
94 | tagStartIndex = localCurrentIndex
95 | case .comment:
96 | do {
97 | let comment = try commentParser.parseComment(source: source,
98 | currentIndex: localCurrentIndex,
99 | commentType: .comment)
100 | localCurrentIndex = comment.endIndex
101 | childNodes.append(comment)
102 | } catch {
103 | throw ParseError.endOfFileReachedBeforeCommentCloseFound
104 | }
105 | case .declaration:
106 | do {
107 | let comment = try commentParser.parseComment(source: source,
108 | currentIndex: localCurrentIndex,
109 | commentType: .declaration)
110 | localCurrentIndex = comment.endIndex
111 | childNodes.append(comment)
112 | } catch {
113 | throw ParseError.endOfFileReachedBeforeCommentCloseFound
114 | }
115 | case .CDATA:
116 | // is CDATA
117 | do {
118 | let cdata = try cdataParser.parse(source: source, currentIndex: localCurrentIndex)
119 | localCurrentIndex = cdata.endIndex
120 | childNodes.append(cdata)
121 | } catch {
122 | throw ParseError.endOfFileReachedBeforeCommentCloseFound
123 | }
124 | }
125 | }
126 | }
127 | } else {
128 | switch parseState {
129 | case .notWithinQuotesOrComment:
130 | if source[localCurrentIndex] == specificCharacters.tagClosingCharacter {
131 | // tag is closed
132 | do {
133 | let tag = try foundTag(source: source, tagStartIndex: tagStartIndex!, tagEndIndex: localCurrentIndex)
134 | return (childNodes, tag)
135 | } catch {
136 | throw error
137 | }
138 | }
139 | if source[localCurrentIndex] == specificCharacters.doubleQuote {
140 | parseState = .withinDoubleQuotes
141 | } else if source[localCurrentIndex] == specificCharacters.singleQuote {
142 | parseState = .withinSingleQuotes
143 | }
144 | case .withinDoubleQuotes:
145 | if source[localCurrentIndex] == specificCharacters.doubleQuote {
146 | parseState = .notWithinQuotesOrComment
147 | }
148 | case .withinSingleQuotes:
149 | if source[localCurrentIndex] == specificCharacters.singleQuote {
150 | parseState = .notWithinQuotesOrComment
151 | }
152 | }
153 | }
154 |
155 | // increment localCurrentIndex
156 | localCurrentIndex = source.index(localCurrentIndex, offsetBy: 1)
157 |
158 | // if source.encompassesIndex(localCurrentIndex) {
159 | // print("localCurrentIndex: \(localCurrentIndex)")
160 | // print(source[localCurrentIndex])
161 | // }
162 | }
163 |
164 | // a tag not found before end of file reached
165 | return (childNodes, nil)
166 | }
167 |
168 | func resolveTagOpeningType(source: String, index: String.Index) -> TagOpeningType? {
169 | if source[index] == specificCharacters.tagOpeningCharacter {
170 | if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.declarationOpening) {
171 | // check if comment opening
172 | if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.commentOpening) {
173 | return TagOpeningType.comment
174 | } else if lookaheadValidator.isValidLookahead(for: source, atIndex: index, checkFor: specificCharacters.CDATAOpening) {
175 | return TagOpeningType.CDATA
176 | }
177 | return TagOpeningType.declaration
178 | }
179 | return TagOpeningType.element
180 | }
181 | return nil
182 | }
183 |
184 | /// produces a `tag` from the found tag text, parsing attributes etc
185 | func foundTag(source: String, tagStartIndex: String.Index, tagEndIndex: String.Index) throws -> Tag {
186 | // create tagText string from indexes
187 | let tagText = String(source[tagStartIndex...tagEndIndex])
188 |
189 | // get tagName from tagText
190 | let tagNameResult: String?
191 | do {
192 | tagNameResult = try parseTagName(tagText: tagText)
193 | } catch {
194 | throw ParseError.tagNameNotFound
195 | }
196 | guard let tagName = tagNameResult else {
197 | throw ParseError.tagNameNotFound
198 | }
199 |
200 | // create the tag
201 | return Tag.init(startIndex: tagStartIndex, endIndex: tagEndIndex, tagText: tagText, tagName: tagName)
202 | }
203 |
204 | func parseTagName(tagText: String) throws -> String {
205 | var currentIndex = tagText.startIndex
206 | let endIndex = tagText.endIndex
207 |
208 | var startTagNameIndex: String.Index?
209 |
210 |
211 | var isFirstCharacterFound = false
212 | while currentIndex < endIndex {
213 | if isFirstCharacterFound == false {
214 | // keep going until you find the first char (ignore < and whitespace)
215 | if tagText[currentIndex] != TagSpecificCharacters().tagOpeningCharacter && tagText[currentIndex].isWhitespace == false {
216 | isFirstCharacterFound = true
217 | // add char to tag
218 | startTagNameIndex = currentIndex
219 | }
220 | } else {
221 | if tagText[currentIndex] == ">" || tagText[currentIndex].isWhitespace {
222 | // dont include last > or whitespace in tagName
223 | let endTagNameIndex = tagText.index(currentIndex, offsetBy: -1)
224 | let tagName = String(tagText[startTagNameIndex!...endTagNameIndex])
225 | return tagName.trimmingCharacters(in: .whitespacesAndNewlines)
226 | }
227 | }
228 |
229 | currentIndex = tagText.index(currentIndex, offsetBy: 1)
230 | }
231 |
232 | throw ParseError.tagNameNotFound
233 | }
234 |
235 | }
236 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/Tags/KnownHTMLTags.swift:
--------------------------------------------------------------------------------
1 | //
2 | // tags.swift
3 | // HTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-05-26.
6 | // Copyright © 2018 Reid Nantes. All rights reserved.
7 | //
8 |
9 | import Foundation
10 |
11 | struct HTMLTag {
12 | let name: String
13 | let isEmpty: Bool
14 |
15 | init(name: String, isEmpty: Bool = false) {
16 | self.name = name
17 | self.isEmpty = isEmpty
18 | }
19 | }
20 |
21 | enum HTMLTagID: String {
22 | case a = "a"
23 | case abbr = "abbr"
24 | case address = "address"
25 | case area = "area"
26 | case article = "article"
27 | case aside = "aside"
28 | case audio = "audio"
29 | case b = "b"
30 | case base = "base"
31 | case bdi = "bdi"
32 | case bdo = "bdo"
33 | case blockquote = "blockquote"
34 | case body = "body"
35 | case br = "br"
36 | case button = "button"
37 | case canvas = "canvas"
38 | case caption = "caption"
39 | case cite = "cite"
40 | case code = "code"
41 | case col = "col"
42 | case colgroup = "colgroup"
43 | case data = "data"
44 | case datalist = "datalist"
45 | case dd = "dd"
46 | case del = "del"
47 | case details = "details"
48 | case dfn = "dfn"
49 | case dialog = "dialog"
50 | case div = "div"
51 | case dl = "dl"
52 | case dt = "dt"
53 | case element = "Element"
54 | case em = "em"
55 | case embed = "embed"
56 | case fieldset = "fieldset"
57 | case figcaption = "figcaption"
58 | case figure = "figure"
59 | case footer = "footer"
60 | case form = "form"
61 | case h1 = "h1"
62 | case h2 = "h2"
63 | case h3 = "h3"
64 | case h4 = "h4"
65 | case h5 = "h5"
66 | case h6 = "h6"
67 | case head = "head"
68 | case header = "header"
69 | case hgroup = "hgroup"
70 | case hr = "hr"
71 | case html = "html"
72 | case i = "i"
73 | case iframe = "iframe"
74 | case img = "img"
75 | case input = "input"
76 | case ins = "ins"
77 | case kbd = "kbd"
78 | case label = "label"
79 | case legend = "legend"
80 | case li = "li"
81 | case link = "link"
82 | case main = "main"
83 | case map = "map"
84 | case mark = "mark"
85 | case mathML = "MathML"
86 | case math = "math "
87 | case menu = "menu"
88 | case meta = "meta"
89 | case meter = "meter"
90 | case nav = "nav"
91 | case noscript = "noscript"
92 | case object = "object"
93 | case ol = "ol"
94 | case optgroup = "optgroup"
95 | case option = "option"
96 | case output = "output"
97 | case p = "p"
98 | case param = "param"
99 | case picture = "picture"
100 | case pre = "pre"
101 | case progress = "progress"
102 | case q = "q"
103 | case rp = "rp"
104 | case rt = "rt"
105 | case ruby = "ruby"
106 | case s = "s"
107 | case samp = "samp"
108 | case script = "script"
109 | case section = "section"
110 | case select = "select"
111 | case slot = "slot"
112 | case small = "small"
113 | case source = "source"
114 | case span = "span"
115 | case strong = "strong"
116 | case style = "style"
117 | case sub = "sub"
118 | case summary = "summary"
119 | case sup = "sup"
120 | case svg = "svg"
121 | case table = "table"
122 | case tbody = "tbody"
123 | case td = "td"
124 | case template = "template"
125 | case textarea = "textarea"
126 | case tfoot = "tfoot"
127 | case th = "th"
128 | case thead = "thead"
129 | case time = "time"
130 | case title = "title"
131 | case tr = "tr"
132 | case track = "track"
133 | case u = "u"
134 | case ul = "ul"
135 | case `var` = "var"
136 | case video = "video"
137 | case wbr = "wbr"
138 | }
139 |
140 | let htmlTags: [HTMLTagID: HTMLTag] = [
141 | .a: HTMLTag.init(name: "a"),
142 | .abbr: HTMLTag.init(name: "abbr"),
143 | .address: HTMLTag.init(name: "address"),
144 | .area: HTMLTag.init(name: "area", isEmpty: true),
145 | .article: HTMLTag.init(name: "article"),
146 | .aside: HTMLTag.init(name: "aside"),
147 | .audio: HTMLTag.init(name: "audio"),
148 | .b: HTMLTag.init(name: "b"),
149 | .base: HTMLTag.init(name: "base", isEmpty: true),
150 | .bdi: HTMLTag.init(name: "bdi"),
151 | .bdo: HTMLTag.init(name: "bdo"),
152 | .blockquote: HTMLTag.init(name: "blockquote"),
153 | .body: HTMLTag.init(name: "body"),
154 | .br: HTMLTag.init(name: "br", isEmpty: true),
155 | .button: HTMLTag.init(name: "button"),
156 | .canvas: HTMLTag.init(name: "canvas"),
157 | .caption: HTMLTag.init(name: "caption"),
158 | .cite: HTMLTag.init(name: "cite"),
159 | .code: HTMLTag.init(name: "code"),
160 | .col: HTMLTag.init(name: "col", isEmpty: true),
161 | .colgroup: HTMLTag.init(name: "colgroup"),
162 | .data: HTMLTag.init(name: "data"),
163 | .datalist: HTMLTag.init(name: "datalist"),
164 | .dd: HTMLTag.init(name: "dd"),
165 | .del: HTMLTag.init(name: "del"),
166 | .details: HTMLTag.init(name: "details"),
167 | .dfn: HTMLTag.init(name: "dfn"),
168 | .dialog: HTMLTag.init(name: "dialog"),
169 | .div: HTMLTag.init(name: "div"),
170 | .dl: HTMLTag.init(name: "dl"),
171 | .dt: HTMLTag.init(name: "dt"),
172 | .element: HTMLTag.init(name: "Element"),
173 | .em: HTMLTag.init(name: "em"),
174 | .embed: HTMLTag.init(name: "embed", isEmpty: true),
175 | .fieldset: HTMLTag.init(name: "fieldset"),
176 | .figcaption: HTMLTag.init(name: "figcaption"),
177 | .figure: HTMLTag.init(name: "figure"),
178 | .footer: HTMLTag.init(name: "footer"),
179 | .form: HTMLTag.init(name: "form"),
180 | .h1: HTMLTag.init(name: "h1"),
181 | .h2: HTMLTag.init(name: "h2"),
182 | .h3: HTMLTag.init(name: "h3"),
183 | .h4: HTMLTag.init(name: "h4"),
184 | .h5: HTMLTag.init(name: "h5"),
185 | .h6: HTMLTag.init(name: "h6"),
186 | .head: HTMLTag.init(name: "head"),
187 | .header: HTMLTag.init(name: "header"),
188 | .hgroup: HTMLTag.init(name: "hgroup"),
189 | .hr: HTMLTag.init(name: "hr", isEmpty: true),
190 | .html: HTMLTag.init(name: "html"),
191 | .i: HTMLTag.init(name: "i"),
192 | .iframe: HTMLTag.init(name: "iframe", isEmpty: true),
193 | .img: HTMLTag.init(name: "img", isEmpty: true),
194 | .input: HTMLTag.init(name: "input", isEmpty: true),
195 | .ins: HTMLTag.init(name: "ins"),
196 | .kbd: HTMLTag.init(name: "kbd"),
197 | .label: HTMLTag.init(name: "label"),
198 | .legend: HTMLTag.init(name: "legend"),
199 | .li: HTMLTag.init(name: "li"),
200 | .link: HTMLTag.init(name: "link", isEmpty: true),
201 | .main: HTMLTag.init(name: "main"),
202 | .map: HTMLTag.init(name: "map"),
203 | .mark: HTMLTag.init(name: "mark"),
204 | .mathML: HTMLTag.init(name: "MathML"),
205 | .math: HTMLTag.init(name: "math "),
206 | .menu: HTMLTag.init(name: "menu"),
207 | .meta: HTMLTag.init(name: "meta", isEmpty: true),
208 | .meter: HTMLTag.init(name: "meter"),
209 | .nav: HTMLTag.init(name: "nav"),
210 | .noscript: HTMLTag.init(name: "noscript"),
211 | .object: HTMLTag.init(name: "object"),
212 | .ol: HTMLTag.init(name: "ol"),
213 | .optgroup: HTMLTag.init(name: "optgroup"),
214 | .option: HTMLTag.init(name: "option"),
215 | .output: HTMLTag.init(name: "output"),
216 | .p: HTMLTag.init(name: "p"),
217 | .param: HTMLTag.init(name: "param", isEmpty: true),
218 | .picture: HTMLTag.init(name: "picture"),
219 | .pre: HTMLTag.init(name: "pre"),
220 | .progress: HTMLTag.init(name: "progress"),
221 | .q: HTMLTag.init(name: "q"),
222 | .rp: HTMLTag.init(name: "rp"),
223 | .rt: HTMLTag.init(name: "rt"),
224 | .ruby: HTMLTag.init(name: "ruby"),
225 | .s: HTMLTag.init(name: "s"),
226 | .samp: HTMLTag.init(name: "samp"),
227 | .script: HTMLTag.init(name: "script"),
228 | .section: HTMLTag.init(name: "section"),
229 | .select: HTMLTag.init(name: "select"),
230 | .slot: HTMLTag.init(name: "slot"),
231 | .small: HTMLTag.init(name: "small"),
232 | .source: HTMLTag.init(name: "source", isEmpty: true),
233 | .span: HTMLTag.init(name: "span"),
234 | .strong: HTMLTag.init(name: "strong"),
235 | .style: HTMLTag.init(name: "style"),
236 | .sub: HTMLTag.init(name: "sub"),
237 | .summary: HTMLTag.init(name: "summary"),
238 | .sup: HTMLTag.init(name: "sup"),
239 | .svg: HTMLTag.init(name: "svg"),
240 | .table: HTMLTag.init(name: "table"),
241 | .tbody: HTMLTag.init(name: "tbody"),
242 | .td: HTMLTag.init(name: "td"),
243 | .template: HTMLTag.init(name: "template", isEmpty: true),
244 | .textarea: HTMLTag.init(name: "textarea"),
245 | .tfoot: HTMLTag.init(name: "tfoot"),
246 | .th: HTMLTag.init(name: "th"),
247 | .thead: HTMLTag.init(name: "thead"),
248 | .time: HTMLTag.init(name: "time"),
249 | .title: HTMLTag.init(name: "title"),
250 | .tr: HTMLTag.init(name: "tr"),
251 | .track: HTMLTag.init(name: "track", isEmpty: true),
252 | .u: HTMLTag.init(name: "u"),
253 | .ul: HTMLTag.init(name: "ul"),
254 | .var: HTMLTag.init(name: "var"),
255 | .video: HTMLTag.init(name: "video"),
256 | .wbr: HTMLTag.init(name: "wbr", isEmpty: true)
257 | ]
258 | let selfClosingHTMLTags: [HTMLTagID] = [
259 | .area,
260 | .base,
261 | .br,
262 | .col,
263 | .embed,
264 | .hr,
265 | .iframe,
266 | .img,
267 | .input,
268 | .link,
269 | .meta,
270 | .param,
271 | .source,
272 | .template,
273 | .track,
274 | .wbr
275 | ]
276 |
277 | let allHTMLTagNames = [
278 | "a",
279 | "abbr",
280 | "address",
281 | "area",
282 | "article",
283 | "aside",
284 | "audio",
285 | "b",
286 | "base",
287 | "bdi",
288 | "bdo",
289 | "blockquote",
290 | "body",
291 | "br",
292 | "button",
293 | "canvas",
294 | "caption",
295 | "cite",
296 | "code",
297 | "col",
298 | "colgroup",
299 | "data",
300 | "datalist",
301 | "dd",
302 | "del",
303 | "details",
304 | "dfn",
305 | "dialog",
306 | "div",
307 | "dl",
308 | "dt",
309 | "em",
310 | "embed",
311 | "fieldset",
312 | "figcaption",
313 | "figure",
314 | "footer",
315 | "form",
316 | "h1,",
317 | "h2,",
318 | "h3,",
319 | "h4,",
320 | "h5,",
321 | "h6",
322 | "head",
323 | "header",
324 | "hgroup",
325 | "hr",
326 | "html",
327 | "i",
328 | "iframe",
329 | "img",
330 | "input",
331 | "ins",
332 | "kbd",
333 | "label",
334 | "legend",
335 | "li",
336 | "link",
337 | "main",
338 | "map",
339 | "mark",
340 | "MathMLmath",
341 | "menu",
342 | "meta",
343 | "meter",
344 | "nav",
345 | "noscript",
346 | "object",
347 | "ol",
348 | "optgroup",
349 | "option",
350 | "output",
351 | "p",
352 | "param",
353 | "picture",
354 | "pre",
355 | "progress",
356 | "q",
357 | "rp",
358 | "rt",
359 | "ruby",
360 | "s",
361 | "samp",
362 | "script",
363 | "section",
364 | "select",
365 | "slot",
366 | "small",
367 | "source",
368 | "span",
369 | "strong",
370 | "style",
371 | "sub",
372 | "summary",
373 | "sup",
374 | "SVG",
375 | "svg",
376 | "table",
377 | "tbody",
378 | "td",
379 | "template",
380 | "textarea",
381 | "tfoot",
382 | "th",
383 | "thead",
384 | "time",
385 | "title",
386 | "tr",
387 | "track",
388 | "u",
389 | "ul",
390 | "var",
391 | "video",
392 | "wbr"
393 | ]
394 |
395 | // elements with no end tag
396 | // reference: https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
397 | let emptyElementTagNames = [
398 | "area",
399 | "base",
400 | "br",
401 | "col",
402 | "embed",
403 | "hr",
404 | "iframe",
405 | "img",
406 | "input",
407 | "link",
408 | "meta",
409 | "param",
410 | "source",
411 | "template",
412 | "track",
413 | "wbr"
414 | ]
415 |
416 | let ignoredTags: [String] = [
417 | "svg",
418 | "script"
419 | ]
420 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/Tags/SVGTags.swift:
--------------------------------------------------------------------------------
1 | //
2 | // SVGTags.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-12.
6 | //
7 |
8 | import Foundation
9 |
10 | enum SVGTagNames: String {
11 | case a
12 | case altGlyph
13 | case altGlyphDef
14 | case altGlyphItem
15 | case animate
16 | case animateColor
17 | case animateMotion
18 | case animateTransform
19 | case circle
20 | case clipPath
21 | case colorProfile = "color-profile"
22 | case cursor
23 | case defs
24 | case desc
25 | case ellipse
26 | case feBlend
27 | case feColorMatrix
28 | case feComponentTransfer
29 | case feComposite
30 | case feConvolveMatrix
31 | case feDiffuseLighting
32 | case feDisplacementMap
33 | case feDistantLight
34 | case feFlood
35 | case feFuncA
36 | case feFuncB
37 | case feFuncG
38 | case feFuncR
39 | case feGaussianBlur
40 | case feImage
41 | case feMerge
42 | case feMergeNode
43 | case feMorphology
44 | case feOffset
45 | case fePointLight
46 | case feSpecularLighting
47 | case feSpotLight
48 | case feTile
49 | case feTurbulence
50 | case filter
51 | case font
52 | case fontFace = "font-face"
53 | case fontFaceFormat = "font-face-format"
54 | case fontFaceName = "font-face-name"
55 | case fontFaceSRC = "font-face-src"
56 | case fontFaceURI = "font-face-uri"
57 | case foreignObject
58 | case g
59 | case glyph
60 | case glyphRef
61 | case hkern
62 | case image
63 | case line
64 | case linearGradient
65 | case marker
66 | case mask
67 | case metadata
68 | case missingGlyph = "missing-glyph"
69 | case mpath
70 | case path
71 | case pattern
72 | case polygon
73 | case polyline
74 | case radialGradient
75 | case rect
76 | case script
77 | case set
78 | case stop
79 | case style
80 | case svg
81 | case switchTag = "switch"
82 | case symbol
83 | case text
84 | case textPath
85 | case title
86 | case tref
87 | case tspan
88 | case use
89 | case view
90 | case vkern
91 | }
92 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Parser/Tags/XMLTags.swift:
--------------------------------------------------------------------------------
1 | //
2 | // XMLTags.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-13.
6 | //
7 |
8 | import Foundation
9 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/ProjectConfig.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TestConfig.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-04.
6 | //
7 |
8 | import Foundation
9 |
10 | struct ProjectConfig {
11 |
12 | // for Debugging
13 | static let shouldPrintTags = false
14 | static let shouldPrintWarnings = false
15 | }
16 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/HTMLTraverser.swift:
--------------------------------------------------------------------------------
1 | //
2 | // HTMLTraverser.swift
3 | //
4 | // Created by Reid Nantes on 2018-05-27.
5 | // Copyright © 2018 Reid Nantes. All rights reserved.
6 | //
7 |
8 | import Foundation
9 |
10 | public struct HTMLTraverser {
11 |
12 | public static func hasMatchingNode(in parsedNodes: [Node], matching nodeSelctorPath: [NodeSelector]) -> Bool {
13 | if findNodes(in: parsedNodes, matching: nodeSelctorPath).count > 0 {
14 | return true
15 | } else {
16 | return false
17 | }
18 | }
19 |
20 | public static func findElements(in parsedNodes: [Node], matching nodeSelectorPath: [NodeSelector]) -> [Element] {
21 | let nodes = findNodes(in: parsedNodes, matching: nodeSelectorPath)
22 | return nodes.compactMap({ $0 as? Element })
23 | }
24 |
25 | public static func findNodes(in parsedNodes: [Node], matching nodeSelectorPath: [NodeSelector]) -> [Node] {
26 | // start with every element matching
27 | var matchingNodes = parsedNodes
28 | var selectorPathIndex = 0
29 | var matchedSelectors = [NodeSelector]()
30 | // var unmatchedSelector: NodeSelector? = nil
31 |
32 | while selectorPathIndex < nodeSelectorPath.count && matchingNodes.count > 0 {
33 | var shouldReturnChildrenOfMatches = true
34 | // if not the last selectorNode get the children
35 | if selectorPathIndex == nodeSelectorPath.count - 1 {
36 | shouldReturnChildrenOfMatches = false
37 | }
38 |
39 | let currentSelector = nodeSelectorPath[selectorPathIndex]
40 | matchingNodes = getMatchesAtDepth(nodeSelector: currentSelector,
41 | nodesAtDepth: matchingNodes,
42 | shouldReturnChildrenOfMatches: shouldReturnChildrenOfMatches)
43 |
44 | // if matched add currentSelector to list of matchedSelectors
45 | if (matchingNodes.count > 0) {
46 | matchedSelectors.append(currentSelector)
47 | } else {
48 | // if not matched set unmatchedSelector
49 | // TODO: return result or throw error with this result?
50 | //unmatchedSelector = currentSelector
51 | }
52 |
53 | selectorPathIndex += 1
54 | }
55 |
56 | return matchingNodes
57 | }
58 |
59 | private static func getMatchesAtDepth(nodeSelector: NodeSelector, nodesAtDepth: [Node], shouldReturnChildrenOfMatches: Bool) -> [Node] {
60 | var matchesAtDepth = [Node]()
61 |
62 | var currentPosition = 0
63 |
64 | for node in nodesAtDepth {
65 | if compare(nodeSelector: nodeSelector, node: node) == true {
66 | if nodeSelector.position.testAgainst(currentPosition) {
67 | if shouldReturnChildrenOfMatches == true {
68 | if let element = node as? Element {
69 | matchesAtDepth.append(contentsOf: element.childNodes)
70 | }
71 | } else {
72 | matchesAtDepth.append(node)
73 | }
74 | }
75 | currentPosition += 1
76 | }
77 | }
78 |
79 | return matchesAtDepth
80 | }
81 |
82 | private static func compare(nodeSelector: NodeSelector, node: Node) -> Bool {
83 | if nodeSelector.testAgainst(node) == false {
84 | return false
85 | }
86 |
87 | return true
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/AttributeSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // AttributeSelector.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-14.
6 | //
7 |
8 | import Foundation
9 |
10 | public class AttributeSelector: ValueStringSelectorBuilder {
11 | private(set) public var name: String
12 | private(set) public var value = StringSelector()
13 |
14 | public init(name: String) {
15 | self.name = name
16 | }
17 |
18 | public func withValue(_ value: String) -> Self {
19 | self.value.withString(value)
20 | return self
21 | }
22 |
23 | /// returns true if the element satisfies the selector
24 | public func testSelector(against element: Element) -> Bool {
25 | let attributeValue = element.attributeValue(for: self.name)
26 |
27 | if value.testAgainst(attributeValue) == false {
28 | return false
29 | }
30 |
31 | return true
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/ClassSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ClassSelector.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-22.
6 | //
7 |
8 | import Foundation
9 |
10 | extension ElementSelector {
11 | /// Matches if the target matches the given className
12 | public func withClassName(_ className: String) -> Self {
13 | self.classNameSelector.hasClassNameAny.appendOrInit(className)
14 | return self
15 | }
16 |
17 | /// Matches if the target matches any of the given classNames
18 | public func withClassNamesAny(_ classNames: [String]) -> Self {
19 | self.classNameSelector.hasClassNameAny.appendOrInit(contentsOf: classNames)
20 | return self
21 | }
22 |
23 | /// Matches if the target matches all of the given classNames
24 | public func withClassNamesAll(_ classNames: [String]) -> Self {
25 | self.classNameSelector.hasClassNamesAll.appendOrInit(contentsOf: classNames)
26 | return self
27 | }
28 |
29 | /// Matches if the target has the exact of the given classNames.
30 | public func withClassNamesExact(_ classNames: [String]) -> Self {
31 | self.classNameSelector.hasClassNamesExact.appendOrInit(contentsOf: classNames)
32 | return self
33 | }
34 |
35 | // negatives
36 | /// Does not match if the node has the className
37 | public func withoutClassName(_ className : String) -> Self {
38 | self.classNameSelector.doesNotHaveClassNameAny.appendOrInit(className)
39 | return self
40 | }
41 |
42 | /// Does not match if any of the given classNames are present
43 | public func withoutClassNameAny(_ classNames : [String]) -> Self {
44 | self.classNameSelector.doesNotHaveClassNameAny.appendOrInit(contentsOf: classNames)
45 | return self
46 | }
47 |
48 | /// Does not match if all of the given classNames are present
49 | public func withoutClassNameAll(_ classNames : [String]) -> Self {
50 | self.classNameSelector.doesNotHaveClassNamesAll.appendOrInit(contentsOf: classNames)
51 | return self
52 | }
53 |
54 | /// Does not match if the element has the exact classNames
55 | public func withoutClassNameExact(_ classNames : [String]) -> Self {
56 | self.classNameSelector.doesNotHaveClassNamesExact.appendOrInit(contentsOf: classNames)
57 | return self
58 | }
59 |
60 | }
61 |
62 | internal final class ClassSelector {
63 | var hasClassNameAny: [String]?
64 | var hasClassNamesAll: [String]?
65 | var hasClassNamesExact: [String]?
66 |
67 | // negatives
68 | // does not match if any of the given classNames are present
69 | var doesNotHaveClassNameAny: [String]?
70 | // does not match if all of the given classNames are present
71 | var doesNotHaveClassNamesAll: [String]?
72 | // does not match if the element has the exact classNames
73 | var doesNotHaveClassNamesExact: [String]?
74 |
75 |
76 | /// returns true if the element satisfies the selector
77 | internal func testAgainst(_ element: Element) -> Bool {
78 | let classNamesSet = Set(element.classNames)
79 |
80 | if let hasClassNameAny = hasClassNameAny {
81 | if hasClassNameAny.contains(where: { classNamesSet.contains($0) }) == false {
82 | return false
83 | }
84 | }
85 |
86 | if let hasClassNamesAll = hasClassNamesAll {
87 | if hasClassNamesAll.allSatisfy({ classNamesSet.contains($0) }) == false {
88 | return false
89 | }
90 | }
91 |
92 | if let hasClassNamesExact = hasClassNamesExact {
93 | if hasClassNamesExact.allSatisfy({ classNamesSet.contains($0) }) == false || hasClassNamesExact.count != classNamesSet.count {
94 | return false
95 | }
96 | }
97 |
98 | if let doesNotHaveClassNameAny = doesNotHaveClassNameAny {
99 | if doesNotHaveClassNameAny.contains(where: { classNamesSet.contains($0) }) == true {
100 | return false
101 | }
102 | }
103 |
104 | if let doesNotHaveClassNamesAll = doesNotHaveClassNamesAll {
105 | if doesNotHaveClassNamesAll.allSatisfy({ classNamesSet.contains($0) }) == true {
106 | return false
107 | }
108 | }
109 |
110 | if let doesNotHaveClassNamesExact = doesNotHaveClassNamesExact {
111 | if doesNotHaveClassNamesExact.allSatisfy({ classNamesSet.contains($0) }) == true && doesNotHaveClassNamesExact.count == classNamesSet.count {
112 | return false
113 | }
114 | }
115 |
116 | return true
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/IntSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // File.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-11-03.
6 | //
7 |
8 | import Foundation
9 |
10 | public final class IntSelector {
11 | private(set) var anyValues: [Int]?
12 | private(set) var lessThanValues: [Int]?
13 | private(set) var greaterThanValues: [Int]?
14 |
15 | // negatives
16 | private(set) var notAnyValues: [Int]?
17 | }
18 |
19 | internal extension IntSelector {
20 | func withValue(_ value: Int) {
21 | anyValues.appendOrInit(value)
22 | }
23 |
24 | func whenValueIsAny(_ values: [Int]) {
25 | anyValues.appendOrInit(contentsOf: values)
26 | }
27 |
28 | func whenValueIsLessThan(_ value: Int) {
29 | lessThanValues.appendOrInit(value)
30 | }
31 |
32 | func whenValueIsGreaterThan(_ value: Int) {
33 | greaterThanValues.appendOrInit(value)
34 | }
35 |
36 | // negatives
37 | func whenValueIsNot(_ value: Int) {
38 | notAnyValues.appendOrInit(value)
39 | }
40 |
41 | func whenValueIsNotAny(_ values: [Int]) {
42 | notAnyValues.appendOrInit(contentsOf: values)
43 | }
44 |
45 | func testAgainst(_ value: Int?) -> Bool {
46 | guard let value = value else {
47 | if anyValues != nil || lessThanValues != nil || greaterThanValues != nil {
48 | return false
49 | } else {
50 | return true
51 | }
52 | }
53 |
54 | if let anyValues = anyValues {
55 | if anyValues.contains(where: { value == $0 }) == false {
56 | return false
57 | }
58 | }
59 |
60 | if let lessThanValues = lessThanValues {
61 | if lessThanValues.allSatisfy({ value < $0 }) == false {
62 | return false
63 | }
64 | }
65 |
66 | if let greaterThanValues = greaterThanValues {
67 | if greaterThanValues.allSatisfy({ value > $0 }) == false {
68 | return false
69 | }
70 | }
71 |
72 | if let notAnyValues = notAnyValues {
73 | if notAnyValues.allSatisfy({ value != $0 }) == false {
74 | return false
75 | }
76 | }
77 |
78 | return true
79 | }
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ElementSelector.swift
3 | // HTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-05-27.
6 | // Copyright © 2018 Reid Nantes. All rights reserved.
7 | //
8 |
9 | import Foundation
10 |
11 | /// based on Xpath and selector
12 | public protocol NodeSelector: AnyObject, PositionIntSelectorBuilder {
13 |
14 | /// Tests the element against the current Node
15 | func testAgainst(_ node: Node) -> Bool
16 | }
17 |
18 | //extension NodeSelector {
19 | //
20 | //// /// Matches when position is less than the given value
21 | //// func whenPositionLessThan(_ position: Int) -> NodeSelector {
22 | ////
23 | //// }
24 | ////
25 | //// /// Matches when position is greater than the given value
26 | //// func whenPositionGreaterThan(_ position: Int) -> NodeSelector {
27 | ////
28 | //// }
29 | //}
30 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/CDataSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // File.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-29.
6 | //
7 |
8 | import Foundation
9 |
10 | public final class CDataSelector: NodeSelector, TextStringSelectorBuilder {
11 | private(set) public var position = IntSelector()
12 | private(set) public var text = StringSelector()
13 |
14 | // public init
15 | public init() {}
16 |
17 | public func testAgainst(_ node: Node) -> Bool {
18 | // return false if node is not an element
19 | guard let cdata = node as? CData else {
20 | return false
21 | }
22 |
23 | if text.testAgainst(cdata.text) == false {
24 | return false
25 | }
26 |
27 | return true
28 | }
29 |
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/CommentSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // CommentSelector.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-31.
6 | //
7 |
8 | import Foundation
9 |
10 | public final class CommentSelector: NodeSelector, TextStringSelectorBuilder {
11 | private(set) public var position = IntSelector()
12 | private(set) public var text = StringSelector()
13 |
14 | // public init
15 | public init() {}
16 |
17 | /// returns true if the Node = satisfies the selector
18 | public func testAgainst(_ node: Node) -> Bool {
19 | // return false if node is not an CommentNode
20 | guard let comment = node as? Comment else {
21 | return false
22 | }
23 |
24 | if text.testAgainst(comment.text) == false {
25 | return false
26 | }
27 |
28 | return true
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/ElementSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ElementSelector.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-29.
6 | //
7 |
8 | import Foundation
9 |
10 | public class ElementSelector: NodeSelector, TagNameStringSelectorBuilder, IdStringSelectorBuilder {
11 | private(set) public var position = IntSelector()
12 |
13 | // string selector
14 | private(set) public var tagName = StringSelector()
15 | private(set) public var id = StringSelector()
16 |
17 | // className selector
18 | private(set) var classNameSelector = ClassSelector()
19 |
20 | // attribute selectors
21 | private(set) public var attributes: [AttributeSelector]?
22 |
23 | // childNode selector
24 | private(set) public var childNodeSelectors: [NodeSelector]?
25 | private(set) public var childNodeSelectorPathsAll: [[NodeSelector]]?
26 |
27 | // public init
28 | public init() {}
29 |
30 | /// Selects element if it has the given attribute
31 | public func withAttribute(_ attributeSelector: AttributeSelector) -> ElementSelector {
32 | self.attributes.appendOrInit(attributeSelector)
33 | return self
34 | }
35 |
36 | /// Selects element if it has the given id attribute value
37 | public func withId(_ id: String) -> ElementSelector {
38 | self.attributes.appendOrInit(AttributeSelector.init(name: "id").withValue(id))
39 | return self
40 | }
41 |
42 | /// Selects element if it has a child node matching the given childNodeSelector
43 | public func withChildNodeSelectorPath(_ childNodeSelectorPath: [NodeSelector]) -> Self {
44 | self.childNodeSelectorPathsAll.appendOrInit(childNodeSelectorPath)
45 | return self
46 | }
47 |
48 | public func withChildElement(_ elementSelector: ElementSelector) -> Self {
49 | self.childNodeSelectors.appendOrInit(elementSelector)
50 | return self
51 | }
52 |
53 | public func withChildTextNode(_ textNodeSelector: TextNodeSelector) -> Self {
54 | self.childNodeSelectors.appendOrInit(textNodeSelector)
55 | return self
56 | }
57 |
58 | public func withChildCommentNode(_ commentNodeSelector: CommentSelector) -> Self {
59 | self.childNodeSelectors.appendOrInit(commentNodeSelector)
60 | return self
61 | }
62 |
63 | public func withChildCDataNode(_ cDataSelector: CDataSelector) -> Self {
64 | self.childNodeSelectors.appendOrInit(cDataSelector)
65 | return self
66 | }
67 |
68 | public func testAgainst(_ node: Node) -> Bool {
69 | // return false if node is not an element
70 | guard let element = node as? Element else {
71 | return false
72 | }
73 |
74 | // test tagName selector
75 | if self.tagName.testAgainst(element.tagName) == false {
76 | return false
77 | }
78 |
79 | //test classNames
80 | if self.classNameSelector.testAgainst(element) == false {
81 | return false
82 | }
83 |
84 | //test attributes (including id)
85 | if self.attributes?.allSatisfy({ $0.testSelector(against: element) }) == false {
86 | return false
87 | }
88 |
89 | // test child selectors
90 | if childNodeSelectors?.allSatisfy( {
91 | HTMLTraverser.hasMatchingNode(in: element.childNodes, matching: [$0])
92 | }) == false {
93 | return false
94 | }
95 |
96 | // test childNodeSelectorPaths
97 | if childNodeSelectorPathsAll?.allSatisfy( {
98 | HTMLTraverser.hasMatchingNode(in: element.childNodes, matching: $0)
99 | }) == false {
100 | return false
101 | }
102 |
103 | return true
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/NodeSelectors/TextNodeSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TextNodeSelector.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-30.
6 | //
7 |
8 | import Foundation
9 |
10 | public final class TextNodeSelector: NodeSelector, TextStringSelectorBuilder {
11 | private(set) public var text = StringSelector()
12 | private(set) public var position = IntSelector()
13 |
14 | // public init
15 | public init() {}
16 |
17 | public func testAgainst(_ node: Node) -> Bool {
18 | // return false if node is not an TextNode
19 | guard let textNode = node as? TextNode else {
20 | return false
21 | }
22 |
23 | if text.testAgainst(textNode.text) == false {
24 | return false
25 | }
26 |
27 | return true
28 | }
29 |
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/IdStringSelectorBuilder.swift:
--------------------------------------------------------------------------------
1 | //
2 | // IdStringSelectorBuilder.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-11-02.
6 | //
7 |
8 | import Foundation
9 |
10 | public protocol IdStringSelectorBuilder {
11 | var id: StringSelector { get }
12 | }
13 |
14 | public extension IdStringSelectorBuilder {
15 | /// Matches when the target equals the given id
16 | func withId(_ id: String) -> Self {
17 | self.id.withString(id)
18 | return self
19 | }
20 |
21 | /// Matches when the target equals any of the given ids
22 | func whenIdIsAny(_ ids: [String]) -> Self {
23 | self.id.whenStringIsAny(ids)
24 | return self
25 | }
26 |
27 | /// Matches when the target contains the given id
28 | func containsId(_ keyword: String) -> Self {
29 | self.id.whenStringContainsAny([keyword])
30 | return self
31 | }
32 |
33 | /// Matches when the target contains any of the given ids
34 | func whenIdContainsAny(_ keywords: [String]) -> Self {
35 | self.id.whenStringContainsAny(keywords)
36 | return self
37 | }
38 |
39 | /// Matches when the target contains all of the given ids
40 | func whenIdContainsAll(_ keywords: [String]) -> Self {
41 | self.id.whenStringContainsAll(keywords)
42 | return self
43 | }
44 |
45 | // negatives
46 | /// Does not match when the target equals the given id
47 | func whenIdIsNot(_ id: String) -> Self {
48 | self.id.whenStringIsNot(id)
49 | return self
50 | }
51 |
52 | /// Does not match if the target equals any of the given ids
53 | func whenIdIsNotAny(_ ids: [String]) -> Self {
54 | self.id.whenStringIsNotAny(ids)
55 | return self
56 | }
57 |
58 | /// Does not match if the target contains the given id
59 | func whenIdDoesNotContain(_ keyword: String) -> Self {
60 | self.id.whenStringDoesNotContainAny([keyword])
61 | return self
62 | }
63 |
64 | /// Does not match if the target contains any of the given ids
65 | func whenIdDoesNotContainAny(_ keywords: [String]) -> Self {
66 | self.id.whenStringDoesNotContainAny(keywords)
67 | return self
68 | }
69 |
70 | /// Does not match if the target contains all of the given ids
71 | func whenIdDoesNotContainAll(_ keywords: [String]) -> Self {
72 | self.id.whenStringDoesNotContainAll(keywords)
73 | return self
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/PositionIntSelectorBuilder.swift:
--------------------------------------------------------------------------------
1 | //
2 | // PositionIntSelectorBuilder.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-11-03.
6 | //
7 |
8 | import Foundation
9 |
10 |
11 | public protocol PositionIntSelectorBuilder {
12 | var position: IntSelector { get }
13 | }
14 |
15 | public extension PositionIntSelectorBuilder {
16 | /// Matches when the target equals the given value
17 | func atPosition(_ value: Int) -> Self {
18 | self.position.withValue(value)
19 | return self
20 | }
21 |
22 | /// Matches when the target equals any of the given values
23 | func whenPositionIsAny(_ values: [Int]) -> Self {
24 | self.position.whenValueIsAny(values)
25 | return self
26 | }
27 |
28 | /// Matches when the target is less than the given value
29 | func whenPositionIsLessThan(_ value: Int) -> Self {
30 | self.position.whenValueIsLessThan(value)
31 | return self
32 | }
33 |
34 | /// Matches when the target is greater than the given value
35 | func whenPositionIsGreaterThan(_ value: Int) -> Self {
36 | self.position.whenValueIsGreaterThan(value)
37 | return self
38 | }
39 |
40 | /// Does not match if the target equals is the given value
41 | func whenPositionIsNot(_ value: Int) -> Self {
42 | self.position.whenValueIsNot(value)
43 | return self
44 | }
45 |
46 | /// Does not match if the target equals any of the given values
47 | func whenPositionIsNotAny(_ values: [Int]) -> Self {
48 | self.position.whenValueIsNotAny(values)
49 | return self
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/TagNameStringSelectorBuilder.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TagNameSelectorBuilder.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-11-02.
6 | //
7 |
8 | import Foundation
9 |
10 | public protocol TagNameStringSelectorBuilder {
11 | var tagName: StringSelector { get }
12 | }
13 |
14 | public extension TagNameStringSelectorBuilder {
15 | /// Matches when the target equals the given value
16 | func withTagName(_ value: String) -> Self {
17 | self.tagName.withString(value)
18 | return self
19 | }
20 |
21 | /// Matches when the target equals any of the given values
22 | func whenTagNameIsAny(_ values: [String]) -> Self {
23 | self.tagName.whenStringIsAny(values)
24 | return self
25 | }
26 |
27 | /// Matches when the target contains the given value
28 | func containingTagName(_ value: String) -> Self {
29 | self.tagName.whenStringContainsAny([value])
30 | return self
31 | }
32 |
33 | /// Matches when the target contains any of the given values
34 | func whenTagNameContainsAny(_ keywords: [String]) -> Self {
35 | self.tagName.whenStringContainsAny(keywords)
36 | return self
37 | }
38 |
39 | /// Matches when the target contains all of the given values
40 | func whenTagNameContainsAll(_ keywords: [String]) -> Self {
41 | self.tagName.whenStringContainsAll(keywords)
42 | return self
43 | }
44 |
45 | // negatives
46 | /// Does not match when the target equals the given value
47 | func whenTagNameIsNot(_ value: String) -> Self {
48 | self.tagName.whenStringIsNot(value)
49 | return self
50 | }
51 |
52 | /// Does not match if the target equals any of the given values
53 | func whenTagNameIsNotAny(_ values: [String]) -> Self {
54 | self.tagName.whenStringIsNotAny(values)
55 | return self
56 | }
57 |
58 | /// Does not match if the target contains the given value
59 | func whenTagNameDoesNotContain(_ keyword: String) -> Self {
60 | self.tagName.whenStringDoesNotContainAny([keyword])
61 | return self
62 | }
63 |
64 | /// Does not match if the target contains any of the given values
65 | func whenTagNameDoesNotContainAny(_ keywords: [String]) -> Self {
66 | self.tagName.whenStringDoesNotContainAny(keywords)
67 | return self
68 | }
69 |
70 | /// Does not match if the target contains all of the given values
71 | func whenTagNameDoesNotContainAll(_ keywords: [String]) -> Self {
72 | self.tagName.whenStringDoesNotContainAll(keywords)
73 | return self
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/TextStringSelectorBuilder.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TextStringSelectorBuilder.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-31.
6 | //
7 |
8 | import Foundation
9 |
10 | public protocol TextStringSelectorBuilder {
11 | var text: StringSelector { get }
12 | }
13 |
14 | public extension TextStringSelectorBuilder {
15 | /// Matches when the target equals the given value
16 | func withText(_ value: String) -> Self {
17 | self.text.withString(value)
18 | return self
19 | }
20 |
21 | /// Matches when the target equals any of the given values
22 | func whenTextIsAny(_ values: [String]) -> Self {
23 | self.text.whenStringIsAny(values)
24 | return self
25 | }
26 |
27 | /// Matches when the target contains the given value
28 | func containingText(_ value: String) -> Self {
29 | self.text.whenStringContainsAny([value])
30 | return self
31 | }
32 |
33 | /// Matches when the target contains any of the given values
34 | func whenTextContainsAny(_ keywords: [String]) -> Self {
35 | self.text.whenStringContainsAny(keywords)
36 | return self
37 | }
38 |
39 | /// Matches when the target contains all of the given values
40 | func whenTextContainsAll(_ keywords: [String]) -> Self {
41 | self.text.whenStringContainsAll(keywords)
42 | return self
43 | }
44 |
45 | // negatives
46 | /// Does not match when the target equals the given value
47 | func whenTextIsNot(_ value: String) -> Self {
48 | self.text.whenStringIsNot(value)
49 | return self
50 | }
51 |
52 | /// Does not match if the target equals any of the given values
53 | func whenTextIsNotAny(_ values: [String]) -> Self {
54 | self.text.whenStringIsNotAny(values)
55 | return self
56 | }
57 |
58 | /// Does not match if the target contains the given value
59 | func whenTextDoesNotContain(_ keyword: String) -> Self {
60 | self.text.whenStringDoesNotContainAny([keyword])
61 | return self
62 | }
63 |
64 | /// Does not match if the target contains any of the given values
65 | func whenTextDoesNotContainAny(_ keywords: [String]) -> Self {
66 | self.text.whenStringDoesNotContainAny(keywords)
67 | return self
68 | }
69 |
70 | /// Does not match if the target contains all of the given values
71 | func whenTextDoesNotContainAll(_ keywords: [String]) -> Self {
72 | self.text.whenStringDoesNotContainAll(keywords)
73 | return self
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/SelectorBuilders/ValueStringSelectorBuilder.swift:
--------------------------------------------------------------------------------
1 | //
2 | // ValueStringSelectorBuilder.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-31.
6 | //
7 |
8 | import Foundation
9 |
10 | public protocol ValueStringSelectorBuilder {
11 | var value: StringSelector { get }
12 | }
13 |
14 | public extension ValueStringSelectorBuilder {
15 | /// Matches when the target equals the given value
16 | func withValue(_ value: String) -> Self {
17 | self.value.withString(value)
18 | return self
19 | }
20 |
21 | /// Matches when the target equals any of the given values
22 | func whenValueIsAny(_ values: [String]) -> Self {
23 | self.value.whenStringIsAny(values)
24 | return self
25 | }
26 |
27 | /// Matches when the target contains the given value
28 | func containingValue(_ value: String) -> Self {
29 | self.value.whenStringContainsAny([value])
30 | return self
31 | }
32 |
33 | /// Matches when the target contains any of the given values
34 | func whenValueContainsAny(_ keywords: [String]) -> Self {
35 | self.value.whenStringContainsAny(keywords)
36 | return self
37 | }
38 |
39 | /// Matches when the target contains all of the given values
40 | func whenValueContainsAll(_ keywords: [String]) -> Self {
41 | self.value.whenStringContainsAll(keywords)
42 | return self
43 | }
44 |
45 | // negatives
46 | /// Does not match when the target equals the given value
47 | func whenValueIsNot(_ value: String) -> Self {
48 | self.value.whenStringIsNot(value)
49 | return self
50 | }
51 |
52 | /// Does not match if the target equals any of the given values
53 | func whenValueIsNotAny(_ values: [String]) -> Self {
54 | self.value.whenStringIsNotAny(values)
55 | return self
56 | }
57 |
58 | /// Does not match if the target contains the given value
59 | func whenValueDoesNotContain(_ keyword: String) -> Self {
60 | self.value.whenStringDoesNotContainAny([keyword])
61 | return self
62 | }
63 |
64 | /// Does not match if the target contains any of the given values
65 | func whenValueDoesNotContainAny(_ keywords: [String]) -> Self {
66 | self.value.whenStringDoesNotContainAny(keywords)
67 | return self
68 | }
69 |
70 | /// Does not match if the target contains all of the given values
71 | func whenValueDoesNotContainAll(_ keywords: [String]) -> Self {
72 | self.value.whenStringDoesNotContainAll(keywords)
73 | return self
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/Sources/SwiftHTMLParser/Traverser/Selectors/StringSelector.swift:
--------------------------------------------------------------------------------
1 | //
2 | // File.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-31.
6 | //
7 |
8 | import Foundation
9 |
10 | public final class StringSelector {
11 | public init() {}
12 |
13 | /// Matches if the target is any of the keywords
14 | private(set) var stringIsAny: [String]?
15 | /// Matches if the target contains anhy the keywords
16 | private(set) var stringContainsAny: [String]?
17 | /// Matches if the target contains all the keywords
18 | private(set) var stringContainsAll: [String]?
19 |
20 | // negatives
21 | /// Does not match if the target is any of the keywords
22 | private(set) var stringIsNotAny: [String]?
23 | /// Does not match if the target contains any of the keywords
24 | private(set) var stringDoesNotContainAny: [String]?
25 | /// Does not match if the target contains all of the keywords
26 | private(set) var stringDoesNotContainAll: [String]?
27 | }
28 |
29 |
30 | internal extension StringSelector {
31 | func withString(_ value: String) {
32 | self.stringIsAny.appendOrInit(value)
33 | }
34 |
35 | func whenStringIsAny(_ values: [String]) {
36 | self.stringIsAny.appendOrInit(contentsOf: values)
37 | }
38 |
39 | /// matches when attribute value contains the given values
40 | func whenStringContainsAny(_ keywords: [String]) {
41 | self.stringContainsAny.appendOrInit(contentsOf: keywords)
42 | }
43 |
44 | /// matches when the target value does not contains the given values
45 | func whenStringContainsAll(_ keywords: [String]) {
46 | self.stringContainsAll.appendOrInit(contentsOf: keywords)
47 | }
48 |
49 | /// Does not match when the target equals the given value
50 | func whenStringIsNot(_ value: String) {
51 | self.stringIsNotAny.appendOrInit(value)
52 | }
53 |
54 | /// Does not match if the target equals any of the given values
55 | func whenStringIsNotAny(_ values: [String]) {
56 | self.stringIsNotAny.appendOrInit(contentsOf: values)
57 | }
58 |
59 | /// Does not match if the target contains any of the given values
60 | func whenStringDoesNotContainAny(_ values: [String]) {
61 | self.stringDoesNotContainAny.appendOrInit(contentsOf: values)
62 | }
63 |
64 | /// Does not match if the target contains all of the given values
65 | func whenStringDoesNotContainAll(_ values: [String]) {
66 | self.stringDoesNotContainAll.appendOrInit(contentsOf: values)
67 | }
68 | }
69 |
70 | extension StringSelector {
71 | func testAgainst(_ string: String?) -> Bool {
72 | guard let string = string else {
73 | if stringIsAny != nil || stringContainsAny != nil || stringContainsAll != nil {
74 | return false
75 | } else {
76 | return true
77 | }
78 | }
79 |
80 | if let stringIsAny = stringIsAny {
81 | if stringIsAny.contains(where: { string == $0 }) == false {
82 | return false
83 | }
84 | }
85 |
86 | if let stringContainsAll = stringContainsAll {
87 | if stringContainsAll.allSatisfy({ string.contains($0) }) == false {
88 | return false
89 | }
90 | }
91 |
92 | if let stringContainsAny = stringContainsAny {
93 | if stringContainsAny.contains(where: { string.contains($0) }) == false {
94 | return false
95 | }
96 | }
97 |
98 | // negatives
99 |
100 | // fails if string is any of the keywords
101 | if let stringIsNotAny = stringIsNotAny {
102 | if stringIsNotAny.contains(where: { string == $0 }) == true {
103 | return false
104 | }
105 | }
106 |
107 | // fails if string contains any of the keywords
108 | if let stringDoesNotContainsAny = stringDoesNotContainAny {
109 | if stringDoesNotContainsAny.contains(where: { string.contains($0) }) == true {
110 | return false
111 | }
112 | }
113 |
114 | // fails if string contains all the keywords
115 | if let stringDoesNotContainsAll = stringDoesNotContainAll {
116 | if stringDoesNotContainsAll.allSatisfy({ string.contains($0) }) == true {
117 | return false
118 | }
119 | }
120 |
121 | return true
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/AppendableTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // AppendableITests.swift.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-11-05.
6 | //
7 |
8 | import Foundation
9 |
10 | import XCTest
11 | @testable import SwiftHTMLParser
12 |
13 | final class AppendableTests: XCTestCase {
14 |
15 | func testAppendOrIntialize() {
16 | // single value
17 | var optArray: [String]? = nil
18 | optArray.appendOrInit("hello appendOrInit")
19 | XCTAssertEqual(optArray![0], "hello appendOrInit")
20 |
21 | // multiple values
22 | var optArray2: [String]? = nil
23 | optArray2.appendOrInit(contentsOf: ["sunny", "rainy", "cloudy"])
24 | XCTAssertEqual(optArray2?.count, 3)
25 |
26 | var optSet: SetThis is a Heading
"))
32 | XCTAssertTrue(htmlString.contains(""))
33 | }
34 |
35 | func testSimple() {
36 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
37 | .appendingPathComponent("elements-simple.html") else {
38 | XCTFail("Could find get file URL to parse")
39 | return
40 | }
41 |
42 | // get html string from file
43 | var htmlStringResult: String? = nil
44 | do {
45 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
46 | } catch {
47 | XCTFail("Could not open file URL: \(fileURL)")
48 | return
49 | }
50 | guard let htmlString = htmlStringResult else {
51 | XCTFail("Could not open file URL: \(fileURL)")
52 | return
53 | }
54 |
55 | // create object from raw html file
56 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
57 | XCTFail("Could not parse HTML")
58 | return
59 | }
60 |
61 | XCTAssertEqual(nodeArray.count, 2)
62 |
63 | // find matching elements by traversing the created html object
64 | var nodeSelectorPath = [
65 | ElementSelector().withTagName("html"),
66 | ElementSelector().withTagName("head"),
67 | ElementSelector().withTagName("title")
68 | ]
69 |
70 | var matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
71 |
72 | XCTAssertEqual(matchingElements.count, 1)
73 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "Test Simple Title")
74 |
75 | nodeSelectorPath = [
76 | ElementSelector().withTagName("html"),
77 | ElementSelector().withTagName("body"),
78 | ElementSelector().withTagName("p")
79 | ]
80 |
81 | matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
82 |
83 | XCTAssertEqual(matchingElements.count, 3)
84 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.")
85 | }
86 |
87 | func testQuotes() {
88 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
89 | .appendingPathComponent("elements-quotes.html") else {
90 | XCTFail("Could find get file URL to parse")
91 | return
92 | }
93 |
94 | // get html string from file
95 | var htmlStringResult: String? = nil
96 | do {
97 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
98 | } catch {
99 | XCTFail("Could not open file URL: \(fileURL)")
100 | return
101 | }
102 | guard let htmlString = htmlStringResult else {
103 | XCTFail("Could not open file URL: \(fileURL)")
104 | return
105 | }
106 |
107 | // create object from raw html file
108 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
109 | XCTFail("Could not parse HTML")
110 | return
111 | }
112 |
113 | // find matching elements by traversing the created html object
114 | let nodeSelectorPath = [
115 | ElementSelector().withTagName("html"),
116 | ElementSelector().withTagName("body"),
117 | ElementSelector().withTagName("p")
118 | ]
119 |
120 | let matchingElements = HTMLTraverser.findElements(in: elementArray,
121 | matching: nodeSelectorPath)
122 |
123 |
124 | XCTAssertEqual(matchingElements.count, 4)
125 | XCTAssertEqual(matchingElements[0].textNodes.first!.text, "'John \"ShotGun\" Nelson'")
126 | XCTAssertEqual(matchingElements[1].textNodes.first!.text, "\"John 'ShotGun' Nelson\"")
127 | XCTAssertEqual(matchingElements[2].textNodes.first!.text, "It's alright")
128 | XCTAssertEqual(matchingElements[3].textNodes.first!.text, "I love the \" (double Quote) character")
129 | }
130 |
131 | func testClosingEmptyTag() {
132 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
133 | .appendingPathComponent("empty-element.html") else {
134 | XCTFail("Could find get file URL to parse")
135 | return
136 | }
137 |
138 | // get html string from file
139 | var htmlStringResult: String? = nil
140 | do {
141 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
142 | } catch {
143 | XCTFail("Could not open file URL: \(fileURL)")
144 | return
145 | }
146 | guard let htmlString = htmlStringResult else {
147 | XCTFail("Could not open file URL: \(fileURL)")
148 | return
149 | }
150 |
151 | // create object from raw html file
152 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
153 | XCTFail("Could not parse HTML")
154 | return
155 | }
156 |
157 | // find matching elements by traversing the created html object
158 | let nodeSelectorPath = [
159 | ElementSelector().withTagName("html"),
160 | ElementSelector().withTagName("body"),
161 | ElementSelector().withTagName("form")
162 | ]
163 |
164 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
165 |
166 | XCTAssertEqual(matchingElements.count, 1)
167 | XCTAssertEqual(matchingElements[0].childElements.count, 1)
168 | }
169 |
170 | func testElementNameOnNewLine() {
171 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
172 | .appendingPathComponent("element-name-on-new-line.html") else {
173 | XCTFail("Could find get file URL to parse")
174 | return
175 | }
176 |
177 | // get html string from file
178 | var htmlStringResult: String? = nil
179 | do {
180 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
181 | } catch {
182 | XCTFail("Could not open file URL: \(fileURL)")
183 | return
184 | }
185 | guard let htmlString = htmlStringResult else {
186 | XCTFail("Could not open file URL: \(fileURL)")
187 | return
188 | }
189 |
190 | // create object from raw html file
191 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
192 | XCTFail("Could not parse HTML")
193 | return
194 | }
195 |
196 | // find matching elements by traversing the created html object
197 | let nodeSelectorPath = [
198 | ElementSelector().withTagName("html"),
199 | ElementSelector().withTagName("body"),
200 | ElementSelector().withTagName("div")
201 | ]
202 |
203 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
204 |
205 | XCTAssertEqual(matchingElements.count, 1)
206 | XCTAssertEqual(matchingElements.first?.tagName, "div")
207 | XCTAssertEqual(matchingElements.first?.attributeValue(for: "name"), "bob")
208 | XCTAssertEqual(matchingElements.first?.attributeValue(for: "type"), "email")
209 | }
210 |
211 | func testElementUnclosedEndTag() {
212 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
213 | .appendingPathComponent("element-unclosed-end-tag.html") else {
214 | XCTFail("Could find get file URL to parse")
215 | return
216 | }
217 |
218 | // get html string from file
219 | var htmlStringResult: String? = nil
220 | do {
221 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
222 | } catch {
223 | XCTFail("Could not open file URL: \(fileURL)")
224 | return
225 | }
226 | guard let htmlString = htmlStringResult else {
227 | XCTFail("Could not open file URL: \(fileURL)")
228 | return
229 | }
230 |
231 | // create object from raw html file
232 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
233 | XCTFail("Could not parse HTML")
234 | return
235 | }
236 |
237 | // find matching elements by traversing the created html object
238 | let nodeSelectorPath = [
239 | ElementSelector().withTagName("html"),
240 | ElementSelector().withTagName("body"),
241 | ElementSelector().withTagName("div")
242 | ]
243 |
244 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
245 |
246 | XCTAssertEqual(matchingElements.count, 1)
247 | XCTAssertEqual(matchingElements.first?.tagName, "div")
248 | XCTAssertEqual(matchingElements.first?.childElements.count, 1)
249 | }
250 |
251 | func testElementStrayEndTag() {
252 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
253 | .appendingPathComponent("elemnent-stray-end-tag.html") else {
254 | XCTFail("Could find get file URL to parse")
255 | return
256 | }
257 |
258 | // get html string from file
259 | var htmlStringResult: String? = nil
260 | do {
261 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
262 | } catch {
263 | XCTFail("Could not open file URL: \(fileURL)")
264 | return
265 | }
266 | guard let htmlString = htmlStringResult else {
267 | XCTFail("Could not open file URL: \(fileURL)")
268 | return
269 | }
270 |
271 | // create object from raw html file
272 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
273 | XCTFail("Could not parse HTML")
274 | return
275 | }
276 |
277 | // find matching elements by traversing the created html object
278 | let nodeSelectorPath = [
279 | ElementSelector().withTagName("html"),
280 | ElementSelector().withTagName("body"),
281 | ElementSelector().withTagName("div")
282 | ]
283 |
284 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
285 |
286 | XCTAssertEqual(matchingElements.count, 1)
287 | XCTAssertEqual(matchingElements.first?.tagName, "div")
288 | XCTAssertEqual(matchingElements.first?.childElements.count, 1)
289 | }
290 |
291 | func testElementStrayHTMLEndTag() {
292 | guard let fileURL = TestFileURLs.elementsTestFilesDirectoryURL?
293 | .appendingPathComponent("elemnent-stray-end-html-tag.html") else {
294 | XCTFail("Could find get file URL to parse")
295 | return
296 | }
297 |
298 | // get html string from file
299 | var htmlStringResult: String? = nil
300 | do {
301 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
302 | } catch {
303 | XCTFail("Could not open file URL: \(fileURL)")
304 | return
305 | }
306 | guard let htmlString = htmlStringResult else {
307 | XCTFail("Could not open file URL: \(fileURL)")
308 | return
309 | }
310 |
311 | // create object from raw html file
312 | guard let nodeArray = try? HTMLParser.parse(htmlString) else {
313 | XCTFail("Could not parse HTML")
314 | return
315 | }
316 |
317 | // find matching elements by traversing the created html object
318 | let nodeSelectorPath = [
319 | ElementSelector().withTagName("html"),
320 | ElementSelector().withTagName("body"),
321 | ElementSelector().withTagName("div")
322 | ]
323 |
324 | let matchingElements = HTMLTraverser.findElements(in: nodeArray, matching: nodeSelectorPath)
325 |
326 | XCTAssertEqual(matchingElements.count, 1)
327 | XCTAssertEqual(matchingElements.first?.tagName, "div")
328 | XCTAssertEqual(matchingElements.first?.childElements.count, 1)
329 | }
330 | }
331 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/ElementTraverserTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TestElementTraverser.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-22.
6 | //
7 |
8 | import XCTest
9 | import SwiftHTMLParser
10 | import TestFiles
11 |
12 | final class ElementTraverserTests: XCTestCase {
13 |
14 | func testSelectTagName() {
15 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
16 | .appendingPathComponent("attributes-multiple-value-class.html") else {
17 | XCTFail("Could find get file URL to parse")
18 | return
19 | }
20 |
21 | var nodeTreeResult: [Node]? = nil
22 | do {
23 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
24 | } catch {
25 | XCTFail(error.localizedDescription)
26 | return
27 | }
28 | guard let nodeTree = nodeTreeResult else {
29 | XCTFail("nodeTreeResult was nil")
30 | return
31 | }
32 |
33 | // find matching elements by traversing the created html object
34 | let nodeSelectorPath = [
35 | ElementSelector().withTagName("html"),
36 | ElementSelector().containingTagName("bod"),
37 | ElementSelector().withTagName("p")
38 | ]
39 |
40 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
41 | XCTAssertEqual(matchingElements.count, 4)
42 | }
43 |
44 |
45 | func testSelectAttributes() {
46 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
47 | .appendingPathComponent("attributes-simple.html") else {
48 | XCTFail("Could find get file URL to parse")
49 | return
50 | }
51 |
52 | var nodeTreeResult: [Node]? = nil
53 | do {
54 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
55 | } catch {
56 | XCTFail(error.localizedDescription)
57 | return
58 | }
59 | guard let nodeTree = nodeTreeResult else {
60 | XCTFail("nodeTreeResult was nil")
61 | return
62 | }
63 |
64 | // find matching elements by traversing the created html object
65 | let nodeSelectorPath: [NodeSelector] = [
66 | ElementSelector().withTagName("html"),
67 | ElementSelector().withTagName("body"),
68 | ElementSelector().withTagName("a")
69 | .withAttribute(AttributeSelector.init(name: "href").withValue("https://duckduckgo.com"))
70 | ]
71 |
72 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
73 | XCTAssertEqual(matchingElements.count, 1)
74 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is an alternate link")
75 | }
76 |
77 | func testSelectClassName() {
78 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
79 | .appendingPathComponent("attributes-multiple-value-class.html") else {
80 | XCTFail("Could find get file URL to parse")
81 | return
82 | }
83 |
84 | var nodeTreeResult: [Node]? = nil
85 | do {
86 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
87 | } catch {
88 | XCTFail(error.localizedDescription)
89 | return
90 | }
91 | guard let nodeTree = nodeTreeResult else {
92 | XCTFail("nodeTreeResult was nil")
93 | return
94 | }
95 |
96 | // find matching elements by traversing the created html object
97 | var nodeSelectorPath = [
98 | ElementSelector().withTagName("html"),
99 | ElementSelector().withTagName("body"),
100 | ElementSelector().withTagName("p")
101 | .withClassNamesAny(["body-paragraph"])
102 | ]
103 |
104 | var matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
105 | XCTAssertTrue(matchingElements.count == 1)
106 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.")
107 |
108 | // find matching elements by traversing the created html object
109 | nodeSelectorPath = [
110 | ElementSelector().withTagName("html"),
111 | ElementSelector().withTagName("body"),
112 | ElementSelector().withTagName("p").withClassName("stylized-paragraph")
113 |
114 | //.withoutClassName("into-paragraph")
115 | ]
116 |
117 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
118 | XCTAssertTrue(matchingElements.count == 4)
119 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the first paragraph.")
120 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.")
121 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the third paragraph.")
122 | XCTAssertEqual(matchingElements[3].textNodes[0].text, "This is the fourth paragraph.")
123 |
124 | // find matching elements by traversing the created html object
125 | nodeSelectorPath = [
126 | ElementSelector().withTagName("html"),
127 | ElementSelector().withTagName("body"),
128 | ElementSelector().withTagName("p")
129 | .withClassNamesExact(["stylized-paragraph"])
130 | ]
131 |
132 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
133 | XCTAssertTrue(matchingElements.count == 1)
134 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the third paragraph.")
135 |
136 |
137 | // find matching elements by traversing the created html object
138 | nodeSelectorPath = [
139 | ElementSelector().withTagName("html"),
140 | ElementSelector().withTagName("body"),
141 | ElementSelector().withTagName("p")
142 | .withoutClassNameAny(["into-paragraph"])
143 | ]
144 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
145 | XCTAssertEqual(matchingElements.count, 3)
146 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.")
147 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the third paragraph.")
148 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the fourth paragraph.")
149 | }
150 |
151 | func testSelectPosition() {
152 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
153 | .appendingPathComponent("attributes-multiple-value-class.html") else {
154 | XCTFail("Could find get file URL to parse")
155 | return
156 | }
157 |
158 | var nodeTreeResult: [Node]? = nil
159 | do {
160 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
161 | } catch {
162 | XCTFail(error.localizedDescription)
163 | return
164 | }
165 | guard let nodeTree = nodeTreeResult else {
166 | XCTFail("nodeTreeResult was nil")
167 | return
168 | }
169 |
170 | // test position equal
171 | var nodeSelectorPath = [
172 | ElementSelector().withTagName("html"),
173 | ElementSelector().withTagName("body"),
174 | ElementSelector().withTagName("p").atPosition(1)
175 | ]
176 | var matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
177 | XCTAssertTrue(matchingElements.count == 1)
178 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.")
179 |
180 | // test position greater than
181 | nodeSelectorPath = [
182 | ElementSelector().withTagName("html"),
183 | ElementSelector().withTagName("body"),
184 | ElementSelector().withTagName("p").whenPositionIsGreaterThan(1)
185 | ]
186 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
187 | XCTAssertTrue(matchingElements.count == 2)
188 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the third paragraph.")
189 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the fourth paragraph.")
190 |
191 | // test position less than
192 | nodeSelectorPath = [
193 | ElementSelector().withTagName("html"),
194 | ElementSelector().withTagName("body"),
195 | ElementSelector().withTagName("p").whenPositionIsLessThan(3)
196 | ]
197 | matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
198 | XCTAssertTrue(matchingElements.count == 3)
199 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the first paragraph.")
200 | XCTAssertEqual(matchingElements[1].textNodes[0].text, "This is the second paragraph.")
201 | XCTAssertEqual(matchingElements[2].textNodes[0].text, "This is the third paragraph.")
202 | }
203 |
204 | func testSelectInnerText() {
205 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
206 | .appendingPathComponent("attributes-multiple-value-class.html") else {
207 | XCTFail("Could find get file URL to parse")
208 | return
209 | }
210 |
211 | var nodeTreeResult: [Node]? = nil
212 | do {
213 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
214 | } catch {
215 | XCTFail(error.localizedDescription)
216 | return
217 | }
218 | guard let nodeTree = nodeTreeResult else {
219 | XCTFail("nodeTreeResult was nil")
220 | return
221 | }
222 |
223 | // find matching elements by traversing the created html object
224 | let nodeSelectorPath = [
225 | ElementSelector().withTagName("html"),
226 | ElementSelector().withTagName("body"),
227 | ElementSelector().withTagName("p")
228 | .withChildTextNode(TextNodeSelector().withText("This is the second paragraph."))
229 | ]
230 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
231 | XCTAssertEqual(matchingElements.count, 1)
232 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the second paragraph.")
233 | }
234 |
235 | func testSelectInnerComment() {
236 | guard let fileURL = TestFileURLs.attributesTestFilesDirectoryURL?
237 | .appendingPathComponent("attributes-multiple-value-class.html") else {
238 | XCTFail("Could find get file URL to parse")
239 | return
240 | }
241 |
242 | var nodeTreeResult: [Node]? = nil
243 | do {
244 | nodeTreeResult = try TestHelper.openFileAndParseHTML(fileURL: fileURL)
245 | } catch {
246 | XCTFail(error.localizedDescription)
247 | return
248 | }
249 | guard let nodeTree = nodeTreeResult else {
250 | XCTFail("nodeTreeResult was nil")
251 | return
252 | }
253 |
254 | // find matching elements by traversing the created html object
255 | let nodeSelectorPath = [
256 | ElementSelector().withTagName("html"),
257 | ElementSelector().withTagName("body"),
258 | ElementSelector().withTagName("p")
259 | .withChildCommentNode(CommentSelector().containingText("This is a comment"))
260 | ]
261 | let matchingElements = HTMLTraverser.findElements(in: nodeTree, matching: nodeSelectorPath)
262 | XCTAssertEqual(matchingElements.count, 1)
263 | XCTAssertEqual(matchingElements[0].textNodes[0].text, "This is the fourth paragraph.")
264 | }
265 |
266 | }
267 |
268 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/JavascriptParserTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // JavascriptParserTests.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-09.
6 | //
7 |
8 | import XCTest
9 | @testable import SwiftHTMLParser
10 | import TestFiles
11 |
12 | final class JavascriptParserTests: XCTestCase {
13 |
14 | func testJavascriptSimple() {
15 | guard let fileURL = TestFileURLs.javascriptTestFilesDirectoryURL?
16 | .appendingPathComponent("javascript-simple.html") else {
17 | XCTFail("Could not get url to test file")
18 | return
19 | }
20 |
21 | // get html string from file
22 | var htmlStringResult: String? = nil
23 | do {
24 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
25 | } catch {
26 | XCTFail("Could not open file at: \(fileURL.path)")
27 | }
28 | guard let htmlString = htmlStringResult else {
29 | XCTFail("Could not open file at: \(fileURL.path)")
30 | return
31 | }
32 |
33 | // create object from raw html file
34 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
35 | XCTFail("Could not parse HTML")
36 | return
37 | }
38 |
39 | // find matching elements by traversing the created html object
40 | let nodeSelectorPath = [
41 | ElementSelector().withTagName("html"),
42 | ElementSelector().withTagName("body"),
43 | ElementSelector().withTagName("script")
44 | ]
45 |
46 | let matchingElements = HTMLTraverser.findElements(in: elementArray,
47 | matching: nodeSelectorPath)
48 | XCTAssertEqual(matchingElements[0].childElements.count, 0)
49 | }
50 |
51 | func testJavascriptComments() {
52 | guard let fileURL = TestFileURLs.javascriptTestFilesDirectoryURL?
53 | .appendingPathComponent("javascript-comments.html") else {
54 | XCTFail("Could not get url to test file")
55 | return
56 | }
57 |
58 | // get html string from file
59 | var htmlStringResult: String? = nil
60 | do {
61 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
62 | } catch {
63 | XCTFail("Could not open file at: \(fileURL.path)")
64 | }
65 | guard let htmlString = htmlStringResult else {
66 | XCTFail("Could not open file at: \(fileURL.path)")
67 | return
68 | }
69 |
70 | // create object from raw html file
71 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
72 | XCTFail("Could not parse HTML")
73 | return
74 | }
75 |
76 | // find matching elements by traversing the created html object
77 | let nodeSelectorPath = [
78 | ElementSelector().withTagName("html"),
79 | ElementSelector().withTagName("body"),
80 | ElementSelector().withTagName("script")
81 | ]
82 |
83 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
84 |
85 | XCTAssertEqual(matchingElements[0].childElements.count, 0)
86 | XCTAssertEqual(matchingElements[0].textNodes.count, 1)
87 | }
88 |
89 | func testJavascriptQuotes() {
90 | guard let fileURL = TestFileURLs.javascriptTestFilesDirectoryURL?
91 | .appendingPathComponent("javascript-quotes.html") else {
92 | XCTFail("Could not get url to test file")
93 | return
94 | }
95 |
96 | // get html string from file
97 | var htmlStringResult: String? = nil
98 | do {
99 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
100 | } catch {
101 | XCTFail("Could not open file at: \(fileURL.path)")
102 | }
103 | guard let htmlString = htmlStringResult else {
104 | XCTFail("Could not open file at: \(fileURL.path)")
105 | return
106 | }
107 |
108 | // create object from raw html file
109 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
110 | XCTFail("Could not parse HTML")
111 | return
112 | }
113 |
114 | // find matching elements by traversing the created html object
115 | let nodeSelectorPath = [
116 | ElementSelector().withTagName("html"),
117 | ElementSelector().withTagName("body"),
118 | ElementSelector().withTagName("script")
119 | ]
120 |
121 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
122 |
123 | XCTAssertEqual(matchingElements[0].childElements.count, 0)
124 | XCTAssertEqual(matchingElements[0].textNodes.count, 1)
125 | XCTAssertEqual(matchingElements[0].textNodes[0].text.count, 803)
126 | }
127 |
128 | func testJavascriptQuotesWithEscapeCharacters() {
129 | guard let fileURL = TestFileURLs.javascriptTestFilesDirectoryURL?
130 | .appendingPathComponent("javascript-quotes-with-escape-characters.html") else {
131 | XCTFail("Could not get url to test file")
132 | return
133 | }
134 |
135 | // get html string from file
136 | var htmlStringResult: String? = nil
137 | do {
138 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
139 | } catch {
140 | XCTFail("Could not open file at: \(fileURL.path)")
141 | }
142 | guard let htmlString = htmlStringResult else {
143 | XCTFail("Could not open file at: \(fileURL.path)")
144 | return
145 | }
146 |
147 | // create object from raw html file
148 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
149 | XCTFail("Could not parse HTML")
150 | return
151 | }
152 |
153 | XCTAssertEqual(elementArray.count, 2)
154 |
155 | // find matching elements by traversing the created html object
156 | let nodeSelectorPath = [
157 | ElementSelector().withTagName("html"),
158 | ElementSelector().withTagName("body"),
159 | ElementSelector().withTagName("script")
160 | ]
161 |
162 | let matchingElements = HTMLTraverser.findElements(in: elementArray, matching: nodeSelectorPath)
163 |
164 | XCTAssertEqual(matchingElements.count, 1)
165 | XCTAssertEqual(matchingElements[0].childElements.count, 0)
166 | XCTAssertEqual(matchingElements[0].textNodes.count, 1)
167 | }
168 |
169 | }
170 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/PerformanceTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // PerformanceTests.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-11.
6 | //
7 |
8 | import XCTest
9 | @testable import SwiftHTMLParser
10 | import TestFiles
11 |
12 | final class PerformanceTests: XCTestCase {
13 |
14 | func testIteratingString() {
15 | guard let fileURL = TestFileURLs.realWorldTestFilesDirectoryURL?
16 | .appendingPathComponent("google-home-page.html") else {
17 | XCTFail("Could not get url to test file")
18 | return
19 | }
20 |
21 | // get html string from file
22 | var htmlStringResult: String? = nil
23 | do {
24 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
25 | } catch {
26 | XCTFail("Could not open file at: \(fileURL.path)")
27 | }
28 | guard let htmlString = htmlStringResult else {
29 | XCTFail("Could not open file at: \(fileURL.path)")
30 | return
31 | }
32 |
33 | var currentIndex = htmlString.startIndex
34 | var numberOfMatchingCharacters = 0
35 | let charToMatch: Character = "a"
36 |
37 | let start = Date()
38 | while currentIndex < htmlString.endIndex {
39 | if (htmlString[currentIndex] == charToMatch) {
40 | numberOfMatchingCharacters += 1
41 | }
42 |
43 | // iterate current index
44 | currentIndex = htmlString.index(currentIndex, offsetBy: 1)
45 | }
46 | let end = Date()
47 |
48 | let timeElapsed = end.timeIntervalSince(start)
49 | print("time elapsed: \(timeElapsed) seconds")
50 |
51 | print("found \(numberOfMatchingCharacters) matching the string '\(charToMatch)'")
52 |
53 | print("--------------------")
54 | }
55 |
56 | func testStringIteration() {
57 | guard let fileURL = TestFileURLs.realWorldTestFilesDirectoryURL?
58 | .appendingPathComponent("google-home-page.html") else {
59 | XCTFail("Could not get url to test file")
60 | return
61 | }
62 |
63 | // get html string from file
64 | var htmlStringResult: String? = nil
65 | do {
66 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
67 | } catch {
68 | XCTFail("Could not open file at: \(fileURL.path)")
69 | }
70 | guard let text = htmlStringResult else {
71 | XCTFail("Could not open file at: \(fileURL.path)")
72 | return
73 | }
74 |
75 | var currentIndex = text.startIndex
76 | var numberOfMatchingCharacters = 0
77 |
78 | let lookaheadValidator = LookaheadValidator()
79 | let scriptEndTag = ""
80 |
81 | let start = Date()
82 | while currentIndex < text.endIndex {
83 | // test for character
84 | if containsInner(text: text, currentIndex: currentIndex) {
85 | numberOfMatchingCharacters += 1
86 | }
87 |
88 | // test speed of lookahead validator
89 | if lookaheadValidator.isValidLookahead(for: text, atIndex: currentIndex, checkFor: scriptEndTag) {
90 | //print("found")
91 | }
92 |
93 | // iterate current index
94 | currentIndex = text.index(currentIndex, offsetBy: 1)
95 | }
96 | let end = Date()
97 |
98 | let timeElapsed = end.timeIntervalSince(start)
99 | print("time elapsed: \(timeElapsed) seconds")
100 |
101 | print("found \(numberOfMatchingCharacters) matching characters.")
102 |
103 | print("--------------------")
104 | }
105 |
106 | func containsInner(text: String, currentIndex: String.Index) -> Bool {
107 | let localIndex = currentIndex
108 |
109 | if text[localIndex] == "a" || text[localIndex] == "A" {
110 | return true
111 | } else {
112 | return false
113 | }
114 | }
115 |
116 | // func testDeep() {
117 | // guard let fileURL = TestsConfig.performanceTestFilesDirectoryURL?
118 | // .appendingPathComponent("deep.html") else {
119 | // XCTFail("Could not get url to test file")
120 | // return
121 | // }
122 | //
123 | // // get html string from file
124 | // var htmlStringResult: String? = nil
125 | // do {
126 | // htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
127 | // } catch {
128 | // XCTFail("Could not open file at: \(fileURL.path)")
129 | // }
130 | // guard let htmlString = htmlStringResult else {
131 | // XCTFail("Could not open file at: \(fileURL.path)")
132 | // return
133 | // }
134 | //
135 | // // create object from raw html file
136 | // let htmlParser = HTMLParser()
137 | // guard let elementArray = try? HTMLParser.parse(htmlString) else {
138 | // XCTFail("Could not parse HTML")
139 | // return
140 | // }
141 | //
142 | // // find matching elements by traversing the created html object
143 | // let nodeSelectorPath = [
144 | // ElementSelector.init(tagName: "html"),
145 | // ElementSelector.init(tagName: "body")
146 | // ]
147 | //
148 | // let traverser = HTMLTraverser()
149 | // let matchingElements = traverser.findElements(in: elementArray,
150 | // matchingNodeSelectorPath: nodeSelectorPath)
151 | //
152 | // XCTAssertEqual(matchingElements[0].childElements.count, 300)
153 | // }
154 |
155 | // func testTimeDeep() {
156 | // guard let fileURL = TestsConfig.performanceTestFilesDirectoryURL?
157 | // .appendingPathComponent("deep.html") else {
158 | // XCTFail("Could not get url to test file")
159 | // return
160 | // }
161 | //
162 | // // get html string from file
163 | // var htmlStringResult: String? = nil
164 | // do {
165 | // htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
166 | // } catch {
167 | // XCTFail("Could not open file at: \(fileURL.path)")
168 | // }
169 | // guard let htmlString = htmlStringResult else {
170 | // XCTFail("Could not open file at: \(fileURL.path)")
171 | // return
172 | // }
173 | //
174 | // // create object from raw html file
175 | // let start = Date()
176 | // let htmlParser = HTMLParser()
177 | // for _ in 0..<20 {
178 | // do {
179 | // _ = try HTMLParser.parse(htmlString)
180 | // } catch {
181 | // XCTFail("Could not parse HTML")
182 | // }
183 | // }
184 | // let end = Date()
185 | //
186 | // let timeElapsed = end.timeIntervalSince(start)
187 | // print("time elapsed: \(timeElapsed) seconds")
188 | // }
189 | }
190 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/SVGParserTests.swift:
--------------------------------------------------------------------------------
1 | //
2 | // SVGParserTests.swift
3 | // SwiftHTMLParser
4 | //
5 | // Created by Reid Nantes on 2018-12-12.
6 | //
7 |
8 | import Foundation
9 |
10 | import XCTest
11 | @testable import SwiftHTMLParser
12 | import TestFiles
13 |
14 |
15 | final class SVGParserTests: XCTestCase {
16 | func testSVG() {
17 | guard let fileURL = TestFileURLs.svgTestFilesDirectoryURL?
18 | .appendingPathComponent("svg-simple.html") else {
19 | XCTFail("Could find get file URL to parse")
20 | return
21 | }
22 |
23 | // get html string from file
24 | var htmlStringResult: String? = nil
25 | do {
26 | htmlStringResult = try String(contentsOf: fileURL, encoding: .utf8)
27 | } catch {
28 | XCTFail("Could not open file URL: \(fileURL)")
29 | return
30 | }
31 | guard let htmlString = htmlStringResult else {
32 | XCTFail("Could not open file URL: \(fileURL)")
33 | return
34 | }
35 |
36 | // create object from raw html file
37 | guard let elementArray = try? HTMLParser.parse(htmlString) else {
38 | XCTFail("Could not parse HTML")
39 | return
40 | }
41 |
42 | XCTAssertEqual(elementArray.count, 2)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/Tests/SwiftHTMLParserTests/TestHelper.swift:
--------------------------------------------------------------------------------
1 | //
2 | // TestHelper.swift
3 | //
4 | //
5 | // Created by Reid Nantes on 2019-10-25.
6 | //
7 |
8 | import Foundation
9 | import SwiftHTMLParser
10 |
11 |
12 | struct TestHelper {
13 | static func openFileAndParseHTML(fileURL: URL) throws -> [Node] {
14 | // get html string from file
15 | let htmlString = try String(contentsOf: fileURL, encoding: .utf8)
16 |
17 | // create object from raw html file
18 | let nodeTree = try HTMLParser.parse(htmlString)
19 |
20 | return nodeTree
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Tests/TestFiles/Mock/Attributes/attributes-multiple-value-class.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | This is a Heading
8 |
9 | This is a Heading
8 |
9 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/Tests/TestFiles/Mock/Attributes/attributes-simple.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
This is a Heading
8 | This is a Heading
8 |
16 |
17 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/Tests/TestFiles/Mock/Comments/comments.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
This is a Heading
13 | This is a heading
12 |
13 |
16 | This is a Heading
13 | This is a Heading
8 |
9 |
23 |
27 |
28 | This is a Heading
8 |
16 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/Tests/TestFiles/Mock/Elements/element-unclosed-end-tag.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
This is a Heading
8 | This is a Heading
8 | This is a Heading
8 | This is a Heading
8 |