GetTestFiles()
40 | {
41 | //DirectoryInfo dir = new DirectoryInfo("SampleData");
42 | //return dir.GetFiles("*.html", SearchOption.AllDirectories);
43 | for (int i = 0; i < 10; i++)
44 | {
45 | yield return new FileInfo(Path.Combine("SampleData", "test.html"));
46 | }
47 | }
48 |
49 | public static void Main(string[] args)
50 | {
51 | //var fragment1 = parser.ParseStringFragment("foo", "");
52 | //var fragment2 = parser.ParseStringFragment(" | foo", "table");
53 |
54 | Stopwatch sw = new Stopwatch();
55 |
56 |
57 | Console.Write("Parsing ... ");
58 | var result = GetTestFiles().Select((file) =>
59 | {
60 | sw.Restart();
61 | var doc = parser.Parse(file.FullName);
62 | sw.Stop();
63 | var parseTime = sw.Elapsed;
64 | doc.Save("test.xml");
65 | sw.Restart();
66 | XDocument.Load("test.xml");
67 | sw.Stop();
68 | var reparseTime = sw.Elapsed;
69 | return new { Document = doc, Time = parseTime, ReparseTime = reparseTime };
70 | }
71 | ).ToList();
72 |
73 | TimeSpan total = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.Time);
74 | TimeSpan reparseTotal = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.ReparseTime);
75 |
76 | Console.WriteLine("done.");
77 | Console.WriteLine("Found " + result.Count + " documents.");
78 | Console.WriteLine();
79 | PrintTime("Total", total);
80 | PrintTime("First", result.First().Time);
81 | PrintTime("Average", TimeSpan.FromTicks(total.Ticks / result.Count));
82 | PrintTime("Average (without first)", TimeSpan.FromTicks((total.Ticks - result.First().Time.Ticks) / (result.Count - 1)));
83 | PrintTime("Min", result.Min(val => val.Time));
84 | PrintTime("Max", result.Max(val => val.Time));
85 |
86 | Console.WriteLine();
87 | Console.WriteLine("=== Reparsing (XDocument) ===");
88 |
89 | // note: reparsing using XmlDocument instead gives similar results
90 |
91 | PrintTime("Total", reparseTotal);
92 | PrintTime("First", result.First().ReparseTime);
93 | PrintTime("Average", TimeSpan.FromTicks(reparseTotal.Ticks / result.Count));
94 | PrintTime("Average (without first)", TimeSpan.FromTicks((reparseTotal.Ticks - result.First().ReparseTime.Ticks) / (result.Count - 1)));
95 | PrintTime("Min", result.Min(val => val.ReparseTime));
96 | PrintTime("Max", result.Max(val => val.ReparseTime));
97 | Console.ReadKey();
98 | }
99 |
100 | private static void PrintTime(string caption, TimeSpan time)
101 | {
102 | Console.WriteLine("{0}:\n {1} ({2} ms)", caption, time.ToString(), time.TotalMilliseconds);
103 | }
104 |
105 |
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/HtmlParserSharp/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // Allgemeine Informationen über eine Assembly werden über die folgenden
6 | // Attribute gesteuert. Ändern Sie diese Attributwerte, um die Informationen zu ändern,
7 | // die mit einer Assembly verknüpft sind.
8 | [assembly: AssemblyTitle("HtmlParser")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("HtmlParser")]
13 | [assembly: AssemblyCopyright("Copyright © 2012")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Durch Festlegen von ComVisible auf "false" werden die Typen in dieser Assembly unsichtbar
18 | // für COM-Komponenten. Wenn Sie auf einen Typ in dieser Assembly von
19 | // COM zugreifen müssen, legen Sie das ComVisible-Attribut für diesen Typ auf "true" fest.
20 | [assembly: ComVisible(false)]
21 |
22 | // Die folgende GUID bestimmt die ID der Typbibliothek, wenn dieses Projekt für COM verfügbar gemacht wird
23 | [assembly: Guid("dd2311df-4aa1-4f09-8fff-751cd048e652")]
24 |
25 | // Versionsinformationen für eine Assembly bestehen aus den folgenden vier Werten:
26 | //
27 | // Hauptversion
28 | // Nebenversion
29 | // Buildnummer
30 | // Revision
31 | //
32 | // Sie können alle Werte angeben oder die standardmäßigen Build- und Revisionsnummern
33 | // übernehmen, indem Sie "*" eingeben:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/HtmlParserSharp/SampleData/test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamietre/HtmlParserSharp/2a450f49bb908d50461eae95dd4f74b872b5094e/HtmlParserSharp/SampleData/test.html
--------------------------------------------------------------------------------
/HtmlParserSharp/SimpleHtmlParser.cs:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2012 /// Patrick Reisert
3 | * Copyright (c) 2005, 2006, 2007 Henri Sivonen
4 | * Copyright (c) 2007-2008 Mozilla Foundation
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a
7 | * copy of this software and associated documentation files (the "Software"),
8 | * to deal in the Software without restriction, including without limitation
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 | * and/or sell copies of the Software, and to permit persons to whom the
11 | * Software is furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 | * DEALINGS IN THE SOFTWARE.
23 | */
24 |
25 | using System;
26 | using System.IO;
27 | using System.Xml;
28 | using HtmlParserSharp.Core;
29 |
30 | namespace HtmlParserSharp
31 | {
32 | ///
33 | /// This is a simple API for the parsing process.
34 | /// Part of this is a port of the nu.validator.htmlparser.io.Driver class.
35 | /// The parser currently ignores the encoding in the html source and parses everything as UTF-8.
36 | ///
37 | public class SimpleHtmlParser
38 | {
39 | private Tokenizer tokenizer;
40 | private XmlTreeBuilder treeBuilder;
41 |
42 | public XmlDocumentFragment ParseStringFragment(string str, string fragmentContext)
43 | {
44 | using (var reader = new StringReader(str))
45 | return ParseFragment(reader, fragmentContext);
46 | }
47 |
48 | public XmlDocument ParseString(string str)
49 | {
50 | using (var reader = new StringReader(str))
51 | return Parse(reader);
52 | }
53 |
54 | public XmlDocument Parse(string path)
55 | {
56 | using (var reader = new StreamReader(path))
57 | return Parse(reader);
58 | }
59 |
60 | public XmlDocument Parse(TextReader reader)
61 | {
62 | Reset();
63 | Tokenize(reader);
64 | return treeBuilder.Document;
65 | }
66 |
67 | public XmlDocumentFragment ParseFragment(TextReader reader, string fragmentContext)
68 | {
69 | Reset();
70 | treeBuilder.SetFragmentContext(fragmentContext);
71 | Tokenize(reader);
72 | return treeBuilder.getDocumentFragment();
73 | }
74 |
75 | private void Reset()
76 | {
77 | treeBuilder = new XmlTreeBuilder();
78 | tokenizer = new Tokenizer(treeBuilder, false);
79 | treeBuilder.WantsComments = false;
80 |
81 | // optionally: report errors and more
82 |
83 | //treeBuilder.ErrorEvent +=
84 | // (sender, a) =>
85 | // {
86 | // ILocator loc = tokenizer as ILocator;
87 | // Console.WriteLine("{0}: {1} (Line: {2})", a.IsWarning ? "Warning" : "Error", a.Message, loc.LineNumber);
88 | // };
89 | //treeBuilder.DocumentModeDetected += (sender, a) => Console.WriteLine("Document mode: " + a.Mode.ToString());
90 | //tokenizer.EncodingDeclared += (sender, a) => Console.WriteLine("Encoding: " + a.Encoding + " (currently ignored)");
91 | }
92 |
93 | private void Tokenize(TextReader reader)
94 | {
95 | if (reader == null)
96 | {
97 | throw new ArgumentNullException("reader was null.");
98 | }
99 |
100 | tokenizer.Start();
101 | bool swallowBom = true;
102 |
103 | try
104 | {
105 | char[] buffer = new char[2048];
106 | UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0);
107 | bool lastWasCR = false;
108 | int len = -1;
109 | if ((len = reader.Read(buffer, 0, buffer.Length)) != 0)
110 | {
111 | int streamOffset = 0;
112 | int offset = 0;
113 | int length = len;
114 | if (swallowBom)
115 | {
116 | if (buffer[0] == '\uFEFF')
117 | {
118 | streamOffset = -1;
119 | offset = 1;
120 | length--;
121 | }
122 | }
123 | if (length > 0)
124 | {
125 | tokenizer.SetTransitionBaseOffset(streamOffset);
126 | bufr.Start = offset;
127 | bufr.End = offset + length;
128 | while (bufr.HasMore)
129 | {
130 | bufr.Adjust(lastWasCR);
131 | lastWasCR = false;
132 | if (bufr.HasMore)
133 | {
134 | lastWasCR = tokenizer.TokenizeBuffer(bufr);
135 | }
136 | }
137 | }
138 | streamOffset = length;
139 | while ((len = reader.Read(buffer, 0, buffer.Length)) != 0)
140 | {
141 | tokenizer.SetTransitionBaseOffset(streamOffset);
142 | bufr.Start = 0;
143 | bufr.End = len;
144 | while (bufr.HasMore)
145 | {
146 | bufr.Adjust(lastWasCR);
147 | lastWasCR = false;
148 | if (bufr.HasMore)
149 | {
150 | lastWasCR = tokenizer.TokenizeBuffer(bufr);
151 | }
152 | }
153 | streamOffset += len;
154 | }
155 | }
156 | tokenizer.Eof();
157 | }
158 | finally
159 | {
160 | tokenizer.End();
161 | }
162 | }
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/HtmlParserSharp/TreeBuilders/XmlTreeBuilder.cs:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2007 Henri Sivonen
3 | * Copyright (c) 2008-2010 Mozilla Foundation
4 | * Copyright (c) 2012 Patrick Reisert
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a
7 | * copy of this software and associated documentation files (the "Software"),
8 | * to deal in the Software without restriction, including without limitation
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 | * and/or sell copies of the Software, and to permit persons to whom the
11 | * Software is furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 | * DEALINGS IN THE SOFTWARE.
23 | */
24 |
25 | using System;
26 | using System.Collections.Generic;
27 | using System.Linq;
28 | using System.Text;
29 | using System.Xml;
30 | using HtmlParserSharp.Common;
31 | using HtmlParserSharp.Core;
32 |
33 | #pragma warning disable 1591 // Missing XML comment
34 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason'
35 | #pragma warning disable 1587 // XML comment is not placed on a valid element
36 |
37 | namespace HtmlParserSharp
38 | {
39 | ///
40 | /// The tree builder glue for building a tree through the public DOM APIs.
41 | ///
42 |
43 | public class XmlTreeBuilder : CoalescingTreeBuilder
44 | {
45 | ///
46 | /// The current doc.
47 | ///
48 | private XmlDocument document;
49 |
50 | override protected void AddAttributesToElement(XmlElement element, HtmlAttributes attributes) {
51 | for (int i = 0; i < attributes.Length; i++) {
52 | String localName = attributes.GetLocalName(i);
53 | String uri = attributes.GetURI(i);
54 | if (!element.HasAttribute(localName, uri)) {
55 | element.SetAttribute(localName, uri, attributes.GetValue(i));
56 | }
57 | }
58 | }
59 |
60 | override protected void AppendCharacters(XmlElement parent, string text)
61 | {
62 | XmlNode lastChild = parent.LastChild;
63 | if (lastChild != null && lastChild.NodeType == XmlNodeType.Text) {
64 | XmlText lastAsText = (XmlText) lastChild;
65 | lastAsText.Data += text;
66 | return;
67 | }
68 | parent.AppendChild(document.CreateTextNode(text));
69 | }
70 |
71 | override protected void AppendChildrenToNewParent(XmlElement oldParent, XmlElement newParent) {
72 | while (oldParent.HasChildNodes) {
73 | newParent.AppendChild(oldParent.FirstChild);
74 | }
75 | }
76 |
77 | protected override void AppendDoctypeToDocument(string name, string publicIdentifier, string systemIdentifier)
78 | {
79 | // TODO: this method was not there originally. is it correct?
80 | document.XmlResolver = null;
81 |
82 | if (publicIdentifier == String.Empty)
83 | publicIdentifier = null;
84 | if (systemIdentifier == String.Empty)
85 | systemIdentifier = null;
86 |
87 | var doctype = document.CreateDocumentType(name, publicIdentifier, systemIdentifier, null);
88 | document.XmlResolver = new XmlUrlResolver();
89 | document.AppendChild(doctype);
90 | }
91 |
92 | override protected void AppendComment(XmlElement parent, String comment)
93 | {
94 | parent.AppendChild(document.CreateComment(comment));
95 | }
96 |
97 | override protected void AppendCommentToDocument(String comment)
98 | {
99 | document.AppendChild(document.CreateComment(comment));
100 | }
101 |
102 | override protected XmlElement CreateElement(string ns, string name, HtmlAttributes attributes)
103 | {
104 | XmlElement rv = document.CreateElement(name, ns);
105 | for (int i = 0; i < attributes.Length; i++)
106 | {
107 | rv.SetAttribute(attributes.GetLocalName(i), attributes.GetURI(i), attributes.GetValue(i));
108 | if (attributes.GetType(i) == "ID")
109 | {
110 | //rv.setIdAttributeNS(null, attributes.GetLocalName(i), true); // FIXME
111 | }
112 | }
113 | return rv;
114 | }
115 |
116 | override protected XmlElement CreateHtmlElementSetAsRoot(HtmlAttributes attributes)
117 | {
118 | XmlElement rv = document.CreateElement("html", "http://www.w3.org/1999/xhtml");
119 | for (int i = 0; i < attributes.Length; i++) {
120 | rv.SetAttribute(attributes.GetLocalName(i), attributes.GetURI(i), attributes.GetValue(i));
121 | }
122 | document.AppendChild(rv);
123 | return rv;
124 | }
125 |
126 | override protected void AppendElement(XmlElement child, XmlElement newParent)
127 | {
128 | newParent.AppendChild(child);
129 | }
130 |
131 | override protected bool HasChildren(XmlElement element)
132 | {
133 | return element.HasChildNodes;
134 | }
135 |
136 | override protected XmlElement CreateElement(string ns, string name, HtmlAttributes attributes, XmlElement form) {
137 | XmlElement rv = CreateElement(ns, name, attributes);
138 | //rv.setUserData("nu.validator.form-pointer", form, null); // TODO
139 | return rv;
140 | }
141 |
142 | override protected void Start(bool fragment) {
143 | document = new XmlDocument(); // implementation.createDocument(null, null, null);
144 | // TODO: fragment?
145 | }
146 |
147 | protected override void ReceiveDocumentMode(DocumentMode mode, String publicIdentifier,
148 | String systemIdentifier, bool html4SpecificAdditionalErrorChecks)
149 | {
150 | //document.setUserData("nu.validator.document-mode", mode, null); // TODO
151 | }
152 |
153 | ///
154 | /// Returns the document.
155 | ///
156 | /// The document
157 | internal XmlDocument Document
158 | {
159 | get
160 | {
161 | return document;
162 | }
163 | }
164 |
165 | ///
166 | /// Return the document fragment.
167 | ///
168 | /// The document fragment
169 | internal XmlDocumentFragment getDocumentFragment() {
170 | XmlDocumentFragment rv = document.CreateDocumentFragment();
171 | XmlNode rootElt = document.FirstChild;
172 | while (rootElt.HasChildNodes) {
173 | rv.AppendChild(rootElt.FirstChild);
174 | }
175 | document = null;
176 | return rv;
177 | }
178 |
179 | override protected void InsertFosterParentedCharacters(string text, XmlElement table, XmlElement stackParent) {
180 | XmlNode parent = table.ParentNode;
181 | if (parent != null) { // always an element if not null
182 | XmlNode previousSibling = table.PreviousSibling;
183 | if (previousSibling != null
184 | && previousSibling.NodeType == XmlNodeType.Text) {
185 | XmlText lastAsText = (XmlText) previousSibling;
186 | lastAsText.Data += text;
187 | return;
188 | }
189 | parent.InsertBefore(document.CreateTextNode(text), table);
190 | return;
191 | }
192 | XmlNode lastChild = stackParent.LastChild;
193 | if (lastChild != null && lastChild.NodeType == XmlNodeType.Text) {
194 | XmlText lastAsText = (XmlText) lastChild;
195 | lastAsText.Data += text;
196 | return;
197 | }
198 | stackParent.AppendChild(document.CreateTextNode(text));
199 | }
200 |
201 | override protected void InsertFosterParentedChild(XmlElement child, XmlElement table, XmlElement stackParent) {
202 | XmlNode parent = table.ParentNode;
203 | if (parent != null) { // always an element if not null
204 | parent.InsertBefore(child, table);
205 | } else {
206 | stackParent.AppendChild(child);
207 | }
208 | }
209 |
210 | override protected void DetachFromParent(XmlElement element)
211 | {
212 | XmlNode parent = element.ParentNode;
213 | if (parent != null) {
214 | parent.RemoveChild(element);
215 | }
216 | }
217 | }
218 | }
219 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This is for the HTML parser as a whole except the rewindable input stream,
2 | the named character classes and the Live DOM Viewer.
3 | For the copyright notices for individual files, please see individual files.
4 |
5 | /*
6 | * Copyright (c) 2005, 2006, 2007 Henri Sivonen
7 | * Copyright (c) 2007-2011 Mozilla Foundation
8 | * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla
9 | * Foundation, and Opera Software ASA.
10 | *
11 | * Permission is hereby granted, free of charge, to any person obtaining a
12 | * copy of this software and associated documentation files (the "Software"),
13 | * to deal in the Software without restriction, including without limitation
14 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 | * and/or sell copies of the Software, and to permit persons to whom the
16 | * Software is furnished to do so, subject to the following conditions:
17 | *
18 | * The above copyright notice and this permission notice shall be included in
19 | * all copies or substantial portions of the Software.
20 | *
21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 | * DEALINGS IN THE SOFTWARE.
28 | */
29 |
30 | The following license is for the WHATWG spec from which the named character
31 | data was extracted.
32 |
33 | /*
34 | * Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and Opera
35 | * Software ASA.
36 | *
37 | * You are granted a license to use, reproduce and create derivative works of
38 | * this document.
39 | */
40 |
41 | The following license is for the rewindable input stream.
42 |
43 | /*
44 | * Copyright (c) 2001-2003 Thai Open Source Software Center Ltd
45 | * All rights reserved.
46 | *
47 | * Redistribution and use in source and binary forms, with or without
48 | * modification, are permitted provided that the following conditions
49 | * are met:
50 | *
51 | * * Redistributions of source code must retain the above copyright
52 | * notice, this list of conditions and the following disclaimer.
53 | * * Redistributions in binary form must reproduce the above
54 | * copyright notice, this list of conditions and the following
55 | * disclaimer in the documentation and/or other materials provided
56 | * with the distribution.
57 | * * Neither the name of the Thai Open Source Software Center Ltd nor
58 | * the names of its contributors may be used to endorse or promote
59 | * products derived from this software without specific prior
60 | * written permission.
61 | *
62 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
63 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
64 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
65 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
66 | * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
67 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
68 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
69 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
70 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
71 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
72 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
73 | * POSSIBILITY OF SUCH DAMAGE.
74 | */
75 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | HtmlParserSharp
2 | ===============
3 |
4 | This is a manual C# port of the [Validator.nu HTML Parser](http://about.validator.nu/htmlparser/), a HTML5 parser originally written in Java and (compiled to C++ using the Google Web Toolkit) used by Mozilla's Gecko rendering engine. This port is current as of Version 1.4.
5 |
6 | The code is DOM-agnostic and provides an interface via `TreeBuilder` for creating a DOM from its output using any object model. Included in the code base is a `TreeBuilder` that produces a DOM using System.Xml.
7 |
8 | Status
9 | ------
10 |
11 | This port was created by Patrick Reisert based on Validator.nu 1.3. It was adopted by James Treworgy in September, 2012 to use in [CsQuery](https://github.com/jamietre/CsQuery). However, since a general-purpose HTML5 parser is extraordinarily useful, I've kept it as an independent project. It's included as a submodule in CsQuery to simplify distribution. It may become an external dependency at some point if development of the parser substantially diverges from CsQuery in the future.
12 |
13 |
--------------------------------------------------------------------------------
|