GetTestFiles()
40 | {
41 | //DirectoryInfo dir = new DirectoryInfo("SampleData");
42 | //return dir.GetFiles("*.html", SearchOption.AllDirectories);
43 | for (int i = 0; i < 10; i++)
44 | {
45 | yield return new FileInfo(Path.Combine("SampleData", "test.html"));
46 | }
47 | }
48 |
49 | public static void Main(string[] args)
50 | {
51 | //var fragment1 = parser.ParseStringFragment("foo", "");
52 | //var fragment2 = parser.ParseStringFragment(" | foo", "table");
53 |
54 | Stopwatch sw = new Stopwatch();
55 |
56 |
57 | Console.Write("Parsing ... ");
58 | var result = GetTestFiles().Select((file) =>
59 | {
60 | sw.Restart();
61 | var doc = parser.Parse(file.FullName);
62 | sw.Stop();
63 | var parseTime = sw.Elapsed;
64 | doc.Save("test.xml");
65 | sw.Restart();
66 | XDocument.Load("test.xml");
67 | sw.Stop();
68 | var reparseTime = sw.Elapsed;
69 | return new { Document = doc, Time = parseTime, ReparseTime = reparseTime };
70 | }
71 | ).ToList();
72 |
73 | TimeSpan total = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.Time);
74 | TimeSpan reparseTotal = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.ReparseTime);
75 |
76 | Console.WriteLine("done.");
77 | Console.WriteLine("Found " + result.Count + " documents.");
78 | Console.WriteLine();
79 | PrintTime("Total", total);
80 | PrintTime("First", result.First().Time);
81 | PrintTime("Average", TimeSpan.FromTicks(total.Ticks / result.Count));
82 | PrintTime("Average (without first)", TimeSpan.FromTicks((total.Ticks - result.First().Time.Ticks) / (result.Count - 1)));
83 | PrintTime("Min", result.Min(val => val.Time));
84 | PrintTime("Max", result.Max(val => val.Time));
85 |
86 | Console.WriteLine();
87 | Console.WriteLine("=== Reparsing (XDocument) ===");
88 |
89 | // note: reparsing using XmlDocument instead gives similar results
90 |
91 | PrintTime("Total", reparseTotal);
92 | PrintTime("First", result.First().ReparseTime);
93 | PrintTime("Average", TimeSpan.FromTicks(reparseTotal.Ticks / result.Count));
94 | PrintTime("Average (without first)", TimeSpan.FromTicks((reparseTotal.Ticks - result.First().ReparseTime.Ticks) / (result.Count - 1)));
95 | PrintTime("Min", result.Min(val => val.ReparseTime));
96 | PrintTime("Max", result.Max(val => val.ReparseTime));
97 | Console.ReadKey();
98 | }
99 |
100 | private static void PrintTime(string caption, TimeSpan time)
101 | {
102 | Console.WriteLine("{0}:\n {1} ({2} ms)", caption, time.ToString(), time.TotalMilliseconds);
103 | }
104 |
105 |
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/HtmlParserSharp/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // Allgemeine Informationen über eine Assembly werden über die folgenden
6 | // Attribute gesteuert. Ändern Sie diese Attributwerte, um die Informationen zu ändern,
7 | // die mit einer Assembly verknüpft sind.
8 | [assembly: AssemblyTitle("HtmlParser")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("HtmlParser")]
13 | [assembly: AssemblyCopyright("Copyright © 2012")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Durch Festlegen von ComVisible auf "false" werden die Typen in dieser Assembly unsichtbar
18 | // für COM-Komponenten. Wenn Sie auf einen Typ in dieser Assembly von
19 | // COM zugreifen müssen, legen Sie das ComVisible-Attribut für diesen Typ auf "true" fest.
20 | [assembly: ComVisible(false)]
21 |
22 | // Die folgende GUID bestimmt die ID der Typbibliothek, wenn dieses Projekt für COM verfügbar gemacht wird
23 | [assembly: Guid("dd2311df-4aa1-4f09-8fff-751cd048e652")]
24 |
25 | // Versionsinformationen für eine Assembly bestehen aus den folgenden vier Werten:
26 | //
27 | // Hauptversion
28 | // Nebenversion
29 | // Buildnummer
30 | // Revision
31 | //
32 | // Sie können alle Werte angeben oder die standardmäßigen Build- und Revisionsnummern
33 | // übernehmen, indem Sie "*" eingeben:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/HtmlParserSharp/SampleData/test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boddlnagg/HtmlParserSharp/bd48da4f4ce3b6309e32677bd1bebafd42ba280b/HtmlParserSharp/SampleData/test.html
--------------------------------------------------------------------------------
/HtmlParserSharp/SimpleHtmlParser.cs:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2012 Patrick Reisert
3 | * Copyright (c) 2005, 2006, 2007 Henri Sivonen
4 | * Copyright (c) 2007-2008 Mozilla Foundation
5 | *
6 | * Permission is hereby granted, free of charge, to any person obtaining a
7 | * copy of this software and associated documentation files (the "Software"),
8 | * to deal in the Software without restriction, including without limitation
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 | * and/or sell copies of the Software, and to permit persons to whom the
11 | * Software is furnished to do so, subject to the following conditions:
12 | *
13 | * The above copyright notice and this permission notice shall be included in
14 | * all copies or substantial portions of the Software.
15 | *
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 | * DEALINGS IN THE SOFTWARE.
23 | */
24 |
25 | using System;
26 | using System.IO;
27 | using System.Xml;
28 | using HtmlParserSharp.Core;
29 |
30 | namespace HtmlParserSharp
31 | {
32 | ///
33 | /// This is a simple API for the parsing process.
34 | /// Part of this is a port of the nu.validator.htmlparser.io.Driver class.
35 | /// The parser currently ignores the encoding in the html source and parses everything as UTF-8.
36 | ///
37 | public class SimpleHtmlParser
38 | {
39 | private Tokenizer tokenizer;
40 | private DomTreeBuilder treeBuilder;
41 |
42 | public XmlDocumentFragment ParseStringFragment(string str, string fragmentContext)
43 | {
44 | using (var reader = new StringReader(str))
45 | return ParseFragment(reader, fragmentContext);
46 | }
47 |
48 | public XmlDocument ParseString(string str)
49 | {
50 | using (var reader = new StringReader(str))
51 | return Parse(reader);
52 | }
53 |
54 | public XmlDocument Parse(string path)
55 | {
56 | using (var reader = new StreamReader(path))
57 | return Parse(reader);
58 | }
59 |
60 | public XmlDocument Parse(TextReader reader)
61 | {
62 | Reset();
63 | Tokenize(reader);
64 | return treeBuilder.Document;
65 | }
66 |
67 | public XmlDocumentFragment ParseFragment(TextReader reader, string fragmentContext)
68 | {
69 | Reset();
70 | treeBuilder.SetFragmentContext(fragmentContext);
71 | Tokenize(reader);
72 | return treeBuilder.getDocumentFragment();
73 | }
74 |
75 | private void Reset()
76 | {
77 | treeBuilder = new DomTreeBuilder();
78 | tokenizer = new Tokenizer(treeBuilder, false);
79 | treeBuilder.IsIgnoringComments = false;
80 |
81 | // optionally: report errors and more
82 |
83 | //treeBuilder.ErrorEvent +=
84 | // (sender, a) =>
85 | // {
86 | // ILocator loc = tokenizer as ILocator;
87 | // Console.WriteLine("{0}: {1} (Line: {2})", a.IsWarning ? "Warning" : "Error", a.Message, loc.LineNumber);
88 | // };
89 | //treeBuilder.DocumentModeDetected += (sender, a) => Console.WriteLine("Document mode: " + a.Mode.ToString());
90 | //tokenizer.EncodingDeclared += (sender, a) => Console.WriteLine("Encoding: " + a.Encoding + " (currently ignored)");
91 | }
92 |
93 | private void Tokenize(TextReader reader)
94 | {
95 | if (reader == null)
96 | {
97 | throw new ArgumentNullException("reader was null.");
98 | }
99 |
100 | tokenizer.Start();
101 | bool swallowBom = true;
102 |
103 | try
104 | {
105 | char[] buffer = new char[2048];
106 | UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0);
107 | bool lastWasCR = false;
108 | int len = -1;
109 | if ((len = reader.Read(buffer, 0, buffer.Length)) != 0)
110 | {
111 | int streamOffset = 0;
112 | int offset = 0;
113 | int length = len;
114 | if (swallowBom)
115 | {
116 | if (buffer[0] == '\uFEFF')
117 | {
118 | streamOffset = -1;
119 | offset = 1;
120 | length--;
121 | }
122 | }
123 | if (length > 0)
124 | {
125 | tokenizer.SetTransitionBaseOffset(streamOffset);
126 | bufr.Start = offset;
127 | bufr.End = offset + length;
128 | while (bufr.HasMore)
129 | {
130 | bufr.Adjust(lastWasCR);
131 | lastWasCR = false;
132 | if (bufr.HasMore)
133 | {
134 | lastWasCR = tokenizer.TokenizeBuffer(bufr);
135 | }
136 | }
137 | }
138 | streamOffset = length;
139 | while ((len = reader.Read(buffer, 0, buffer.Length)) != 0)
140 | {
141 | tokenizer.SetTransitionBaseOffset(streamOffset);
142 | bufr.Start = 0;
143 | bufr.End = len;
144 | while (bufr.HasMore)
145 | {
146 | bufr.Adjust(lastWasCR);
147 | lastWasCR = false;
148 | if (bufr.HasMore)
149 | {
150 | lastWasCR = tokenizer.TokenizeBuffer(bufr);
151 | }
152 | }
153 | streamOffset += len;
154 | }
155 | }
156 | tokenizer.Eof();
157 | }
158 | finally
159 | {
160 | tokenizer.End();
161 | }
162 | }
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | HtmlParserSharp
2 | ===============
3 |
4 | This is a manual C# port of the [Validator.nu HTML Parser](http://about.validator.nu/htmlparser/), a HTML5 parser originally written in Java and (compiled to C++ using the Google Web Toolkit) used by Mozilla's Gecko rendering engine. The port uses the DOM implemented in [System.Xml](http://msdn.microsoft.com/en-us/library/system.xml.aspx).
5 |
6 | Status
7 | ------
8 | PLEASE SEE https://github.com/jamietre/HtmlParserSharp FOR AN ACTIVELY MAINTAINED VERSION OF THIS PROJECT.
9 |
10 | Currently the port is based on Validator.nu 1.3.1 and works, as far as I have tested it. However as there are no unit tests, I'm not sure if every detail is working correctly. Tests showed that it is quite fast (about 3-6 times slower than parsing XML using .NET's XDocument API, but I think XML parsing is easier to implement, so this is okay and it's still FAST).
11 |
12 | What's missing
13 | --------------
14 | If you want to contribute, maybe you can start here:
15 |
16 | * Support for character encodings other than UTF-8
17 | * More C#-ish coding style
18 | * Unit tests
19 | * Look for TODOs in the code
20 |
--------------------------------------------------------------------------------
|