├── .gitignore ├── HtmlParserSharp.Tests ├── Basic.cs ├── ExtensionMethods.cs ├── HtmlParserSharp.Tests.csproj ├── Properties │ └── AssemblyInfo.cs ├── README.md ├── Resources │ └── HTML Standard.htm └── packages.config ├── HtmlParserSharp.sln ├── HtmlParserSharp ├── Common │ ├── Attributes.cs │ ├── DoctypeExpectation.cs │ ├── DocumentMode.cs │ ├── DocumentModeEventArgs.cs │ ├── EncodingDetectedEventArgs.cs │ ├── ITokenHandler.cs │ ├── ParserErrorEventArgs.cs │ └── XmlViolationPolicy.cs ├── Core │ ├── AttributeName.cs │ ├── CharsetState.cs │ ├── CoalescingTreeBuilder.cs │ ├── DispatchGroup.cs │ ├── ElementName.cs │ ├── HtmlAttributes.cs │ ├── ILocator.cs │ ├── ITreeBuilderState.cs │ ├── InsertionMode.cs │ ├── Locator.cs │ ├── NCName.cs │ ├── NamedCharacterAccel.cs │ ├── NamedCharacters.cs │ ├── Portability.cs │ ├── StackNode.cs │ ├── StateSnapshot.cs │ ├── TaintableLocator.cs │ ├── Tokenizer.cs │ ├── TreeBuilder.cs │ ├── TreeBuilderConstants.cs │ └── UTF16Buffer.cs ├── HtmlParserSharp.csproj ├── Parser.cs ├── Program.cs ├── Properties │ └── AssemblyInfo.cs ├── SampleData │ └── test.html ├── SimpleHtmlParser.cs └── TreeBuilders │ └── XmlTreeBuilder.cs ├── LICENSE.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Nonpublic stuff and generated junk 3 | ################# 4 | 5 | Build/ 6 | WebSite/ 7 | TestResults/ 8 | packages 9 | *.nupkg 10 | 11 | ################# 12 | ## Visual Studio 13 | ################# 14 | bin 15 | obj 16 | 17 | ## Ignore Visual Studio temporary files, build results, and 18 | ## files generated by popular Visual Studio add-ons. 19 | 20 | # User-specific files 21 | *.suo 22 | *.user 23 | *.sln.docstates 24 | 25 | # Build results 26 | [Dd]ebug/ 27 | [Rr]elease/ 28 | *_i.c 29 | *_p.c 30 | *.ilk 31 | *.meta 32 | *.obj 33 | *.pch 34 | *.pgc 35 | *.pgd 36 | *.rsp 37 | *.sbr 38 | *.tlb 39 | *.tli 40 | *.tlh 41 | *.tmp 42 | *.vspscc 43 | .builds 44 | *.dotCover 45 | 46 | # Visual C++ cache files 47 | ipch/ 48 | *.aps 49 | *.ncb 50 | *.opensdf 51 | *.sdf 52 | 53 | # Visual Studio profiler 54 | *.psess 55 | *.vsp 56 | 57 | # ReSharper is a .NET coding add-in 58 | _ReSharper* 59 | 60 | # Installshield output folder 61 | [Ee]xpress 62 | 63 | # DocProject is a documentation generator add-in 64 | DocProject/buildhelp/ 65 | DocProject/Help/*.HxT 66 | DocProject/Help/*.HxC 67 | DocProject/Help/*.hhc 68 | DocProject/Help/*.hhk 69 | DocProject/Help/*.hhp 70 | DocProject/Help/Html2 71 | DocProject/Help/html 72 | 73 | # Click-Once directory 74 | publish 75 | 76 | # Others 77 | [Bb]in 78 | [Oo]bj 79 | sql 80 | TestResults 81 | *.Cache 82 | ClientBin 83 | stylecop.* 84 | ~$* 85 | *.dbmdl 86 | Generated_Code #added for RIA/Silverlight projects 87 | 88 | # Backup & report files from converting an old project file to a newer 89 | # Visual Studio version. Backup files are not needed, because we have git ;-) 90 | _UpgradeReport_Files/ 91 | Backup*/ 92 | UpgradeLog*.XML 93 | -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/Basic.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using CsQuery; 4 | using Microsoft.VisualStudio.TestTools.UnitTesting; 5 | using System.IO; 6 | 7 | namespace HtmlParserSharp.Tests 8 | { 9 | [TestClass] 10 | public class Basic 11 | { 12 | static CQ Dom; 13 | 14 | /// 15 | /// This method ensures that the huge DOM gets parsed correctly by checking a few key selectors. 16 | /// 17 | 18 | [ TestMethod] 19 | public void DomParsingTestWithNthChild() 20 | { 21 | 22 | // these values have been verified in Chrome with jQuery 1.7.2 23 | 24 | Assert.AreEqual(2704, Dom["div span:first-child"].Length); 25 | Assert.AreEqual(2517, Dom["div span:only-child"].Length); 26 | Assert.AreEqual(2, Dom["[type]"].Length); 27 | Assert.AreEqual(505, Dom["div:nth-child(2n+1)"].Length); 28 | Assert.AreEqual(13, Dom["div:nth-child(3)"].Length); 29 | Assert.AreEqual(534, Dom["div:nth-last-child(2n+1)"].Length); 30 | Assert.AreEqual(7, Dom["div:nth-last-child(3)"].Length); 31 | Assert.AreEqual(2605, Dom["div span:last-child"].Length); 32 | 33 | } 34 | 35 | [ TestMethod] 36 | public void AutoGeneratedTags() 37 | { 38 | 39 | // these values have been verified in Chrome with jQuery 1.7.2 40 | 41 | Assert.AreEqual(110, Dom["tbody"].Length); 42 | 43 | } 44 | 45 | 46 | [ClassInitialize] 47 | public static void ReadLargeDoc(TestContext context) 48 | { 49 | // CsQuery (version 1.3.0 and above) uses this code. 50 | 51 | Dom = CQ.Create( 52 | CsQuery.Utility.Support.GetFile("HtmlParserSharp.Tests\\Resources\\html standard.htm") 53 | ); 54 | } 55 | 56 | [TestMethod] 57 | public void SvgHang() 58 | { 59 | var parser = new SimpleHtmlParser(); 60 | parser.Parse(new StringReader("")); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/ExtensionMethods.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.Xml; 7 | using System.IO; 8 | 9 | namespace HtmlParserSharp.Tests 10 | { 11 | public class Utf8StringWriter : StringWriter 12 | { 13 | public override Encoding Encoding 14 | { 15 | get { return Encoding.UTF8; } 16 | } 17 | } 18 | 19 | public static class ExtensionMethods 20 | { 21 | public static string WriteString(this XmlDocument doc) 22 | { 23 | using (TextWriter writer = new Utf8StringWriter()) 24 | { 25 | using (XmlWriter xmlWriter = XmlWriter.Create(writer)) 26 | { 27 | doc.WriteContentTo(xmlWriter); 28 | } 29 | return writer.ToString(); 30 | } 31 | } 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/HtmlParserSharp.Tests.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | AnyCPU 6 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4} 7 | Library 8 | Properties 9 | HtmlParserSharp.Tests 10 | HtmlParserSharp.Tests 11 | v4.5 12 | 512 13 | {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} 14 | 10.0 15 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) 16 | $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages 17 | False 18 | UnitTest 19 | false 20 | publish\ 21 | true 22 | Disk 23 | false 24 | Foreground 25 | 7 26 | Days 27 | false 28 | false 29 | true 30 | 0 31 | 1.0.0.%2a 32 | false 33 | true 34 | 35 | 36 | true 37 | full 38 | false 39 | bin\Debug\ 40 | DEBUG;TRACE 41 | prompt 42 | 4 43 | 44 | 45 | pdbonly 46 | true 47 | bin\Release\ 48 | TRACE 49 | prompt 50 | 4 51 | 52 | 53 | 54 | ..\packages\CsQuery.1.3.0-beta1\lib\net40\CsQuery.dll 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | {fd150915-d34f-436a-92c1-80aa505da754} 79 | HtmlParserSharp 80 | 81 | 82 | 83 | 84 | False 85 | Microsoft .NET Framework 4.5 %28x86 and x64%29 86 | true 87 | 88 | 89 | False 90 | .NET Framework 3.5 SP1 Client Profile 91 | false 92 | 93 | 94 | False 95 | .NET Framework 3.5 SP1 96 | false 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | False 111 | 112 | 113 | False 114 | 115 | 116 | False 117 | 118 | 119 | False 120 | 121 | 122 | 123 | 124 | 125 | 126 | 133 | -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("HtmlParserSharp.Tests")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("HtmlParserSharp.Tests")] 13 | [assembly: AssemblyCopyright("Copyright © 2012")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("e0289a0f-89bb-4efd-a05b-474cb3e1582e")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/README.md: -------------------------------------------------------------------------------- 1 | This test project is a placeholder. Right now it doesn't even test this code (it uses CsQuery to build the DOM) so don't think too much about it. -------------------------------------------------------------------------------- /HtmlParserSharp.Tests/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /HtmlParserSharp.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HtmlParserSharp", "HtmlParserSharp\HtmlParserSharp.csproj", "{FD150915-D34F-436A-92C1-80AA505DA754}" 5 | EndProject 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HtmlParserSharp.Tests", "HtmlParserSharp.Tests\HtmlParserSharp.Tests.csproj", "{B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Debug|x64 = Debug|x64 12 | Release|Any CPU = Release|Any CPU 13 | Release|x64 = Release|x64 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {FD150915-D34F-436A-92C1-80AA505DA754}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {FD150915-D34F-436A-92C1-80AA505DA754}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {FD150915-D34F-436A-92C1-80AA505DA754}.Debug|x64.ActiveCfg = Debug|Any CPU 19 | {FD150915-D34F-436A-92C1-80AA505DA754}.Release|Any CPU.ActiveCfg = Release|Any CPU 20 | {FD150915-D34F-436A-92C1-80AA505DA754}.Release|Any CPU.Build.0 = Release|Any CPU 21 | {FD150915-D34F-436A-92C1-80AA505DA754}.Release|x64.ActiveCfg = Release|Any CPU 22 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Debug|x64.ActiveCfg = Debug|Any CPU 25 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Release|Any CPU.ActiveCfg = Release|Any CPU 26 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Release|Any CPU.Build.0 = Release|Any CPU 27 | {B0AD90A0-F661-4C4B-9B26-F2EC4DA03BC4}.Release|x64.ActiveCfg = Release|Any CPU 28 | EndGlobalSection 29 | GlobalSection(SolutionProperties) = preSolution 30 | HideSolutionNode = FALSE 31 | EndGlobalSection 32 | EndGlobal 33 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/Attributes.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | 25 | #pragma warning disable 1591 26 | 27 | namespace HtmlParserSharp.Common 28 | { 29 | // This file contains the attributes that correspond to the annotations 30 | // @NsUri, @Prefix and @Local in the Java code. Probably we can safely remove these. 31 | 32 | [AttributeUsage(AttributeTargets.Field | AttributeTargets.Parameter | AttributeTargets.Method)] 33 | public class NsUriAttribute : Attribute { } 34 | 35 | [AttributeUsage(AttributeTargets.Field | AttributeTargets.Parameter | AttributeTargets.Method)] 36 | public class PrefixAttribute : Attribute { } 37 | 38 | [AttributeUsage(AttributeTargets.Field | AttributeTargets.Parameter | AttributeTargets.Method)] 39 | public class LocalAttribute : Attribute { } 40 | } 41 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/DoctypeExpectation.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | using System; 25 | using System.Collections.Generic; 26 | using System.Linq; 27 | using System.Text; 28 | 29 | #pragma warning disable 1591 30 | 31 | namespace HtmlParserSharp.Common 32 | { 33 | /// 34 | /// Used for indicating desired behavior with legacy doctypes. 35 | /// 36 | public enum DoctypeExpectation 37 | { 38 | /// 39 | /// Be a pure HTML5 parser. 40 | /// 41 | Html, 42 | 43 | /// 44 | /// Require the HTML 4.01 Transitional public id. Turn on HTML4-specific 45 | /// additional errors regardless of doctype. 46 | /// 47 | Html401Transitional, 48 | 49 | /// 50 | /// Require the HTML 4.01 Transitional public id and a system id. Turn on 51 | /// HTML4-specific additional errors regardless of doctype. 52 | /// 53 | Html401Strict, 54 | 55 | /// 56 | /// Treat the doctype required by HTML 5, doctypes with the HTML 4.01 Strict 57 | /// public id and doctypes with the HTML 4.01 Transitional public id and a 58 | /// system id as non-errors. Turn on HTML4-specific additional errors if the 59 | /// public id is the HTML 4.01 Strict or Transitional public id. 60 | /// 61 | Auto, 62 | 63 | /// 64 | /// Never enable HTML4-specific error checks. Never report any doctype 65 | /// condition as an error. (Doctype tokens in wrong places will be 66 | /// reported as errors, though.) The application may decide what to log 67 | /// in response to calls to DocumentModeHanler. This mode 68 | /// is meant for doing surveys on existing content. 69 | /// 70 | NoDoctypeErrors 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/DocumentMode.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2008 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #pragma warning disable 1591 26 | 27 | namespace HtmlParserSharp.Common 28 | { 29 | public enum DocumentMode 30 | { 31 | /// 32 | /// The Standards Mode 33 | /// 34 | StandardsMode, 35 | 36 | /// 37 | /// The Limited Quirks Mode aka. The Almost Standards Mode 38 | /// 39 | AlmostStandardsMode, 40 | 41 | /// 42 | /// The Quirks Mode 43 | /// 44 | /// 45 | QuirksMode 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/DocumentModeEventArgs.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | 25 | #pragma warning disable 1591 26 | 27 | namespace HtmlParserSharp.Common 28 | { 29 | public class DocumentModeEventArgs : EventArgs 30 | { 31 | public DocumentMode Mode { get; private set; } 32 | public string PublicIdentifier { get; private set; } 33 | public string SystemIdentifier { get; private set; } 34 | public bool Html4SpecificAdditionalErrorChecks { get; private set; } 35 | 36 | /// 37 | /// Receive notification of the document mode. 38 | /// 39 | /// The document mode. 40 | /// The public identifier of the doctype or null if unavailable. 41 | /// The system identifier of the doctype or null if unavailable. 42 | /// true if HTML 4-specific checks were enabled, 43 | /// false otherwise 44 | public DocumentModeEventArgs(DocumentMode mode, string publicIdentifier, string systemIdentifier, bool html4SpecificAdditionalErrorChecks) 45 | { 46 | Mode = mode; 47 | PublicIdentifier = publicIdentifier; 48 | SystemIdentifier = systemIdentifier; 49 | Html4SpecificAdditionalErrorChecks = html4SpecificAdditionalErrorChecks; 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/EncodingDetectedEventArgs.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | 25 | #pragma warning disable 1591 26 | 27 | namespace HtmlParserSharp.Common 28 | { 29 | public class EncodingDetectedEventArgs : EventArgs 30 | { 31 | public string Encoding { get; private set; } 32 | 33 | /// 34 | /// When true, the client has decided to accept the charset-encoding for the document and the 35 | /// current processing must be suspended. (Set true to stop encoding). 36 | /// 37 | 38 | public bool AcceptEncoding { get; set; } 39 | 40 | public EncodingDetectedEventArgs(string encoding) 41 | { 42 | Encoding = encoding; 43 | AcceptEncoding = false; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/ITokenHandler.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2008-2010 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | using HtmlParserSharp.Core; 26 | 27 | #pragma warning disable 1591 28 | 29 | namespace HtmlParserSharp.Common 30 | { 31 | /// 32 | /// Tokenizer reports tokens through this interface. 33 | /// 34 | public interface ITokenHandler 35 | { 36 | 37 | /// 38 | /// This method is called at the start of tokenization before any other 39 | /// methods on this interface are called. Implementations should hold the 40 | /// reference to the Tokenizer in order to set the content 41 | /// model flag and in order to be able to query for Locator data. 42 | /// 43 | /// The Tokenizer. 44 | void StartTokenization(Tokenizer self); 45 | 46 | /// 47 | /// If this handler implementation cares about comments, return true. 48 | /// If not, return false 49 | /// 50 | /// Whether this handler wants comments 51 | bool WantsComments { get; } 52 | 53 | /// 54 | /// Receive a doctype token. 55 | /// 56 | /// The name. 57 | /// The public identifier. 58 | /// The system identifier. 59 | /// Whether the token is correct. 60 | void Doctype(string name, string publicIdentifier, string systemIdentifier, bool forceQuirks); 61 | 62 | /// 63 | /// Receive a start tag token. 64 | /// 65 | /// The tag name. 66 | /// The attributes. 67 | /// TODO 68 | void StartTag(ElementName eltName, HtmlAttributes attributes, bool selfClosing); 69 | 70 | /// 71 | /// Receive an end tag token. 72 | /// 73 | /// The tag name. 74 | void EndTag(ElementName eltName); 75 | 76 | /// 77 | /// Receive a comment token. The data is junk if thewantsComments() 78 | /// returned false. 79 | /// 80 | /// The buffer holding the data. 81 | /// The offset into the buffer. 82 | /// The number of code units to read. 83 | void Comment(char[] buf, int start, int length); 84 | 85 | /// 86 | /// Receive character tokens. This method has the same semantics as the SAX 87 | /// method of the same name. 88 | /// 89 | /// A buffer holding the data. 90 | /// The offset into the buffer. 91 | /// The number of code units to read. 92 | void Characters(char[] buf, int start, int length); 93 | 94 | /// 95 | /// Reports a U+0000 that's being turned into a U+FFFD. 96 | /// 97 | void ZeroOriginatingReplacementCharacter(); 98 | 99 | /// 100 | /// The end-of-file token. 101 | /// 102 | void Eof(); 103 | 104 | /// 105 | /// The perform final cleanup. 106 | /// 107 | void EndTokenization(); 108 | 109 | /// 110 | /// Checks if the CDATA sections are allowed. 111 | /// 112 | /// true if CDATA sections are allowed 113 | bool IsCDataSectionAllowed { get; } 114 | 115 | /// 116 | /// Gets a value indicating whether self-closing tags should be allowed. When true, any tag may 117 | /// close itself. When false, a self-closing tag is treated like an opening-tag only. 118 | /// 119 | 120 | bool AllowSelfClosingTags { get; } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/ParserErrorEventArgs.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | 25 | #pragma warning disable 1591 26 | 27 | namespace HtmlParserSharp.Common 28 | { 29 | public class ParserErrorEventArgs : EventArgs 30 | { 31 | public string Message { get; private set; } 32 | public bool IsWarning { get; private set; } 33 | 34 | public ParserErrorEventArgs(string message, bool isWarning) 35 | { 36 | Message = message; 37 | IsWarning = isWarning; 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /HtmlParserSharp/Common/XmlViolationPolicy.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | using System; 25 | using System.Collections.Generic; 26 | using System.Linq; 27 | using System.Text; 28 | 29 | #pragma warning disable 1591 30 | 31 | namespace HtmlParserSharp.Common 32 | { 33 | /// 34 | /// Policy for XML 1.0 violations. 35 | /// 36 | /// 37 | public enum XmlViolationPolicy 38 | { 39 | /// 40 | /// Conform to HTML 5, allow XML 1.0 to be violated. 41 | /// 42 | Allow, 43 | 44 | /// 45 | /// Halt when something cannot be mapped to XML 1.0. 46 | /// 47 | Fatal, 48 | 49 | /// 50 | /// Be non-conforming and alter the infoset to fit 51 | /// XML 1.0 when something would otherwise not be 52 | /// mappable to XML 1.0. 53 | /// 54 | AlterInfoset 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/CharsetState.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2007-2011 Mozilla Foundation 4 | * Portions of comments Copyright 2004-2008 Apple Computer, Inc., Mozilla 5 | * Foundation, and Opera Software ASA. 6 | * Copyright (c) 2012 Patrick Reisert 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a 9 | * copy of this software and associated documentation files (the "Software"), 10 | * to deal in the Software without restriction, including without limitation 11 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | * and/or sell copies of the Software, and to permit persons to whom the 13 | * Software is furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | * DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #pragma warning disable 1591 // Missing XML comment 28 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 29 | #pragma warning disable 1587 // XML comment is not placed on a valid element 30 | 31 | namespace HtmlParserSharp.Core 32 | { 33 | public enum CharsetState 34 | { 35 | CHARSET_INITIAL = 0, 36 | 37 | CHARSET_C = 1, 38 | 39 | CHARSET_H = 2, 40 | 41 | CHARSET_A = 3, 42 | 43 | CHARSET_R = 4, 44 | 45 | CHARSET_S = 5, 46 | 47 | CHARSET_E = 6, 48 | 49 | CHARSET_T = 7, 50 | 51 | CHARSET_EQUALS = 8, 52 | 53 | CHARSET_SINGLE_QUOTED = 9, 54 | 55 | CHARSET_DOUBLE_QUOTED = 10, 56 | 57 | CHARSET_UNQUOTED = 11 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/CoalescingTreeBuilder.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008-2010 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | using System; 25 | using System.Text; 26 | 27 | #pragma warning disable 1591 // Missing XML comment 28 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 29 | #pragma warning disable 1587 // XML comment is not placed on a valid element 30 | 31 | namespace HtmlParserSharp.Core 32 | { 33 | 34 | 35 | /// 36 | /// A common superclass for tree builders that coalesce their text nodes. 37 | /// 38 | public abstract class CoalescingTreeBuilder : TreeBuilder where T : class 39 | { 40 | override protected void AppendCharacters(T parent, char[] buf, int start, int length) 41 | { 42 | AppendCharacters(parent, new String(buf, start, length)); 43 | } 44 | override protected void AppendCharacters(T parent, StringBuilder sb) 45 | { 46 | AppendCharacters(parent, sb.ToString()); 47 | } 48 | 49 | override protected void AppendIsindexPrompt(T parent) 50 | { 51 | AppendCharacters(parent, "This is a searchable index. Enter search keywords: "); 52 | } 53 | 54 | protected abstract void AppendCharacters(T parent, string text); 55 | 56 | override protected void AppendComment(T parent, char[] buf, int start, int length) 57 | { 58 | AppendComment(parent, new String(buf, start, length)); 59 | } 60 | 61 | protected abstract void AppendComment(T parent, string comment); 62 | 63 | override protected void AppendCommentToDocument(char[] buf, int start, int length) 64 | { 65 | // TODO Auto-generated method stub 66 | AppendCommentToDocument(new String(buf, start, length)); 67 | } 68 | 69 | protected abstract void AppendCommentToDocument(string comment); 70 | 71 | //override protected void InsertFosterParentedCharacters(char[] buf, int start, 72 | // int length, T table, T stackParent) 73 | //{ 74 | // InsertFosterParentedCharacters(new String(buf, start, length), table, stackParent); 75 | //} 76 | 77 | protected override void InsertFosterParentedCharacters(StringBuilder sb, T table, T stackParent) 78 | { 79 | InsertFosterParentedCharacters(sb.ToString(), table, stackParent); 80 | } 81 | 82 | protected abstract void InsertFosterParentedCharacters(string text, T table, T stackParent); 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/DispatchGroup.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2007-2011 Mozilla Foundation 4 | * Portions of comments Copyright 2004-2008 Apple Computer, Inc., Mozilla 5 | * Foundation, and Opera Software ASA. 6 | * Copyright (c) 2012 Patrick Reisert 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a 9 | * copy of this software and associated documentation files (the "Software"), 10 | * to deal in the Software without restriction, including without limitation 11 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | * and/or sell copies of the Software, and to permit persons to whom the 13 | * Software is furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | * DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #pragma warning disable 1591 // Missing XML comment 28 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 29 | #pragma warning disable 1587 // XML comment is not placed on a valid element 30 | 31 | namespace HtmlParserSharp.Core 32 | { 33 | public enum DispatchGroup 34 | { 35 | OTHER = 0, 36 | 37 | A = 1, 38 | 39 | BASE = 2, 40 | 41 | BODY = 3, 42 | 43 | BR = 4, 44 | 45 | BUTTON = 5, 46 | 47 | CAPTION = 6, 48 | 49 | COL = 7, 50 | 51 | COLGROUP = 8, 52 | 53 | FORM = 9, 54 | 55 | FRAME = 10, 56 | 57 | FRAMESET = 11, 58 | 59 | IMAGE = 12, 60 | 61 | INPUT = 13, 62 | 63 | ISINDEX = 14, 64 | 65 | LI = 15, 66 | 67 | LINK_OR_BASEFONT_OR_BGSOUND = 16, 68 | 69 | MATH = 17, 70 | 71 | META = 18, 72 | 73 | SVG = 19, 74 | 75 | HEAD = 20, 76 | 77 | HR = 22, 78 | 79 | HTML = 23, 80 | 81 | NOBR = 24, 82 | 83 | NOFRAMES = 25, 84 | 85 | NOSCRIPT = 26, 86 | 87 | OPTGROUP = 27, 88 | 89 | OPTION = 28, 90 | 91 | P = 29, 92 | 93 | PLAINTEXT = 30, 94 | 95 | SCRIPT = 31, 96 | 97 | SELECT = 32, 98 | 99 | STYLE = 33, 100 | 101 | TABLE = 34, 102 | 103 | TEXTAREA = 35, 104 | 105 | TITLE = 36, 106 | 107 | TR = 37, 108 | 109 | XMP = 38, 110 | 111 | TBODY_OR_THEAD_OR_TFOOT = 39, 112 | 113 | TD_OR_TH = 40, 114 | 115 | DD_OR_DT = 41, 116 | 117 | H1_OR_H2_OR_H3_OR_H4_OR_H5_OR_H6 = 42, 118 | 119 | MARQUEE_OR_APPLET = 43, 120 | 121 | PRE_OR_LISTING = 44, 122 | 123 | B_OR_BIG_OR_CODE_OR_EM_OR_I_OR_S_OR_SMALL_OR_STRIKE_OR_STRONG_OR_TT_OR_U = 45, 124 | 125 | UL_OR_OL_OR_DL = 46, 126 | 127 | IFRAME = 47, 128 | 129 | EMBED_OR_IMG = 48, 130 | 131 | AREA_OR_WBR = 49, 132 | 133 | DIV_OR_BLOCKQUOTE_OR_CENTER_OR_MENU = 50, 134 | 135 | ADDRESS_OR_ARTICLE_OR_ASIDE_OR_DETAILS_OR_DIR_OR_FIGCAPTION_OR_FIGURE_OR_FOOTER_OR_HEADER_OR_HGROUP_OR_NAV_OR_SECTION_OR_SUMMARY = 51, 136 | 137 | RUBY_OR_SPAN_OR_SUB_OR_SUP_OR_VAR = 52, 138 | 139 | RT_OR_RP = 53, 140 | 141 | COMMAND = 54, 142 | 143 | PARAM_OR_SOURCE_OR_TRACK = 55, 144 | 145 | MGLYPH_OR_MALIGNMARK = 56, 146 | 147 | MI_MO_MN_MS_MTEXT = 57, 148 | 149 | ANNOTATION_XML = 58, 150 | 151 | FOREIGNOBJECT_OR_DESC = 59, 152 | 153 | NOEMBED = 60, 154 | 155 | FIELDSET = 61, 156 | 157 | OUTPUT_OR_LABEL = 62, 158 | 159 | OBJECT = 63, 160 | 161 | FONT = 64, 162 | 163 | KEYGEN = 65, 164 | 165 | MENUITEM = 66 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/HtmlAttributes.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2008-2011 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | using System; 26 | using System.Diagnostics; 27 | using HtmlParserSharp.Common; 28 | 29 | #pragma warning disable 1591 // Missing XML comment 30 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 31 | #pragma warning disable 1587 // XML comment is not placed on a valid element 32 | 33 | namespace HtmlParserSharp.Core 34 | { 35 | /// 36 | /// Be careful with this class. QName is the name in from HTML tokenization. 37 | /// Otherwise, please refer to the interface doc. 38 | /// 39 | public sealed class HtmlAttributes : IEquatable /* : Sax.IAttributes*/ { 40 | 41 | // [NOCPP[ 42 | 43 | private static readonly AttributeName[] EMPTY_ATTRIBUTENAMES = new AttributeName[0]; 44 | 45 | private static readonly string[] EMPTY_stringS = new string[0]; 46 | 47 | // ]NOCPP] 48 | 49 | public static readonly HtmlAttributes EMPTY_ATTRIBUTES = new HtmlAttributes(AttributeName.HTML); 50 | 51 | private int mode; 52 | 53 | private int length; 54 | 55 | private AttributeName[] names; 56 | 57 | private string[] values; 58 | 59 | // [NOCPP[ 60 | 61 | private string idValue; 62 | 63 | private int xmlnsLength; 64 | 65 | private AttributeName[] xmlnsNames; 66 | 67 | private string[] xmlnsValues; 68 | 69 | // ]NOCPP] 70 | 71 | public HtmlAttributes(int mode) 72 | { 73 | this.mode = mode; 74 | this.length = 0; 75 | /* 76 | * The length of 5 covers covers 98.3% of elements 77 | * according to Hixie 78 | */ 79 | this.names = new AttributeName[5]; 80 | this.values = new string[5]; 81 | 82 | // [NOCPP[ 83 | 84 | this.idValue = null; 85 | 86 | this.xmlnsLength = 0; 87 | 88 | this.xmlnsNames = HtmlAttributes.EMPTY_ATTRIBUTENAMES; 89 | 90 | this.xmlnsValues = HtmlAttributes.EMPTY_stringS; 91 | 92 | // ]NOCPP] 93 | } 94 | /* 95 | public HtmlAttributes(HtmlAttributes other) { 96 | this.mode = other.mode; 97 | this.length = other.length; 98 | this.names = new AttributeName[other.length]; 99 | this.values = new string[other.length]; 100 | // [NOCPP[ 101 | this.idValue = other.idValue; 102 | this.xmlnsLength = other.xmlnsLength; 103 | this.xmlnsNames = new AttributeName[other.xmlnsLength]; 104 | this.xmlnsValues = new string[other.xmlnsLength]; 105 | // ]NOCPP] 106 | } 107 | */ 108 | 109 | /// 110 | /// Only use with a static argument 111 | /// 112 | public int GetIndex(AttributeName name) 113 | { 114 | for (int i = 0; i < length; i++) 115 | { 116 | if (names[i] == name) 117 | { 118 | return i; 119 | } 120 | } 121 | return -1; 122 | } 123 | 124 | // [NOCPP[ 125 | 126 | public int GetIndex(string qName) 127 | { 128 | for (int i = 0; i < length; i++) 129 | { 130 | if (names[i].GetQName(mode) == qName) 131 | { 132 | return i; 133 | } 134 | } 135 | return -1; 136 | } 137 | 138 | public int GetIndex(string uri, string localName) 139 | { 140 | for (int i = 0; i < length; i++) 141 | { 142 | if (names[i].GetLocal(mode) == localName 143 | && names[i].GetUri(mode) == uri) 144 | { 145 | return i; 146 | } 147 | } 148 | return -1; 149 | } 150 | 151 | public string GetType(string qName) 152 | { 153 | int index = GetIndex(qName); 154 | if (index == -1) 155 | { 156 | return null; 157 | } 158 | else 159 | { 160 | return GetType(index); 161 | } 162 | } 163 | 164 | public string GetType(string uri, string localName) 165 | { 166 | int index = GetIndex(uri, localName); 167 | if (index == -1) 168 | { 169 | return null; 170 | } 171 | else 172 | { 173 | return GetType(index); 174 | } 175 | } 176 | 177 | public string GetValue(string qName) 178 | { 179 | int index = GetIndex(qName); 180 | if (index == -1) 181 | { 182 | return null; 183 | } 184 | else 185 | { 186 | return GetValue(index); 187 | } 188 | } 189 | 190 | public string GetValue(string uri, string localName) 191 | { 192 | int index = GetIndex(uri, localName); 193 | if (index == -1) 194 | { 195 | return null; 196 | } 197 | else 198 | { 199 | return GetValue(index); 200 | } 201 | } 202 | 203 | // ]NOCPP] 204 | 205 | public int Length 206 | { 207 | get 208 | { 209 | return length; 210 | } 211 | } 212 | 213 | [Local] 214 | public string GetLocalName(int index) 215 | { 216 | if (index < length && index >= 0) 217 | { 218 | return names[index].GetLocal(mode); 219 | } 220 | else 221 | { 222 | return null; 223 | } 224 | } 225 | 226 | // [NOCPP[ 227 | 228 | public string GetQName(int index) 229 | { 230 | if (index < length && index >= 0) 231 | { 232 | return names[index].GetQName(mode); 233 | } 234 | else 235 | { 236 | return null; 237 | } 238 | } 239 | 240 | public string GetType(int index) 241 | { 242 | if (index < length && index >= 0) 243 | { 244 | return (names[index] == AttributeName.ID) ? "ID" : "CDATA"; 245 | } 246 | else 247 | { 248 | return null; 249 | } 250 | } 251 | 252 | // ]NOCPP] 253 | 254 | public AttributeName GetAttributeName(int index) 255 | { 256 | if (index < length && index >= 0) 257 | { 258 | return names[index]; 259 | } 260 | else 261 | { 262 | return null; 263 | } 264 | } 265 | 266 | [NsUri] 267 | public string GetURI(int index) 268 | { 269 | if (index < length && index >= 0) 270 | { 271 | return names[index].GetUri(mode); 272 | } 273 | else 274 | { 275 | return null; 276 | } 277 | } 278 | 279 | [Prefix] 280 | public string GetPrefix(int index) 281 | { 282 | if (index < length && index >= 0) 283 | { 284 | return names[index].GetPrefix(mode); 285 | } 286 | else 287 | { 288 | return null; 289 | } 290 | } 291 | 292 | public string GetValue(int index) 293 | { 294 | if (index < length && index >= 0) 295 | { 296 | return values[index]; 297 | } 298 | else 299 | { 300 | return null; 301 | } 302 | } 303 | 304 | /// 305 | /// Only use with static argument. 306 | /// 307 | public string GetValue(AttributeName name) 308 | { 309 | int index = GetIndex(name); 310 | if (index == -1) 311 | { 312 | return null; 313 | } 314 | else 315 | { 316 | return GetValue(index); 317 | } 318 | } 319 | 320 | // [NOCPP[ 321 | 322 | public string Id 323 | { 324 | get 325 | { 326 | return idValue; 327 | } 328 | } 329 | 330 | public int XmlnsLength 331 | { 332 | get 333 | { 334 | return xmlnsLength; 335 | } 336 | } 337 | 338 | [Local] 339 | public string GetXmlnsLocalName(int index) 340 | { 341 | if (index < xmlnsLength && index >= 0) 342 | { 343 | return xmlnsNames[index].GetLocal(mode); 344 | } 345 | else 346 | { 347 | return null; 348 | } 349 | } 350 | 351 | [NsUri] 352 | public string GetXmlnsURI(int index) 353 | { 354 | if (index < xmlnsLength && index >= 0) 355 | { 356 | return xmlnsNames[index].GetUri(mode); 357 | } 358 | else 359 | { 360 | return null; 361 | } 362 | } 363 | 364 | public string GetXmlnsValue(int index) 365 | { 366 | if (index < xmlnsLength && index >= 0) 367 | { 368 | return xmlnsValues[index]; 369 | } 370 | else 371 | { 372 | return null; 373 | } 374 | } 375 | 376 | public int GetXmlnsIndex(AttributeName name) 377 | { 378 | for (int i = 0; i < xmlnsLength; i++) 379 | { 380 | if (xmlnsNames[i] == name) 381 | { 382 | return i; 383 | } 384 | } 385 | return -1; 386 | } 387 | 388 | public string GetXmlnsValue(AttributeName name) 389 | { 390 | int index = GetXmlnsIndex(name); 391 | if (index == -1) 392 | { 393 | return null; 394 | } 395 | else 396 | { 397 | return GetXmlnsValue(index); 398 | } 399 | } 400 | 401 | public AttributeName GetXmlnsAttributeName(int index) 402 | { 403 | if (index < xmlnsLength && index >= 0) 404 | { 405 | return xmlnsNames[index]; 406 | } 407 | else 408 | { 409 | return null; 410 | } 411 | } 412 | 413 | // ]NOCPP] 414 | 415 | internal void AddAttribute(AttributeName name, string value 416 | // [NOCPP[ 417 | , XmlViolationPolicy xmlnsPolicy 418 | // ]NOCPP] 419 | ) 420 | { 421 | // [NOCPP[ 422 | if (name == AttributeName.ID) 423 | { 424 | idValue = value; 425 | } 426 | 427 | if (name.IsXmlns) 428 | { 429 | if (xmlnsNames.Length == xmlnsLength) 430 | { 431 | int newLen = xmlnsLength == 0 ? 2 : xmlnsLength << 1; 432 | AttributeName[] newNames = new AttributeName[newLen]; 433 | Array.Copy(xmlnsNames, newNames, xmlnsNames.Length); 434 | 435 | xmlnsNames = newNames; 436 | string[] newValues = new string[newLen]; 437 | Array.Copy(xmlnsValues, newValues, xmlnsValues.Length); 438 | xmlnsValues = newValues; 439 | } 440 | xmlnsNames[xmlnsLength] = name; 441 | xmlnsValues[xmlnsLength] = value; 442 | xmlnsLength++; 443 | switch (xmlnsPolicy) 444 | { 445 | case XmlViolationPolicy.Fatal: 446 | // this is ugly (TODO) 447 | throw new Exception("Saw an xmlns attribute."); 448 | case XmlViolationPolicy.AlterInfoset: 449 | return; 450 | case XmlViolationPolicy.Allow: 451 | break; // fall through 452 | } 453 | } 454 | 455 | // ]NOCPP] 456 | 457 | if (names.Length == length) 458 | { 459 | int newLen = length << 1; // The first growth covers virtually 460 | // 100% of elements according to 461 | // Hixie 462 | AttributeName[] newNames = new AttributeName[newLen]; 463 | Array.Copy(names, newNames, names.Length); 464 | names = newNames; 465 | string[] newValues = new string[newLen]; 466 | Array.Copy(values, newValues, values.Length); 467 | values = newValues; 468 | } 469 | names[length] = name; 470 | values[length] = value; 471 | length++; 472 | } 473 | 474 | internal void Clear(int m) 475 | { 476 | for (int i = 0; i < length; i++) 477 | { 478 | names[i] = null; 479 | values[i] = null; 480 | } 481 | length = 0; 482 | mode = m; 483 | // [NOCPP[ 484 | idValue = null; 485 | for (int i = 0; i < xmlnsLength; i++) 486 | { 487 | xmlnsNames[i] = null; 488 | xmlnsValues[i] = null; 489 | } 490 | xmlnsLength = 0; 491 | // ]NOCPP] 492 | } 493 | 494 | /// 495 | /// This is only used for AttributeName ownership transfer 496 | /// in the isindex case to avoid freeing custom names twice in C++. 497 | /// 498 | internal void ClearWithoutReleasingContents() 499 | { 500 | for (int i = 0; i < length; i++) 501 | { 502 | names[i] = null; 503 | values[i] = null; 504 | } 505 | length = 0; 506 | } 507 | 508 | public bool Contains(AttributeName name) 509 | { 510 | for (int i = 0; i < length; i++) 511 | { 512 | if (name.Equals(names[i])) 513 | { 514 | return true; 515 | } 516 | } 517 | // [NOCPP[ 518 | for (int i = 0; i < xmlnsLength; i++) 519 | { 520 | if (name.Equals(xmlnsNames[i])) 521 | { 522 | return true; 523 | } 524 | } 525 | // ]NOCPP] 526 | return false; 527 | } 528 | 529 | public void AdjustForMath() 530 | { 531 | mode = AttributeName.MATHML; 532 | } 533 | 534 | public void AdjustForSvg() 535 | { 536 | mode = AttributeName.SVG; 537 | } 538 | 539 | public HtmlAttributes CloneAttributes() 540 | { 541 | Debug.Assert((length == 0 && xmlnsLength == 0) || mode == 0 || mode == 3); 542 | HtmlAttributes clone = new HtmlAttributes(0); 543 | for (int i = 0; i < length; i++) 544 | { 545 | clone.AddAttribute(names[i].CloneAttributeName(), values[i] 546 | // [NOCPP[ 547 | , XmlViolationPolicy.Allow 548 | // ]NOCPP] 549 | ); 550 | } 551 | // [NOCPP[ 552 | for (int i = 0; i < xmlnsLength; i++) 553 | { 554 | clone.AddAttribute(xmlnsNames[i], 555 | xmlnsValues[i], XmlViolationPolicy.Allow); 556 | } 557 | // ]NOCPP] 558 | return clone; // XXX!!! 559 | } 560 | 561 | public bool Equals(HtmlAttributes other) 562 | { 563 | Debug.Assert(mode == 0 || mode == 3, "Trying to compare attributes in foreign content."); 564 | int otherLength = other.Length; 565 | if (length != otherLength) 566 | { 567 | return false; 568 | } 569 | for (int i = 0; i < length; i++) 570 | { 571 | // Work around the limitations of C++ 572 | bool found = false; 573 | // The comparing just the local names is OK, since these attribute 574 | // holders are both supposed to belong to HTML formatting elements 575 | /*[Local]*/ 576 | string ownLocal = names[i].GetLocal(AttributeName.HTML); 577 | for (int j = 0; j < otherLength; j++) 578 | { 579 | if (ownLocal == other.names[j].GetLocal(AttributeName.HTML)) 580 | { 581 | found = true; 582 | if (values[i] != other.values[j]) 583 | { 584 | return false; 585 | } 586 | } 587 | } 588 | if (!found) 589 | { 590 | return false; 591 | } 592 | } 593 | return true; 594 | } 595 | 596 | // [NOCPP[ 597 | 598 | internal void ProcessNonNcNames(TreeBuilder treeBuilder, XmlViolationPolicy namePolicy) where T : class 599 | { 600 | for (int i = 0; i < length; i++) 601 | { 602 | AttributeName attName = names[i]; 603 | if (!attName.IsNcName(mode)) 604 | { 605 | string name = attName.GetLocal(mode); 606 | switch (namePolicy) 607 | { 608 | case XmlViolationPolicy.AlterInfoset: 609 | names[i] = AttributeName.Create(NCName.EscapeName(name)); 610 | goto case XmlViolationPolicy.Allow; // fall through 611 | case XmlViolationPolicy.Allow: 612 | if (attName != AttributeName.XML_LANG) 613 | { 614 | treeBuilder.Warn("Attribute \u201C" + name + "\u201D is not serializable as XML 1.0."); 615 | } 616 | break; 617 | case XmlViolationPolicy.Fatal: 618 | treeBuilder.Fatal("Attribute \u201C" + name + "\u201D is not serializable as XML 1.0."); 619 | break; 620 | } 621 | } 622 | } 623 | } 624 | 625 | public void Merge(HtmlAttributes attributes) 626 | { 627 | int len = attributes.Length; 628 | for (int i = 0; i < len; i++) 629 | { 630 | AttributeName name = attributes.GetAttributeName(i); 631 | if (!Contains(name)) 632 | { 633 | AddAttribute(name, attributes.GetValue(i), XmlViolationPolicy.Allow); 634 | } 635 | } 636 | } 637 | 638 | // ]NOCPP] 639 | } 640 | } 641 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/ILocator.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #pragma warning disable 1591 // Missing XML comment 24 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 25 | #pragma warning disable 1587 // XML comment is not placed on a valid element 26 | 27 | namespace HtmlParserSharp.Core 28 | { 29 | /// 30 | /// Interface for getting the current line and column 31 | /// (Corresponds to the SAX Locator interface). 32 | /// This is implemented by Locator and Tokenizer. 33 | /// 34 | public interface ILocator 35 | { 36 | int LineNumber { get; } 37 | int ColumnNumber { get; } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/ITreeBuilderState.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009-2010 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma warning disable 1591 // Missing XML comment 25 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 26 | #pragma warning disable 1587 // XML comment is not placed on a valid element 27 | 28 | namespace HtmlParserSharp.Core 29 | { 30 | /// 31 | /// Interface for exposing the state of the HTML5 tree builder so that the 32 | /// interface can be implemented by the tree builder itself and by snapshots. 33 | /// 34 | public interface ITreeBuilderState where T : class 35 | { 36 | /// 37 | /// Gets the stack. 38 | /// 39 | /// The stack 40 | StackNode[] Stack { get; } 41 | 42 | /// 43 | /// Gets the list of active formatting elements. 44 | /// 45 | /// The list of active formatting elements. 46 | StackNode[] ListOfActiveFormattingElements { get; } 47 | 48 | /// 49 | /// Gets the form pointer. 50 | /// 51 | /// The form pointer 52 | T FormPointer { get; } 53 | 54 | /// 55 | /// Gets the head pointer. 56 | /// 57 | /// The head pointer. 58 | T HeadPointer { get; } 59 | 60 | /// 61 | /// Gets the deep tree surrogate parent. 62 | /// 63 | /// The deep tree surrogate parent. 64 | T DeepTreeSurrogateParent { get; } 65 | 66 | /// 67 | /// Gets the mode. 68 | /// 69 | /// The mode. 70 | InsertionMode Mode { get; } 71 | 72 | /// 73 | /// Gets the original mode. 74 | /// 75 | /// The original mode. 76 | InsertionMode OriginalMode { get; } 77 | 78 | /// 79 | /// Determines whether the frameset is OK. 80 | /// 81 | /// 82 | /// true if the frameset is OK; otherwise, false. 83 | /// 84 | bool IsFramesetOk { get; } 85 | 86 | /// 87 | /// Determines whether we need to drop LF. 88 | /// 89 | /// 90 | /// true if we need to drop LF; otherwise, false. 91 | /// 92 | bool IsNeedToDropLF { get; } 93 | 94 | /// 95 | /// Determines whether this instance is in quirks mode. 96 | /// 97 | /// 98 | /// true if this instance is in quirks mode; otherwise, false. 99 | /// 100 | bool IsQuirks { get; } 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/InsertionMode.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2007-2011 Mozilla Foundation 4 | * Portions of comments Copyright 2004-2008 Apple Computer, Inc., Mozilla 5 | * Foundation, and Opera Software ASA. 6 | * Copyright (c) 2012 Patrick Reisert 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a 9 | * copy of this software and associated documentation files (the "Software"), 10 | * to deal in the Software without restriction, including without limitation 11 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | * and/or sell copies of the Software, and to permit persons to whom the 13 | * Software is furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | * DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #pragma warning disable 1591 // Missing XML comment 28 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 29 | #pragma warning disable 1587 // XML comment is not placed on a valid element 30 | 31 | namespace HtmlParserSharp.Core 32 | { 33 | public enum InsertionMode 34 | { 35 | INITIAL = 0, 36 | 37 | BEFORE_HTML = 1, 38 | 39 | BEFORE_HEAD = 2, 40 | 41 | IN_HEAD = 3, 42 | 43 | IN_HEAD_NOSCRIPT = 4, 44 | 45 | AFTER_HEAD = 5, 46 | 47 | IN_BODY = 6, 48 | 49 | IN_TABLE = 7, 50 | 51 | IN_CAPTION = 8, 52 | 53 | IN_COLUMN_GROUP = 9, 54 | 55 | IN_TABLE_BODY = 10, 56 | 57 | IN_ROW = 11, 58 | 59 | IN_CELL = 12, 60 | 61 | IN_SELECT = 13, 62 | 63 | IN_SELECT_IN_TABLE = 14, 64 | 65 | AFTER_BODY = 15, 66 | 67 | IN_FRAMESET = 16, 68 | 69 | AFTER_FRAMESET = 17, 70 | 71 | AFTER_AFTER_BODY = 18, 72 | 73 | AFTER_AFTER_FRAMESET = 19, 74 | 75 | TEXT = 20, 76 | 77 | FRAMESET_OK = 21 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/Locator.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2011 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #pragma warning disable 1591 // Missing XML comment 26 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 27 | #pragma warning disable 1587 // XML comment is not placed on a valid element 28 | 29 | namespace HtmlParserSharp.Core 30 | { 31 | public class Locator : ILocator 32 | { 33 | public int ColumnNumber { get; private set; } 34 | 35 | public int LineNumber { get; private set; } 36 | 37 | public Locator(ILocator locator) 38 | { 39 | ColumnNumber = locator.ColumnNumber; 40 | LineNumber = locator.LineNumber; 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/NCName.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008-2009 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | using System; 25 | using System.Text; 26 | 27 | #pragma warning disable 1591 // Missing XML comment 28 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 29 | #pragma warning disable 1587 // XML comment is not placed on a valid element 30 | 31 | namespace HtmlParserSharp.Core 32 | { 33 | public sealed class NCName 34 | { 35 | // [NOCPP[ 36 | 37 | private const int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00; 38 | 39 | private static readonly char[] HEX_TABLE = "0123456789ABCDEF".ToCharArray(); 40 | 41 | public static bool IsNCNameStart(char c) 42 | { 43 | return ((c >= '\u0041' && c <= '\u005A') 44 | || (c >= '\u0061' && c <= '\u007A') 45 | || (c >= '\u00C0' && c <= '\u00D6') 46 | || (c >= '\u00D8' && c <= '\u00F6') 47 | || (c >= '\u00F8' && c <= '\u00FF') 48 | || (c >= '\u0100' && c <= '\u0131') 49 | || (c >= '\u0134' && c <= '\u013E') 50 | || (c >= '\u0141' && c <= '\u0148') 51 | || (c >= '\u014A' && c <= '\u017E') 52 | || (c >= '\u0180' && c <= '\u01C3') 53 | || (c >= '\u01CD' && c <= '\u01F0') 54 | || (c >= '\u01F4' && c <= '\u01F5') 55 | || (c >= '\u01FA' && c <= '\u0217') 56 | || (c >= '\u0250' && c <= '\u02A8') 57 | || (c >= '\u02BB' && c <= '\u02C1') || (c == '\u0386') 58 | || (c >= '\u0388' && c <= '\u038A') || (c == '\u038C') 59 | || (c >= '\u038E' && c <= '\u03A1') 60 | || (c >= '\u03A3' && c <= '\u03CE') 61 | || (c >= '\u03D0' && c <= '\u03D6') || (c == '\u03DA') 62 | || (c == '\u03DC') || (c == '\u03DE') || (c == '\u03E0') 63 | || (c >= '\u03E2' && c <= '\u03F3') 64 | || (c >= '\u0401' && c <= '\u040C') 65 | || (c >= '\u040E' && c <= '\u044F') 66 | || (c >= '\u0451' && c <= '\u045C') 67 | || (c >= '\u045E' && c <= '\u0481') 68 | || (c >= '\u0490' && c <= '\u04C4') 69 | || (c >= '\u04C7' && c <= '\u04C8') 70 | || (c >= '\u04CB' && c <= '\u04CC') 71 | || (c >= '\u04D0' && c <= '\u04EB') 72 | || (c >= '\u04EE' && c <= '\u04F5') 73 | || (c >= '\u04F8' && c <= '\u04F9') 74 | || (c >= '\u0531' && c <= '\u0556') || (c == '\u0559') 75 | || (c >= '\u0561' && c <= '\u0586') 76 | || (c >= '\u05D0' && c <= '\u05EA') 77 | || (c >= '\u05F0' && c <= '\u05F2') 78 | || (c >= '\u0621' && c <= '\u063A') 79 | || (c >= '\u0641' && c <= '\u064A') 80 | || (c >= '\u0671' && c <= '\u06B7') 81 | || (c >= '\u06BA' && c <= '\u06BE') 82 | || (c >= '\u06C0' && c <= '\u06CE') 83 | || (c >= '\u06D0' && c <= '\u06D3') || (c == '\u06D5') 84 | || (c >= '\u06E5' && c <= '\u06E6') 85 | || (c >= '\u0905' && c <= '\u0939') || (c == '\u093D') 86 | || (c >= '\u0958' && c <= '\u0961') 87 | || (c >= '\u0985' && c <= '\u098C') 88 | || (c >= '\u098F' && c <= '\u0990') 89 | || (c >= '\u0993' && c <= '\u09A8') 90 | || (c >= '\u09AA' && c <= '\u09B0') || (c == '\u09B2') 91 | || (c >= '\u09B6' && c <= '\u09B9') 92 | || (c >= '\u09DC' && c <= '\u09DD') 93 | || (c >= '\u09DF' && c <= '\u09E1') 94 | || (c >= '\u09F0' && c <= '\u09F1') 95 | || (c >= '\u0A05' && c <= '\u0A0A') 96 | || (c >= '\u0A0F' && c <= '\u0A10') 97 | || (c >= '\u0A13' && c <= '\u0A28') 98 | || (c >= '\u0A2A' && c <= '\u0A30') 99 | || (c >= '\u0A32' && c <= '\u0A33') 100 | || (c >= '\u0A35' && c <= '\u0A36') 101 | || (c >= '\u0A38' && c <= '\u0A39') 102 | || (c >= '\u0A59' && c <= '\u0A5C') || (c == '\u0A5E') 103 | || (c >= '\u0A72' && c <= '\u0A74') 104 | || (c >= '\u0A85' && c <= '\u0A8B') || (c == '\u0A8D') 105 | || (c >= '\u0A8F' && c <= '\u0A91') 106 | || (c >= '\u0A93' && c <= '\u0AA8') 107 | || (c >= '\u0AAA' && c <= '\u0AB0') 108 | || (c >= '\u0AB2' && c <= '\u0AB3') 109 | || (c >= '\u0AB5' && c <= '\u0AB9') || (c == '\u0ABD') 110 | || (c == '\u0AE0') || (c >= '\u0B05' && c <= '\u0B0C') 111 | || (c >= '\u0B0F' && c <= '\u0B10') 112 | || (c >= '\u0B13' && c <= '\u0B28') 113 | || (c >= '\u0B2A' && c <= '\u0B30') 114 | || (c >= '\u0B32' && c <= '\u0B33') 115 | || (c >= '\u0B36' && c <= '\u0B39') || (c == '\u0B3D') 116 | || (c >= '\u0B5C' && c <= '\u0B5D') 117 | || (c >= '\u0B5F' && c <= '\u0B61') 118 | || (c >= '\u0B85' && c <= '\u0B8A') 119 | || (c >= '\u0B8E' && c <= '\u0B90') 120 | || (c >= '\u0B92' && c <= '\u0B95') 121 | || (c >= '\u0B99' && c <= '\u0B9A') || (c == '\u0B9C') 122 | || (c >= '\u0B9E' && c <= '\u0B9F') 123 | || (c >= '\u0BA3' && c <= '\u0BA4') 124 | || (c >= '\u0BA8' && c <= '\u0BAA') 125 | || (c >= '\u0BAE' && c <= '\u0BB5') 126 | || (c >= '\u0BB7' && c <= '\u0BB9') 127 | || (c >= '\u0C05' && c <= '\u0C0C') 128 | || (c >= '\u0C0E' && c <= '\u0C10') 129 | || (c >= '\u0C12' && c <= '\u0C28') 130 | || (c >= '\u0C2A' && c <= '\u0C33') 131 | || (c >= '\u0C35' && c <= '\u0C39') 132 | || (c >= '\u0C60' && c <= '\u0C61') 133 | || (c >= '\u0C85' && c <= '\u0C8C') 134 | || (c >= '\u0C8E' && c <= '\u0C90') 135 | || (c >= '\u0C92' && c <= '\u0CA8') 136 | || (c >= '\u0CAA' && c <= '\u0CB3') 137 | || (c >= '\u0CB5' && c <= '\u0CB9') || (c == '\u0CDE') 138 | || (c >= '\u0CE0' && c <= '\u0CE1') 139 | || (c >= '\u0D05' && c <= '\u0D0C') 140 | || (c >= '\u0D0E' && c <= '\u0D10') 141 | || (c >= '\u0D12' && c <= '\u0D28') 142 | || (c >= '\u0D2A' && c <= '\u0D39') 143 | || (c >= '\u0D60' && c <= '\u0D61') 144 | || (c >= '\u0E01' && c <= '\u0E2E') || (c == '\u0E30') 145 | || (c >= '\u0E32' && c <= '\u0E33') 146 | || (c >= '\u0E40' && c <= '\u0E45') 147 | || (c >= '\u0E81' && c <= '\u0E82') || (c == '\u0E84') 148 | || (c >= '\u0E87' && c <= '\u0E88') || (c == '\u0E8A') 149 | || (c == '\u0E8D') || (c >= '\u0E94' && c <= '\u0E97') 150 | || (c >= '\u0E99' && c <= '\u0E9F') 151 | || (c >= '\u0EA1' && c <= '\u0EA3') || (c == '\u0EA5') 152 | || (c == '\u0EA7') || (c >= '\u0EAA' && c <= '\u0EAB') 153 | || (c >= '\u0EAD' && c <= '\u0EAE') || (c == '\u0EB0') 154 | || (c >= '\u0EB2' && c <= '\u0EB3') || (c == '\u0EBD') 155 | || (c >= '\u0EC0' && c <= '\u0EC4') 156 | || (c >= '\u0F40' && c <= '\u0F47') 157 | || (c >= '\u0F49' && c <= '\u0F69') 158 | || (c >= '\u10A0' && c <= '\u10C5') 159 | || (c >= '\u10D0' && c <= '\u10F6') || (c == '\u1100') 160 | || (c >= '\u1102' && c <= '\u1103') 161 | || (c >= '\u1105' && c <= '\u1107') || (c == '\u1109') 162 | || (c >= '\u110B' && c <= '\u110C') 163 | || (c >= '\u110E' && c <= '\u1112') || (c == '\u113C') 164 | || (c == '\u113E') || (c == '\u1140') || (c == '\u114C') 165 | || (c == '\u114E') || (c == '\u1150') 166 | || (c >= '\u1154' && c <= '\u1155') || (c == '\u1159') 167 | || (c >= '\u115F' && c <= '\u1161') || (c == '\u1163') 168 | || (c == '\u1165') || (c == '\u1167') || (c == '\u1169') 169 | || (c >= '\u116D' && c <= '\u116E') 170 | || (c >= '\u1172' && c <= '\u1173') || (c == '\u1175') 171 | || (c == '\u119E') || (c == '\u11A8') || (c == '\u11AB') 172 | || (c >= '\u11AE' && c <= '\u11AF') 173 | || (c >= '\u11B7' && c <= '\u11B8') || (c == '\u11BA') 174 | || (c >= '\u11BC' && c <= '\u11C2') || (c == '\u11EB') 175 | || (c == '\u11F0') || (c == '\u11F9') 176 | || (c >= '\u1E00' && c <= '\u1E9B') 177 | || (c >= '\u1EA0' && c <= '\u1EF9') 178 | || (c >= '\u1F00' && c <= '\u1F15') 179 | || (c >= '\u1F18' && c <= '\u1F1D') 180 | || (c >= '\u1F20' && c <= '\u1F45') 181 | || (c >= '\u1F48' && c <= '\u1F4D') 182 | || (c >= '\u1F50' && c <= '\u1F57') || (c == '\u1F59') 183 | || (c == '\u1F5B') || (c == '\u1F5D') 184 | || (c >= '\u1F5F' && c <= '\u1F7D') 185 | || (c >= '\u1F80' && c <= '\u1FB4') 186 | || (c >= '\u1FB6' && c <= '\u1FBC') || (c == '\u1FBE') 187 | || (c >= '\u1FC2' && c <= '\u1FC4') 188 | || (c >= '\u1FC6' && c <= '\u1FCC') 189 | || (c >= '\u1FD0' && c <= '\u1FD3') 190 | || (c >= '\u1FD6' && c <= '\u1FDB') 191 | || (c >= '\u1FE0' && c <= '\u1FEC') 192 | || (c >= '\u1FF2' && c <= '\u1FF4') 193 | || (c >= '\u1FF6' && c <= '\u1FFC') || (c == '\u2126') 194 | || (c >= '\u212A' && c <= '\u212B') || (c == '\u212E') 195 | || (c >= '\u2180' && c <= '\u2182') 196 | || (c >= '\u3041' && c <= '\u3094') 197 | || (c >= '\u30A1' && c <= '\u30FA') 198 | || (c >= '\u3105' && c <= '\u312C') 199 | || (c >= '\uAC00' && c <= '\uD7A3') 200 | || (c >= '\u4E00' && c <= '\u9FA5') || (c == '\u3007') 201 | || (c >= '\u3021' && c <= '\u3029') || (c == '_')); 202 | } 203 | 204 | public static bool IsNCNameTrail(char c) 205 | { 206 | return ((c >= '\u0030' && c <= '\u0039') 207 | || (c >= '\u0660' && c <= '\u0669') 208 | || (c >= '\u06F0' && c <= '\u06F9') 209 | || (c >= '\u0966' && c <= '\u096F') 210 | || (c >= '\u09E6' && c <= '\u09EF') 211 | || (c >= '\u0A66' && c <= '\u0A6F') 212 | || (c >= '\u0AE6' && c <= '\u0AEF') 213 | || (c >= '\u0B66' && c <= '\u0B6F') 214 | || (c >= '\u0BE7' && c <= '\u0BEF') 215 | || (c >= '\u0C66' && c <= '\u0C6F') 216 | || (c >= '\u0CE6' && c <= '\u0CEF') 217 | || (c >= '\u0D66' && c <= '\u0D6F') 218 | || (c >= '\u0E50' && c <= '\u0E59') 219 | || (c >= '\u0ED0' && c <= '\u0ED9') 220 | || (c >= '\u0F20' && c <= '\u0F29') 221 | || (c >= '\u0041' && c <= '\u005A') 222 | || (c >= '\u0061' && c <= '\u007A') 223 | || (c >= '\u00C0' && c <= '\u00D6') 224 | || (c >= '\u00D8' && c <= '\u00F6') 225 | || (c >= '\u00F8' && c <= '\u00FF') 226 | || (c >= '\u0100' && c <= '\u0131') 227 | || (c >= '\u0134' && c <= '\u013E') 228 | || (c >= '\u0141' && c <= '\u0148') 229 | || (c >= '\u014A' && c <= '\u017E') 230 | || (c >= '\u0180' && c <= '\u01C3') 231 | || (c >= '\u01CD' && c <= '\u01F0') 232 | || (c >= '\u01F4' && c <= '\u01F5') 233 | || (c >= '\u01FA' && c <= '\u0217') 234 | || (c >= '\u0250' && c <= '\u02A8') 235 | || (c >= '\u02BB' && c <= '\u02C1') || (c == '\u0386') 236 | || (c >= '\u0388' && c <= '\u038A') || (c == '\u038C') 237 | || (c >= '\u038E' && c <= '\u03A1') 238 | || (c >= '\u03A3' && c <= '\u03CE') 239 | || (c >= '\u03D0' && c <= '\u03D6') || (c == '\u03DA') 240 | || (c == '\u03DC') || (c == '\u03DE') || (c == '\u03E0') 241 | || (c >= '\u03E2' && c <= '\u03F3') 242 | || (c >= '\u0401' && c <= '\u040C') 243 | || (c >= '\u040E' && c <= '\u044F') 244 | || (c >= '\u0451' && c <= '\u045C') 245 | || (c >= '\u045E' && c <= '\u0481') 246 | || (c >= '\u0490' && c <= '\u04C4') 247 | || (c >= '\u04C7' && c <= '\u04C8') 248 | || (c >= '\u04CB' && c <= '\u04CC') 249 | || (c >= '\u04D0' && c <= '\u04EB') 250 | || (c >= '\u04EE' && c <= '\u04F5') 251 | || (c >= '\u04F8' && c <= '\u04F9') 252 | || (c >= '\u0531' && c <= '\u0556') || (c == '\u0559') 253 | || (c >= '\u0561' && c <= '\u0586') 254 | || (c >= '\u05D0' && c <= '\u05EA') 255 | || (c >= '\u05F0' && c <= '\u05F2') 256 | || (c >= '\u0621' && c <= '\u063A') 257 | || (c >= '\u0641' && c <= '\u064A') 258 | || (c >= '\u0671' && c <= '\u06B7') 259 | || (c >= '\u06BA' && c <= '\u06BE') 260 | || (c >= '\u06C0' && c <= '\u06CE') 261 | || (c >= '\u06D0' && c <= '\u06D3') || (c == '\u06D5') 262 | || (c >= '\u06E5' && c <= '\u06E6') 263 | || (c >= '\u0905' && c <= '\u0939') || (c == '\u093D') 264 | || (c >= '\u0958' && c <= '\u0961') 265 | || (c >= '\u0985' && c <= '\u098C') 266 | || (c >= '\u098F' && c <= '\u0990') 267 | || (c >= '\u0993' && c <= '\u09A8') 268 | || (c >= '\u09AA' && c <= '\u09B0') || (c == '\u09B2') 269 | || (c >= '\u09B6' && c <= '\u09B9') 270 | || (c >= '\u09DC' && c <= '\u09DD') 271 | || (c >= '\u09DF' && c <= '\u09E1') 272 | || (c >= '\u09F0' && c <= '\u09F1') 273 | || (c >= '\u0A05' && c <= '\u0A0A') 274 | || (c >= '\u0A0F' && c <= '\u0A10') 275 | || (c >= '\u0A13' && c <= '\u0A28') 276 | || (c >= '\u0A2A' && c <= '\u0A30') 277 | || (c >= '\u0A32' && c <= '\u0A33') 278 | || (c >= '\u0A35' && c <= '\u0A36') 279 | || (c >= '\u0A38' && c <= '\u0A39') 280 | || (c >= '\u0A59' && c <= '\u0A5C') || (c == '\u0A5E') 281 | || (c >= '\u0A72' && c <= '\u0A74') 282 | || (c >= '\u0A85' && c <= '\u0A8B') || (c == '\u0A8D') 283 | || (c >= '\u0A8F' && c <= '\u0A91') 284 | || (c >= '\u0A93' && c <= '\u0AA8') 285 | || (c >= '\u0AAA' && c <= '\u0AB0') 286 | || (c >= '\u0AB2' && c <= '\u0AB3') 287 | || (c >= '\u0AB5' && c <= '\u0AB9') || (c == '\u0ABD') 288 | || (c == '\u0AE0') || (c >= '\u0B05' && c <= '\u0B0C') 289 | || (c >= '\u0B0F' && c <= '\u0B10') 290 | || (c >= '\u0B13' && c <= '\u0B28') 291 | || (c >= '\u0B2A' && c <= '\u0B30') 292 | || (c >= '\u0B32' && c <= '\u0B33') 293 | || (c >= '\u0B36' && c <= '\u0B39') || (c == '\u0B3D') 294 | || (c >= '\u0B5C' && c <= '\u0B5D') 295 | || (c >= '\u0B5F' && c <= '\u0B61') 296 | || (c >= '\u0B85' && c <= '\u0B8A') 297 | || (c >= '\u0B8E' && c <= '\u0B90') 298 | || (c >= '\u0B92' && c <= '\u0B95') 299 | || (c >= '\u0B99' && c <= '\u0B9A') || (c == '\u0B9C') 300 | || (c >= '\u0B9E' && c <= '\u0B9F') 301 | || (c >= '\u0BA3' && c <= '\u0BA4') 302 | || (c >= '\u0BA8' && c <= '\u0BAA') 303 | || (c >= '\u0BAE' && c <= '\u0BB5') 304 | || (c >= '\u0BB7' && c <= '\u0BB9') 305 | || (c >= '\u0C05' && c <= '\u0C0C') 306 | || (c >= '\u0C0E' && c <= '\u0C10') 307 | || (c >= '\u0C12' && c <= '\u0C28') 308 | || (c >= '\u0C2A' && c <= '\u0C33') 309 | || (c >= '\u0C35' && c <= '\u0C39') 310 | || (c >= '\u0C60' && c <= '\u0C61') 311 | || (c >= '\u0C85' && c <= '\u0C8C') 312 | || (c >= '\u0C8E' && c <= '\u0C90') 313 | || (c >= '\u0C92' && c <= '\u0CA8') 314 | || (c >= '\u0CAA' && c <= '\u0CB3') 315 | || (c >= '\u0CB5' && c <= '\u0CB9') || (c == '\u0CDE') 316 | || (c >= '\u0CE0' && c <= '\u0CE1') 317 | || (c >= '\u0D05' && c <= '\u0D0C') 318 | || (c >= '\u0D0E' && c <= '\u0D10') 319 | || (c >= '\u0D12' && c <= '\u0D28') 320 | || (c >= '\u0D2A' && c <= '\u0D39') 321 | || (c >= '\u0D60' && c <= '\u0D61') 322 | || (c >= '\u0E01' && c <= '\u0E2E') || (c == '\u0E30') 323 | || (c >= '\u0E32' && c <= '\u0E33') 324 | || (c >= '\u0E40' && c <= '\u0E45') 325 | || (c >= '\u0E81' && c <= '\u0E82') || (c == '\u0E84') 326 | || (c >= '\u0E87' && c <= '\u0E88') || (c == '\u0E8A') 327 | || (c == '\u0E8D') || (c >= '\u0E94' && c <= '\u0E97') 328 | || (c >= '\u0E99' && c <= '\u0E9F') 329 | || (c >= '\u0EA1' && c <= '\u0EA3') || (c == '\u0EA5') 330 | || (c == '\u0EA7') || (c >= '\u0EAA' && c <= '\u0EAB') 331 | || (c >= '\u0EAD' && c <= '\u0EAE') || (c == '\u0EB0') 332 | || (c >= '\u0EB2' && c <= '\u0EB3') || (c == '\u0EBD') 333 | || (c >= '\u0EC0' && c <= '\u0EC4') 334 | || (c >= '\u0F40' && c <= '\u0F47') 335 | || (c >= '\u0F49' && c <= '\u0F69') 336 | || (c >= '\u10A0' && c <= '\u10C5') 337 | || (c >= '\u10D0' && c <= '\u10F6') || (c == '\u1100') 338 | || (c >= '\u1102' && c <= '\u1103') 339 | || (c >= '\u1105' && c <= '\u1107') || (c == '\u1109') 340 | || (c >= '\u110B' && c <= '\u110C') 341 | || (c >= '\u110E' && c <= '\u1112') || (c == '\u113C') 342 | || (c == '\u113E') || (c == '\u1140') || (c == '\u114C') 343 | || (c == '\u114E') || (c == '\u1150') 344 | || (c >= '\u1154' && c <= '\u1155') || (c == '\u1159') 345 | || (c >= '\u115F' && c <= '\u1161') || (c == '\u1163') 346 | || (c == '\u1165') || (c == '\u1167') || (c == '\u1169') 347 | || (c >= '\u116D' && c <= '\u116E') 348 | || (c >= '\u1172' && c <= '\u1173') || (c == '\u1175') 349 | || (c == '\u119E') || (c == '\u11A8') || (c == '\u11AB') 350 | || (c >= '\u11AE' && c <= '\u11AF') 351 | || (c >= '\u11B7' && c <= '\u11B8') || (c == '\u11BA') 352 | || (c >= '\u11BC' && c <= '\u11C2') || (c == '\u11EB') 353 | || (c == '\u11F0') || (c == '\u11F9') 354 | || (c >= '\u1E00' && c <= '\u1E9B') 355 | || (c >= '\u1EA0' && c <= '\u1EF9') 356 | || (c >= '\u1F00' && c <= '\u1F15') 357 | || (c >= '\u1F18' && c <= '\u1F1D') 358 | || (c >= '\u1F20' && c <= '\u1F45') 359 | || (c >= '\u1F48' && c <= '\u1F4D') 360 | || (c >= '\u1F50' && c <= '\u1F57') || (c == '\u1F59') 361 | || (c == '\u1F5B') || (c == '\u1F5D') 362 | || (c >= '\u1F5F' && c <= '\u1F7D') 363 | || (c >= '\u1F80' && c <= '\u1FB4') 364 | || (c >= '\u1FB6' && c <= '\u1FBC') || (c == '\u1FBE') 365 | || (c >= '\u1FC2' && c <= '\u1FC4') 366 | || (c >= '\u1FC6' && c <= '\u1FCC') 367 | || (c >= '\u1FD0' && c <= '\u1FD3') 368 | || (c >= '\u1FD6' && c <= '\u1FDB') 369 | || (c >= '\u1FE0' && c <= '\u1FEC') 370 | || (c >= '\u1FF2' && c <= '\u1FF4') 371 | || (c >= '\u1FF6' && c <= '\u1FFC') || (c == '\u2126') 372 | || (c >= '\u212A' && c <= '\u212B') || (c == '\u212E') 373 | || (c >= '\u2180' && c <= '\u2182') 374 | || (c >= '\u3041' && c <= '\u3094') 375 | || (c >= '\u30A1' && c <= '\u30FA') 376 | || (c >= '\u3105' && c <= '\u312C') 377 | || (c >= '\uAC00' && c <= '\uD7A3') 378 | || (c >= '\u4E00' && c <= '\u9FA5') || (c == '\u3007') 379 | || (c >= '\u3021' && c <= '\u3029') || (c == '_') || (c == '.') 380 | || (c == '-') || (c >= '\u0300' && c <= '\u0345') 381 | || (c >= '\u0360' && c <= '\u0361') 382 | || (c >= '\u0483' && c <= '\u0486') 383 | || (c >= '\u0591' && c <= '\u05A1') 384 | || (c >= '\u05A3' && c <= '\u05B9') 385 | || (c >= '\u05BB' && c <= '\u05BD') || (c == '\u05BF') 386 | || (c >= '\u05C1' && c <= '\u05C2') || (c == '\u05C4') 387 | || (c >= '\u064B' && c <= '\u0652') || (c == '\u0670') 388 | || (c >= '\u06D6' && c <= '\u06DC') 389 | || (c >= '\u06DD' && c <= '\u06DF') 390 | || (c >= '\u06E0' && c <= '\u06E4') 391 | || (c >= '\u06E7' && c <= '\u06E8') 392 | || (c >= '\u06EA' && c <= '\u06ED') 393 | || (c >= '\u0901' && c <= '\u0903') || (c == '\u093C') 394 | || (c >= '\u093E' && c <= '\u094C') || (c == '\u094D') 395 | || (c >= '\u0951' && c <= '\u0954') 396 | || (c >= '\u0962' && c <= '\u0963') 397 | || (c >= '\u0981' && c <= '\u0983') || (c == '\u09BC') 398 | || (c == '\u09BE') || (c == '\u09BF') 399 | || (c >= '\u09C0' && c <= '\u09C4') 400 | || (c >= '\u09C7' && c <= '\u09C8') 401 | || (c >= '\u09CB' && c <= '\u09CD') || (c == '\u09D7') 402 | || (c >= '\u09E2' && c <= '\u09E3') || (c == '\u0A02') 403 | || (c == '\u0A3C') || (c == '\u0A3E') || (c == '\u0A3F') 404 | || (c >= '\u0A40' && c <= '\u0A42') 405 | || (c >= '\u0A47' && c <= '\u0A48') 406 | || (c >= '\u0A4B' && c <= '\u0A4D') 407 | || (c >= '\u0A70' && c <= '\u0A71') 408 | || (c >= '\u0A81' && c <= '\u0A83') || (c == '\u0ABC') 409 | || (c >= '\u0ABE' && c <= '\u0AC5') 410 | || (c >= '\u0AC7' && c <= '\u0AC9') 411 | || (c >= '\u0ACB' && c <= '\u0ACD') 412 | || (c >= '\u0B01' && c <= '\u0B03') || (c == '\u0B3C') 413 | || (c >= '\u0B3E' && c <= '\u0B43') 414 | || (c >= '\u0B47' && c <= '\u0B48') 415 | || (c >= '\u0B4B' && c <= '\u0B4D') 416 | || (c >= '\u0B56' && c <= '\u0B57') 417 | || (c >= '\u0B82' && c <= '\u0B83') 418 | || (c >= '\u0BBE' && c <= '\u0BC2') 419 | || (c >= '\u0BC6' && c <= '\u0BC8') 420 | || (c >= '\u0BCA' && c <= '\u0BCD') || (c == '\u0BD7') 421 | || (c >= '\u0C01' && c <= '\u0C03') 422 | || (c >= '\u0C3E' && c <= '\u0C44') 423 | || (c >= '\u0C46' && c <= '\u0C48') 424 | || (c >= '\u0C4A' && c <= '\u0C4D') 425 | || (c >= '\u0C55' && c <= '\u0C56') 426 | || (c >= '\u0C82' && c <= '\u0C83') 427 | || (c >= '\u0CBE' && c <= '\u0CC4') 428 | || (c >= '\u0CC6' && c <= '\u0CC8') 429 | || (c >= '\u0CCA' && c <= '\u0CCD') 430 | || (c >= '\u0CD5' && c <= '\u0CD6') 431 | || (c >= '\u0D02' && c <= '\u0D03') 432 | || (c >= '\u0D3E' && c <= '\u0D43') 433 | || (c >= '\u0D46' && c <= '\u0D48') 434 | || (c >= '\u0D4A' && c <= '\u0D4D') || (c == '\u0D57') 435 | || (c == '\u0E31') || (c >= '\u0E34' && c <= '\u0E3A') 436 | || (c >= '\u0E47' && c <= '\u0E4E') || (c == '\u0EB1') 437 | || (c >= '\u0EB4' && c <= '\u0EB9') 438 | || (c >= '\u0EBB' && c <= '\u0EBC') 439 | || (c >= '\u0EC8' && c <= '\u0ECD') 440 | || (c >= '\u0F18' && c <= '\u0F19') || (c == '\u0F35') 441 | || (c == '\u0F37') || (c == '\u0F39') || (c == '\u0F3E') 442 | || (c == '\u0F3F') || (c >= '\u0F71' && c <= '\u0F84') 443 | || (c >= '\u0F86' && c <= '\u0F8B') 444 | || (c >= '\u0F90' && c <= '\u0F95') || (c == '\u0F97') 445 | || (c >= '\u0F99' && c <= '\u0FAD') 446 | || (c >= '\u0FB1' && c <= '\u0FB7') || (c == '\u0FB9') 447 | || (c >= '\u20D0' && c <= '\u20DC') || (c == '\u20E1') 448 | || (c >= '\u302A' && c <= '\u302F') || (c == '\u3099') 449 | || (c == '\u309A') || (c == '\u00B7') || (c == '\u02D0') 450 | || (c == '\u02D1') || (c == '\u0387') || (c == '\u0640') 451 | || (c == '\u0E46') || (c == '\u0EC6') || (c == '\u3005') 452 | || (c >= '\u3031' && c <= '\u3035') 453 | || (c >= '\u309D' && c <= '\u309E') || (c >= '\u30FC' && c <= '\u30FE')); 454 | } 455 | 456 | public static bool IsNCName(string str) 457 | { 458 | if (str == null) 459 | { 460 | return false; 461 | } 462 | else 463 | { 464 | int len = str.Length; 465 | switch (len) 466 | { 467 | case 0: 468 | return false; 469 | case 1: 470 | return NCName.IsNCNameStart(str[0]); 471 | default: 472 | if (!NCName.IsNCNameStart(str[0])) 473 | { 474 | return false; 475 | } 476 | for (int i = 1; i < len; i++) 477 | { 478 | if (!NCName.IsNCNameTrail(str[i])) 479 | { 480 | return false; 481 | } 482 | } 483 | 484 | return true; 485 | } 486 | } 487 | } 488 | 489 | private static void AppendUHexTo(StringBuilder sb, int c) 490 | { 491 | sb.Append('U'); 492 | for (int i = 0; i < 6; i++) 493 | { 494 | sb.Append(HEX_TABLE[(c & 0xF00000) >> 20]); 495 | c <<= 4; 496 | } 497 | } 498 | 499 | public static string EscapeName(string str) 500 | { 501 | StringBuilder sb = new StringBuilder(); 502 | for (int i = 0; i < str.Length; i++) 503 | { 504 | char c = str[i]; 505 | if ((c & 0xFC00) == 0xD800) 506 | { 507 | char next = str[++i]; 508 | AppendUHexTo(sb, (c << 10) + next + SURROGATE_OFFSET); 509 | } 510 | else if (i == 0 && !IsNCNameStart(c)) 511 | { 512 | AppendUHexTo(sb, c); 513 | } 514 | else if (i != 0 && !IsNCNameTrail(c)) 515 | { 516 | AppendUHexTo(sb, c); 517 | } 518 | else 519 | { 520 | sb.Append(c); 521 | } 522 | } 523 | return String.Intern(sb.ToString()); 524 | } 525 | // ]NOCPP] 526 | } 527 | 528 | } 529 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/NamedCharacterAccel.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and Opera 3 | * Software ASA. 4 | * 5 | * You are granted a license to use, reproduce and create derivative works of 6 | * this document. 7 | */ 8 | 9 | #pragma warning disable 1591 // Missing XML comment 10 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 11 | #pragma warning disable 1587 // XML comment is not placed on a valid element 12 | 13 | namespace HtmlParserSharp.Core 14 | { 15 | public sealed class NamedCharactersAccel 16 | { 17 | internal static readonly int[][] HILO_ACCEL = new int[][] { 18 | null, 19 | null, 20 | null, 21 | null, 22 | null, 23 | null, 24 | null, 25 | null, 26 | null, 27 | null, 28 | null, 29 | null, 30 | null, 31 | null, 32 | null, 33 | null, 34 | null, 35 | null, 36 | null, 37 | null, 38 | null, 39 | null, 40 | null, 41 | null, 42 | null, 43 | null, 44 | null, 45 | null, 46 | null, 47 | null, 48 | null, 49 | null, 50 | null, 51 | null, 52 | null, 53 | null, 54 | null, 55 | null, 56 | null, 57 | null, 58 | null, 59 | null, 60 | null, 61 | null, 62 | null, 63 | null, 64 | null, 65 | null, 66 | null, 67 | null, 68 | null, 69 | null, 70 | null, 71 | null, 72 | null, 73 | null, 74 | null, 75 | null, 76 | null, 77 | null, 78 | null, 79 | null, 80 | null, 81 | null, 82 | null, 83 | new int[] { 0, 0, 0, 0, 0, 0, 0, 12386493, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 | 0, 0, 0, 0, 0, 40174181, 0, 0, 0, 0, 60162966, 0, 0, 0, 85 | 75367550, 0, 0, 0, 82183396, 0, 0, 0, 0, 0, 115148507, 0, 86 | 0, 135989275, 139397199, 0, 0, 0, 0, }, 87 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28770743, 0, 88 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89 | 82248935, 0, 0, 0, 0, 0, 115214046, 0, 0, 0, 139528272, 0, 90 | 0, 0, 0, }, 91 | null, 92 | new int[] { 0, 0, 0, 4980811, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 93 | 0, 38470219, 0, 0, 0, 0, 0, 0, 0, 0, 64553944, 0, 0, 0, 0, 94 | 0, 0, 0, 92145022, 0, 0, 0, 0, 0, 0, 0, 0, 139593810, 0, 0, 95 | 0, 0, }, 96 | new int[] { 65536, 0, 0, 0, 0, 0, 0, 0, 13172937, 0, 0, 0, 0, 0, 25297282, 0, 97 | 0, 28901816, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 98 | 71500866, 0, 0, 0, 0, 82380008, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99 | 0, 0, 0, 0, 0, }, 100 | null, 101 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 102 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 103 | 94897574, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 104 | new int[] { 0, 0, 2555943, 0, 0, 0, 0, 0, 0, 0, 15532269, 0, 0, 0, 0, 0, 0, 105 | 0, 31785444, 34406924, 0, 0, 0, 0, 0, 40895088, 0, 0, 0, 106 | 60228503, 0, 0, 0, 0, 0, 0, 0, 82445546, 0, 0, 0, 0, 0, 107 | 115279583, 0, 0, 136054812, 0, 0, 0, 0, 0, }, 108 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 109 | 0, 0, 40239718, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 110 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 111 | new int[] { 0, 0, 0, 5046349, 0, 0, 10944679, 0, 13238474, 0, 15597806, 112 | 16056565, 0, 20578618, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 113 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 114 | 0, 0, 0, 0, 0, 0, }, 115 | null, 116 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 | 95225257, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 119 | new int[] { 196610, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 122 | new int[] { 0, 0, 0, 0, 8454273, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123 | 0, 0, 0, 0, 0, 0, 0, 46072511, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 125 | new int[] { 0, 0, 2687016, 0, 0, 0, 0, 0, 13304011, 0, 0, 0, 0, 0, 0, 0, 0, 126 | 0, 31850982, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 128 | null, 129 | null, 130 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 | 34472462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 | 0, 0, 0, 95290798, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 133 | new int[] { 0, 0, 0, 5111886, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 134 | 34603535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 135 | 0, 0, 0, 0, 105776718, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 136 | new int[] { 0, 0, 0, 0, 8585346, 0, 11075752, 0, 0, 0, 0, 16187638, 0, 0, 0, 137 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 138 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 139 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28508594, 0, 0, 140 | 0, 0, 0, 0, 0, 40305255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 141 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 142 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 143 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 144 | 95421871, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 145 | null, 146 | null, 147 | null, 148 | new int[] { 0, 0, 0, 5177423, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 149 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 151 | null, 152 | null, 153 | null, 154 | null, 155 | null, 156 | null, 157 | new int[] { 327684, 1900571, 2949162, 5374032, 8716420, 0, 11206826, 158 | 12517566, 13435084, 0, 15663343, 16515320, 19988785, 159 | 20644155, 25428355, 27197855, 0, 29163962, 31916519, 160 | 34734609, 36045347, 0, 0, 0, 40436328, 40960625, 41615994, 161 | 46596800, 54264627, 60556184, 64750554, 68879387, 71763012, 162 | 75826303, 77268122, 0, 81462490, 83952875, 92865919, 163 | 96142769, 105973327, 110167691, 0, 116917984, 121833283, 164 | 132253665, 136251421, 140707923, 0, 0, 144574620, 165 | 145361066, }, 166 | new int[] { 393222, 0, 0, 0, 0, 0, 11272364, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 167 | 0, 0, 36176423, 38535756, 0, 0, 0, 0, 41681532, 46727880, 168 | 0, 60687261, 0, 0, 71828552, 75891846, 0, 0, 0, 84411650, 169 | 0, 96404924, 0, 0, 0, 117376761, 121898820, 132319203, 170 | 136382496, 0, 0, 0, 0, 0, }, 171 | new int[] { 589831, 1966110, 3276846, 5505107, 8978566, 10420383, 11468973, 172 | 12583104, 13631694, 15139046, 15794416, 16711933, 20054322, 173 | 20840764, 25624965, 27263392, 0, 29360574, 32244200, 174 | 34931219, 36373033, 38601293, 39584348, 0, 40567402, 175 | 41091698, 42205821, 46858954, 54723389, 60818335, 65143773, 176 | 68944924, 71959625, 75957383, 77530268, 80938194, 81593564, 177 | 84739337, 92997002, 96863680, 106235474, 110233234, 0, 178 | 117704448, 122816325, 132515812, 136579106, 140773476, 179 | 142149753, 143001732, 144705695, 145492139, }, 180 | new int[] { 0, 0, 3342387, 0, 9044106, 0, 11534512, 0, 13697233, 0, 0, 0, 0, 181 | 0, 25690504, 0, 0, 0, 0, 0, 36438572, 38732366, 0, 0, 0, 182 | 41157236, 0, 46924492, 54788932, 61080481, 65209315, 0, 183 | 72025163, 0, 0, 0, 0, 85132558, 93062540, 96929223, 184 | 106563158, 0, 0, 118032133, 123012947, 132581351, 185 | 136775717, 140839013, 0, 143067271, 0, 145557677, }, 186 | new int[] { 0, 2162719, 3473460, 5636181, 0, 0, 0, 0, 0, 0, 0, 18809088, 187 | 20185395, 21299519, 0, 0, 0, 29622721, 0, 0, 0, 39256656, 188 | 39649885, 0, 0, 41288309, 42336901, 47448781, 55182149, 189 | 61342629, 65274852, 69010461, 72811596, 76219528, 77726880, 190 | 0, 0, 86967572, 93128077, 97650120, 106628699, 110560915, 191 | 0, 118490890, 123733846, 132646888, 0, 141232230, 192 | 142411898, 0, 144836769, 145688750, }, 193 | new int[] { 655370, 2228258, 3538998, 5701719, 9109643, 10485920, 11600049, 194 | 12648641, 13762770, 15204584, 15859954, 18874656, 20250933, 195 | 21365062, 25756041, 27328929, 28574132, 29688261, 32309741, 196 | 34996758, 36504109, 39322200, 39715422, 39912033, 40632940, 197 | 41353847, 42467975, 47514325, 55247691, 61473705, 65405925, 198 | 69272606, 72877144, 76285068, 77857955, 81003732, 81659102, 199 | 87164208, 93193614, 97715667, 106759772, 110626456, 200 | 114296528, 118687505, 123864929, 132712425, 136906792, 201 | 141297772, 142477438, 143132808, 144902307, 145754288, }, 202 | new int[] { 786443, 0, 0, 0, 9240716, 0, 11665586, 0, 13893843, 0, 0, 0, 0, 203 | 0, 25887114, 0, 0, 0, 0, 0, 36635182, 0, 0, 0, 0, 0, 204 | 42599049, 0, 0, 0, 65733607, 0, 73008217, 0, 77989029, 0, 205 | 81724639, 87295283, 0, 98305492, 107021918, 0, 0, 0, 0, 0, 206 | 137037866, 0, 0, 0, 0, 0, }, 207 | new int[] { 0, 0, 3604535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27394466, 0, 208 | 29753798, 32571886, 35258903, 0, 0, 0, 0, 0, 0, 0, 0, 209 | 55509836, 61604779, 0, 0, 0, 0, 0, 0, 81790176, 87557429, 210 | 93259151, 98502109, 107152994, 110888601, 0, 119015188, 211 | 124323683, 133498858, 137234476, 0, 0, 143263881, 0, 212 | 145819825, }, 213 | new int[] { 0, 0, 3866680, 6160472, 0, 10616993, 0, 12714178, 0, 0, 0, 0, 214 | 20316470, 0, 0, 27460003, 0, 31261127, 32637426, 35521051, 215 | 0, 0, 0, 39977570, 0, 0, 0, 48366294, 56492880, 62391213, 216 | 0, 69338146, 73073755, 0, 78316711, 0, 0, 0, 93980048, 217 | 98764256, 107218532, 111085213, 114362065, 119736089, 218 | 125241194, 133957622, 0, 0, 0, 143329419, 144967844, 219 | 145885362, }, 220 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 221 | 0, 0, 0, 0, 0, 0, 0, 62456761, 0, 69403683, 73139292, 0, 222 | 78382252, 0, 81855713, 87622969, 0, 98829796, 0, 0, 0, 0, 223 | 0, 0, 0, 0, 0, 0, 0, 0, }, 224 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 225 | 0, 0, 0, 0, 0, 48431843, 0, 0, 0, 0, 0, 76416141, 0, 0, 0, 226 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 227 | new int[] { 851981, 0, 4063292, 0, 9306254, 0, 0, 0, 0, 0, 0, 19005729, 0, 0, 228 | 0, 27525540, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42795659, 229 | 49152740, 56623967, 62587834, 66061292, 69600292, 73401437, 230 | 0, 0, 0, 0, 87950650, 94111131, 99878373, 107546213, 231 | 112002720, 0, 119932708, 125306744, 0, 137496623, 232 | 141363309, 0, 143460492, 0, 0, }, 233 | new int[] { 917518, 0, 0, 0, 9502863, 0, 0, 0, 14155989, 0, 0, 19071267, 0, 234 | 0, 26083724, 0, 0, 0, 32702963, 0, 36700720, 0, 0, 0, 0, 0, 235 | 43057806, 0, 0, 0, 66520049, 0, 0, 0, 78841005, 81069269, 236 | 0, 88147263, 0, 99943925, 107873898, 112068270, 0, 237 | 120063783, 125831033, 0, 137693235, 0, 0, 143526030, 0, 0, }, 238 | new int[] { 983055, 0, 0, 0, 0, 0, 0, 0, 14483673, 0, 0, 0, 0, 0, 0, 0, 0, 0, 239 | 0, 0, 37093937, 0, 0, 0, 0, 0, 44565138, 49349359, 0, 0, 240 | 66651128, 69665831, 73860193, 0, 79561908, 0, 0, 88606018, 241 | 94176669, 0, 0, 0, 0, 120129321, 0, 0, 0, 141494382, 0, 242 | 143591567, 0, 0, }, 243 | new int[] { 1114128, 2293795, 4587583, 8257631, 9633938, 10813603, 11731123, 244 | 12845251, 14680286, 15270121, 15925491, 19661092, 20382007, 245 | 24969543, 26149263, 27656613, 28639669, 31392222, 32768500, 246 | 35586591, 37225015, 39387737, 39780959, 40043107, 40698477, 247 | 41419384, 44696233, 52495090, 57738081, 63439804, 66782202, 248 | 69927976, 73925736, 76809359, 79824063, 81134806, 81921250, 249 | 89785673, 94307742, 100795894, 107939439, 112330415, 250 | 114427602, 120588074, 126158721, 134416381, 137824310, 251 | 141559920, 142542975, 143853712, 145033381, 145950899, }, 252 | new int[] { 1179666, 0, 0, 0, 9699476, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26280336, 253 | 0, 0, 0, 0, 0, 38076985, 0, 0, 0, 0, 0, 45220523, 52560674, 254 | 0, 0, 67175420, 69993516, 0, 0, 79889603, 0, 0, 89916763, 255 | 94373280, 101451267, 108136048, 0, 114493139, 120784689, 256 | 126355334, 134481924, 138414136, 141625457, 142608512, 0, 257 | 0, 0, }, 258 | new int[] { 0, 0, 0, 0, 9896085, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 259 | 33292789, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67830786, 0, 0, 260 | 0, 80020676, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127403913, 0, 0, 0, 261 | 0, 0, 0, 0, }, 262 | new int[] { 1310739, 2359332, 4653127, 0, 0, 0, 12189876, 0, 0, 0, 0, 0, 0, 263 | 0, 26345874, 28246439, 0, 31457760, 0, 35652128, 38142534, 264 | 0, 0, 0, 0, 0, 45351603, 52757283, 57869170, 63636425, 265 | 67961868, 71304237, 73991273, 0, 0, 0, 0, 90309981, 0, 266 | 101910029, 108988019, 114034355, 0, 120850228, 127469465, 267 | 135464965, 138741825, 141690994, 142739585, 143984788, 0, 268 | 0, }, 269 | new int[] { 1441813, 2424869, 4718664, 8388735, 10027160, 10879142, 12255419, 270 | 12976325, 14745825, 15401194, 15991028, 19857709, 20447544, 271 | 25035134, 26542483, 28377520, 28705206, 31588833, 33358333, 272 | 35783201, 38208071, 39453274, 39846496, 40108644, 40764014, 273 | 41484921, 45613749, 53216038, 58196852, 63898572, 68158478, 274 | 71369793, 74253418, 77005973, 80479430, 81265879, 81986787, 275 | 90965347, 94504353, 103679508, 109250176, 114165453, 276 | 114558676, 121243445, 127731610, 135727124, 138807366, 277 | 142018675, 142805123, 144115862, 145098918, 146016436, }, 278 | new int[] { 1572887, 0, 0, 0, 10092698, 0, 12320956, 0, 14811362, 0, 0, 279 | 19923248, 0, 25166207, 26739094, 0, 0, 0, 33423870, 0, 280 | 38273608, 0, 0, 0, 0, 0, 45744825, 0, 58262393, 64095184, 281 | 68355089, 0, 75170926, 0, 80610509, 0, 0, 91817325, 0, 282 | 104203823, 109512324, 0, 0, 121636667, 128059294, 0, 283 | 139069511, 0, 0, 0, 0, 0, }, 284 | new int[] { 1703961, 2490406, 4849737, 0, 10223771, 0, 0, 13107399, 15007971, 285 | 15466732, 0, 0, 20513081, 25231745, 26870169, 0, 0, 286 | 31654371, 34275839, 0, 38404681, 0, 0, 0, 40829551, 0, 287 | 45875899, 53609261, 59900794, 64226259, 68551700, 0, 0, 0, 288 | 80807119, 81331417, 0, 91948410, 94700963, 104465975, 289 | 109643400, 114230991, 114951893, 121702209, 131663779, 0, 290 | 139266123, 0, 0, 144246936, 145295527, 0, }, 291 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27132315, 0, 0, 0, 0, 292 | 0, 0, 39518811, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75302012, 0, 293 | 0, 0, 0, 92079484, 0, 105383483, 109708938, 0, 0, 0, 0, 0, 294 | 0, 0, 0, 144312474, 0, 0, }, 295 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 296 | 0, 0, 0, 0, 46006973, 0, 60031891, 64291797, 0, 0, 0, 0, 0, 297 | 0, 0, 0, 0, 105711177, 0, 0, 0, 0, 131991514, 135923736, 298 | 139331662, 0, 0, 144378011, 0, 146147509, }, 299 | new int[] { 0, 0, 0, 0, 10354845, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 300 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68813847, 0, 0, 0, 0, 0, 301 | 0, 0, 0, 0, 0, 0, 0, 121767746, 0, 0, 0, 0, 0, 0, 0, 0, }, 302 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 303 | 0, 0, 0, 0, 0, 0, 60097429, 0, 0, 0, 0, 77137048, 0, 0, 0, 304 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 305 | new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 306 | 0, 0, 0, 0, 0, 0, 0, 64422870, 0, 0, 0, 0, 0, 0, 0, 0, 0, 307 | 0, 0, 0, 0, 0, 132122591, 0, 0, 142084216, 0, 0, 0, 0, }, }; 308 | 309 | } 310 | 311 | } 312 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/Portability.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008-2009 Mozilla Foundation 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | using HtmlParserSharp.Common; 25 | 26 | #pragma warning disable 1591 // Missing XML comment 27 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 28 | #pragma warning disable 1587 // XML comment is not placed on a valid element 29 | 30 | namespace HtmlParserSharp.Core 31 | { 32 | /// 33 | /// Class for C++ portability. 34 | /// TODO: Remove this 35 | /// 36 | public sealed class Portability 37 | { 38 | // Allocating methods 39 | 40 | /// 41 | /// Allocates a new local name object. In C++, the refcount must be set up in such a way that 42 | /// calling releaseLocal on the return value balances the refcount set by this method. 43 | /// 44 | [Local] 45 | public static String NewLocalNameFromBuffer(char[] buf, int offset, int length) 46 | { 47 | return string.Intern(new String(buf, offset, length)); 48 | } 49 | 50 | // Comparison methods 51 | 52 | public static bool LocalEqualsBuffer([Local] string local, char[] buf, int offset, int length) 53 | { 54 | if (local.Length != length) 55 | { 56 | return false; 57 | } 58 | for (int i = 0; i < length; i++) 59 | { 60 | if (local[i] != buf[offset + i]) 61 | { 62 | return false; 63 | } 64 | } 65 | return true; 66 | } 67 | 68 | public static bool LowerCaseLiteralIsPrefixOfIgnoreAsciiCaseString(string lowerCaseLiteral, string str) 69 | { 70 | if (str == null) 71 | { 72 | return false; 73 | } 74 | if (lowerCaseLiteral.Length > str.Length) 75 | { 76 | return false; 77 | } 78 | for (int i = 0; i < lowerCaseLiteral.Length; i++) 79 | { 80 | char c0 = lowerCaseLiteral[i]; 81 | char c1 = str[i]; 82 | if (c1 >= 'A' && c1 <= 'Z') 83 | { 84 | c1 += (char)0x20; 85 | } 86 | if (c0 != c1) 87 | { 88 | return false; 89 | } 90 | } 91 | return true; 92 | } 93 | 94 | public static bool LowerCaseLiteralEqualsIgnoreAsciiCaseString(string lowerCaseLiteral, string str) 95 | { 96 | if (str == null) 97 | { 98 | return false; 99 | } 100 | if (lowerCaseLiteral.Length != str.Length) 101 | { 102 | return false; 103 | } 104 | for (int i = 0; i < lowerCaseLiteral.Length; i++) 105 | { 106 | char c0 = lowerCaseLiteral[i]; 107 | char c1 = str[i]; 108 | if (c1 >= 'A' && c1 <= 'Z') 109 | { 110 | c1 += (char)0x20; 111 | } 112 | if (c0 != c1) 113 | { 114 | return false; 115 | } 116 | } 117 | return true; 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/StackNode.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2007-2011 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | using System; 26 | using System.Diagnostics; 27 | using HtmlParserSharp.Common; 28 | 29 | #pragma warning disable 1591 // Missing XML comment 30 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 31 | #pragma warning disable 1587 // XML comment is not placed on a valid element 32 | 33 | namespace HtmlParserSharp.Core 34 | { 35 | public sealed class StackNode 36 | { 37 | readonly int flags; 38 | 39 | [Local] 40 | internal readonly string name; 41 | 42 | [Local] 43 | internal readonly string popName; 44 | 45 | [NsUri] 46 | internal readonly string ns; 47 | 48 | internal readonly T node; 49 | 50 | // Only used on the list of formatting elements 51 | internal HtmlAttributes attributes; 52 | 53 | private int refcount = 1; 54 | 55 | // [NOCPP[ 56 | 57 | private readonly TaintableLocator locator; 58 | 59 | public TaintableLocator Locator 60 | { 61 | get 62 | { 63 | return locator; 64 | } 65 | } 66 | 67 | // ]NOCPP] 68 | 69 | public int Flags 70 | { 71 | get 72 | { 73 | return flags; 74 | } 75 | } 76 | 77 | public DispatchGroup Group 78 | { 79 | get 80 | { 81 | return (DispatchGroup)(flags & ElementName.GROUP_MASK); 82 | } 83 | } 84 | 85 | public bool IsScoping 86 | { 87 | get 88 | { 89 | return (flags & ElementName.SCOPING) != 0; 90 | } 91 | } 92 | 93 | public bool IsSpecial 94 | { 95 | get 96 | { 97 | return (flags & ElementName.SPECIAL) != 0; 98 | } 99 | } 100 | 101 | public bool IsFosterParenting 102 | { 103 | get 104 | { 105 | return (flags & ElementName.FOSTER_PARENTING) != 0; 106 | } 107 | } 108 | 109 | public bool IsHtmlIntegrationPoint 110 | { 111 | get 112 | { 113 | return (flags & ElementName.HTML_INTEGRATION_POINT) != 0; 114 | } 115 | } 116 | 117 | // [NOCPP[ 118 | 119 | public bool IsOptionalEndTag 120 | { 121 | get 122 | { 123 | return (flags & ElementName.OPTIONAL_END_TAG) != 0; 124 | } 125 | } 126 | 127 | // ]NOCPP] 128 | 129 | /// 130 | /// Constructor for copying. This doesn't take another StackNode 131 | /// because in C++ the caller is reponsible for reobtaining the local names 132 | /// from another interner. 133 | /// 134 | internal StackNode(int flags, [NsUri] String ns, [Local] String name, T node, 135 | [Local] String popName, HtmlAttributes attributes 136 | // [NOCPP[ 137 | , TaintableLocator locator 138 | // ]NOCPP] 139 | ) 140 | { 141 | this.flags = flags; 142 | this.name = name; 143 | this.popName = popName; 144 | this.ns = ns; 145 | this.node = node; 146 | this.attributes = attributes; 147 | this.refcount = 1; 148 | // [NOCPP[ 149 | this.locator = locator; 150 | // ]NOCPP] 151 | } 152 | 153 | /// 154 | /// Short hand for well-known HTML elements. 155 | /// 156 | internal StackNode(ElementName elementName, T node 157 | // [NOCPP[ 158 | , TaintableLocator locator 159 | // ]NOCPP] 160 | ) 161 | { 162 | this.flags = elementName.Flags; 163 | this.name = elementName.name; 164 | this.popName = elementName.name; 165 | this.ns = "http://www.w3.org/1999/xhtml"; 166 | this.node = node; 167 | this.attributes = null; 168 | this.refcount = 1; 169 | Debug.Assert(!elementName.IsCustom, "Don't use this constructor for custom elements."); 170 | // [NOCPP[ 171 | this.locator = locator; 172 | // ]NOCPP] 173 | } 174 | 175 | /// 176 | /// Constructor for HTML formatting elements. 177 | /// 178 | internal StackNode(ElementName elementName, T node, HtmlAttributes attributes 179 | // [NOCPP[ 180 | , TaintableLocator locator 181 | // ]NOCPP] 182 | ) 183 | { 184 | this.flags = elementName.Flags; 185 | this.name = elementName.name; 186 | this.popName = elementName.name; 187 | this.ns = "http://www.w3.org/1999/xhtml"; 188 | this.node = node; 189 | this.attributes = attributes; 190 | this.refcount = 1; 191 | Debug.Assert(!elementName.IsCustom, "Don't use this constructor for custom elements."); 192 | // [NOCPP[ 193 | this.locator = locator; 194 | // ]NOCPP] 195 | } 196 | 197 | /// 198 | /// The common-case HTML constructor. 199 | /// 200 | internal StackNode(ElementName elementName, T node, [Local] string popName 201 | // [NOCPP[ 202 | , TaintableLocator locator 203 | // ]NOCPP] 204 | ) 205 | { 206 | this.flags = elementName.Flags; 207 | this.name = elementName.name; 208 | this.popName = popName; 209 | this.ns = "http://www.w3.org/1999/xhtml"; 210 | this.node = node; 211 | this.attributes = null; 212 | this.refcount = 1; 213 | // [NOCPP[ 214 | this.locator = locator; 215 | // ]NOCPP] 216 | } 217 | 218 | /// 219 | /// Constructor for SVG elements. Note that the order of the arguments is 220 | /// what distinguishes this from the HTML constructor. This is ugly, but 221 | /// AFAICT the least disruptive way to make this work with Java's generics 222 | /// and without unnecessary branches. :-( 223 | /// 224 | internal StackNode(ElementName elementName, [Local] string popName, T node 225 | // [NOCPP[ 226 | , TaintableLocator locator 227 | // ]NOCPP] 228 | ) 229 | { 230 | this.flags = PrepareSvgFlags(elementName.Flags); 231 | this.name = elementName.name; 232 | this.popName = popName; 233 | this.ns = "http://www.w3.org/2000/svg"; 234 | this.node = node; 235 | this.attributes = null; 236 | this.refcount = 1; 237 | // [NOCPP[ 238 | this.locator = locator; 239 | // ]NOCPP] 240 | } 241 | 242 | /// 243 | /// Constructor for MathML. 244 | /// 245 | internal StackNode(ElementName elementName, T node, [Local] string popName, 246 | bool markAsIntegrationPoint 247 | // [NOCPP[ 248 | , TaintableLocator locator 249 | // ]NOCPP] 250 | ) 251 | { 252 | this.flags = PrepareMathFlags(elementName.Flags, markAsIntegrationPoint); 253 | this.name = elementName.name; 254 | this.popName = popName; 255 | this.ns = "http://www.w3.org/1998/Math/MathML"; 256 | this.node = node; 257 | this.attributes = null; 258 | this.refcount = 1; 259 | // [NOCPP[ 260 | this.locator = locator; 261 | // ]NOCPP] 262 | } 263 | 264 | private static int PrepareSvgFlags(int flags) 265 | { 266 | flags &= ~(ElementName.FOSTER_PARENTING | ElementName.SCOPING 267 | | ElementName.SPECIAL | ElementName.OPTIONAL_END_TAG); 268 | if ((flags & ElementName.SCOPING_AS_SVG) != 0) 269 | { 270 | flags |= (ElementName.SCOPING | ElementName.SPECIAL | ElementName.HTML_INTEGRATION_POINT); 271 | } 272 | return flags; 273 | } 274 | 275 | private static int PrepareMathFlags(int flags, bool markAsIntegrationPoint) 276 | { 277 | flags &= ~(ElementName.FOSTER_PARENTING | ElementName.SCOPING 278 | | ElementName.SPECIAL | ElementName.OPTIONAL_END_TAG); 279 | if ((flags & ElementName.SCOPING_AS_MATHML) != 0) 280 | { 281 | flags |= (ElementName.SCOPING | ElementName.SPECIAL); 282 | } 283 | if (markAsIntegrationPoint) 284 | { 285 | flags |= ElementName.HTML_INTEGRATION_POINT; 286 | } 287 | return flags; 288 | } 289 | 290 | public void DropAttributes() 291 | { 292 | attributes = null; 293 | } 294 | 295 | // [NOCPP[ 296 | 297 | /// 298 | /// Returns a that represents this instance. 299 | /// 300 | /// 301 | /// A that represents this instance. 302 | /// 303 | override public String ToString() 304 | { 305 | return name; 306 | } 307 | 308 | // ]NOCPP] 309 | 310 | // TODO: probably we won't need these 311 | public void Retain() 312 | { 313 | refcount++; 314 | } 315 | 316 | public void Release() 317 | { 318 | refcount--; 319 | /*if (refcount == 0) { 320 | Portability.delete(this); 321 | }*/ 322 | } 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/StateSnapshot.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009-2010 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma warning disable 1591 // Missing XML comment 25 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 26 | #pragma warning disable 1587 // XML comment is not placed on a valid element 27 | 28 | namespace HtmlParserSharp.Core 29 | { 30 | public class StateSnapshot : ITreeBuilderState where T : class 31 | { 32 | /// 33 | /// Gets the stack. 34 | /// 35 | public StackNode[] Stack { get; private set; } 36 | 37 | /// 38 | /// Gets the list of active formatting elements. 39 | /// 40 | public StackNode[] ListOfActiveFormattingElements { get; private set; } 41 | 42 | public T FormPointer { get; private set; } 43 | 44 | public T HeadPointer { get; private set; } 45 | 46 | public T DeepTreeSurrogateParent { get; private set; } 47 | 48 | /// 49 | /// Gets the mode. 50 | /// 51 | public InsertionMode Mode { get; private set; } 52 | 53 | /// 54 | /// Gets the original mode. 55 | /// 56 | public InsertionMode OriginalMode { get; private set; } 57 | 58 | /// 59 | /// Gets a value indicating whether this instance is frameset ok. 60 | /// 61 | /// 62 | /// true if this instance is frameset ok; otherwise, false. 63 | /// 64 | public bool IsFramesetOk { get; private set; } 65 | 66 | /// 67 | /// Gets a value indicating whether this instance is need to drop LF. 68 | /// 69 | /// 70 | /// true if this instance is need to drop LF; otherwise, false. 71 | /// 72 | public bool IsNeedToDropLF { get; private set; } 73 | 74 | /// 75 | /// Gets a value indicating whether this instance is quirks. 76 | /// 77 | /// 78 | /// true if this instance is quirks; otherwise, false. 79 | /// 80 | public bool IsQuirks { get; private set; } 81 | 82 | internal StateSnapshot(StackNode[] stack, 83 | StackNode[] listOfActiveFormattingElements, 84 | T formPointer, 85 | T headPointer, 86 | T deepTreeSurrogateParent, 87 | InsertionMode mode, 88 | InsertionMode originalMode, 89 | bool framesetOk, 90 | bool needToDropLF, 91 | bool quirks) 92 | { 93 | Stack = stack; 94 | ListOfActiveFormattingElements = listOfActiveFormattingElements; 95 | FormPointer = formPointer; 96 | HeadPointer = headPointer; 97 | DeepTreeSurrogateParent = deepTreeSurrogateParent; 98 | Mode = mode; 99 | OriginalMode = originalMode; 100 | IsFramesetOk = framesetOk; 101 | IsNeedToDropLF = needToDropLF; 102 | IsQuirks = quirks; 103 | } 104 | } 105 | 106 | } 107 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/TaintableLocator.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma warning disable 1591 // Missing XML comment 25 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 26 | #pragma warning disable 1587 // XML comment is not placed on a valid element 27 | 28 | namespace HtmlParserSharp.Core 29 | { 30 | public class TaintableLocator : Locator 31 | { 32 | public TaintableLocator(ILocator locator) 33 | : base(locator) 34 | { 35 | IsTainted = false; 36 | } 37 | 38 | public void MarkTainted() 39 | { 40 | IsTainted = true; 41 | } 42 | 43 | public bool IsTainted { get; private set; } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/TreeBuilderConstants.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * The comments following this one that use the same comment syntax as this 3 | * comment are quotes from the WHATWG HTML 5 spec as of 27 June 2007 4 | * amended as of June 28 2007. 5 | * That document came with this statement: 6 | * © Copyright 2004-2007 Apple Computer, Inc., Mozilla Foundation, and 7 | * Opera Software ASA. You are granted a license to use, reproduce and 8 | * create derivative works of this document." 9 | */ 10 | 11 | using HtmlParserSharp.Common; 12 | 13 | #pragma warning disable 1591 // Missing XML comment 14 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 15 | #pragma warning disable 1587 // XML comment is not placed on a valid element 16 | 17 | namespace HtmlParserSharp.Core 18 | { 19 | /// 20 | /// Moved the constants (and pseude-enums) out of the TreeBuilder class. 21 | /// 22 | public class TreeBuilderConstants 23 | { 24 | /// 25 | /// Array version of U+FFFD. 26 | /// 27 | internal static readonly char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; 28 | 29 | // [NOCPP[ 30 | 31 | internal readonly static string[] HTML4_PUBLIC_IDS = { 32 | "-//W3C//DTD HTML 4.0 Frameset//EN", 33 | "-//W3C//DTD HTML 4.0 Transitional//EN", 34 | "-//W3C//DTD HTML 4.0//EN", "-//W3C//DTD HTML 4.01 Frameset//EN", 35 | "-//W3C//DTD HTML 4.01 Transitional//EN", 36 | "-//W3C//DTD HTML 4.01//EN" }; 37 | 38 | // ]NOCPP] 39 | 40 | internal readonly static string[] QUIRKY_PUBLIC_IDS = { 41 | "+//silmaril//dtd html pro v0r11 19970101//", 42 | "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 43 | "-//as//dtd html 3.0 aswedit + extensions//", 44 | "-//ietf//dtd html 2.0 level 1//", 45 | "-//ietf//dtd html 2.0 level 2//", 46 | "-//ietf//dtd html 2.0 strict level 1//", 47 | "-//ietf//dtd html 2.0 strict level 2//", 48 | "-//ietf//dtd html 2.0 strict//", 49 | "-//ietf//dtd html 2.0//", 50 | "-//ietf//dtd html 2.1e//", 51 | "-//ietf//dtd html 3.0//", 52 | "-//ietf//dtd html 3.2 final//", 53 | "-//ietf//dtd html 3.2//", 54 | "-//ietf//dtd html 3//", 55 | "-//ietf//dtd html level 0//", 56 | "-//ietf//dtd html level 1//", 57 | "-//ietf//dtd html level 2//", 58 | "-//ietf//dtd html level 3//", 59 | "-//ietf//dtd html strict level 0//", 60 | "-//ietf//dtd html strict level 1//", 61 | "-//ietf//dtd html strict level 2//", 62 | "-//ietf//dtd html strict level 3//", 63 | "-//ietf//dtd html strict//", 64 | "-//ietf//dtd html//", 65 | "-//metrius//dtd metrius presentational//", 66 | "-//microsoft//dtd internet explorer 2.0 html strict//", 67 | "-//microsoft//dtd internet explorer 2.0 html//", 68 | "-//microsoft//dtd internet explorer 2.0 tables//", 69 | "-//microsoft//dtd internet explorer 3.0 html strict//", 70 | "-//microsoft//dtd internet explorer 3.0 html//", 71 | "-//microsoft//dtd internet explorer 3.0 tables//", 72 | "-//netscape comm. corp.//dtd html//", 73 | "-//netscape comm. corp.//dtd strict html//", 74 | "-//o'reilly and associates//dtd html 2.0//", 75 | "-//o'reilly and associates//dtd html extended 1.0//", 76 | "-//o'reilly and associates//dtd html extended relaxed 1.0//", 77 | "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 78 | "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 79 | "-//spyglass//dtd html 2.0 extended//", 80 | "-//sq//dtd html 2.0 hotmetal + extensions//", 81 | "-//sun microsystems corp.//dtd hotjava html//", 82 | "-//sun microsystems corp.//dtd hotjava strict html//", 83 | "-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3.2 draft//", 84 | "-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2//", 85 | "-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 4.0 frameset//", 86 | "-//w3c//dtd html 4.0 transitional//", 87 | "-//w3c//dtd html experimental 19960712//", 88 | "-//w3c//dtd html experimental 970421//", "-//w3c//dtd w3 html//", 89 | "-//w3o//dtd w3 html 3.0//", "-//webtechs//dtd mozilla html 2.0//", 90 | "-//webtechs//dtd mozilla html//" }; 91 | 92 | internal const int NOT_FOUND_ON_STACK = int.MaxValue; 93 | 94 | // [NOCPP[ 95 | 96 | [Local] 97 | internal const string HTML_LOCAL = "html"; 98 | 99 | // ]NOCPP] 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /HtmlParserSharp/Core/UTF16Buffer.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008-2010 Mozilla Foundation 3 | * Copyright (c) 2012 Patrick Reisert 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma warning disable 1591 // Missing XML comment 25 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 26 | #pragma warning disable 1587 // XML comment is not placed on a valid element 27 | 28 | namespace HtmlParserSharp.Core 29 | { 30 | /// 31 | /// An UTF-16 buffer that knows the start and end indeces of its unconsumed 32 | /// content. 33 | /// 34 | public sealed class UTF16Buffer 35 | { 36 | /// 37 | /// Gets the backing store of the buffer. May be larger than the logical content 38 | /// of this UTF16Buffer. 39 | /// 40 | public char[] Buffer { get; private set; } 41 | 42 | /// 43 | /// Gets or sets the index of the first unconsumed character in the backing buffer. 44 | /// 45 | public int Start { get; set; } 46 | 47 | /// 48 | /// Gets or sets the index of the slot immediately after the last character in the backing 49 | /// buffer that is part of the logical content of this UTF16Buffer. 50 | /// 51 | public int End { get; set; } 52 | 53 | /// 54 | /// Constructor for wrapping an existing UTF-16 code unit array. 55 | /// 56 | /// The backing buffer. 57 | /// The index of the first character to consume. 58 | /// The index immediately after the last character to consume. 59 | public UTF16Buffer(char[] buffer, int start, int end) 60 | { 61 | Buffer = buffer; 62 | Start = start; 63 | End = end; 64 | } 65 | 66 | /// 67 | /// Determines whether this instance has data left. 68 | /// 69 | /// 70 | /// true if there's data left; otherwise, false. 71 | /// 72 | public bool HasMore 73 | { 74 | get 75 | { 76 | return Start < End; 77 | } 78 | } 79 | 80 | /// 81 | /// Adjusts the start index to skip over the first character if it is a line 82 | /// feed and the previous character was a carriage return. 83 | /// 84 | /// Whether the previous character was a carriage return. 85 | public void Adjust(bool lastWasCR) 86 | { 87 | if (lastWasCR && Buffer[Start] == '\n') 88 | { 89 | Start++; 90 | } 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /HtmlParserSharp/HtmlParserSharp.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | AnyCPU 6 | 8.0.30703 7 | 2.0 8 | {FD150915-D34F-436A-92C1-80AA505DA754} 9 | Library 10 | Properties 11 | HtmlParserSharp 12 | HtmlParser 13 | v4.0 14 | 512 15 | false 16 | publish\ 17 | true 18 | Disk 19 | false 20 | Foreground 21 | 7 22 | Days 23 | false 24 | false 25 | true 26 | 0 27 | 1.0.0.%2a 28 | false 29 | true 30 | 31 | 32 | true 33 | full 34 | false 35 | bin\Debug\ 36 | DEBUG;TRACE 37 | prompt 38 | 4 39 | 40 | 41 | pdbonly 42 | true 43 | bin\Release\ 44 | TRACE 45 | prompt 46 | 4 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | Always 102 | 103 | 104 | 105 | 106 | False 107 | Microsoft .NET Framework 4 %28x86 and x64%29 108 | true 109 | 110 | 111 | False 112 | .NET Framework 3.5 SP1 Client Profile 113 | false 114 | 115 | 116 | False 117 | .NET Framework 3.5 SP1 118 | false 119 | 120 | 121 | False 122 | Windows Installer 4.5 123 | true 124 | 125 | 126 | 127 | 134 | -------------------------------------------------------------------------------- /HtmlParserSharp/Parser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.IO; 6 | using HtmlParserSharp.Core; 7 | using HtmlParserSharp.Common; 8 | 9 | 10 | namespace HtmlParserSharp 11 | { 12 | /// 13 | /// Generic parser that accepts any ITokenHandler implementation. It's up to the client to poll 14 | /// the resulting document from their implementation. 15 | /// 16 | 17 | public class Parser 18 | { 19 | /// 20 | /// Creates a new TokenHandler and parses the html with it. 21 | /// 22 | /// 23 | /// 24 | /// Generic type parameter. 25 | /// 26 | /// 27 | /// The HTML. 28 | /// 29 | /// 30 | /// 31 | /// The populated TokenHandler 32 | /// 33 | 34 | public static T Create(string html) where T: ITokenHandler, new() 35 | { 36 | T tokenHandler = new T(); 37 | var parser = new Parser(tokenHandler); 38 | parser.Parse(html); 39 | return tokenHandler; 40 | } 41 | 42 | public Parser(ITokenHandler treeBuilder) 43 | { 44 | TreeBuilder = treeBuilder; 45 | } 46 | 47 | private Tokenizer Tokenizer; 48 | private ITokenHandler TreeBuilder; 49 | 50 | 51 | 52 | public void Parse(string html) 53 | { 54 | using (var reader = new StringReader(html)) 55 | { 56 | Tokenize(reader); 57 | } 58 | } 59 | 60 | 61 | private void Reset() 62 | { 63 | TreeBuilder = new XmlTreeBuilder(); 64 | Tokenizer = new Tokenizer(TreeBuilder, false); 65 | } 66 | 67 | private void Tokenize(TextReader reader) 68 | { 69 | if (reader == null) 70 | { 71 | throw new ArgumentNullException("reader was null."); 72 | } 73 | 74 | Tokenizer.Start(); 75 | bool swallowBom = true; 76 | 77 | try 78 | { 79 | char[] buffer = new char[2048]; 80 | UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); 81 | bool lastWasCR = false; 82 | int len = -1; 83 | if ((len = reader.Read(buffer, 0, buffer.Length)) != 0) 84 | { 85 | int streamOffset = 0; 86 | int offset = 0; 87 | int length = len; 88 | if (swallowBom) 89 | { 90 | if (buffer[0] == '\uFEFF') 91 | { 92 | streamOffset = -1; 93 | offset = 1; 94 | length--; 95 | } 96 | } 97 | if (length > 0) 98 | { 99 | Tokenizer.SetTransitionBaseOffset(streamOffset); 100 | bufr.Start = offset; 101 | bufr.End = offset + length; 102 | while (bufr.HasMore) 103 | { 104 | bufr.Adjust(lastWasCR); 105 | lastWasCR = false; 106 | if (bufr.HasMore) 107 | { 108 | lastWasCR = Tokenizer.TokenizeBuffer(bufr); 109 | } 110 | } 111 | } 112 | streamOffset = length; 113 | while ((len = reader.Read(buffer, 0, buffer.Length)) != 0) 114 | { 115 | Tokenizer.SetTransitionBaseOffset(streamOffset); 116 | bufr.Start = 0; 117 | bufr.End = len; 118 | while (bufr.HasMore) 119 | { 120 | bufr.Adjust(lastWasCR); 121 | lastWasCR = false; 122 | if (bufr.HasMore) 123 | { 124 | lastWasCR = Tokenizer.TokenizeBuffer(bufr); 125 | } 126 | } 127 | streamOffset += len; 128 | } 129 | } 130 | Tokenizer.Eof(); 131 | } 132 | finally 133 | { 134 | Tokenizer.End(); 135 | } 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /HtmlParserSharp/Program.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Patrick Reisert 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | using System; 24 | using System.Collections.Generic; 25 | using System.Diagnostics; 26 | using System.IO; 27 | using System.Linq; 28 | using System.Xml.Linq; 29 | 30 | namespace HtmlParserSharp 31 | { 32 | /// 33 | /// This is contains a sample entry point for testing and benchmarks. 34 | /// 35 | public class Program 36 | { 37 | static SimpleHtmlParser parser = new SimpleHtmlParser(); 38 | 39 | private static IEnumerable GetTestFiles() 40 | { 41 | //DirectoryInfo dir = new DirectoryInfo("SampleData"); 42 | //return dir.GetFiles("*.html", SearchOption.AllDirectories); 43 | for (int i = 0; i < 10; i++) 44 | { 45 | yield return new FileInfo(Path.Combine("SampleData", "test.html")); 46 | } 47 | } 48 | 49 | public static void Main(string[] args) 50 | { 51 | //var fragment1 = parser.ParseStringFragment("foo", ""); 52 | //var fragment2 = parser.ParseStringFragment("foo", "table"); 53 | 54 | Stopwatch sw = new Stopwatch(); 55 | 56 | 57 | Console.Write("Parsing ... "); 58 | var result = GetTestFiles().Select((file) => 59 | { 60 | sw.Restart(); 61 | var doc = parser.Parse(file.FullName); 62 | sw.Stop(); 63 | var parseTime = sw.Elapsed; 64 | doc.Save("test.xml"); 65 | sw.Restart(); 66 | XDocument.Load("test.xml"); 67 | sw.Stop(); 68 | var reparseTime = sw.Elapsed; 69 | return new { Document = doc, Time = parseTime, ReparseTime = reparseTime }; 70 | } 71 | ).ToList(); 72 | 73 | TimeSpan total = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.Time); 74 | TimeSpan reparseTotal = result.Aggregate(new TimeSpan(), (passed, current) => passed + current.ReparseTime); 75 | 76 | Console.WriteLine("done."); 77 | Console.WriteLine("Found " + result.Count + " documents."); 78 | Console.WriteLine(); 79 | PrintTime("Total", total); 80 | PrintTime("First", result.First().Time); 81 | PrintTime("Average", TimeSpan.FromTicks(total.Ticks / result.Count)); 82 | PrintTime("Average (without first)", TimeSpan.FromTicks((total.Ticks - result.First().Time.Ticks) / (result.Count - 1))); 83 | PrintTime("Min", result.Min(val => val.Time)); 84 | PrintTime("Max", result.Max(val => val.Time)); 85 | 86 | Console.WriteLine(); 87 | Console.WriteLine("=== Reparsing (XDocument) ==="); 88 | 89 | // note: reparsing using XmlDocument instead gives similar results 90 | 91 | PrintTime("Total", reparseTotal); 92 | PrintTime("First", result.First().ReparseTime); 93 | PrintTime("Average", TimeSpan.FromTicks(reparseTotal.Ticks / result.Count)); 94 | PrintTime("Average (without first)", TimeSpan.FromTicks((reparseTotal.Ticks - result.First().ReparseTime.Ticks) / (result.Count - 1))); 95 | PrintTime("Min", result.Min(val => val.ReparseTime)); 96 | PrintTime("Max", result.Max(val => val.ReparseTime)); 97 | Console.ReadKey(); 98 | } 99 | 100 | private static void PrintTime(string caption, TimeSpan time) 101 | { 102 | Console.WriteLine("{0}:\n {1} ({2} ms)", caption, time.ToString(), time.TotalMilliseconds); 103 | } 104 | 105 | 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /HtmlParserSharp/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // Allgemeine Informationen über eine Assembly werden über die folgenden 6 | // Attribute gesteuert. Ändern Sie diese Attributwerte, um die Informationen zu ändern, 7 | // die mit einer Assembly verknüpft sind. 8 | [assembly: AssemblyTitle("HtmlParser")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("HtmlParser")] 13 | [assembly: AssemblyCopyright("Copyright © 2012")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Durch Festlegen von ComVisible auf "false" werden die Typen in dieser Assembly unsichtbar 18 | // für COM-Komponenten. Wenn Sie auf einen Typ in dieser Assembly von 19 | // COM zugreifen müssen, legen Sie das ComVisible-Attribut für diesen Typ auf "true" fest. 20 | [assembly: ComVisible(false)] 21 | 22 | // Die folgende GUID bestimmt die ID der Typbibliothek, wenn dieses Projekt für COM verfügbar gemacht wird 23 | [assembly: Guid("dd2311df-4aa1-4f09-8fff-751cd048e652")] 24 | 25 | // Versionsinformationen für eine Assembly bestehen aus den folgenden vier Werten: 26 | // 27 | // Hauptversion 28 | // Nebenversion 29 | // Buildnummer 30 | // Revision 31 | // 32 | // Sie können alle Werte angeben oder die standardmäßigen Build- und Revisionsnummern 33 | // übernehmen, indem Sie "*" eingeben: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /HtmlParserSharp/SampleData/test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamietre/HtmlParserSharp/2a450f49bb908d50461eae95dd4f74b872b5094e/HtmlParserSharp/SampleData/test.html -------------------------------------------------------------------------------- /HtmlParserSharp/SimpleHtmlParser.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 /// Patrick Reisert 3 | * Copyright (c) 2005, 2006, 2007 Henri Sivonen 4 | * Copyright (c) 2007-2008 Mozilla Foundation 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | using System; 26 | using System.IO; 27 | using System.Xml; 28 | using HtmlParserSharp.Core; 29 | 30 | namespace HtmlParserSharp 31 | { 32 | /// 33 | /// This is a simple API for the parsing process. 34 | /// Part of this is a port of the nu.validator.htmlparser.io.Driver class. 35 | /// The parser currently ignores the encoding in the html source and parses everything as UTF-8. 36 | /// 37 | public class SimpleHtmlParser 38 | { 39 | private Tokenizer tokenizer; 40 | private XmlTreeBuilder treeBuilder; 41 | 42 | public XmlDocumentFragment ParseStringFragment(string str, string fragmentContext) 43 | { 44 | using (var reader = new StringReader(str)) 45 | return ParseFragment(reader, fragmentContext); 46 | } 47 | 48 | public XmlDocument ParseString(string str) 49 | { 50 | using (var reader = new StringReader(str)) 51 | return Parse(reader); 52 | } 53 | 54 | public XmlDocument Parse(string path) 55 | { 56 | using (var reader = new StreamReader(path)) 57 | return Parse(reader); 58 | } 59 | 60 | public XmlDocument Parse(TextReader reader) 61 | { 62 | Reset(); 63 | Tokenize(reader); 64 | return treeBuilder.Document; 65 | } 66 | 67 | public XmlDocumentFragment ParseFragment(TextReader reader, string fragmentContext) 68 | { 69 | Reset(); 70 | treeBuilder.SetFragmentContext(fragmentContext); 71 | Tokenize(reader); 72 | return treeBuilder.getDocumentFragment(); 73 | } 74 | 75 | private void Reset() 76 | { 77 | treeBuilder = new XmlTreeBuilder(); 78 | tokenizer = new Tokenizer(treeBuilder, false); 79 | treeBuilder.WantsComments = false; 80 | 81 | // optionally: report errors and more 82 | 83 | //treeBuilder.ErrorEvent += 84 | // (sender, a) => 85 | // { 86 | // ILocator loc = tokenizer as ILocator; 87 | // Console.WriteLine("{0}: {1} (Line: {2})", a.IsWarning ? "Warning" : "Error", a.Message, loc.LineNumber); 88 | // }; 89 | //treeBuilder.DocumentModeDetected += (sender, a) => Console.WriteLine("Document mode: " + a.Mode.ToString()); 90 | //tokenizer.EncodingDeclared += (sender, a) => Console.WriteLine("Encoding: " + a.Encoding + " (currently ignored)"); 91 | } 92 | 93 | private void Tokenize(TextReader reader) 94 | { 95 | if (reader == null) 96 | { 97 | throw new ArgumentNullException("reader was null."); 98 | } 99 | 100 | tokenizer.Start(); 101 | bool swallowBom = true; 102 | 103 | try 104 | { 105 | char[] buffer = new char[2048]; 106 | UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); 107 | bool lastWasCR = false; 108 | int len = -1; 109 | if ((len = reader.Read(buffer, 0, buffer.Length)) != 0) 110 | { 111 | int streamOffset = 0; 112 | int offset = 0; 113 | int length = len; 114 | if (swallowBom) 115 | { 116 | if (buffer[0] == '\uFEFF') 117 | { 118 | streamOffset = -1; 119 | offset = 1; 120 | length--; 121 | } 122 | } 123 | if (length > 0) 124 | { 125 | tokenizer.SetTransitionBaseOffset(streamOffset); 126 | bufr.Start = offset; 127 | bufr.End = offset + length; 128 | while (bufr.HasMore) 129 | { 130 | bufr.Adjust(lastWasCR); 131 | lastWasCR = false; 132 | if (bufr.HasMore) 133 | { 134 | lastWasCR = tokenizer.TokenizeBuffer(bufr); 135 | } 136 | } 137 | } 138 | streamOffset = length; 139 | while ((len = reader.Read(buffer, 0, buffer.Length)) != 0) 140 | { 141 | tokenizer.SetTransitionBaseOffset(streamOffset); 142 | bufr.Start = 0; 143 | bufr.End = len; 144 | while (bufr.HasMore) 145 | { 146 | bufr.Adjust(lastWasCR); 147 | lastWasCR = false; 148 | if (bufr.HasMore) 149 | { 150 | lastWasCR = tokenizer.TokenizeBuffer(bufr); 151 | } 152 | } 153 | streamOffset += len; 154 | } 155 | } 156 | tokenizer.Eof(); 157 | } 158 | finally 159 | { 160 | tokenizer.End(); 161 | } 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /HtmlParserSharp/TreeBuilders/XmlTreeBuilder.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2007 Henri Sivonen 3 | * Copyright (c) 2008-2010 Mozilla Foundation 4 | * Copyright (c) 2012 Patrick Reisert 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a 7 | * copy of this software and associated documentation files (the "Software"), 8 | * to deal in the Software without restriction, including without limitation 9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 | * and/or sell copies of the Software, and to permit persons to whom the 11 | * Software is furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in 14 | * all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 | * DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | using System; 26 | using System.Collections.Generic; 27 | using System.Linq; 28 | using System.Text; 29 | using System.Xml; 30 | using HtmlParserSharp.Common; 31 | using HtmlParserSharp.Core; 32 | 33 | #pragma warning disable 1591 // Missing XML comment 34 | #pragma warning disable 1570 // XML comment on 'construct' has badly formed XML — 'reason' 35 | #pragma warning disable 1587 // XML comment is not placed on a valid element 36 | 37 | namespace HtmlParserSharp 38 | { 39 | /// 40 | /// The tree builder glue for building a tree through the public DOM APIs. 41 | /// 42 | 43 | public class XmlTreeBuilder : CoalescingTreeBuilder 44 | { 45 | /// 46 | /// The current doc. 47 | /// 48 | private XmlDocument document; 49 | 50 | override protected void AddAttributesToElement(XmlElement element, HtmlAttributes attributes) { 51 | for (int i = 0; i < attributes.Length; i++) { 52 | String localName = attributes.GetLocalName(i); 53 | String uri = attributes.GetURI(i); 54 | if (!element.HasAttribute(localName, uri)) { 55 | element.SetAttribute(localName, uri, attributes.GetValue(i)); 56 | } 57 | } 58 | } 59 | 60 | override protected void AppendCharacters(XmlElement parent, string text) 61 | { 62 | XmlNode lastChild = parent.LastChild; 63 | if (lastChild != null && lastChild.NodeType == XmlNodeType.Text) { 64 | XmlText lastAsText = (XmlText) lastChild; 65 | lastAsText.Data += text; 66 | return; 67 | } 68 | parent.AppendChild(document.CreateTextNode(text)); 69 | } 70 | 71 | override protected void AppendChildrenToNewParent(XmlElement oldParent, XmlElement newParent) { 72 | while (oldParent.HasChildNodes) { 73 | newParent.AppendChild(oldParent.FirstChild); 74 | } 75 | } 76 | 77 | protected override void AppendDoctypeToDocument(string name, string publicIdentifier, string systemIdentifier) 78 | { 79 | // TODO: this method was not there originally. is it correct? 80 | document.XmlResolver = null; 81 | 82 | if (publicIdentifier == String.Empty) 83 | publicIdentifier = null; 84 | if (systemIdentifier == String.Empty) 85 | systemIdentifier = null; 86 | 87 | var doctype = document.CreateDocumentType(name, publicIdentifier, systemIdentifier, null); 88 | document.XmlResolver = new XmlUrlResolver(); 89 | document.AppendChild(doctype); 90 | } 91 | 92 | override protected void AppendComment(XmlElement parent, String comment) 93 | { 94 | parent.AppendChild(document.CreateComment(comment)); 95 | } 96 | 97 | override protected void AppendCommentToDocument(String comment) 98 | { 99 | document.AppendChild(document.CreateComment(comment)); 100 | } 101 | 102 | override protected XmlElement CreateElement(string ns, string name, HtmlAttributes attributes) 103 | { 104 | XmlElement rv = document.CreateElement(name, ns); 105 | for (int i = 0; i < attributes.Length; i++) 106 | { 107 | rv.SetAttribute(attributes.GetLocalName(i), attributes.GetURI(i), attributes.GetValue(i)); 108 | if (attributes.GetType(i) == "ID") 109 | { 110 | //rv.setIdAttributeNS(null, attributes.GetLocalName(i), true); // FIXME 111 | } 112 | } 113 | return rv; 114 | } 115 | 116 | override protected XmlElement CreateHtmlElementSetAsRoot(HtmlAttributes attributes) 117 | { 118 | XmlElement rv = document.CreateElement("html", "http://www.w3.org/1999/xhtml"); 119 | for (int i = 0; i < attributes.Length; i++) { 120 | rv.SetAttribute(attributes.GetLocalName(i), attributes.GetURI(i), attributes.GetValue(i)); 121 | } 122 | document.AppendChild(rv); 123 | return rv; 124 | } 125 | 126 | override protected void AppendElement(XmlElement child, XmlElement newParent) 127 | { 128 | newParent.AppendChild(child); 129 | } 130 | 131 | override protected bool HasChildren(XmlElement element) 132 | { 133 | return element.HasChildNodes; 134 | } 135 | 136 | override protected XmlElement CreateElement(string ns, string name, HtmlAttributes attributes, XmlElement form) { 137 | XmlElement rv = CreateElement(ns, name, attributes); 138 | //rv.setUserData("nu.validator.form-pointer", form, null); // TODO 139 | return rv; 140 | } 141 | 142 | override protected void Start(bool fragment) { 143 | document = new XmlDocument(); // implementation.createDocument(null, null, null); 144 | // TODO: fragment? 145 | } 146 | 147 | protected override void ReceiveDocumentMode(DocumentMode mode, String publicIdentifier, 148 | String systemIdentifier, bool html4SpecificAdditionalErrorChecks) 149 | { 150 | //document.setUserData("nu.validator.document-mode", mode, null); // TODO 151 | } 152 | 153 | /// 154 | /// Returns the document. 155 | /// 156 | /// The document 157 | internal XmlDocument Document 158 | { 159 | get 160 | { 161 | return document; 162 | } 163 | } 164 | 165 | /// 166 | /// Return the document fragment. 167 | /// 168 | /// The document fragment 169 | internal XmlDocumentFragment getDocumentFragment() { 170 | XmlDocumentFragment rv = document.CreateDocumentFragment(); 171 | XmlNode rootElt = document.FirstChild; 172 | while (rootElt.HasChildNodes) { 173 | rv.AppendChild(rootElt.FirstChild); 174 | } 175 | document = null; 176 | return rv; 177 | } 178 | 179 | override protected void InsertFosterParentedCharacters(string text, XmlElement table, XmlElement stackParent) { 180 | XmlNode parent = table.ParentNode; 181 | if (parent != null) { // always an element if not null 182 | XmlNode previousSibling = table.PreviousSibling; 183 | if (previousSibling != null 184 | && previousSibling.NodeType == XmlNodeType.Text) { 185 | XmlText lastAsText = (XmlText) previousSibling; 186 | lastAsText.Data += text; 187 | return; 188 | } 189 | parent.InsertBefore(document.CreateTextNode(text), table); 190 | return; 191 | } 192 | XmlNode lastChild = stackParent.LastChild; 193 | if (lastChild != null && lastChild.NodeType == XmlNodeType.Text) { 194 | XmlText lastAsText = (XmlText) lastChild; 195 | lastAsText.Data += text; 196 | return; 197 | } 198 | stackParent.AppendChild(document.CreateTextNode(text)); 199 | } 200 | 201 | override protected void InsertFosterParentedChild(XmlElement child, XmlElement table, XmlElement stackParent) { 202 | XmlNode parent = table.ParentNode; 203 | if (parent != null) { // always an element if not null 204 | parent.InsertBefore(child, table); 205 | } else { 206 | stackParent.AppendChild(child); 207 | } 208 | } 209 | 210 | override protected void DetachFromParent(XmlElement element) 211 | { 212 | XmlNode parent = element.ParentNode; 213 | if (parent != null) { 214 | parent.RemoveChild(element); 215 | } 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This is for the HTML parser as a whole except the rewindable input stream, 2 | the named character classes and the Live DOM Viewer. 3 | For the copyright notices for individual files, please see individual files. 4 | 5 | /* 6 | * Copyright (c) 2005, 2006, 2007 Henri Sivonen 7 | * Copyright (c) 2007-2011 Mozilla Foundation 8 | * Portions of comments Copyright 2004-2007 Apple Computer, Inc., Mozilla 9 | * Foundation, and Opera Software ASA. 10 | * 11 | * Permission is hereby granted, free of charge, to any person obtaining a 12 | * copy of this software and associated documentation files (the "Software"), 13 | * to deal in the Software without restriction, including without limitation 14 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 15 | * and/or sell copies of the Software, and to permit persons to whom the 16 | * Software is furnished to do so, subject to the following conditions: 17 | * 18 | * The above copyright notice and this permission notice shall be included in 19 | * all copies or substantial portions of the Software. 20 | * 21 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 24 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 27 | * DEALINGS IN THE SOFTWARE. 28 | */ 29 | 30 | The following license is for the WHATWG spec from which the named character 31 | data was extracted. 32 | 33 | /* 34 | * Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and Opera 35 | * Software ASA. 36 | * 37 | * You are granted a license to use, reproduce and create derivative works of 38 | * this document. 39 | */ 40 | 41 | The following license is for the rewindable input stream. 42 | 43 | /* 44 | * Copyright (c) 2001-2003 Thai Open Source Software Center Ltd 45 | * All rights reserved. 46 | * 47 | * Redistribution and use in source and binary forms, with or without 48 | * modification, are permitted provided that the following conditions 49 | * are met: 50 | * 51 | * * Redistributions of source code must retain the above copyright 52 | * notice, this list of conditions and the following disclaimer. 53 | * * Redistributions in binary form must reproduce the above 54 | * copyright notice, this list of conditions and the following 55 | * disclaimer in the documentation and/or other materials provided 56 | * with the distribution. 57 | * * Neither the name of the Thai Open Source Software Center Ltd nor 58 | * the names of its contributors may be used to endorse or promote 59 | * products derived from this software without specific prior 60 | * written permission. 61 | * 62 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 63 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 64 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 65 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 66 | * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 67 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 68 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 69 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 70 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 71 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 72 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 73 | * POSSIBILITY OF SUCH DAMAGE. 74 | */ 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HtmlParserSharp 2 | =============== 3 | 4 | This is a manual C# port of the [Validator.nu HTML Parser](http://about.validator.nu/htmlparser/), a HTML5 parser originally written in Java and (compiled to C++ using the Google Web Toolkit) used by Mozilla's Gecko rendering engine. This port is current as of Version 1.4. 5 | 6 | The code is DOM-agnostic and provides an interface via `TreeBuilder` for creating a DOM from its output using any object model. Included in the code base is a `TreeBuilder` that produces a DOM using System.Xml. 7 | 8 | Status 9 | ------ 10 | 11 | This port was created by Patrick Reisert based on Validator.nu 1.3. It was adopted by James Treworgy in September, 2012 to use in [CsQuery](https://github.com/jamietre/CsQuery). However, since a general-purpose HTML5 parser is extraordinarily useful, I've kept it as an independent project. It's included as a submodule in CsQuery to simplify distribution. It may become an external dependency at some point if development of the parser substantially diverges from CsQuery in the future. 12 | 13 | --------------------------------------------------------------------------------