├── Test ├── Test References │ └── NSoup.accessor ├── htmltests │ ├── xml-test.xml │ ├── meta-charset-3.html │ ├── thumb.jpg │ ├── baidu-cn-home.html │ ├── baidu-variant.html │ ├── meta-charset-1.html │ ├── meta-charset-2.html │ └── README ├── NSoupTest.snk ├── TextUtil.cs ├── Properties │ └── AssemblyInfo.cs ├── Select │ └── QueryParserTest.cs ├── Integration │ ├── Benchmark.cs │ └── ParseTest.cs ├── Parser │ ├── TagTest.cs │ ├── XmlTreeBuilderTest.cs │ ├── TokenQueueTests.cs │ └── AttributeParseTest.cs ├── Helper │ ├── DataUtilTest.cs │ ├── StringUtilTest.cs │ └── HttpConnectionTest.cs ├── Nodes │ ├── TextNodeTest.cs │ ├── EntitiesTest.cs │ └── DocumentTest.cs └── Test.csproj ├── NSoup.suo ├── NSoup.v11.suo ├── NSoup ├── NSoup.snk ├── NSoup.csproj.user ├── HttpStatusException.cs ├── UnsupportedMimeTypeException.cs ├── Parse │ ├── ParseErrorList.cs │ ├── ParseError.cs │ ├── TreeBuilder.cs │ ├── XmlTreeBuilder.cs │ ├── Parser.cs │ └── Token.cs ├── Select │ ├── NodeVisitor.cs │ ├── NodeTraversor.cs │ ├── Collector.cs │ ├── CombiningEvaluator.cs │ ├── StructuralEvaluator.cs │ └── Selector.cs ├── Nodes │ ├── entities-base.txt │ ├── Comment.cs │ ├── DocumentType.cs │ ├── XmlDeclaration.cs │ ├── DataNode.cs │ ├── Attribute.cs │ ├── TextNode.cs │ └── Entities.cs ├── Properties │ └── AssemblyInfo.cs ├── Helper │ ├── DescendableLinkedList.cs │ ├── StringUtil.cs │ ├── DataUtil.cs │ └── LinkedHashSet.cs ├── NSoup.csproj └── Safety │ └── Cleaner.cs ├── LocalTestRun.testrunconfig ├── README.md ├── LICENSE ├── .gitignore ├── NSoup.sln └── NSoup.vsmdi /Test/Test References/NSoup.accessor: -------------------------------------------------------------------------------- 1 | NSoup.dll 2 | Desktop 3 | -------------------------------------------------------------------------------- /NSoup.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/NSoup.suo -------------------------------------------------------------------------------- /NSoup.v11.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/NSoup.v11.suo -------------------------------------------------------------------------------- /NSoup/NSoup.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/NSoup/NSoup.snk -------------------------------------------------------------------------------- /Test/htmltests/xml-test.xml: -------------------------------------------------------------------------------- 1 | OneTwoThree 2 | -------------------------------------------------------------------------------- /Test/NSoupTest.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/NSoupTest.snk -------------------------------------------------------------------------------- /Test/htmltests/meta-charset-3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 新 4 | -------------------------------------------------------------------------------- /Test/htmltests/thumb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/htmltests/thumb.jpg -------------------------------------------------------------------------------- /Test/htmltests/baidu-cn-home.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/htmltests/baidu-cn-home.html -------------------------------------------------------------------------------- /Test/htmltests/baidu-variant.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/htmltests/baidu-variant.html -------------------------------------------------------------------------------- /Test/htmltests/meta-charset-1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/htmltests/meta-charset-1.html -------------------------------------------------------------------------------- /Test/htmltests/meta-charset-2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeReV/NSoup/HEAD/Test/htmltests/meta-charset-2.html -------------------------------------------------------------------------------- /NSoup/NSoup.csproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LocalTestRun.testrunconfig: -------------------------------------------------------------------------------- 1 | 2 | 3 | This is a default test run configuration for a local test run. 4 | 5 | -------------------------------------------------------------------------------- /Test/TextUtil.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Text.RegularExpressions; 6 | 7 | namespace Test 8 | { 9 | /// 10 | /// Text utils to ease testing 11 | /// 12 | /// 16 | public class TextUtil 17 | { 18 | public static string StripNewLines(string s) 19 | { 20 | return Regex.Replace(s, "(?:\\n\\s*)", string.Empty); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **NSoup is currently unmaintained.** 2 | 3 | At this time, I am not actively working on this library. However, I will happily accept any help and pull requests, and perhaps return to working on it, should it gain any more traction. 4 | 5 | The source code has been migrated from CodePlex in the hopes it will get picked up by the GitHub community. It is by now fairly outdated and perhaps should be ported from latest *jsoup* scratch. 6 | 7 | # NSoup 8 | NSoup is a .NET port of the jsoup (https://github.com/jhy/jsoup) HTML parser and sanitizer originally written in Java. 9 | 10 | jsoup originally written by [Jonathan Hedley](https://github.com/jhy). 11 | Ported to .NET by Amir Grozki. 12 | 13 | ## Features 14 | 15 | - jQuery-like CSS selectors for finding and extracting data from HTML pages. 16 | - Sanitize HTML sent from untrusted sources. 17 | - Manipulate HTML documents. 18 | -------------------------------------------------------------------------------- /NSoup/HttpStatusException.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace NSoup 8 | { 9 | /// 10 | /// Signals that a HTTP request resulted in a not OK HTTP response. 11 | /// 12 | public class HttpStatusException : IOException 13 | { 14 | private int _statusCode; 15 | private string _url; 16 | 17 | public HttpStatusException(string message, int statusCode, string url) 18 | : base(message) 19 | { 20 | this._statusCode = statusCode; 21 | this._url = url; 22 | } 23 | 24 | public int StatusCode 25 | { 26 | get { return _statusCode; } 27 | } 28 | 29 | public string Url 30 | { 31 | get { return _url; } 32 | } 33 | 34 | public override string ToString() 35 | { 36 | return base.ToString() + ". Status=" + StatusCode + ", URL=" + Url; 37 | } 38 | } 39 | } 40 | 41 | -------------------------------------------------------------------------------- /NSoup/UnsupportedMimeTypeException.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace NSoup 8 | { 9 | /// 10 | /// Signals that a HTTP response returned a mime type that is not supported. 11 | /// 12 | public class UnsupportedMimeTypeException : IOException 13 | { 14 | private string _mimeType; 15 | private string _url; 16 | 17 | public UnsupportedMimeTypeException(string message, string mimeType, string url) 18 | : base(message) 19 | { 20 | this._mimeType = mimeType; 21 | this._url = url; 22 | } 23 | 24 | public string MimeType 25 | { 26 | get { return _mimeType; } 27 | } 28 | 29 | public string Url 30 | { 31 | get { return _url; } 32 | } 33 | 34 | public override string ToString() 35 | { 36 | return base.ToString() + ". Mimetype=" + MimeType + ", URL=" + Url; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Amir Grozki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Test/htmltests/README: -------------------------------------------------------------------------------- 1 | Note 2 | ==== 3 | 4 | The HTML files in this directory (htmltests) are intended to be used for testing the Jsoup parser and improving its 5 | interoperability with real world published HTML. These files are not distributed in the core Jsoup library. 6 | 7 | These files remain the copyright of the original owner. 8 | 9 | If you are the copyright holder and do not wish your works to be used in this manner, please contact Jonathan Hedley 10 | (jonathan@hedley.net) and your works will be removed from this test-suite. 11 | 12 | Sources 13 | ======== 14 | 15 | * yahoo-article-1.html http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china 1-Sep-2010 16 | * smh-biz-article-1.html http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html 17 | * news-com-au-home.html http://www.news.com.au/ 11-Jan-2010 18 | * google-ipod.html http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10 11-Jan-2010 19 | * yahoo-jp.html http://www.yahoo.co.jp/index.html 12-Jan-2010 20 | * baidu-cn-home.html http://www.baidu.com/ 15-Jul-2010 21 | * nyt-article-1.html http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp 22 | -------------------------------------------------------------------------------- /NSoup/Parse/ParseErrorList.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Parse 7 | { 8 | /// 9 | /// A container for ParseErrors. 10 | /// 11 | public class ParseErrorList : List 12 | { 13 | private static readonly int INITIAL_CAPACITY = 16; 14 | private readonly int maxSize; 15 | 16 | public ParseErrorList(int initialCapacity, int maxSize) 17 | : base(initialCapacity) 18 | { 19 | this.maxSize = maxSize; 20 | } 21 | 22 | public bool CanAddError 23 | { 24 | get { return this.Count < maxSize; } 25 | } 26 | 27 | public int MaxSize 28 | { 29 | get { return maxSize; } 30 | } 31 | 32 | public static ParseErrorList NoTracking() 33 | { 34 | return new ParseErrorList(0, 0); 35 | } 36 | 37 | public static ParseErrorList Tracking(int maxSize) 38 | { 39 | return new ParseErrorList(INITIAL_CAPACITY, maxSize); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /NSoup/Parse/ParseError.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Parse 7 | { 8 | /// 9 | /// A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. 10 | /// 11 | // todo: currently not ready for public consumption. revisit api, and exposure methods 12 | public class ParseError 13 | { 14 | 15 | private int _pos; 16 | private string _errorMsg; 17 | 18 | public ParseError(int pos, string errorMsg) 19 | { 20 | this._pos = pos; 21 | this._errorMsg = errorMsg; 22 | } 23 | 24 | public ParseError(int pos, string errorFormat, params object[] args) 25 | { 26 | this._errorMsg = string.Format(errorFormat, args); 27 | this._pos = pos; 28 | } 29 | 30 | public string ErrorMessage 31 | { 32 | get { return _errorMsg; } 33 | } 34 | 35 | public int Position 36 | { 37 | get { return _pos; } 38 | } 39 | 40 | public override string ToString() 41 | { 42 | return _pos + ": " + _errorMsg; 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /Test/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("Test")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("Test")] 13 | [assembly: AssemblyCopyright("Copyright © 2012")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("5f127e50-0b81-42c1-b965-ae074540eca0")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /NSoup/Select/NodeVisitor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using NSoup.Nodes; 6 | 7 | namespace NSoup.Select 8 | { 9 | /// 10 | /// Node visitor interface. Provide an implementing class to NodeTraversor to iterate through nodes. 11 | /// This interface provides two methods, Head() and Tail(). The head method is called when the node is first 12 | /// seen, and the tail method when all of the node's children have been visited. As an example, head can be used to 13 | /// create a start tag for a node, and tail to create the end tag. 14 | /// 15 | public interface NodeVisitor 16 | { 17 | /// 18 | /// Callback for when a node is first visited. 19 | /// 20 | /// The node being visited. 21 | /// The depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node 22 | /// of that will have depth 1. 23 | void Head(Node node, int depth); 24 | 25 | /// 26 | /// Callback for when a node is last visited, after all of its descendants have been visited. 27 | /// 28 | /// The node being visited. 29 | /// the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node 30 | /// of that will have depth 1. 31 | void Tail(Node node, int depth); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /NSoup/Nodes/entities-base.txt: -------------------------------------------------------------------------------- 1 | AElig=000C6 2 | AMP=00026 3 | Aacute=000C1 4 | Acirc=000C2 5 | Agrave=000C0 6 | Aring=000C5 7 | Atilde=000C3 8 | Auml=000C4 9 | COPY=000A9 10 | Ccedil=000C7 11 | ETH=000D0 12 | Eacute=000C9 13 | Ecirc=000CA 14 | Egrave=000C8 15 | Euml=000CB 16 | GT=0003E 17 | Iacute=000CD 18 | Icirc=000CE 19 | Igrave=000CC 20 | Iuml=000CF 21 | LT=0003C 22 | Ntilde=000D1 23 | Oacute=000D3 24 | Ocirc=000D4 25 | Ograve=000D2 26 | Oslash=000D8 27 | Otilde=000D5 28 | Ouml=000D6 29 | QUOT=00022 30 | REG=000AE 31 | THORN=000DE 32 | Uacute=000DA 33 | Ucirc=000DB 34 | Ugrave=000D9 35 | Uuml=000DC 36 | Yacute=000DD 37 | aacute=000E1 38 | acirc=000E2 39 | acute=000B4 40 | aelig=000E6 41 | agrave=000E0 42 | amp=00026 43 | aring=000E5 44 | atilde=000E3 45 | auml=000E4 46 | brvbar=000A6 47 | ccedil=000E7 48 | cedil=000B8 49 | cent=000A2 50 | copy=000A9 51 | curren=000A4 52 | deg=000B0 53 | divide=000F7 54 | eacute=000E9 55 | ecirc=000EA 56 | egrave=000E8 57 | eth=000F0 58 | euml=000EB 59 | frac12=000BD 60 | frac14=000BC 61 | frac34=000BE 62 | gt=0003E 63 | iacute=000ED 64 | icirc=000EE 65 | iexcl=000A1 66 | igrave=000EC 67 | iquest=000BF 68 | iuml=000EF 69 | laquo=000AB 70 | lt=0003C 71 | macr=000AF 72 | micro=000B5 73 | middot=000B7 74 | nbsp=000A0 75 | not=000AC 76 | ntilde=000F1 77 | oacute=000F3 78 | ocirc=000F4 79 | ograve=000F2 80 | ordf=000AA 81 | ordm=000BA 82 | oslash=000F8 83 | otilde=000F5 84 | ouml=000F6 85 | para=000B6 86 | plusmn=000B1 87 | pound=000A3 88 | quot=00022 89 | raquo=000BB 90 | reg=000AE 91 | sect=000A7 92 | shy=000AD 93 | sup1=000B9 94 | sup2=000B2 95 | sup3=000B3 96 | szlig=000DF 97 | thorn=000FE 98 | times=000D7 99 | uacute=000FA 100 | ucirc=000FB 101 | ugrave=000F9 102 | uml=000A8 103 | uuml=000FC 104 | yacute=000FD 105 | yen=000A5 106 | yuml=000FF -------------------------------------------------------------------------------- /NSoup/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("NSoup")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("NSoup")] 13 | [assembly: AssemblyCopyright("Copyright © 2012 Amir Grozki")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("ea7c4382-d4cf-470b-9525-72d42850c656")] 24 | 25 | // Added by popular demand. 26 | [assembly: System.Security.AllowPartiallyTrustedCallers] 27 | 28 | [assembly: InternalsVisibleTo("Test, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ff2769904d9601c999569e2ebea98b0822f0c58cb1a59d26ac1b3f0a9361cc58217d8c119b0cd0b8b16cb74e470f1b3c50334cefee49b3a0f9ca5830f418584c7ae8f9860a5fe91cdc2a51e8db16d8d1575c053c2e24bca46f644dcf12b3633829077a0ff5e68e4ca0491b9fbba3b19a06eb2887251bcb186486e2d2ddcbb3b5")] 29 | 30 | // Version information for an assembly consists of the following four values: 31 | // 32 | // Major Version 33 | // Minor Version 34 | // Build Number 35 | // Revision 36 | // 37 | // You can specify all the values or you can default the Build and Revision Numbers 38 | // by using the '*' as shown below: 39 | // [assembly: AssemblyVersion("1.0.*")] 40 | [assembly: AssemblyVersion("0.8.0.0")] 41 | [assembly: AssemblyFileVersion("0.8.0.0")] 42 | -------------------------------------------------------------------------------- /NSoup/Nodes/Comment.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Nodes 7 | { 8 | /// 9 | /// A comment node. 10 | /// 11 | /// 15 | public class Comment : Node 16 | { 17 | private static readonly string COMMENT_KEY = "comment"; 18 | 19 | /// 20 | /// Create a new comment node. 21 | /// 22 | /// The contents of the comment 23 | /// base URI 24 | public Comment(string data, string baseUri) 25 | : base(baseUri) 26 | { 27 | Attributes.Add(COMMENT_KEY, data); 28 | } 29 | 30 | /// 31 | /// Gets the node's name. 32 | /// 33 | public override string NodeName 34 | { 35 | get { return "#comment"; } 36 | } 37 | 38 | /// 39 | /// Get the contents of the comment. 40 | /// 41 | /// Content 42 | public string GetData() 43 | { 44 | return Attributes.GetValue(COMMENT_KEY); 45 | } 46 | 47 | public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSettings output) 48 | { 49 | if (output.PrettyPrint()) 50 | { 51 | Indent(accum, depth, output); 52 | } 53 | 54 | accum 55 | .Append(""); 58 | } 59 | 60 | public override void OuterHtmlTail(StringBuilder accum, int depth, OutputSettings output) 61 | { 62 | } 63 | 64 | public override string ToString() 65 | { 66 | return OuterHtml(); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /NSoup/Nodes/DocumentType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using NSoup.Helper; 6 | 7 | namespace NSoup.Nodes 8 | { 9 | /// 10 | /// A <!DOCTPYE> node. 11 | /// 12 | public class DocumentType : Node 13 | { 14 | // todo: quirk mode from publicId and systemId 15 | 16 | /// 17 | /// Create a new doctype element. 18 | /// 19 | /// the doctype's name 20 | /// the doctype's public ID 21 | /// the doctype's system ID 22 | /// the doctype's base URI 23 | public DocumentType(string name, string publicId, string systemId, string baseUri) 24 | : base(baseUri) 25 | { 26 | if (string.IsNullOrEmpty(name)) 27 | { 28 | throw new ArgumentNullException("name"); 29 | } 30 | 31 | Attr("name", name); 32 | Attr("publicId", publicId); 33 | Attr("systemId", systemId); 34 | } 35 | 36 | public override string NodeName 37 | { 38 | get { return "#doctype"; } 39 | } 40 | 41 | public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSettings output) 42 | { 43 | accum.Append("'); 56 | } 57 | 58 | public override void OuterHtmlTail(StringBuilder accum, int depth, OutputSettings output) 59 | { 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Build Folders (you can keep bin if you'd like, to store dlls and pdbs) 2 | # [Bb]in/ 3 | [Oo]bj/ 4 | 5 | # mstest test results 6 | TestResults 7 | 8 | ## Ignore Visual Studio temporary files, build results, and 9 | ## files generated by popular Visual Studio add-ons. 10 | 11 | # User-specific files 12 | *.suo 13 | *.user 14 | *.sln.docstates 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Rr]elease/ 19 | x64/ 20 | *_i.c 21 | *_p.c 22 | *.ilk 23 | *.meta 24 | *.obj 25 | *.pch 26 | *.pdb 27 | *.pgc 28 | *.pgd 29 | *.rsp 30 | *.sbr 31 | *.tlb 32 | *.tli 33 | *.tlh 34 | *.tmp 35 | *.log 36 | *.vspscc 37 | *.vssscc 38 | .builds 39 | 40 | # Visual C++ cache files 41 | ipch/ 42 | *.aps 43 | *.ncb 44 | *.opensdf 45 | *.sdf 46 | 47 | # Visual Studio profiler 48 | *.psess 49 | *.vsp 50 | *.vspx 51 | 52 | # Guidance Automation Toolkit 53 | *.gpState 54 | 55 | # ReSharper is a .NET coding add-in 56 | _ReSharper* 57 | 58 | # NCrunch 59 | *.ncrunch* 60 | .*crunch*.local.xml 61 | 62 | # Installshield output folder 63 | [Ee]xpress 64 | 65 | # DocProject is a documentation generator add-in 66 | DocProject/buildhelp/ 67 | DocProject/Help/*.HxT 68 | DocProject/Help/*.HxC 69 | DocProject/Help/*.hhc 70 | DocProject/Help/*.hhk 71 | DocProject/Help/*.hhp 72 | DocProject/Help/Html2 73 | DocProject/Help/html 74 | 75 | # Click-Once directory 76 | publish 77 | 78 | # Publish Web Output 79 | *.Publish.xml 80 | 81 | # NuGet Packages Directory 82 | # packages 83 | 84 | # Windows Azure Build Output 85 | csx 86 | *.build.csdef 87 | 88 | # Windows Store app package directory 89 | AppPackages/ 90 | 91 | # Others 92 | #[Bb]in 93 | [Oo]bj 94 | sql 95 | TestResults 96 | [Tt]est[Rr]esult* 97 | *.Cache 98 | ClientBin 99 | [Ss]tyle[Cc]op.* 100 | ~$* 101 | *.dbmdl 102 | Generated_Code #added for RIA/Silverlight projects 103 | 104 | # Backup & report files from converting an old project file to a newer 105 | # Visual Studio version. Backup files are not needed, because we have git ;-) 106 | _UpgradeReport_Files/ 107 | Backup*/ 108 | UpgradeLog*.XML 109 | 110 | .sass-cache 111 | .sass-cache/* 112 | 113 | .vs -------------------------------------------------------------------------------- /NSoup/Select/NodeTraversor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using NSoup.Nodes; 6 | 7 | namespace NSoup.Select 8 | { 9 | /// 10 | /// Depth-first node traversor. Use to iterate through all nodes under and including the specified root node. 11 | /// This implementation does not use recursion, so a deep DOM does not risk blowing the stack. 12 | /// 13 | internal class NodeTraversor 14 | { 15 | private NodeVisitor _visitor; 16 | 17 | /// 18 | /// Create a new traversor. 19 | /// 20 | /// A class implementing the NodeVisitor interface, to be called when visiting each node. 21 | public NodeTraversor(NodeVisitor visitor) 22 | { 23 | this._visitor = visitor; 24 | } 25 | 26 | /// 27 | /// Start a depth-first traverse of the root and all of its descendants. 28 | /// 29 | /// The root node point to traverse. 30 | public void Traverse(Node root) 31 | { 32 | Node node = root; 33 | int depth = 0; 34 | 35 | while (node != null) 36 | { 37 | _visitor.Head(node, depth); 38 | if (node.ChildNodes.Count > 0) 39 | { 40 | node = node.ChildNodes[0]; 41 | depth++; 42 | } 43 | else 44 | { 45 | while (node.NextSibling == null && depth > 0) 46 | { 47 | _visitor.Tail(node, depth); 48 | node = node.ParentNode; 49 | depth--; 50 | } 51 | _visitor.Tail(node, depth); 52 | if (node == root) 53 | { 54 | break; 55 | } 56 | node = node.NextSibling; 57 | } 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /NSoup.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C66E2854-5CFF-4DD3-B867-3FE49DE1E96A}" 5 | ProjectSection(SolutionItems) = preProject 6 | LocalTestRun.testrunconfig = LocalTestRun.testrunconfig 7 | NSoup.vsmdi = NSoup.vsmdi 8 | EndProjectSection 9 | EndProject 10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NSoup", "NSoup\NSoup.csproj", "{EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}" 11 | EndProject 12 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}" 13 | EndProject 14 | Global 15 | GlobalSection(TestCaseManagementSettings) = postSolution 16 | CategoryFile = NSoup.vsmdi 17 | EndGlobalSection 18 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 19 | Debug|Any CPU = Debug|Any CPU 20 | Release|Any CPU = Release|Any CPU 21 | EndGlobalSection 22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 23 | {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 24 | {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Debug|Any CPU.Build.0 = Debug|Any CPU 25 | {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Release|Any CPU.ActiveCfg = Release|Any CPU 26 | {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Release|Any CPU.Build.0 = Release|Any CPU 27 | {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 28 | {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Debug|Any CPU.Build.0 = Debug|Any CPU 29 | {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Release|Any CPU.ActiveCfg = Release|Any CPU 30 | {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Release|Any CPU.Build.0 = Release|Any CPU 31 | {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 32 | {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Debug|Any CPU.Build.0 = Debug|Any CPU 33 | {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Release|Any CPU.ActiveCfg = Release|Any CPU 34 | {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Release|Any CPU.Build.0 = Release|Any CPU 35 | EndGlobalSection 36 | GlobalSection(SolutionProperties) = preSolution 37 | HideSolutionNode = FALSE 38 | EndGlobalSection 39 | EndGlobal 40 | -------------------------------------------------------------------------------- /NSoup/Select/Collector.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using NSoup.Nodes; 6 | 7 | namespace NSoup.Select 8 | { 9 | /// 10 | /// Collects a list of elements that match the supplied criteria. 11 | /// 12 | /// 16 | internal class Collector 17 | { 18 | private Collector() { } 19 | 20 | /// 21 | /// Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator. 22 | /// 23 | /// Evaluator to test elements against 24 | /// root of tree to descend 25 | /// list of matches; empty if none 26 | public static Elements Collect(Evaluator eval, Element root) 27 | { 28 | Elements elements = new Elements(); 29 | new NodeTraversor(new Accumulator(root, elements, eval)).Traverse(root); 30 | return elements; 31 | } 32 | 33 | private class Accumulator : NodeVisitor 34 | { 35 | private readonly Element root; 36 | private readonly Elements elements; 37 | private readonly Evaluator eval; 38 | 39 | public Accumulator(Element root, Elements elements, Evaluator eval) 40 | { 41 | this.root = root; 42 | this.elements = elements; 43 | this.eval = eval; 44 | } 45 | 46 | public void Head(Node node, int depth) 47 | { 48 | if (node is Element) 49 | { 50 | Element el = (Element)node; 51 | if (eval.Matches(root, el)) 52 | { 53 | elements.Add(el); 54 | } 55 | } 56 | } 57 | 58 | public void Tail(Node node, int depth) 59 | { 60 | // void 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /Test/Select/QueryParserTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using NSoup.Select; 4 | 5 | namespace Test.Select 6 | { 7 | /// 8 | /// Tests for the Selector Query Parser. 9 | /// 10 | /// 14 | [TestClass] 15 | public class QueryParserTest 16 | { 17 | [TestMethod] 18 | public void testOrGetsCorrectPrecedence() 19 | { 20 | // tests that a selector "a b, c d, e f" evals to (a AND b) OR (c AND d) OR (e AND f)" 21 | // top level or, three child ands 22 | Evaluator eval = QueryParser.Parse("a b, c d, e f"); 23 | Assert.IsTrue(eval is CombiningEvaluator.Or); 24 | CombiningEvaluator.Or or = (CombiningEvaluator.Or)eval; 25 | Assert.AreEqual(3, or.Evaluators.Count); 26 | foreach (Evaluator innerEval in or.Evaluators) 27 | { 28 | Assert.IsTrue(innerEval is CombiningEvaluator.And); 29 | CombiningEvaluator.And and = (CombiningEvaluator.And)innerEval; 30 | Assert.AreEqual(2, and.Evaluators.Count); 31 | Assert.IsTrue(and.Evaluators[0] is Evaluator.Tag); 32 | Assert.IsTrue(and.Evaluators[1] is StructuralEvaluator.Parent); 33 | } 34 | } 35 | 36 | [TestMethod] 37 | public void testParsesMultiCorrectly() 38 | { 39 | Evaluator eval = QueryParser.Parse(".foo > ol, ol > li + li"); 40 | Assert.IsTrue(eval is CombiningEvaluator.Or); 41 | CombiningEvaluator.Or or = (CombiningEvaluator.Or)eval; 42 | Assert.AreEqual(2, or.Evaluators.Count); 43 | 44 | CombiningEvaluator.And andLeft = (CombiningEvaluator.And)or.Evaluators[0]; 45 | CombiningEvaluator.And andRight = (CombiningEvaluator.And)or.Evaluators[1]; 46 | 47 | Assert.AreEqual("ol :ImmediateParent.foo", andLeft.ToString()); 48 | Assert.AreEqual(2, andLeft.Evaluators.Count); 49 | Assert.AreEqual("li :prevli :ImmediateParentol", andRight.ToString()); 50 | Assert.AreEqual(2, andLeft.Evaluators.Count); 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /NSoup/Nodes/XmlDeclaration.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Nodes 7 | { 8 | /// 9 | /// An XML Declaration. 10 | /// 11 | /// 15 | public class XmlDeclaration : Node 16 | { 17 | private static readonly string DECL_KEY = "declaration"; 18 | private readonly bool _isProcessingInstruction; // 21 | /// Create a new XML declaration 22 | /// 23 | /// data 24 | /// base uri 25 | /// is processing instruction 26 | public XmlDeclaration(string data, string baseUri, bool isProcessingInstruction) 27 | : base(baseUri) 28 | { 29 | Attributes.Add(DECL_KEY, data); 30 | this._isProcessingInstruction = isProcessingInstruction; 31 | } 32 | 33 | /// 34 | /// Gets the node's name. 35 | /// 36 | public override string NodeName 37 | { 38 | get { return "#declaration"; } 39 | } 40 | 41 | /// 42 | /// Get the unencoded XML declaration. 43 | /// 44 | /// XML declaration 45 | public string GetWholeDeclaration() 46 | { 47 | return Attributes.GetValue(DECL_KEY); 48 | } 49 | 50 | public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSettings output) 51 | { 52 | accum 53 | .Append("<") 54 | .Append(_isProcessingInstruction ? "!" : "?") 55 | .Append(GetWholeDeclaration()) 56 | .Append(">"); 57 | } 58 | 59 | public override void OuterHtmlTail(StringBuilder accum, int depth, OutputSettings output) { } 60 | 61 | public override string ToString() 62 | { 63 | return OuterHtml(); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Test/Integration/Benchmark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | 7 | namespace Test.Integration 8 | { 9 | /// 10 | /// Does an A/B test on two methods, and prints out how long each took. 11 | /// 12 | /// 16 | /*[TestClass] 17 | public class Benchmark 18 | { 19 | public Benchmark() 20 | { 21 | // 22 | // TODO: Add constructor logic here 23 | // 24 | } 25 | 26 | private TestContext testContextInstance; 27 | 28 | /// 29 | ///Gets or sets the test context which provides 30 | ///information about and functionality for the current test run. 31 | /// 32 | public TestContext TestContext 33 | { 34 | get 35 | { 36 | return testContextInstance; 37 | } 38 | set 39 | { 40 | testContextInstance = value; 41 | } 42 | } 43 | 44 | #region Additional test attributes 45 | // 46 | // You can use the following additional attributes as you write your tests: 47 | // 48 | // Use ClassInitialize to run code before running the first test in the class 49 | // [ClassInitialize()] 50 | // public static void MyClassInitialize(TestContext testContext) { } 51 | // 52 | // Use ClassCleanup to run code after all tests in a class have run 53 | // [ClassCleanup()] 54 | // public static void MyClassCleanup() { } 55 | // 56 | // Use TestInitialize to run code before running each test 57 | // [TestInitialize()] 58 | // public void MyTestInitialize() { } 59 | // 60 | // Use TestCleanup to run code after each test has run 61 | // [TestCleanup()] 62 | // public void MyTestCleanup() { } 63 | // 64 | #endregion 65 | 66 | [TestMethod] 67 | public void TestMethod1() 68 | { 69 | // 70 | // TODO: Add test logic here 71 | // 72 | } 73 | }*/ 74 | } 75 | -------------------------------------------------------------------------------- /NSoup/Parse/TreeBuilder.cs: -------------------------------------------------------------------------------- 1 | using NSoup.Helper; 2 | using NSoup.Nodes; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace NSoup.Parse 9 | { 10 | public abstract class TreeBuilder 11 | { 12 | protected CharacterReader _reader; 13 | protected Tokeniser _tokeniser; 14 | protected Document _doc; // current doc we are building into 15 | protected DescendableLinkedList _stack; // the stack of open elements 16 | protected string _baseUri; // current base uri, for creating new elements 17 | protected Token _currentToken; // currentToken is used only for error tracking. 18 | protected ParseErrorList _errors; // null when not tracking errors 19 | 20 | protected virtual void InitialiseParse(string input, string baseUri, ParseErrorList errors) 21 | { 22 | if (input == null) 23 | { 24 | throw new ArgumentNullException("String input must not be null"); 25 | } 26 | if (baseUri == null) 27 | { 28 | throw new ArgumentNullException("BaseURI must not be null"); 29 | } 30 | 31 | _doc = new Document(baseUri); 32 | _reader = new CharacterReader(input); 33 | _errors = errors; 34 | _tokeniser = new Tokeniser(_reader, errors); 35 | _stack = new DescendableLinkedList(); 36 | this._baseUri = baseUri; 37 | } 38 | 39 | public Document Parse(string input, string baseUri) 40 | { 41 | return Parse(input, baseUri, ParseErrorList.NoTracking()); 42 | } 43 | 44 | public virtual Document Parse(string input, string baseUri, ParseErrorList errors) 45 | { 46 | InitialiseParse(input, baseUri, errors); 47 | 48 | RunParser(); 49 | 50 | return _doc; 51 | } 52 | 53 | protected void RunParser() 54 | { 55 | while (true) 56 | { 57 | Token token = _tokeniser.Read(); 58 | Process(token); 59 | 60 | if (token.Type == Token.TokenType.EOF) 61 | { 62 | break; 63 | } 64 | } 65 | } 66 | 67 | public abstract bool Process(Token token); 68 | 69 | public Element CurrentElement 70 | { 71 | get { return _stack.Last.Value; } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /NSoup/Helper/DescendableLinkedList.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Helper 7 | { 8 | 9 | /// 10 | /// Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE. 11 | /// 12 | /// 13 | public class DescendableLinkedList : LinkedList where T : class 14 | { 15 | 16 | /// 17 | /// Create a new DescendableLinkedList. 18 | /// 19 | public DescendableLinkedList() 20 | : base() 21 | { 22 | } 23 | 24 | /// 25 | /// Get an iterator that starts and the end of the list and works towards the start. 26 | /// 27 | /// an iterator that starts and the end of the list and works towards the start. 28 | public IEnumerator GetDescendingEnumerator() 29 | { 30 | return new DescendingEnumerator(this); 31 | } 32 | 33 | private class DescendingEnumerator : IEnumerator where V : class 34 | { 35 | private LinkedList list; 36 | private LinkedListNode curr = null; 37 | 38 | private bool first = true; 39 | 40 | public DescendingEnumerator(LinkedList list) 41 | { 42 | this.list = list; 43 | } 44 | 45 | #region IEnumerator Members 46 | 47 | public V Current 48 | { 49 | get { return curr == null ? null : curr.Value; } 50 | } 51 | 52 | #endregion 53 | 54 | #region IDisposable Members 55 | 56 | public void Dispose() 57 | { 58 | } 59 | 60 | #endregion 61 | 62 | #region IEnumerator Members 63 | 64 | object System.Collections.IEnumerator.Current 65 | { 66 | get { return curr == null ? null : curr.Value; } 67 | } 68 | 69 | public bool MoveNext() 70 | { 71 | if (first) 72 | { 73 | first = false; 74 | curr = list.Last; 75 | 76 | return curr != null; 77 | } 78 | 79 | if (curr.Previous == null) 80 | { 81 | return false; 82 | } 83 | 84 | curr = curr.Previous; 85 | return true; 86 | } 87 | 88 | public void Reset() 89 | { 90 | first = true; 91 | } 92 | 93 | #endregion 94 | } 95 | } 96 | } -------------------------------------------------------------------------------- /NSoup/Nodes/DataNode.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Web; 6 | 7 | namespace NSoup.Nodes 8 | { 9 | /// 10 | /// A data node, for contents of style, script tags etc, where contents should not show in text(). 11 | /// 12 | /// 16 | public class DataNode : Node 17 | { 18 | private static readonly string DATA_KEY = "data"; 19 | 20 | /// 21 | /// Create a new DataNode. 22 | /// 23 | /// data contents 24 | /// base URI 25 | public DataNode(string data, string baseUri) 26 | : base(baseUri) 27 | { 28 | Attributes.Add(DATA_KEY, data); 29 | } 30 | 31 | /// 32 | /// Gets the node's name. 33 | /// 34 | public override string NodeName 35 | { 36 | get { return "#data"; } 37 | } 38 | 39 | /// 40 | /// Get the data contents of this node. Will be unescaped and with original new lines, space etc. 41 | /// 42 | /// data 43 | public string GetWholeData() 44 | { 45 | return Attributes.GetValue(DATA_KEY); 46 | } 47 | 48 | /// 49 | /// Set the data contents of this node. 50 | /// 51 | /// unencoded data 52 | /// this node, for chaining 53 | public DataNode setWholeData(string data) 54 | { 55 | _attributes[DATA_KEY] = data; 56 | return this; 57 | } 58 | 59 | public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSettings output) 60 | { 61 | accum.Append(GetWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain 62 | } 63 | 64 | public override void OuterHtmlTail(StringBuilder accum, int depth, OutputSettings output) { } 65 | 66 | public override string ToString() 67 | { 68 | return OuterHtml(); 69 | } 70 | 71 | /// 72 | /// Create a new DataNode from HTML encoded data. 73 | /// 74 | /// encoded data 75 | /// bass URI 76 | /// new DataNode 77 | public static DataNode CreateFromEncoded(string encodedData, string baseUri) 78 | { 79 | string data = Entities.Unescape(encodedData); 80 | return new DataNode(data, baseUri); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /NSoup/Select/CombiningEvaluator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using NSoup.Nodes; 6 | 7 | namespace NSoup.Select 8 | { 9 | /// 10 | /// Base combining (and, or) evaluator. 11 | /// 12 | internal abstract class CombiningEvaluator : Evaluator 13 | { 14 | protected readonly List _evaluators; 15 | 16 | private CombiningEvaluator() 17 | : base() 18 | { 19 | _evaluators = new List(); 20 | } 21 | 22 | private CombiningEvaluator(ICollection evaluators) 23 | : this() 24 | { 25 | this._evaluators.AddRange(evaluators); 26 | } 27 | 28 | public Evaluator RightMostEvaluator() 29 | { 30 | return _evaluators.Count > 0 ? _evaluators[_evaluators.Count - 1] : null; 31 | } 32 | 33 | public void ReplaceRightMostEvaluator(Evaluator replacement) 34 | { 35 | _evaluators[_evaluators.Count - 1] = replacement; 36 | } 37 | 38 | public List Evaluators 39 | { 40 | get { return _evaluators; } 41 | } 42 | 43 | public sealed class And : CombiningEvaluator 44 | { 45 | public And(ICollection evaluators) 46 | : base(evaluators) 47 | { 48 | } 49 | 50 | public And(params Evaluator[] evaluators) 51 | : base(evaluators) 52 | { 53 | } 54 | 55 | public override bool Matches(Element root, Element node) 56 | { 57 | for (int i = 0; i < _evaluators.Count; i++) 58 | { 59 | Evaluator s = _evaluators[i]; 60 | if (!s.Matches(root, node)) 61 | { 62 | return false; 63 | } 64 | } 65 | 66 | return true; 67 | } 68 | 69 | public override string ToString() 70 | { 71 | return string.Join(" ", _evaluators.Select(e => e.ToString()).ToArray()); 72 | } 73 | } 74 | 75 | public sealed class Or : CombiningEvaluator 76 | { 77 | public Or(ICollection evaluators) 78 | : base() 79 | { 80 | if (evaluators.Count > 1) 81 | { 82 | this._evaluators.Add(new And(evaluators)); 83 | } 84 | else // 0 or 1 85 | { 86 | this._evaluators.AddRange(evaluators); 87 | } 88 | } 89 | 90 | public Or() 91 | : base() 92 | {} 93 | 94 | public void Add(Evaluator e) 95 | { 96 | _evaluators.Add(e); 97 | } 98 | 99 | public override bool Matches(Element root, Element node) 100 | { 101 | for (int i = 0; i < _evaluators.Count; i++) 102 | { 103 | Evaluator s = _evaluators[i]; 104 | if (s.Matches(root, node)) 105 | { 106 | return true; 107 | } 108 | } 109 | return false; 110 | } 111 | 112 | public override string ToString() 113 | { 114 | return string.Format(":or{0}", _evaluators); 115 | } 116 | } 117 | } 118 | } -------------------------------------------------------------------------------- /Test/Parser/TagTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using NSoup.Parse; 7 | 8 | namespace Test.Parser 9 | { 10 | /// 11 | /// Tag tests. 12 | /// 13 | /// 17 | [TestClass] 18 | public class TagTest 19 | { 20 | public TagTest() 21 | { 22 | // 23 | // TODO: Add constructor logic here 24 | // 25 | } 26 | 27 | private TestContext testContextInstance; 28 | 29 | /// 30 | ///Gets or sets the test context which provides 31 | ///information about and functionality for the current test run. 32 | /// 33 | public TestContext TestContext 34 | { 35 | get 36 | { 37 | return testContextInstance; 38 | } 39 | set 40 | { 41 | testContextInstance = value; 42 | } 43 | } 44 | 45 | #region Additional test attributes 46 | // 47 | // You can use the following additional attributes as you write your tests: 48 | // 49 | // Use ClassInitialize to run code before running the first test in the class 50 | // [ClassInitialize()] 51 | // public static void MyClassInitialize(TestContext testContext) { } 52 | // 53 | // Use ClassCleanup to run code after all tests in a class have run 54 | // [ClassCleanup()] 55 | // public static void MyClassCleanup() { } 56 | // 57 | // Use TestInitialize to run code before running each test 58 | // [TestInitialize()] 59 | // public void MyTestInitialize() { } 60 | // 61 | // Use TestCleanup to run code after each test has run 62 | // [TestCleanup()] 63 | // public void MyTestCleanup() { } 64 | // 65 | #endregion 66 | 67 | [TestMethod] 68 | public void isCaseInsensitive() 69 | { 70 | Tag p1 = Tag.ValueOf("P"); 71 | Tag p2 = Tag.ValueOf("p"); 72 | Assert.AreEqual(p1, p2); 73 | } 74 | 75 | [TestMethod] 76 | public void trims() 77 | { 78 | Tag p1 = Tag.ValueOf("p"); 79 | Tag p2 = Tag.ValueOf(" p "); 80 | Assert.AreEqual(p1, p2); 81 | } 82 | 83 | [TestMethod] 84 | public void equality() 85 | { 86 | Tag p1 = Tag.ValueOf("p"); 87 | Tag p2 = Tag.ValueOf("p"); 88 | Assert.IsTrue(p1.Equals(p2)); 89 | Assert.IsTrue(p1 == p2); 90 | } 91 | 92 | [TestMethod] 93 | public void divSemantics() 94 | { 95 | Tag div = Tag.ValueOf("div"); 96 | 97 | Assert.IsTrue(div.IsBlock); 98 | Assert.IsTrue(div.FormatAsBlock); 99 | } 100 | 101 | [TestMethod] 102 | public void pSemantics() 103 | { 104 | Tag p = Tag.ValueOf("p"); 105 | 106 | Assert.IsTrue(p.IsBlock); 107 | Assert.IsFalse(p.FormatAsBlock); 108 | } 109 | 110 | [TestMethod] 111 | public void imgSemantics() 112 | { 113 | Tag img = Tag.ValueOf("img"); 114 | 115 | Assert.IsTrue(img.IsInline); 116 | Assert.IsTrue(img.IsSelfClosing); 117 | Assert.IsFalse(img.IsBlock); 118 | } 119 | 120 | [TestMethod] 121 | public void defaultSemantics() 122 | { 123 | Tag foo = Tag.ValueOf("foo"); // not defined 124 | Tag foo2 = Tag.ValueOf("FOO"); 125 | 126 | Assert.AreEqual(foo, foo2); 127 | Assert.IsTrue(foo.IsInline); 128 | Assert.IsTrue(foo.FormatAsBlock); 129 | } 130 | 131 | [TestMethod] 132 | [ExpectedException(typeof(ArgumentNullException))] 133 | public void ValueOfChecksNotNull() 134 | { 135 | Tag.ValueOf(null); 136 | } 137 | 138 | [TestMethod] 139 | [ExpectedException(typeof(ArgumentException))] 140 | public void ValueOfChecksNotEmpty() 141 | { 142 | Tag.ValueOf(" "); 143 | } 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /Test/Parser/XmlTreeBuilderTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using NSoup.Parse; 4 | using NSoup.Nodes; 5 | using NSoup; 6 | using System.IO; 7 | 8 | namespace Test.Parser 9 | { 10 | 11 | /** 12 | * Tests XmlTreeBuilder. 13 | * 14 | * @author Jonathan Hedley 15 | */ 16 | [TestClass] 17 | public class XmlTreeBuilderTest 18 | { 19 | [TestMethod] 20 | public void testSimpleXmlParse() 21 | { 22 | string xml = "Foo
OneTwo
"; 23 | XmlTreeBuilder tb = new XmlTreeBuilder(); 24 | Document doc = tb.Parse(xml, "http://foo.com/"); 25 | Assert.AreEqual("Foo
OneTwo
", 26 | TextUtil.StripNewLines(doc.Html())); 27 | Assert.AreEqual(doc.GetElementById("2").AbsUrl("href"), "http://foo.com/bar"); 28 | } 29 | 30 | [TestMethod] 31 | public void testPopToClose() 32 | { 33 | // test: closes Two, ignored 34 | string xml = "OneTwoThree"; 35 | XmlTreeBuilder tb = new XmlTreeBuilder(); 36 | Document doc = tb.Parse(xml, "http://foo.com/"); 37 | Assert.AreEqual("OneTwoThree", 38 | TextUtil.StripNewLines(doc.Html())); 39 | } 40 | 41 | [TestMethod] 42 | public void testCommentAndDocType() 43 | { 44 | string xml = "One Two"; 45 | XmlTreeBuilder tb = new XmlTreeBuilder(); 46 | Document doc = tb.Parse(xml, "http://foo.com/"); 47 | Assert.AreEqual("One Two", 48 | TextUtil.StripNewLines(doc.Html())); 49 | } 50 | 51 | [TestMethod] 52 | public void testSupplyParserToJsoupClass() 53 | { 54 | String xml = "OneTwoThree"; 55 | Document doc = NSoupClient.Parse(xml, "http://foo.com/", NSoup.Parse.Parser.XmlParser()); 56 | Assert.AreEqual("OneTwoThree", 57 | TextUtil.StripNewLines(doc.Html())); 58 | } 59 | 60 | [Ignore] 61 | [TestMethod] 62 | public void testSupplyParserToConnection() 63 | { 64 | String xmlUrl = "http://direct.infohound.net/tools/jsoup-xml-test.xml"; 65 | 66 | // parse with both xml and html parser, ensure different 67 | Document xmlDoc = NSoupClient.Connect(xmlUrl).Parser(NSoup.Parse.Parser.XmlParser()).Get(); 68 | Document htmlDoc = NSoupClient.Connect(xmlUrl).Get(); 69 | 70 | Assert.AreEqual("OneTwoThree", 71 | TextUtil.StripNewLines(xmlDoc.Html())); 72 | Assert.AreNotSame(htmlDoc, xmlDoc); 73 | Assert.AreEqual(1, htmlDoc.Select("head").Count); // html parser normalises 74 | Assert.AreEqual(0, xmlDoc.Select("head").Count); // xml parser does not 75 | } 76 | 77 | [TestMethod] 78 | public void testSupplyParserToDataStream() { 79 | using (Stream input = getFile("Test.htmltests.xml-test.xml")) 80 | { 81 | Document doc = NSoupClient.Parse(input, null, "http://foo.com", NSoup.Parse.Parser.XmlParser()); 82 | Assert.AreEqual("OneTwoThree", 83 | TextUtil.StripNewLines(doc.Html())); 84 | } 85 | } 86 | 87 | [TestMethod] 88 | public void testDoesNotForceSelfClosingKnownTags() 89 | { 90 | // html will force "
one
" to "
One
". XML should be stay "
one
-- don't recognise tag. 91 | Document htmlDoc = NSoupClient.Parse("
one
"); 92 | Assert.AreEqual("
one\n
", htmlDoc.Body.Html()); 93 | 94 | Document xmlDoc = NSoupClient.Parse("
one
", "", NSoup.Parse.Parser.XmlParser()); 95 | Assert.AreEqual("
one
", xmlDoc.Html()); 96 | } 97 | 98 | Stream getFile(string resourceName) 99 | { 100 | try 101 | { 102 | return System.Reflection.Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName); 103 | } 104 | catch (Exception) 105 | { 106 | throw; 107 | } 108 | } 109 | } 110 | } -------------------------------------------------------------------------------- /NSoup.vsmdi: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /NSoup/Helper/StringUtil.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace NSoup.Helper 7 | { 8 | /** 9 | * A minimal String utility class. Designed for internal jsoup use only. 10 | */ 11 | public static class StringUtil 12 | { 13 | // memoised padding up to 10 14 | private static readonly string[] padding = { "", " ", " ", " ", " ", " ", " ", " ", " ", " ", " " }; 15 | 16 | /// 17 | /// Join a collection of strings by a seperator 18 | /// 19 | /// collection of string objects 20 | /// string to place between strings 21 | /// joined string 22 | public static string Join(this ICollection strings, string sep) 23 | { 24 | return string.Join(sep, strings.ToArray()); 25 | } 26 | 27 | public static string Join(IEnumerator iterator, string sep) 28 | { 29 | if (!iterator.MoveNext()) 30 | { 31 | return string.Empty; 32 | } 33 | 34 | var start = iterator.Current; 35 | if (!iterator.MoveNext()) 36 | { 37 | return start; 38 | } 39 | 40 | var sb = new StringBuilder(64).Append(start); 41 | while (iterator.MoveNext()) 42 | { 43 | sb.Append(sep); 44 | sb.Append(iterator.Current); 45 | } 46 | 47 | return sb.ToString(); 48 | } 49 | 50 | /// 51 | /// Returns space padding 52 | /// 53 | /// amount of padding desired 54 | /// string of spaces * width 55 | public static string Padding(int width) 56 | { 57 | if (width < 0) 58 | { 59 | throw new ArgumentException("width must be > 0"); 60 | } 61 | 62 | if (width < padding.Length) 63 | { 64 | return padding[width]; 65 | } 66 | 67 | return string.Empty.PadLeft(width); 68 | } 69 | 70 | public static bool IsBlank(this string s) 71 | { 72 | return string.IsNullOrWhiteSpace(s) ? true : s.Trim().Length == 0; 73 | } 74 | 75 | public static bool IsNumeric(this string s) 76 | { 77 | if (string.IsNullOrEmpty(s)) 78 | { 79 | return false; 80 | } 81 | 82 | var anyNonDigits = s.ToCharArray().Any(c => !char.IsDigit(c)); 83 | return !(anyNonDigits); 84 | } 85 | 86 | /// 87 | /// Tests if a code point is "whitespace" as defined in the HTML spec. 88 | /// 89 | /// Code point to test 90 | /// True if code point is whitespace, false otherwise 91 | public static bool IsWhiteSpace(char c) 92 | { 93 | return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; 94 | } 95 | 96 | public static string NormaliseWhitespace(this string s) 97 | { 98 | var sb = new StringBuilder(s.Length); 99 | 100 | var lastWasWhite = false; 101 | var reachedNonWhite = false; 102 | 103 | var l = s.Length; 104 | for (var i = 0; i < l; i++) 105 | { 106 | var c = s[i]; 107 | if (IsWhiteSpace(c)) 108 | { 109 | if (lastWasWhite) { continue; } 110 | sb.Append(' '); 111 | lastWasWhite = true; 112 | } 113 | else 114 | { 115 | sb.Append(c); 116 | lastWasWhite = false; 117 | reachedNonWhite = true; 118 | } 119 | } 120 | 121 | return sb.ToString(); 122 | } 123 | 124 | public static bool In(string needle, params string[] haystack) 125 | { 126 | foreach (string hay in haystack) 127 | { 128 | if (hay.Equals(needle)) 129 | { 130 | return true; 131 | } 132 | } 133 | return false; 134 | } 135 | 136 | public static bool InSorted(string needle, params string[] haystack) 137 | { 138 | return Array.BinarySearch(haystack, needle) >= 0; 139 | } 140 | 141 | public static Uri Resolve(Uri url, string relUrl) 142 | { 143 | Uri resultUri = null; 144 | if (relUrl.IndexOf('.') == 0 && url.PathAndQuery.IndexOf('/') != 0) 145 | { 146 | url = new Uri(url.Scheme + url.Host + url.Port + "/" + url.PathAndQuery); 147 | } 148 | 149 | Uri.TryCreate(url, relUrl, out resultUri); 150 | return resultUri; 151 | } 152 | 153 | public static string Resolve(string url, string relUrl) 154 | { 155 | Uri baseUri = null; 156 | var validUri = Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out baseUri); 157 | if (validUri) 158 | { 159 | var resultUri = Resolve(baseUri, relUrl); 160 | return resultUri == null ? string.Empty : resultUri.ToString(); 161 | } 162 | 163 | validUri = Uri.TryCreate(relUrl, UriKind.RelativeOrAbsolute, out baseUri); 164 | return baseUri == null ? string.Empty : baseUri.ToString(); 165 | } 166 | } 167 | } -------------------------------------------------------------------------------- /Test/Parser/TokenQueueTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using NSoup.Parse; 7 | using NSoup.Nodes; 8 | 9 | namespace Test.Parser 10 | { 11 | /// 12 | /// Token queue tests. 13 | /// 14 | /// 18 | [TestClass] 19 | public class TokenQueueTest 20 | { 21 | public TokenQueueTest() 22 | { 23 | // 24 | // TODO: Add constructor logic here 25 | // 26 | } 27 | 28 | private TestContext testContextInstance; 29 | 30 | /// 31 | ///Gets or sets the test context which provides 32 | ///information about and functionality for the current test run. 33 | /// 34 | public TestContext TestContext 35 | { 36 | get 37 | { 38 | return testContextInstance; 39 | } 40 | set 41 | { 42 | testContextInstance = value; 43 | } 44 | } 45 | 46 | #region Additional test attributes 47 | // 48 | // You can use the following additional attributes as you write your tests: 49 | // 50 | // Use ClassInitialize to run code before running the first test in the class 51 | // [ClassInitialize()] 52 | // public static void MyClassInitialize(TestContext testContext) { } 53 | // 54 | // Use ClassCleanup to run code after all tests in a class have run 55 | // [ClassCleanup()] 56 | // public static void MyClassCleanup() { } 57 | // 58 | // Use TestInitialize to run code before running each test 59 | // [TestInitialize()] 60 | // public void MyTestInitialize() { } 61 | // 62 | // Use TestCleanup to run code after each test has run 63 | // [TestCleanup()] 64 | // public void MyTestCleanup() { } 65 | // 66 | #endregion 67 | 68 | [TestMethod] 69 | public void chompBalanced() 70 | { 71 | TokenQueue tq = new TokenQueue(":contains(one (two) three) four"); 72 | string pre = tq.ConsumeTo("("); 73 | string guts = tq.ChompBalanced('(', ')'); 74 | string remainder = tq.Remainder(); 75 | 76 | Assert.AreEqual(":contains", pre); 77 | Assert.AreEqual("one (two) three", guts); 78 | Assert.AreEqual(" four", remainder); 79 | } 80 | 81 | [TestMethod] 82 | public void chompEscapedBalanced() 83 | { 84 | TokenQueue tq = new TokenQueue(":contains(one (two) \\( \\) \\) three) four"); 85 | string pre = tq.ConsumeTo("("); 86 | string guts = tq.ChompBalanced('(', ')'); 87 | string remainder = tq.Remainder(); 88 | 89 | Assert.AreEqual(":contains", pre); 90 | Assert.AreEqual("one (two) \\( \\) \\) three", guts); 91 | Assert.AreEqual("one (two) ( ) ) three", TokenQueue.Unescape(guts)); 92 | Assert.AreEqual(" four", remainder); 93 | } 94 | 95 | [TestMethod] 96 | public void chompBalancedMatchesAsMuchAsPossible() 97 | { 98 | TokenQueue tq = new TokenQueue("unbalanced(something(or another"); 99 | tq.ConsumeTo("("); 100 | string match = tq.ChompBalanced('(', ')'); 101 | Assert.AreEqual("something(or another", match); 102 | } 103 | 104 | [TestMethod] 105 | public void unescape() 106 | { 107 | Assert.AreEqual("one ( ) \\", TokenQueue.Unescape("one \\( \\) \\\\")); 108 | } 109 | 110 | [TestMethod] 111 | public void chompToIgnoreCase() 112 | { 113 | string t = ""; 114 | TokenQueue tq = new TokenQueue(t); 115 | string data = tq.ChompToIgnoreCase("one < two ", data); 117 | 118 | tq = new TokenQueue("