├── .gitignore ├── .hgignore ├── LICENSE ├── NuGet.Config ├── README.md ├── ReleaseNotes.md ├── ScrapySharp.Core ├── CssSelectorExecutor.fs ├── CssSelectorTokenizer.fs ├── Models.fs ├── NavigationProvider.fs └── ScrapySharp.Core.fsproj ├── ScrapySharp.Tests ├── Html │ ├── .gitattributes │ ├── Form1.htm │ ├── GeneratedHtml1.htm │ ├── InvalidPage1.htm │ ├── InvalidPage2.htm │ ├── InvalidPage3.htm │ ├── Page1.htm │ ├── ValidPage1.htm │ ├── ValidPage2.htm │ └── WebFormPage.htm ├── ScrapySharp.Tests.csproj ├── When_build_HtmlDom.cs ├── When_generate_HTML_with_HDocument.cs ├── When_parse_real_html_pages.cs ├── When_parses_using_CssSelector.cs ├── When_parses_using_CssSelector_with_HDocument.cs ├── When_parses_using_CssSelector_with_fsharp_tokenizer.cs ├── When_tokenize_CssSelector.cs ├── When_use_HtmlDeclarationReader.cs ├── When_use_browser.cs ├── When_use_code_reader.cs └── When_use_web_forms.cs ├── ScrapySharp.sln ├── ScrapySharp ├── Cache │ ├── WebResourceEntity.cs │ └── WebResourceStorage.cs ├── Exceptions │ └── ScrapingException.cs ├── Extensions │ ├── CssQueryExtensions.cs │ ├── DictionaryExtensions.cs │ ├── HDocumentCssQueryExtensions.cs │ ├── HElementNavigationProvider.cs │ ├── HtmlCreationHelper.cs │ ├── HtmlParsingHelper.cs │ ├── HtmlValue.cs │ ├── NodeValueComparer.cs │ ├── NodeValueComparison.cs │ ├── TokenHelper.cs │ └── UrlHelper.cs ├── Html │ ├── By.cs │ ├── Dom │ │ ├── DeclarationType.cs │ │ ├── HAttribute.cs │ │ ├── HComment.cs │ │ ├── HContainer.cs │ │ ├── HDocument.cs │ │ ├── HElement.cs │ │ ├── HtmlLinqExtensions.cs │ │ ├── IHSubContainer.cs │ │ └── TagDeclaration.cs │ ├── ElementFinder.cs │ ├── ElementSearchKind.cs │ ├── Forms │ │ ├── AgilityNodeParser.cs │ │ ├── FormField.cs │ │ ├── HElementFormParser.cs │ │ ├── HElementNodeParser.cs │ │ ├── HyperLink.cs │ │ ├── IHtmlNodeParser.cs │ │ ├── PageWebForm.cs │ │ └── WebForm.cs │ └── Parsing │ │ ├── CodeReader.cs │ │ ├── CodeReadingContext.cs │ │ ├── HtmlDeclarationReader.cs │ │ ├── HtmlDomBuilder.cs │ │ ├── Tokens.cs │ │ └── Word.cs ├── Network │ ├── CookiesParser.cs │ ├── FakeUserAgent.cs │ ├── FakeUserAgents.cs │ ├── HttpVerb.cs │ ├── RawRequest.cs │ ├── RawResponse.cs │ ├── ScrapingBrowser.cs │ ├── WebPage.cs │ └── WebResource.cs └── ScrapySharp.csproj ├── build.ps1 └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # MSTest test Results 33 | [Tt]est[Rr]esult*/ 34 | [Bb]uild[Ll]og.* 35 | 36 | # NUNIT 37 | *.VisualState.xml 38 | TestResult.xml 39 | 40 | # Build Results of an ATL Project 41 | [Dd]ebugPS/ 42 | [Rr]eleasePS/ 43 | dlldata.c 44 | 45 | # .NET Core 46 | project.lock.json 47 | project.fragment.lock.json 48 | artifacts/ 49 | **/Properties/launchSettings.json 50 | 51 | *_i.c 52 | *_p.c 53 | *_i.h 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.tmp_proj 68 | *.log 69 | *.vspscc 70 | *.vssscc 71 | .builds 72 | *.pidb 73 | *.svclog 74 | *.scc 75 | 76 | # Chutzpah Test files 77 | _Chutzpah* 78 | 79 | # Visual C++ cache files 80 | ipch/ 81 | *.aps 82 | *.ncb 83 | *.opendb 84 | *.opensdf 85 | *.sdf 86 | *.cachefile 87 | *.VC.db 88 | *.VC.VC.opendb 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # Visual Studio code coverage results 117 | *.coverage 118 | *.coveragexml 119 | 120 | # NCrunch 121 | _NCrunch_* 122 | .*crunch*.local.xml 123 | nCrunchTemp_* 124 | 125 | # MightyMoose 126 | *.mm.* 127 | AutoTest.Net/ 128 | 129 | # Web workbench (sass) 130 | .sass-cache/ 131 | 132 | # Installshield output folder 133 | [Ee]xpress/ 134 | 135 | # DocProject is a documentation generator add-in 136 | DocProject/buildhelp/ 137 | DocProject/Help/*.HxT 138 | DocProject/Help/*.HxC 139 | DocProject/Help/*.hhc 140 | DocProject/Help/*.hhk 141 | DocProject/Help/*.hhp 142 | DocProject/Help/Html2 143 | DocProject/Help/html 144 | 145 | # Click-Once directory 146 | publish/ 147 | 148 | # Publish Web Output 149 | *.[Pp]ublish.xml 150 | *.azurePubxml 151 | # TODO: Comment the next line if you want to checkin your web deploy settings 152 | # but database connection strings (with potential passwords) will be unencrypted 153 | *.pubxml 154 | *.publishproj 155 | 156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 157 | # checkin your Azure Web App publish settings, but sensitive information contained 158 | # in these scripts will be unencrypted 159 | PublishScripts/ 160 | 161 | # NuGet Packages 162 | *.nupkg 163 | # The packages folder can be ignored because of Package Restore 164 | **/packages/* 165 | # except build/, which is used as an MSBuild target. 166 | !**/packages/build/ 167 | # Uncomment if necessary however generally it will be regenerated when needed 168 | #!**/packages/repositories.config 169 | # NuGet v3's project.json files produces more ignorable files 170 | *.nuget.props 171 | *.nuget.targets 172 | 173 | # Microsoft Azure Build Output 174 | csx/ 175 | *.build.csdef 176 | 177 | # Microsoft Azure Emulator 178 | ecf/ 179 | rcf/ 180 | 181 | # Windows Store app package directories and files 182 | AppPackages/ 183 | BundleArtifacts/ 184 | Package.StoreAssociation.xml 185 | _pkginfo.txt 186 | 187 | # Visual Studio cache files 188 | # files ending in .cache can be ignored 189 | *.[Cc]ache 190 | # but keep track of directories ending in .cache 191 | !*.[Cc]ache/ 192 | 193 | # Others 194 | ClientBin/ 195 | ~$* 196 | *~ 197 | *.dbmdl 198 | *.dbproj.schemaview 199 | *.jfm 200 | *.pfx 201 | *.publishsettings 202 | orleans.codegen.cs 203 | 204 | # Since there are multiple workflows, uncomment next line to ignore bower_components 205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 206 | #bower_components/ 207 | 208 | # RIA/Silverlight projects 209 | Generated_Code/ 210 | 211 | # Backup & report files from converting an old project file 212 | # to a newer Visual Studio version. Backup files are not needed, 213 | # because we have git ;-) 214 | _UpgradeReport_Files/ 215 | Backup*/ 216 | UpgradeLog*.XML 217 | UpgradeLog*.htm 218 | 219 | # SQL Server files 220 | *.mdf 221 | *.ldf 222 | *.ndf 223 | 224 | # Business Intelligence projects 225 | *.rdl.data 226 | *.bim.layout 227 | *.bim_*.settings 228 | 229 | # Microsoft Fakes 230 | FakesAssemblies/ 231 | 232 | # GhostDoc plugin setting file 233 | *.GhostDoc.xml 234 | 235 | # Node.js Tools for Visual Studio 236 | .ntvs_analysis.dat 237 | node_modules/ 238 | 239 | # Typescript v1 declaration files 240 | typings/ 241 | 242 | # Visual Studio 6 build log 243 | *.plg 244 | 245 | # Visual Studio 6 workspace options file 246 | *.opt 247 | 248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 249 | *.vbw 250 | 251 | # Visual Studio LightSwitch build output 252 | **/*.HTMLClient/GeneratedArtifacts 253 | **/*.DesktopClient/GeneratedArtifacts 254 | **/*.DesktopClient/ModelManifest.xml 255 | **/*.Server/GeneratedArtifacts 256 | **/*.Server/ModelManifest.xml 257 | _Pvt_Extensions 258 | 259 | # Paket dependency manager 260 | .paket/paket.exe 261 | paket-files/ 262 | 263 | # FAKE - F# Make 264 | .fake/ 265 | 266 | # JetBrains Rider 267 | .idea/ 268 | *.sln.iml 269 | 270 | # CodeRush 271 | .cr/ 272 | 273 | # Python Tools for Visual Studio (PTVS) 274 | __pycache__/ 275 | *.pyc 276 | 277 | # Cake - Uncomment if you are using it 278 | # tools/** 279 | # !tools/packages.config 280 | 281 | # Telerik's JustMock configuration file 282 | *.jmconfig 283 | 284 | # BizTalk build output 285 | *.btp.cs 286 | *.btm.cs 287 | *.odx.cs 288 | *.xsd.cs 289 | \.hg/ 290 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | #ignore files 2 | glob:_ReSharper.*/ 3 | glob:TestResults/ 4 | glob:**/bin/ 5 | glob:**/obj/ 6 | glob:*.suo 7 | glob:**.vssscc 8 | glob:**.vspscc 9 | glob:**.ReSharper 10 | glob:**.user 11 | glob:**/[Ww]eb.config 12 | glob:**/[Aa]pp.config 13 | glob:**/AssemblyFileVersion.cs 14 | Bin/ 15 | glob:**/[.]idea/* 16 | glob:UpgradeLog*.XML 17 | glob:**/UpgradeReport*.* 18 | glob:**.Publish.xml 19 | glob:**.vsprops 20 | glob:**.sln.cache 21 | glob:**.vs10x 22 | glob:bin 23 | glob:BIN 24 | glob:Bin 25 | glob:**packages/* 26 | syntax: glob 27 | *.docstates 28 | *.nupkg 29 | build/* 30 | .fake/* 31 | NuGet/* 32 | *.git/* 33 | LICENSE.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010-2018 @rflechner, Inc. https://github.com/rflechner/ScrapySharp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /NuGet.Config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | ScrapySharp has a Web Client able to simulate a real Web browser (handle referrer, cookies …) 4 | 5 | Html parsing has to be as natural as possible. So I like to use CSS Selectors and Linq. 6 | 7 | This framework wraps HtmlAgilityPack. 8 | 9 | ## Basic examples of CssSelect usages 10 | 11 | ```C# 12 | 13 | using System.Linq; 14 | using HtmlAgilityPack; 15 | using ScrapySharp.Extensions; 16 | 17 | class Example 18 | { 19 | public void Main() 20 | { 21 | var divs = html.CssSelect("div"); //all div elements 22 | var nodes = html.CssSelect("div.content"); //all div elements with css class ‘content’ 23 | var nodes = html.CssSelect("div.widget.monthlist"); //all div elements with the both css class 24 | var nodes = html.CssSelect("#postPaging"); //all HTML elements with the id postPaging 25 | var nodes = html.CssSelect("div#postPaging.testClass"); // all HTML elements with the id postPaging and css class testClass 26 | 27 | var nodes = html.CssSelect("div.content > p.para"); //p elements who are direct children of div elements with css class ‘content’ 28 | 29 | var nodes = html.CssSelect("input[type=text].login"); // textbox with css class login 30 | } 31 | } 32 | ``` 33 | 34 | ## Scrapysharp can also simulate a web browser 35 | 36 | ```C# 37 | 38 | ScrapingBrowser browser = new ScrapingBrowser(); 39 | 40 | //set UseDefaultCookiesParser as false if a website returns invalid cookies format 41 | //browser.UseDefaultCookiesParser = false; 42 | 43 | WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/")); 44 | 45 | PageWebForm form = homePage.FindFormById("sb_form"); 46 | form["q"] = "scrapysharp"; 47 | form.Method = HttpVerb.Get; 48 | WebPage resultsPage = form.Submit(); 49 | 50 | HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray(); 51 | 52 | WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click(); 53 | ``` 54 | 55 | ## Install Scrapysharp in your project 56 | 57 | It's easy to use Scrapysharp in your project. 58 | 59 | A Nuget package exists on [nuget.org](https://www.nuget.org/packages/ScrapySharp) and on [myget](https://www.myget.org/feed/romcyber/package/nuget/ScrapySharp) 60 | 61 | ## News 62 | 63 | Scrapysharp V3 is a reborn. 64 | 65 | Old version under GPL license is still on [bitbucket](https://bitbucket.org/rflechner/scrapysharp/src) 66 | 67 | Version 3 is a conversion to .net standard 2.0 and a relicensing. 68 | -------------------------------------------------------------------------------- /ReleaseNotes.md: -------------------------------------------------------------------------------- 1 | # 3.0.0 2 | 3 | Alpha seems to be stable so I release. 4 | 5 | # 3.0.0-alpha2 6 | 7 | - dotnet standard 2.0 8 | - fix nuget meta data 9 | 10 | # 3.0.0-alpha1 11 | 12 | - dotnet standard 2.0 13 | 14 | # 2.6.2 15 | 16 | - Introduce async methods for .Net >= 4.5 17 | - Add possibility to add special headers for using "NavigateToPage" (Pull Request #6) 18 | -------------------------------------------------------------------------------- /ScrapySharp.Core/CssSelectorExecutor.fs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Core 2 | 3 | open System 4 | open System.IO 5 | open System.Net 6 | open System.Runtime.Serialization.Formatters.Binary 7 | open System.Text 8 | open System.Linq 9 | 10 | type FilterLevel = 11 | | Root 12 | | Children 13 | | Descendants 14 | | Parents 15 | | Ancestors 16 | 17 | type CssSelectorExecutor<'n>(nodes:System.Collections.Generic.List<'n>, tokens:System.Collections.Generic.List, navigator:INavigationProvider<'n>) = 18 | let mutable navigator = navigator 19 | let mutable nodes = Array.toList(nodes.ToArray()) 20 | let mutable tokens = Array.toList(tokens.ToArray()) 21 | let mutable level = FilterLevel.Descendants 22 | let mutable matchAncestors = false 23 | 24 | member public x.MatchAncestors 25 | with get() = 26 | matchAncestors 27 | and set(value) = 28 | matchAncestors <- value 29 | level <- if matchAncestors then FilterLevel.Ancestors else FilterLevel.Root 30 | 31 | member public x.GetElements() = 32 | let elements = x.selectElements() 33 | elements |> List.toArray 34 | 35 | member private x.selectElements() = 36 | 37 | let whiteSpaces = [|' '; '\t'; '\r'; '\n'|] 38 | 39 | let getTargets (acc:List<'n>) = 40 | if level = FilterLevel.Children then 41 | navigator.ChildNodes(new System.Collections.Generic.List<'n>(acc)).ToArray() |> Array.toList 42 | elif level = FilterLevel.Descendants then 43 | navigator.Descendants(new System.Collections.Generic.List<'n>(acc)).ToArray() |> Array.toList 44 | elif level = FilterLevel.Parents then 45 | navigator.ParentNodes(new System.Collections.Generic.List<'n>(acc)).ToArray() |> Array.toList 46 | elif level = FilterLevel.Ancestors then 47 | navigator.AncestorsAndSelf(new System.Collections.Generic.List<'n>(acc)).ToArray() |> Array.toList 48 | else 49 | acc 50 | 51 | let rec selectElements' (acc:List<'n>) source = 52 | match source with 53 | | Token.TagName(o, name) :: t -> 54 | let children = acc |> getTargets |> Seq.toList 55 | let selectedNodes = children |> Seq.filter(fun x -> navigator.GetName(x).Equals(name, StringComparison.InvariantCultureIgnoreCase)) |> Seq.toList 56 | level <- FilterLevel.Root 57 | selectElements' selectedNodes t 58 | 59 | | Token.ClassPrefix(o) :: Token.CssClass(o2, className) :: t -> 60 | let selectedNodes = acc |> getTargets 61 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x "class" String.Empty).Split(whiteSpaces).Contains(className) ) 62 | |> Seq.toList 63 | level <- FilterLevel.Root 64 | selectElements' selectedNodes t 65 | 66 | | Token.IdPrefix(o) :: Token.CssId(o2, id) :: t -> 67 | let selectedNodes = acc |> getTargets 68 | |> Seq.filter (fun x -> (navigator.GetId x) = id) 69 | |> Seq.toList 70 | level <- FilterLevel.Root 71 | selectElements' selectedNodes t 72 | 73 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.Assign(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 74 | let selectedNodes = acc |> getTargets 75 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty) = value) 76 | |> Seq.toList 77 | level <- FilterLevel.Root 78 | selectElements' selectedNodes t 79 | 80 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.EndWith(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 81 | let selectedNodes = acc |> getTargets 82 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty).EndsWith(value)) 83 | |> Seq.toList 84 | level <- FilterLevel.Root 85 | selectElements' selectedNodes t 86 | 87 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.StartWith(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 88 | let selectedNodes = acc |> getTargets 89 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty).StartsWith(value)) 90 | |> Seq.toList 91 | level <- FilterLevel.Root 92 | selectElements' selectedNodes t 93 | 94 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.AttributeContainsPrefix(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 95 | let selectedNodes = acc |> getTargets 96 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty).StartsWith(value)) 97 | |> Seq.toList 98 | level <- FilterLevel.Root 99 | selectElements' selectedNodes t 100 | 101 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.AttributeContains(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 102 | let selectedNodes = acc |> getTargets 103 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty).ToLowerInvariant().Contains(value.ToLowerInvariant())) 104 | |> Seq.toList 105 | level <- FilterLevel.Root 106 | selectElements' selectedNodes t 107 | 108 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.AttributeContainsWord(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 109 | let selectedNodes = acc |> getTargets 110 | |> Seq.filter (fun x -> 111 | let attr = (navigator.GetAttributeValue x name String.Empty) 112 | attr.Split(whiteSpaces).Any(fun s -> s.Equals(value, StringComparison.InvariantCultureIgnoreCase)) ) 113 | |> Seq.toList 114 | level <- FilterLevel.Root 115 | selectElements' selectedNodes t 116 | 117 | | Token.OpenAttribute(o) :: Token.AttributeName(o1, name) :: Token.AttributeNotEqual(o2) :: Token.AttributeValue(o3, value) :: Token.CloseAttribute(o4) :: t -> 118 | let selectedNodes = acc |> getTargets 119 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x name String.Empty) <> value) 120 | |> Seq.toList 121 | level <- FilterLevel.Root 122 | selectElements' selectedNodes t 123 | 124 | | Token.Checkbox(o) :: t -> 125 | let selectedNodes = acc |> getTargets 126 | |> Seq.filter (fun x -> (navigator.GetAttributeValue x "type" String.Empty) = "checkbox") 127 | |> Seq.toList 128 | level <- FilterLevel.Root 129 | selectElements' selectedNodes t 130 | 131 | | Token.Checked(o) :: t -> 132 | let selectedNodes = acc |> getTargets 133 | |> Seq.filter (fun x -> (navigator.Attributes x).AllKeys.Contains("checked")) 134 | |> Seq.toList 135 | level <- FilterLevel.Root 136 | selectElements' selectedNodes t 137 | 138 | | Token.Selected(o) :: t -> 139 | let selectedNodes = acc |> getTargets 140 | |> Seq.filter (fun x -> (navigator.Attributes x).AllKeys.Contains("selected")) 141 | |> Seq.toList 142 | level <- FilterLevel.Root 143 | selectElements' selectedNodes t 144 | 145 | | Token.Disabled(o) :: t -> 146 | let selectedNodes = acc |> getTargets 147 | |> Seq.filter (fun x -> (navigator.Attributes x).AllKeys.Contains("disabled")) 148 | |> Seq.toList 149 | level <- FilterLevel.Root 150 | selectElements' selectedNodes t 151 | 152 | | Token.Enabled(o) :: t -> 153 | let selectedNodes = acc |> getTargets 154 | |> Seq.filter (fun x -> (navigator.Attributes x).AllKeys.Contains("disabled") = false) 155 | |> Seq.toList 156 | level <- FilterLevel.Root 157 | selectElements' selectedNodes t 158 | 159 | | Token.AllChildren(o) :: t -> 160 | level <- if matchAncestors then FilterLevel.Ancestors else FilterLevel.Descendants 161 | selectElements' acc t 162 | 163 | | Token.DirectChildren(o) :: t -> 164 | level <- if matchAncestors then FilterLevel.Parents else FilterLevel.Children 165 | selectElements' acc t 166 | 167 | | Token.Ancestor(o) :: t -> 168 | level <- FilterLevel.Ancestors 169 | selectElements' acc t 170 | 171 | | [] -> acc 172 | | _ :: t -> failwith "Invalid token" 173 | 174 | selectElements' nodes tokens 175 | 176 | 177 | -------------------------------------------------------------------------------- /ScrapySharp.Core/CssSelectorTokenizer.fs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Core 2 | 3 | 4 | open System 5 | open System.IO 6 | open System.Net 7 | open System.Runtime.Serialization.Formatters.Binary 8 | open System.Text 9 | 10 | type CssSelectorTokenizer() = 11 | let mutable charCount:int = 0 12 | let mutable source = List.Empty 13 | let mutable cssSelector = "" 14 | let mutable inQuotes:bool = false 15 | 16 | let getOffset (t:List) = 17 | charCount - 1 - t.Length 18 | 19 | member public x.Tokenize(pCssSelector:string) = 20 | cssSelector <- pCssSelector 21 | source <- Array.toList(cssSelector.ToCharArray()) 22 | charCount <- source.Length 23 | x.tokenize() |> List.toArray 24 | 25 | member private x.tokenize() = 26 | let rec readString acc = function 27 | | c :: t when Char.IsLetterOrDigit(c) || c.Equals('-') || c.Equals('_') 28 | || c.Equals('+') || c.Equals('/') 29 | -> readString (acc + (c.ToString())) t 30 | | '\'' :: t -> 31 | if inQuotes then 32 | inQuotes <- false 33 | acc, t 34 | else 35 | inQuotes <- true 36 | readString acc t 37 | 38 | | '\\' :: '\'' :: t when inQuotes -> 39 | readString (acc + ('\''.ToString())) t 40 | 41 | | c :: t when inQuotes -> 42 | readString (acc + (c.ToString())) t 43 | | c :: t -> acc, c :: t 44 | | [] -> 45 | acc, [] 46 | | _ -> 47 | failwith "Invalid css selector syntax" 48 | 49 | let (|TokenStr|_|) (s:string) x = 50 | let chars = Seq.toList s 51 | 52 | let rec equal x s = 53 | match x, s with 54 | | x, [] -> Some(x) 55 | | xh :: xt, sh :: st when xh = sh -> equal xt st 56 | | _ -> None 57 | 58 | equal x chars 59 | 60 | let rec tokenize' acc sourceChars = 61 | match sourceChars with 62 | | w :: t when Char.IsWhiteSpace(w) -> 63 | let seqtoken = (acc |> List.toSeq |> Seq.skip(1) |> Seq.toList) 64 | match acc.Head with 65 | | Token.Ancestor(o) -> tokenize' (Token.Ancestor(getOffset(t)) :: seqtoken) t 66 | | Token.AllChildren(o) -> tokenize' (Token.AllChildren(getOffset(t)) :: seqtoken) t 67 | | Token.DirectChildren(o) -> tokenize' (Token.DirectChildren(getOffset(t)) :: seqtoken) t 68 | | _ -> tokenize' (Token.AllChildren(getOffset(t)) :: acc) t 69 | | '.' :: t -> 70 | let s, t' = readString "" t 71 | tokenize' (Token.CssClass(getOffset(t)+1, s) :: Token.ClassPrefix(getOffset(t)) :: acc) t' 72 | | '#' :: t -> 73 | let s, t' = readString "" t 74 | tokenize' (Token.CssId(getOffset(t)+1, s) :: Token.IdPrefix(getOffset(t)) :: acc) t' 75 | | '[' :: t -> 76 | let s, t' = readString "" t 77 | tokenize' (Token.AttributeName(getOffset(t)+1, s) :: Token.OpenAttribute(getOffset(t)) :: acc) t' 78 | | ']' :: t -> 79 | tokenize' (Token.CloseAttribute(getOffset(t)) :: acc) t 80 | | '=' :: t -> 81 | let s, t' = readString "" t 82 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.Assign(getOffset(t)) :: acc) t' 83 | | '$' :: '=' :: t -> 84 | let s, t' = readString "" t 85 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.EndWith(getOffset(t)) :: acc) t' 86 | | '^' :: '=' :: t -> 87 | let s, t' = readString "" t 88 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.StartWith(getOffset(t)) :: acc) t' 89 | | '|' :: '=' :: t -> 90 | let s, t' = readString "" t 91 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.AttributeContainsPrefix(getOffset(t)) :: acc) t' 92 | | '*' :: '=' :: t -> 93 | let s, t' = readString "" t 94 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.AttributeContains(getOffset(t)) :: acc) t' 95 | 96 | | '~' :: '=' :: t -> 97 | let s, t' = readString "" t 98 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.AttributeContainsWord(getOffset(t)) :: acc) t' 99 | 100 | | '!' :: '=' :: t -> 101 | let s, t' = readString "" t 102 | tokenize' (Token.AttributeValue(getOffset(t)+1, s) :: Token.AttributeNotEqual(getOffset(t)) :: acc) t' 103 | 104 | | TokenStr ":checkbox" t -> 105 | let s, t' = readString "" t 106 | tokenize' (Token.Checkbox(getOffset(t)+1) :: acc) t' 107 | 108 | | TokenStr ":selected" t -> 109 | let s, t' = readString "" t 110 | tokenize' (Token.Selected(getOffset(t)+1) :: acc) t' 111 | 112 | | TokenStr ":checked" t -> 113 | let s, t' = readString "" t 114 | tokenize' (Token.Checked(getOffset(t)+1) :: acc) t' 115 | 116 | | TokenStr ":disabled" t -> 117 | let s, t' = readString "" t 118 | tokenize' (Token.Disabled(getOffset(t)+1) :: acc) t' 119 | 120 | | TokenStr ":enabled" t -> 121 | let s, t' = readString "" t 122 | tokenize' (Token.Enabled(getOffset(t)+1) :: acc) t' 123 | 124 | | '>' :: t -> 125 | let seqtoken = (acc |> List.toSeq |> Seq.skip(1) |> Seq.toList) 126 | match acc.Head with 127 | | Token.AllChildren(o) -> tokenize' (Token.DirectChildren(getOffset(t)) :: seqtoken) t 128 | | _ -> tokenize' (Token.DirectChildren(getOffset(t)) :: acc) t 129 | | '<' :: t -> 130 | let seqtoken = (acc |> List.toSeq |> Seq.skip(1) |> Seq.toList) 131 | match acc.Head with 132 | | Token.AllChildren(o) -> tokenize' (Token.Ancestor(getOffset(t)) :: seqtoken) t 133 | | _ -> tokenize' (Token.Ancestor(getOffset(t)) :: acc) t 134 | | c :: t when Char.IsLetterOrDigit(c) -> 135 | let str = c.ToString() 136 | let s, t' = readString str t 137 | tokenize' (Token.TagName(getOffset(t), s) :: acc) t' 138 | | [] -> List.rev acc // A la fin, on inverse la liste, car la call stack nous sort les tokens à l'envers 139 | | c :: t when Char.IsLetterOrDigit(c) <> true -> 140 | let offset = getOffset t 141 | failwith (sprintf "Invalid css selector syntax (char '%c' at offset %d)" c offset) 142 | | _ -> 143 | failwith "Invalid css selector syntax" 144 | tokenize' [] source 145 | -------------------------------------------------------------------------------- /ScrapySharp.Core/Models.fs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Core 2 | 3 | open System 4 | open System.IO 5 | open System.Net 6 | open System.Runtime.Serialization.Formatters.Binary 7 | open System.Text 8 | 9 | type Token = 10 | | ClassPrefix of int 11 | | IdPrefix of int 12 | | TagName of int * string 13 | | CssClass of int * string 14 | | CssId of int * string 15 | | AllChildren of int 16 | | OpenAttribute of int 17 | | CloseAttribute of int 18 | | AttributeName of int * string 19 | | AttributeValue of int * string 20 | | Assign of int 21 | | EndWith of int 22 | | StartWith of int 23 | | DirectChildren of int 24 | | Ancestor of int 25 | | AttributeContainsPrefix of int 26 | | AttributeContains of int 27 | | AttributeContainsWord of int 28 | | AttributeNotEqual of int 29 | | Checkbox of int 30 | | Checked of int 31 | | Disabled of int 32 | | Enabled of int 33 | | Selected of int 34 | 35 | 36 | type TokenContainer(token:Token, offset:int) = 37 | member t.Offset = offset 38 | member t.Token = token 39 | 40 | type CharContainer(c:char, offset:int) = 41 | member t.Offset = offset 42 | member t.Char = c -------------------------------------------------------------------------------- /ScrapySharp.Core/NavigationProvider.fs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Core 2 | 3 | open System 4 | open System.IO 5 | open System.Net 6 | open System.Runtime.Serialization.Formatters.Binary 7 | open System.Text 8 | 9 | type INavigationProvider<'t> = 10 | abstract member ChildNodes : System.Collections.Generic.List<'t> -> System.Collections.Generic.List<'t> 11 | abstract member Descendants : System.Collections.Generic.List<'t> -> System.Collections.Generic.List<'t> 12 | abstract member ParentNodes : System.Collections.Generic.List<'t> -> System.Collections.Generic.List<'t> 13 | abstract member AncestorsAndSelf : System.Collections.Generic.List<'t> -> System.Collections.Generic.List<'t> 14 | abstract member GetName : 't -> string 15 | abstract member GetAttributeValue : 't -> string -> string -> string 16 | abstract member GetId : 't -> string 17 | abstract member Attributes : 't -> System.Collections.Specialized.NameValueCollection 18 | 19 | 20 | type AgilityNavigationProvider() = 21 | interface INavigationProvider with 22 | member this.ChildNodes(nodes) = 23 | let resutls = nodes |> Seq.map (fun x -> x.ChildNodes) |> Seq.collect (fun x -> x) 24 | new System.Collections.Generic.List<'t>(resutls) 25 | member this.Descendants(nodes) = 26 | let resutls = nodes |> Seq.map (fun x -> x.Descendants()) |> Seq.collect (fun x -> x) 27 | new System.Collections.Generic.List<'t>(resutls) 28 | member this.ParentNodes(nodes) = 29 | let results = nodes |> Seq.map (fun x -> x.ParentNode) 30 | new System.Collections.Generic.List<'t>(results) 31 | member this.AncestorsAndSelf(nodes) = 32 | let results = nodes |> Seq.map (fun x -> x.AncestorsAndSelf()) |> Seq.collect (fun x -> x) 33 | new System.Collections.Generic.List<'t>(results) 34 | member this.GetName(node) = 35 | node.Name 36 | member this.GetAttributeValue node name defaultValue = 37 | node.GetAttributeValue(name, defaultValue) 38 | member this.GetId(node) = 39 | node.Id 40 | member this.Attributes(node) = 41 | let attrs = new System.Collections.Specialized.NameValueCollection() 42 | for attr in node.Attributes do 43 | attrs.Add(attr.Name, attr.Value) 44 | attrs 45 | -------------------------------------------------------------------------------- /ScrapySharp.Core/ScrapySharp.Core.fsproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | netstandard2.0 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/.gitattributes: -------------------------------------------------------------------------------- 1 | * text eol=crlf 2 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/Form1.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 |
4 | 5 | 6 |

7 | 8 |

9 |

10 | 11 |

12 |

13 | Likes: 14 | 15 | 16 | 17 | 18 |

19 | 20 |  
21 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/GeneratedHtml1.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 |
4 | Login: 5 |
6 | 7 | 8 | 9 | 10 |
case1
11 | 12 | 13 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/InvalidPage1.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 |
6 | Welcome 7 | toto 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 |
column 1column 2
value 1value 2
value 3 23 | value 4
27 | 28 |
29 | 30 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/InvalidPage2.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 |
6 | Welcome 7 | toto 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 |
column 1column 2
value 1value 2
value 3 23 | value 4
27 | 28 |
29 | 30 | 31 | 34 | 35 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/InvalidPage3.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | Welcome 26 | toto 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 43 | 44 | 45 |
column 1column 2
value 1value 2
value 3 42 | value 4
46 | 47 |
48 | 49 | 50 | 53 | 54 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/Page1.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | Romcyber | .Net coding 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |
17 | 18 | 19 | 20 |
21 | 22 | 37 | 38 | 39 | 40 | 43 | Hi toto! 44 |
45 | Hi titi! 46 |
47 | 48 | 50 |
51 |
52 | 69 | 70 | 71 | 106 | 107 |
108 |
109 |
110 |
111 | 112 | 113 | 114 |
115 | 116 | tototata 117 | 118 | 119 |
120 |
121 | 122 | 126 | 127 |
128 | 129 |
130 | 131 | 132 | 133 | 134 |
135 | 136 | 137 | 138 | 139 | 143 |
144 |
145 |
146 |
147 | 148 | 165 |
166 | 167 |
168 | 169 | 170 | 171 | 172 | 177 |
178 | 179 | 180 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/ValidPage1.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 |
5 | Welcome 6 | toto 7 |
8 | 9 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/ValidPage2.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | Valid page 2 4 | 20 | 21 | 22 | 23 |
24 | Welcome 25 | toto 26 |
27 | 28 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/Html/WebFormPage.htm: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | Romcyber | .Net coding 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |
17 | 18 | 19 | 20 |
21 | 22 | 37 | 38 | 39 | 40 | 43 | Hi toto! 44 |
45 | Hi titi! 46 |
47 | 48 | 50 |
51 |
52 | 69 | 70 | 71 | 106 | 107 |
108 |
109 |
110 |
111 | 112 | 113 | 114 |
115 | 116 | tototata 117 | 118 | 119 |
120 |
121 | 122 | 123 | 124 |

125 | 126 |

127 |

128 | 129 |

130 |

131 | Likes: 132 | 133 | 134 | 135 | 136 |

137 | 138 |

139 | Genre: 140 | 141 | 142 | 143 |

144 | 145 | 146 | 147 |
148 | 149 | 153 | 154 |
155 | 156 |
157 | 158 | 159 | 160 | 161 |
162 | 163 | 164 | 165 | 166 | 170 |
171 |
172 |
173 |
174 | 175 | 192 |
193 | 194 |
195 | 196 | 197 | 198 | 199 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/ScrapySharp.Tests.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | netcoreapp2.0 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | Always 19 | 20 | 21 | Always 22 | 23 | 24 | Always 25 | 26 | 27 | Always 28 | 29 | 30 | Always 31 | 32 | 33 | Always 34 | 35 | 36 | Always 37 | 38 | 39 | Always 40 | 41 | 42 | Always 43 | 44 | 45 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_build_HtmlDom.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rflechner/ScrapySharp/796a486333a9ddcc04e3970610831e63b7d41d55/ScrapySharp.Tests/When_build_HtmlDom.cs -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_generate_HTML_with_HDocument.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using NUnit.Framework; 3 | using ScrapySharp.Html.Dom; 4 | 5 | namespace ScrapySharp.Tests 6 | { 7 | [TestFixture] 8 | public class When_generate_HTML_with_HDocument 9 | { 10 | [Test] 11 | public void When_generate_HTML_1() 12 | { 13 | var doc = new HDocument( 14 | new HElement("html", 15 | new HElement("body", 16 | new HElement("div", 17 | new HAttribute("id", "login-box"), 18 | new HElement("span", "Login:")), 19 | new HElement("table", 20 | new HElement("tr", 21 | new HElement("td", "case1")))) 22 | )); 23 | 24 | var html = doc.GetOuterHtml(HtmlGenerationStyle.Indent); 25 | 26 | var html2 = File.ReadAllText("Html/GeneratedHtml1.htm"); 27 | 28 | Assert.AreEqual(html2, html); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_parse_real_html_pages.cs: -------------------------------------------------------------------------------- 1 | // ReSharper disable InconsistentNaming 2 | 3 | using System.IO; 4 | using NUnit.Framework; 5 | using ScrapySharp.Html.Dom; 6 | using System.Linq; 7 | using ScrapySharp.Extensions; 8 | 9 | namespace ScrapySharp.Tests 10 | { 11 | [TestFixture] 12 | public class When_parse_real_html_pages 13 | { 14 | [Test] 15 | public void When_parsing_InvalidPage1() 16 | { 17 | var source = File.ReadAllText("Html/Page1.htm"); 18 | var document = HDocument.Parse(source); 19 | 20 | } 21 | 22 | [Test] 23 | public void When_parsing_ValidPage2() 24 | { 25 | var source = File.ReadAllText("Html/ValidPage2.htm"); 26 | var document = HDocument.Parse(source); 27 | 28 | var js = document.Descendants("script").Single().InnerText; 29 | } 30 | 31 | [Test] 32 | public void When_parsing_InvalidPage2() 33 | { 34 | var source = File.ReadAllText("Html/InvalidPage2.htm"); 35 | var document = HDocument.Parse(source); 36 | 37 | Assert.AreEqual(1, document.CssSelect("div.login").Count()); 38 | 39 | Assert.AreEqual(3, document.CssSelect("div").Count()); 40 | 41 | Assert.AreEqual(1, document.CssSelect("div#footer").Count()); 42 | 43 | var outerHtml = document.GetOuterHtml(); 44 | } 45 | 46 | [Test] 47 | public void When_parsing_Comments() 48 | { 49 | var source = File.ReadAllText("Html/InvalidPage3.htm"); 50 | var document = HDocument.Parse(source); 51 | 52 | var body = document.CssSelect("body").Single(); 53 | var children = body.Children.ToArray(); 54 | 55 | Assert.AreEqual(1, children.OfType().Count()); 56 | 57 | var comment = children.OfType().Single(); 58 | 59 | var text = comment.GetOuterHtml(); 60 | Assert.AreEqual("", text); 61 | } 62 | } 63 | } 64 | 65 | // ReSharper restore InconsistentNaming -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_parses_using_CssSelector.cs: -------------------------------------------------------------------------------- 1 | // ReSharper disable InconsistentNaming 2 | 3 | using System.Linq; 4 | using HtmlAgilityPack; 5 | using NUnit.Framework; 6 | using ScrapySharp.Extensions; 7 | 8 | namespace ScrapySharp.Tests 9 | { 10 | [TestFixture] 11 | public class When_parses_using_CssSelector 12 | { 13 | private readonly HtmlNode html; 14 | 15 | public When_parses_using_CssSelector() 16 | { 17 | var htmlDocument = new HtmlDocument(); 18 | htmlDocument.Load(@"Html/Page1.htm"); 19 | html = htmlDocument.DocumentNode; 20 | } 21 | 22 | [Test] 23 | public void When_css_class_contains_no_alpha_numerics() 24 | { 25 | var spans = html.CssSelect("span.login-box").ToArray(); 26 | 27 | Assert.AreEqual(1, spans.Length); 28 | } 29 | 30 | [Test] 31 | public void When_id_contains_no_alpha_numerics() 32 | { 33 | var spans = html.CssSelect("span#pass-box").ToArray(); 34 | 35 | Assert.AreEqual(1, spans.Length); 36 | } 37 | 38 | [Test] 39 | public void When_uses_simple_tagName() 40 | { 41 | var divs = html.CssSelect("div").ToArray(); 42 | 43 | Assert.AreEqual(29, divs.Length); 44 | } 45 | 46 | [Test] 47 | public void When_uses_tagName_with_css_class() 48 | { 49 | Assert.AreEqual(3, html.CssSelect("div.content").Count()); 50 | 51 | Assert.AreEqual(1, html.CssSelect("div.widget.monthlist").Count()); 52 | } 53 | 54 | [Test] 55 | public void When_uses_tagName_with_css_class_using_inheritance() 56 | { 57 | Assert.AreEqual(1, html.CssSelect("div.left-corner div.node").Count()); 58 | 59 | var nodes = html.CssSelect("span#testSpan span").ToArray(); 60 | 61 | Assert.AreEqual(2, nodes.Length); 62 | 63 | Assert.AreEqual("tototata", nodes[0].InnerText); 64 | Assert.AreEqual("tata", nodes[1].InnerText); 65 | 66 | } 67 | 68 | [Test] 69 | public void When_uses_id() 70 | { 71 | Assert.AreEqual(1, html.CssSelect("#postPaging").Count()); 72 | 73 | Assert.AreEqual(1, html.CssSelect("div#postPaging").Count()); 74 | 75 | Assert.AreEqual(1, html.CssSelect("div#postPaging.testClass").Count()); 76 | } 77 | 78 | [Test] 79 | public void When_uses_tagName_with_css_class_using_direct_inheritance() 80 | { 81 | Assert.AreEqual(1, html.CssSelect("div.content > p.para").Count()); 82 | } 83 | 84 | [Test] 85 | public void When_uses_tagName_with_id_class_using_direct_inheritance() 86 | { 87 | Assert.AreEqual(1, html.CssSelect("ul#pagelist > li#listItem1").Count()); 88 | } 89 | 90 | [Test] 91 | public void When_uses_ancestor() 92 | { 93 | var ancestors = html.CssSelect("p.para").CssSelectAncestors("div div.menu").ToArray(); 94 | Assert.AreEqual(1, ancestors.Count()); 95 | } 96 | 97 | [Test] 98 | public void When_uses_direct_ancestor() 99 | { 100 | var ancestors1 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.menu").ToArray(); 101 | Assert.AreEqual(0, ancestors1.Count()); 102 | 103 | var ancestors2 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.widget").ToArray(); 104 | Assert.AreEqual(1, ancestors2.Count()); 105 | } 106 | 107 | [Test] 108 | public void When_uses_attribute_selector() 109 | { 110 | Assert.AreEqual(1, html.CssSelect("input[type=button]").Count()); 111 | 112 | Assert.AreEqual(2, html.CssSelect("input[type=text]").Count()); 113 | 114 | Assert.AreEqual(10, html.CssSelect("script[type=text/javascript]").Count()); 115 | 116 | Assert.AreEqual(2, html.CssSelect("link[type=application/rdf+xml]").Count()); 117 | } 118 | 119 | [Test] 120 | public void When_uses_attribute_selector_with_css_class() 121 | { 122 | Assert.AreEqual(1, html.CssSelect("input[type=text].login").Count()); 123 | } 124 | 125 | [Test] 126 | public void When_using_starts_with_attribute_selector() 127 | { 128 | var doc = new HtmlDocument(); 129 | doc.LoadHtml(@"



"); 130 | var node = doc.DocumentNode; 131 | 132 | var result = node.CssSelect("hr[id^=bla]").ToArray(); 133 | 134 | Assert.AreEqual(1, result.Length); 135 | } 136 | 137 | [Test] 138 | public void When_using_ends_with_attribute_selector() 139 | { 140 | var doc = new HtmlDocument(); 141 | doc.LoadHtml(@"



"); 142 | var node = doc.DocumentNode; 143 | 144 | var result = node.CssSelect("hr[id$=ing]").ToArray(); 145 | 146 | Assert.AreEqual(2, result.Length); 147 | } 148 | 149 | [Test] 150 | public void When_using_multiple_selector() 151 | { 152 | var doc = new HtmlDocument(); 153 | doc.LoadHtml(@"


"); 154 | var node = doc.DocumentNode; 155 | 156 | var result = node.CssSelect(new string[] { "#beep", "#boop" }).ToArray(); 157 | 158 | Assert.AreEqual(2, result.Length); 159 | } 160 | 161 | [Test] 162 | public void When_chain_methods() 163 | { 164 | var doc = new HtmlDocument(); 165 | doc.LoadHtml(@"" 166 | + "" 167 | + "" 168 | +"
Case 1Case 2Case 3
Case 4Case 5Case 6
"); 169 | var node = doc.DocumentNode; 170 | 171 | var trs1 = node.CssSelect("tr").ToArray(); 172 | Assert.AreEqual(2, trs1.Length); 173 | 174 | var tds1 = node.CssSelect("td").ToArray(); 175 | Assert.AreEqual(6, tds1.Length); 176 | 177 | var tds2 = trs1[1].CssSelect("td").ToArray(); 178 | Assert.AreEqual(3, tds2.Length); 179 | } 180 | 181 | [Test] 182 | public void When_select_ancestors() 183 | { 184 | var doc = new HtmlDocument(); 185 | doc.LoadHtml(@"" 186 | + "" 187 | + "" 188 | + "
Case 1Case 3
Case 4Case 5Case 6
"); 189 | var html = doc.DocumentNode; 190 | 191 | var labels = html.CssSelect("label[for=c1]").ToArray(); 192 | var trs = labels.CssSelectAncestors("tr").ToArray(); 193 | 194 | Assert.AreEqual(1, trs.Length); 195 | } 196 | } 197 | } 198 | 199 | // ReSharper restore InconsistentNaming 200 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_parses_using_CssSelector_with_HDocument.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Linq; 3 | using NUnit.Framework; 4 | using ScrapySharp.Extensions; 5 | using ScrapySharp.Html.Dom; 6 | 7 | namespace ScrapySharp.Tests 8 | { 9 | [TestFixture] 10 | public class When_parses_using_CssSelector_with_HDocument 11 | { 12 | public HDocument GetHtmlage1() 13 | { 14 | var source = File.ReadAllText("Html/Page1.htm"); 15 | return HDocument.Parse(source); 16 | } 17 | 18 | [Test] 19 | public void When_css_class_contains_no_alpha_numerics() 20 | { 21 | var spans = GetHtmlage1().CssSelect("span.login-box").ToArray(); 22 | 23 | Assert.AreEqual(1, spans.Length); 24 | } 25 | 26 | [Test] 27 | public void When_id_contains_no_alpha_numerics() 28 | { 29 | var spans = GetHtmlage1().CssSelect("span#pass-box").ToArray(); 30 | 31 | Assert.AreEqual(1, spans.Length); 32 | } 33 | 34 | [Test] 35 | public void When_uses_simple_tagName() 36 | { 37 | var divs = GetHtmlage1().CssSelect("div").ToArray(); 38 | 39 | Assert.AreEqual(29, divs.Length); 40 | } 41 | 42 | [Test] 43 | public void When_uses_tagName_with_css_class() 44 | { 45 | var html = GetHtmlage1(); 46 | 47 | Assert.AreEqual(3, html.CssSelect("div.content").Count()); 48 | 49 | Assert.AreEqual(1, html.CssSelect("div.widget.monthlist").Count()); 50 | } 51 | 52 | [Test] 53 | public void When_uses_tagName_with_css_class_using_inheritance() 54 | { 55 | var html = GetHtmlage1(); 56 | Assert.AreEqual(1, html.CssSelect("div.left-corner div.node").Count()); 57 | 58 | var nodes = html.CssSelect("span#testSpan span").ToArray(); 59 | 60 | Assert.AreEqual(2, nodes.Length); 61 | 62 | Assert.AreEqual("tototata", nodes[0].InnerText); 63 | Assert.AreEqual("tata", nodes[1].InnerText); 64 | 65 | } 66 | 67 | [Test] 68 | public void When_uses_id() 69 | { 70 | var html = GetHtmlage1(); 71 | Assert.AreEqual(1, html.CssSelect("#postPaging").Count()); 72 | 73 | Assert.AreEqual(1, html.CssSelect("div#postPaging").Count()); 74 | 75 | Assert.AreEqual(1, html.CssSelect("div#postPaging.testClass").Count()); 76 | } 77 | 78 | [Test] 79 | public void When_uses_tagName_with_css_class_using_direct_inheritance() 80 | { 81 | var html = GetHtmlage1(); 82 | Assert.AreEqual(1, html.CssSelect("div.content > p.para").Count()); 83 | } 84 | 85 | [Test] 86 | public void When_uses_tagName_with_id_class_using_direct_inheritance() 87 | { 88 | var html = GetHtmlage1(); 89 | Assert.AreEqual(1, html.CssSelect("ul#pagelist > li#listItem1").Count()); 90 | } 91 | 92 | [Test] 93 | public void When_uses_ancestor() 94 | { 95 | var html = GetHtmlage1(); 96 | var ancestors = html.CssSelect("p.para").CssSelectAncestors("div div.menu").ToArray(); 97 | Assert.AreEqual(1, ancestors.Count()); 98 | } 99 | 100 | [Test] 101 | public void When_uses_direct_ancestor() 102 | { 103 | var html = GetHtmlage1(); 104 | var ancestors1 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.menu").ToArray(); 105 | Assert.AreEqual(0, ancestors1.Count()); 106 | 107 | var ancestors2 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.widget").ToArray(); 108 | Assert.AreEqual(1, ancestors2.Count()); 109 | } 110 | 111 | [Test] 112 | public void When_uses_attribute_selector() 113 | { 114 | var html = GetHtmlage1(); 115 | Assert.AreEqual(1, html.CssSelect("input[type=button]").Count()); 116 | 117 | Assert.AreEqual(2, html.CssSelect("input[type=text]").Count()); 118 | 119 | Assert.AreEqual(10, html.CssSelect("script[type=text/javascript]").Count()); 120 | 121 | Assert.AreEqual(2, html.CssSelect("link[type=application/rdf+xml]").Count()); 122 | } 123 | 124 | [Test] 125 | public void When_uses_attribute_selector_with_css_class() 126 | { 127 | var html = GetHtmlage1(); 128 | Assert.AreEqual(1, html.CssSelect("input[type=text].login").Count()); 129 | } 130 | 131 | [Test] 132 | public void When_using_starts_with_attribute_selector() 133 | { 134 | var source = "



"; 135 | var doc = HDocument.Parse(source); 136 | 137 | var result = doc.CssSelect("hr[id^=bla]").ToArray(); 138 | 139 | Assert.AreEqual(1, result.Length); 140 | 141 | Assert.AreEqual(1, doc.CssSelect("hr[id|=bla]").Count()); 142 | } 143 | 144 | [Test] 145 | public void When_using_ends_with_attribute_selector() 146 | { 147 | var source = "



"; 148 | var doc = HDocument.Parse(source); 149 | 150 | var result = doc.CssSelect("hr[id$=ing]").ToArray(); 151 | 152 | Assert.AreEqual(2, result.Length); 153 | } 154 | 155 | [Test] 156 | public void When_using_selector_attribute_equals_with_spaces() 157 | { 158 | var source = 159 | "



"; 160 | var doc = HDocument.Parse(source); 161 | 162 | Assert.AreEqual(1, doc.CssSelect("hr.toto.tata").Count()); 163 | Assert.AreEqual(1, doc.CssSelect("hr[class='toto tata']").Count()); 164 | } 165 | 166 | [Test] 167 | public void When_using_attributeContains_selector() 168 | { 169 | var source = "



"; 170 | var doc = HDocument.Parse(source); 171 | 172 | Assert.AreEqual(2, doc.CssSelect("hr[id*=thi]").Count()); 173 | } 174 | 175 | [Test] 176 | public void When_using_attributeContainsWord_selector() 177 | { 178 | // http://api.jquery.com/attribute-contains-word-selector/ 179 | 180 | var source = @" 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | "; 189 | var doc = HDocument.Parse(source); 190 | 191 | Assert.AreEqual(1, doc.CssSelect("input[name~='man']").Count()); 192 | } 193 | 194 | [Test] 195 | public void When_using_attributeNotEqual_selector() 196 | { 197 | // http://api.jquery.com/attribute-not-equal-selector/ 198 | 199 | var source = @" 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | "; 208 | var doc = HDocument.Parse(source); 209 | 210 | Assert.AreEqual(3, doc.CssSelect("input[name!=man-news]").Count()); 211 | } 212 | 213 | [Test] 214 | public void When_using_checkbox_selector() 215 | { 216 | // http://api.jquery.com/checkbox-selector/ 217 | 218 | var source = @" 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | "; 231 | var doc = HDocument.Parse(source); 232 | 233 | Assert.AreEqual(1, doc.CssSelect("input:checkbox").Count()); 234 | Assert.AreEqual(1, doc.CssSelect(":checkbox").Count()); 235 | } 236 | 237 | [Test] 238 | public void When_using_checked_selector() 239 | { 240 | // http://api.jquery.com/checkbox-selector/ 241 | 242 | var source = @" 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | "; 257 | var doc = HDocument.Parse(source); 258 | 259 | Assert.AreEqual(2, doc.CssSelect("input:checked").Count()); 260 | Assert.AreEqual(2, doc.CssSelect(":checked").Count()); 261 | } 262 | 263 | [Test] 264 | public void When_using_disabled_and_enabled_selector() 265 | { 266 | // http://api.jquery.com/disabled-selector/ 267 | 268 | var source = @" 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | "; 283 | var doc = HDocument.Parse(source); 284 | 285 | Assert.AreEqual(2, doc.CssSelect("input:disabled").Count()); 286 | Assert.AreEqual(2, doc.CssSelect(":disabled").Count()); 287 | 288 | Assert.AreEqual(7, doc.CssSelect("input:enabled").Count()); 289 | } 290 | 291 | [Test] 292 | public void When_using_selected_selector() 293 | { 294 | var source = @" 295 | 296 | 301 | 302 | "; 303 | var doc = HDocument.Parse(source); 304 | 305 | Assert.AreEqual(1, doc.CssSelect(":selected").Count()); 306 | Assert.AreEqual(1, doc.CssSelect("select option:selected").Count()); 307 | } 308 | } 309 | } -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_parses_using_CssSelector_with_fsharp_tokenizer.cs: -------------------------------------------------------------------------------- 1 | // ReSharper disable InconsistentNaming 2 | 3 | using System.Linq; 4 | using HtmlAgilityPack; 5 | using NUnit.Framework; 6 | using ScrapySharp.Core; 7 | using ScrapySharp.Extensions; 8 | using ScrapySharp.Core; 9 | 10 | namespace ScrapySharp.Tests 11 | { 12 | [TestFixture] 13 | public class When_parses_using_CssSelector_with_fsharp_tokenizer 14 | { 15 | private readonly HtmlNode html; 16 | 17 | public When_parses_using_CssSelector_with_fsharp_tokenizer() 18 | { 19 | var htmlDocument = new HtmlDocument(); 20 | htmlDocument.Load(@"Html/Page1.htm"); 21 | html = htmlDocument.DocumentNode; 22 | } 23 | 24 | [Test] 25 | public void When_css_class_contains_no_alpha_numerics() 26 | { 27 | var tokenizer = new CssSelectorTokenizer(); 28 | var tokens = tokenizer.Tokenize("span.loginbox"); 29 | 30 | Assert.AreEqual(3, tokens.Length); 31 | 32 | tokens = tokenizer.Tokenize("span.login-box"); 33 | 34 | Assert.AreEqual(3, tokens.Length); 35 | } 36 | 37 | [Test] 38 | public void When_execute_css_selector1() 39 | { 40 | var tokenizer = new CssSelectorTokenizer(); 41 | var tokens = tokenizer.Tokenize("span.login-box"); 42 | Assert.AreEqual(3, tokens.Length); 43 | 44 | var executor = new CssSelectorExecutor(html.ChildNodes.ToList(), tokens.ToList(), new AgilityNavigationProvider()); 45 | HtmlNode[] htmlNodes = executor.GetElements(); 46 | 47 | Assert.AreEqual(1, htmlNodes.Length); 48 | } 49 | 50 | [Test] 51 | public void When_execute_css_selector2() 52 | { 53 | var tokenizer = new CssSelectorTokenizer(); 54 | var tokens = tokenizer.Tokenize("div.widget.monthlist"); 55 | Assert.AreEqual(5, tokens.Length); 56 | 57 | var executor = new CssSelectorExecutor(html.ChildNodes.ToList(), tokens.ToList(), new AgilityNavigationProvider()); 58 | HtmlNode[] htmlNodes = executor.GetElements(); 59 | 60 | Assert.AreEqual(1, htmlNodes.Length); 61 | } 62 | 63 | [Test] 64 | public void When_id_contains_no_alpha_numerics() 65 | { 66 | var spans = html.CssSelect("span#pass-box").ToArray(); 67 | 68 | Assert.AreEqual(1, spans.Length); 69 | } 70 | 71 | [Test] 72 | public void When_uses_simple_tagName() 73 | { 74 | var divs = html.CssSelect("div").ToArray(); 75 | 76 | Assert.AreEqual(29, divs.Length); 77 | } 78 | 79 | [Test] 80 | public void When_uses_tagName_with_css_class() 81 | { 82 | Assert.AreEqual(3, html.CssSelect("div.content").Count()); 83 | 84 | Assert.AreEqual(1, html.CssSelect("div.widget.monthlist").Count()); 85 | } 86 | 87 | 88 | [Test] 89 | public void When_uses_tagName_with_css_class_using_inheritance() 90 | { 91 | Assert.AreEqual(1, html.CssSelect("div.left-corner div.node").Count()); 92 | 93 | var nodes = html.CssSelect("span#testSpan span").ToArray(); 94 | 95 | Assert.AreEqual(2, nodes.Length); 96 | 97 | Assert.AreEqual("tototata", nodes[0].InnerText); 98 | Assert.AreEqual("tata", nodes[1].InnerText); 99 | } 100 | 101 | [Test] 102 | public void When_uses_id() 103 | { 104 | Assert.AreEqual(1, html.CssSelect("#postPaging").Count()); 105 | 106 | Assert.AreEqual(1, html.CssSelect("div#postPaging").Count()); 107 | 108 | Assert.AreEqual(1, html.CssSelect("div#postPaging.testClass").Count()); 109 | } 110 | 111 | [Test] 112 | public void When_uses_tagName_with_css_class_using_direct_inheritance() 113 | { 114 | var cssSelect1 = html.CssSelect("div.content > p.para").ToArray(); 115 | var CssSelect = html.CssSelect("div.content p.para").ToArray(); 116 | 117 | Assert.AreEqual(1, cssSelect1.Count()); 118 | Assert.AreEqual(2, CssSelect.Count()); 119 | } 120 | 121 | [Test] 122 | public void When_uses_tagName_with_id_class_using_direct_inheritance() 123 | { 124 | Assert.AreEqual(1, html.CssSelect("ul#pagelist > li#listItem1").Count()); 125 | } 126 | 127 | [Test] 128 | public void When_uses_ancestor() 129 | { 130 | var cssSelect1 = html.CssSelect("p.para"); 131 | 132 | var ancestors = cssSelect1.CssSelectAncestors("div div.menu").ToArray(); 133 | 134 | Assert.AreEqual(1, ancestors.Count()); 135 | } 136 | 137 | [Test] 138 | public void When_uses_direct_ancestor() 139 | { 140 | var ancestors1 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.menu").ToArray(); 141 | Assert.AreEqual(0, ancestors1.Count()); 142 | 143 | var ancestors2 = html.CssSelect("p.para").CssSelectAncestors("div.content > div.widget").ToArray(); 144 | Assert.AreEqual(1, ancestors2.Count()); 145 | } 146 | 147 | [Test] 148 | public void When_uses_attribute_selector() 149 | { 150 | Assert.AreEqual(1, html.CssSelect("input[type=button]").Count()); 151 | 152 | Assert.AreEqual(2, html.CssSelect("input[type=text]").Count()); 153 | 154 | Assert.AreEqual(10, html.CssSelect("script[type=text/javascript]").Count()); 155 | 156 | Assert.AreEqual(2, html.CssSelect("link[type=application/rdf+xml]").Count()); 157 | } 158 | 159 | [Test] 160 | public void When_uses_attribute_selector_with_css_class() 161 | { 162 | Assert.AreEqual(1, html.CssSelect("input[type=text].login").Count()); 163 | } 164 | 165 | [Test] 166 | public void When_using_starts_with_attribute_selector() 167 | { 168 | var doc = new HtmlDocument(); 169 | doc.LoadHtml(@"



"); 170 | var node = doc.DocumentNode; 171 | 172 | var result = node.CssSelect("hr[id^=bla]").ToArray(); 173 | 174 | Assert.AreEqual(1, result.Length); 175 | } 176 | 177 | [Test] 178 | public void When_using_ends_with_attribute_selector() 179 | { 180 | var doc = new HtmlDocument(); 181 | doc.LoadHtml(@"



"); 182 | var node = doc.DocumentNode; 183 | 184 | var result = node.CssSelect("hr[id$=ing]").ToArray(); 185 | 186 | Assert.AreEqual(2, result.Length); 187 | } 188 | } 189 | } 190 | 191 | // ReSharper restore InconsistentNaming -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_tokenize_CssSelector.cs: -------------------------------------------------------------------------------- 1 | // ReSharper disable InconsistentNaming 2 | 3 | using NUnit.Framework; 4 | using ScrapySharp.Core; 5 | 6 | namespace ScrapySharp.Tests 7 | { 8 | [TestFixture] 9 | public class When_tokenize_CssSelector 10 | { 11 | [Test] 12 | public void When_tokenize_selectors_used_in_v1() 13 | { 14 | var tokenizer = new CssSelectorTokenizer(); 15 | var tokens = tokenizer.Tokenize("span.loginbox"); 16 | Assert.AreEqual(3, tokens.Length); 17 | 18 | tokens = tokenizer.Tokenize("span.login-box"); 19 | Assert.AreEqual(3, tokens.Length); 20 | 21 | tokens = tokenizer.Tokenize("script[type=text/javascript]"); 22 | Assert.AreEqual(6, tokens.Length); 23 | 24 | tokens = tokenizer.Tokenize("hr[id^=bla]"); 25 | Assert.AreEqual(6, tokens.Length); 26 | 27 | tokens = tokenizer.Tokenize("hr[id$=ing]"); 28 | Assert.AreEqual(6, tokens.Length); 29 | 30 | tokens = tokenizer.Tokenize("link[type=application/rdf+xml]"); 31 | Assert.AreEqual(6, tokens.Length); 32 | } 33 | } 34 | } 35 | 36 | // ReSharper restore InconsistentNaming -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_use_HtmlDeclarationReader.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rflechner/ScrapySharp/796a486333a9ddcc04e3970610831e63b7d41d55/ScrapySharp.Tests/When_use_HtmlDeclarationReader.cs -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_use_browser.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Specialized; 3 | using System.Globalization; 4 | using System.Linq; 5 | using System.Net; 6 | using System.Text; 7 | using NUnit.Framework; 8 | using ScrapySharp.Network; 9 | using ScrapySharp.Extensions; 10 | 11 | namespace ScrapySharp.Tests 12 | { 13 | [TestFixture] 14 | public class When_use_browser 15 | { 16 | [Test, Ignore("legacy")] 17 | [Category("Integration")] 18 | public void When_using_raw_request_response() 19 | { 20 | var browser = new ScrapingBrowser(); 21 | //browser.Encoding = Encoding.UTF8; 22 | WebPage page = browser.NavigateToPage(new Uri("https://bitbucket.org/repo/all"), HttpVerb.Post, new NameValueCollection 23 | { 24 | {"name", "test"}, 25 | }); 26 | 27 | page.SaveSnapshot("outdir"); 28 | } 29 | 30 | [Test] 31 | public void When_combine_urls() 32 | { 33 | var baseUrl = "http://toto.dada.com/izi/"; 34 | var relative1 = "../general/images/izi/logo.gif"; 35 | var abs1 = baseUrl.CombineUrl(relative1).ToString(); 36 | 37 | Assert.AreEqual("http://toto.dada.com/general/images/izi/logo.gif", abs1); 38 | 39 | 40 | var relative2 = "/images/izi/logo.gif"; 41 | var abs2 = baseUrl.CombineUrl(relative2).ToString(); 42 | 43 | Assert.AreEqual("http://toto.dada.com/images/izi/logo.gif", abs2); 44 | 45 | } 46 | 47 | [Test] 48 | [Category("Integration")] 49 | public void When_parses_cookies() 50 | { 51 | var exp1 = @"FBXSID=""8KgAN7h4ZQsvn9OWXy1fvBlrNuRdIr4J0bkguqR5AIdL7clHgA+NQ5URtThL10od""; Max-Age=86400; HTTPOnly"; 52 | var cookieContainer = new CookieContainer(); 53 | cookieContainer.SetCookies(new Uri("http://www.popo.com"), exp1); 54 | 55 | Assert.AreEqual(1, cookieContainer.Count); 56 | } 57 | 58 | [Test, Ignore("Integration")] 59 | [Category("Integration")] 60 | public void When_forcing_anguage() 61 | { 62 | var browser1 = new ScrapingBrowser(); 63 | var html1 = browser1.DownloadString(new Uri("http://www.google.com")); 64 | 65 | var browser2 = new ScrapingBrowser {Language = CultureInfo.CreateSpecificCulture("fr-FR")}; 66 | var html2 = browser2.DownloadString(new Uri("http://www.google.com")); 67 | 68 | Assert.AreNotEqual(html1, html2); 69 | } 70 | 71 | } 72 | } -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_use_code_reader.cs: -------------------------------------------------------------------------------- 1 | // ReSharper disable InconsistentNaming 2 | 3 | using NUnit.Framework; 4 | using ScrapySharp.Html.Parsing; 5 | 6 | namespace ScrapySharp.Tests 7 | { 8 | [TestFixture] 9 | public class When_use_code_reader 10 | { 11 | [Test] 12 | public void When_reading_2_colapsed_spans() 13 | { 14 | var sourceCode = "text 1text 2"; 15 | var codeReader = new CodeReader(sourceCode); 16 | 17 | var word = codeReader.ReadWord(); 18 | Assert.AreEqual("<", word.Value); 19 | 20 | word = codeReader.ReadWord(); 21 | Assert.AreEqual("span", word.Value); 22 | 23 | word = codeReader.ReadWord(); 24 | Assert.AreEqual(">", word.Value); 25 | 26 | word = codeReader.ReadWord(); 27 | Assert.AreEqual("text", word.Value); 28 | 29 | word = codeReader.ReadWord(); 30 | Assert.AreEqual(" ", word.Value); 31 | 32 | word = codeReader.ReadWord(); 33 | Assert.AreEqual("1", word.Value); 34 | 35 | word = codeReader.ReadWord(); 36 | Assert.AreEqual("<", word.Value); 37 | 38 | word = codeReader.ReadWord(); 39 | Assert.AreEqual("/", word.Value); 40 | 41 | word = codeReader.ReadWord(); 42 | Assert.AreEqual("span", word.Value); 43 | 44 | word = codeReader.ReadWord(); 45 | Assert.AreEqual(">", word.Value); 46 | 47 | } 48 | 49 | [Test] 50 | public void When_read_a_simple_tag_with_missing_quote_in_attibute() 51 | { 52 | var sourceCode = "
test
"; 53 | var codeReader = new CodeReader(sourceCode); 54 | 55 | var word = codeReader.ReadWord(); 56 | Assert.AreEqual("<", word.Value); 57 | 58 | word = codeReader.ReadWord(); 59 | Assert.AreEqual("div", word.Value); 60 | 61 | word = codeReader.ReadWord(); 62 | Assert.IsTrue(word.IsWhiteSpace); 63 | 64 | word = codeReader.ReadWord(); 65 | Assert.AreEqual("class", word.Value); 66 | 67 | word = codeReader.ReadWord(); 68 | Assert.AreEqual("=", word.Value); 69 | 70 | word = codeReader.ReadWord(); 71 | Assert.AreEqual("login id=", word.Value); 72 | 73 | word = codeReader.ReadWord(); 74 | Assert.AreEqual("lol", word.Value); 75 | 76 | word = codeReader.ReadWord(); 77 | Assert.AreEqual(">", word.Value); 78 | 79 | word = codeReader.ReadWord(); 80 | Assert.AreEqual("test", word.Value); 81 | 82 | word = codeReader.ReadWord(); 83 | Assert.AreEqual("<", word.Value); 84 | 85 | word = codeReader.ReadWord(); 86 | Assert.AreEqual("/", word.Value); 87 | 88 | word = codeReader.ReadWord(); 89 | Assert.AreEqual("div", word.Value); 90 | 91 | word = codeReader.ReadWord(); 92 | Assert.AreEqual(">", word.Value); 93 | } 94 | 95 | [Test] 96 | public void When_read_a_simple_tag() 97 | { 98 | var sourceCode = "
login: \n\t romcy
"; 99 | var codeReader = new CodeReader(sourceCode); 100 | 101 | var word = codeReader.ReadWord(); 102 | Assert.AreEqual("<", word.Value); 103 | 104 | word = codeReader.ReadWord(); 105 | Assert.AreEqual("div", word.Value); 106 | 107 | word = codeReader.ReadWord(); 108 | Assert.IsTrue(word.IsWhiteSpace); 109 | 110 | word = codeReader.ReadWord(); 111 | Assert.AreEqual("class", word.Value); 112 | Assert.IsFalse(word.IsQuoted); 113 | 114 | word = codeReader.ReadWord(); 115 | Assert.AreEqual("=", word.Value); 116 | 117 | word = codeReader.ReadWord(); 118 | Assert.AreEqual("login box1", word.Value); 119 | Assert.IsTrue(word.IsQuoted); 120 | 121 | word = codeReader.ReadWord(); 122 | Assert.AreEqual(">", word.Value); 123 | 124 | word = codeReader.ReadWord(); 125 | Assert.AreEqual("login:", word.Value); 126 | 127 | word = codeReader.ReadWord(); 128 | Assert.AreEqual(" ", word.Value); 129 | 130 | word = codeReader.ReadWord(); 131 | Assert.AreEqual("\n", word.Value); 132 | 133 | word = codeReader.ReadWord(); 134 | Assert.AreEqual("\t", word.Value); 135 | 136 | word = codeReader.ReadWord(); 137 | Assert.AreEqual(" ", word.Value); 138 | 139 | word = codeReader.ReadWord(); 140 | Assert.AreEqual("romcy", word.Value); 141 | 142 | 143 | word = codeReader.ReadWord(); 144 | Assert.AreEqual("<", word.Value); 145 | 146 | word = codeReader.ReadWord(); 147 | Assert.AreEqual("/", word.Value); 148 | 149 | word = codeReader.ReadWord(); 150 | Assert.AreEqual("div", word.Value); 151 | 152 | word = codeReader.ReadWord(); 153 | Assert.AreEqual(">", word.Value); 154 | 155 | } 156 | } 157 | } 158 | 159 | // ReSharper restore InconsistentNaming 160 | -------------------------------------------------------------------------------- /ScrapySharp.Tests/When_use_web_forms.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using HtmlAgilityPack; 4 | using NUnit.Framework; 5 | using ScrapySharp.Html; 6 | using ScrapySharp.Html.Dom; 7 | using ScrapySharp.Html.Forms; 8 | using ScrapySharp.Extensions; 9 | using System.Linq; 10 | using ScrapySharp.Network; 11 | 12 | namespace ScrapySharp.Tests 13 | { 14 | [TestFixture] 15 | public class When_use_web_forms 16 | { 17 | [Test,Ignore("Integration")] 18 | public void When_browsing_using_helpers() 19 | { 20 | ScrapingBrowser browser = new ScrapingBrowser(); 21 | 22 | //set UseDefaultCookiesParser as false if a website returns invalid cookies format 23 | //browser.UseDefaultCookiesParser = false; 24 | 25 | WebPage homePage = browser.NavigateToPage(new Uri("http://www.bing.com/")); 26 | 27 | PageWebForm form = homePage.FindFormById("sb_form"); 28 | form["q"] = "scrapysharp"; 29 | form.Method = HttpVerb.Get; 30 | WebPage resultsPage = form.Submit(); 31 | 32 | HtmlNode[] resultsLinks = resultsPage.Html.CssSelect("div.sb_tlst h3 a").ToArray(); 33 | 34 | WebPage blogPage = resultsPage.FindLinks(By.Text("romcyber blog | Just another WordPress site")).Single().Click(); 35 | } 36 | 37 | [Test] 38 | public void When_parsing_form() 39 | { 40 | var source = File.ReadAllText("Html/WebFormPage.htm"); 41 | var html = HDocument.Parse(source); 42 | 43 | var webForm = new WebForm(html.CssSelect("form[name=TestForm]").Single()); 44 | 45 | Assert.AreEqual(5, webForm.FormFields.Count); 46 | } 47 | 48 | [Test] 49 | public void When_parsing_form_with_agility_pack() 50 | { 51 | var source = File.ReadAllText("Html/WebFormPage.htm"); 52 | var html = source.ToHtmlNode(); 53 | 54 | var webForm = new WebForm(html.CssSelect("form[name=TestForm]").Single()); 55 | 56 | //Because HtmlAgilityPack fails the form parsing ! 57 | Assert.AreNotEqual(5, webForm.FormFields.Count); 58 | } 59 | 60 | [Test] 61 | public void When_parsing_partial_view() 62 | { 63 | var source = File.ReadAllText("Html/Form1.htm"); 64 | var html = HDocument.Parse(source); 65 | 66 | var form = html.CssSelect("form").SingleOrDefault(); 67 | 68 | Assert.IsNotNull(form); 69 | 70 | 71 | } 72 | } 73 | } -------------------------------------------------------------------------------- /ScrapySharp.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.23107.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{6755677C-979E-4D67-A21F-FAAF7598A993}" 7 | ProjectSection(SolutionItems) = preProject 8 | build.cmd = build.cmd 9 | build.fsx = build.fsx 10 | ReleaseNotes.txt = ReleaseNotes.txt 11 | ScrapySharp.nuspec = ScrapySharp.nuspec 12 | EndProjectSection 13 | GlobalSection(HgVSProperties) = preSolution 14 | SolutionIsControlled = True 15 | SolutionBindings = 16 | EndGlobalSection 17 | EndProject 18 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapySharp", "ScrapySharp\ScrapySharp.csproj", "{19858AEA-E842-4633-AF1B-110ED4FE3770}" 19 | GlobalSection(HgVSProperties) = preSolution 20 | SolutionIsControlled = True 21 | SolutionBindings = 22 | EndGlobalSection 23 | EndProject 24 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapySharp.Tests", "ScrapySharp.Tests\ScrapySharp.Tests.csproj", "{99F509FE-B51C-48A8-8C79-DD32AD2FDF10}" 25 | GlobalSection(HgVSProperties) = preSolution 26 | SolutionIsControlled = True 27 | SolutionBindings = 28 | EndGlobalSection 29 | EndProject 30 | Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "ScrapySharp.Core", "ScrapySharp.Core\ScrapySharp.Core.fsproj", "{CE27016B-2755-4B44-8A0E-574914B9256F}" 31 | GlobalSection(HgVSProperties) = preSolution 32 | SolutionIsControlled = True 33 | SolutionBindings = 34 | EndGlobalSection 35 | EndProject 36 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Libraries", "Libraries", "{F0E92760-C697-4CC6-8767-8929C6C9898E}" 37 | GlobalSection(HgVSProperties) = preSolution 38 | SolutionIsControlled = True 39 | SolutionBindings = 40 | EndGlobalSection 41 | EndProject 42 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{D5D992B5-D7B1-4132-A8E1-078513363057}" 43 | GlobalSection(HgVSProperties) = preSolution 44 | SolutionIsControlled = True 45 | SolutionBindings = 46 | EndGlobalSection 47 | EndProject 48 | Global 49 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 50 | Debug|Any CPU = Debug|Any CPU 51 | Debug|Mixed Platforms = Debug|Mixed Platforms 52 | Debug|x64 = Debug|x64 53 | Debug|x86 = Debug|x86 54 | Release|Any CPU = Release|Any CPU 55 | Release|Mixed Platforms = Release|Mixed Platforms 56 | Release|x64 = Release|x64 57 | Release|x86 = Release|x86 58 | EndGlobalSection 59 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 60 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 61 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|Any CPU.Build.0 = Debug|Any CPU 62 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 63 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 64 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|x64.ActiveCfg = Debug|Any CPU 65 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Debug|x86.ActiveCfg = Debug|Any CPU 66 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|Any CPU.ActiveCfg = Release|Any CPU 67 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|Any CPU.Build.0 = Release|Any CPU 68 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 69 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|Mixed Platforms.Build.0 = Release|Any CPU 70 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|x64.ActiveCfg = Release|Any CPU 71 | {19858AEA-E842-4633-AF1B-110ED4FE3770}.Release|x86.ActiveCfg = Release|Any CPU 72 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 73 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|Any CPU.Build.0 = Debug|Any CPU 74 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 75 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 76 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|x64.ActiveCfg = Debug|Any CPU 77 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Debug|x86.ActiveCfg = Debug|Any CPU 78 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|Any CPU.ActiveCfg = Release|Any CPU 79 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|Any CPU.Build.0 = Release|Any CPU 80 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 81 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|Mixed Platforms.Build.0 = Release|Any CPU 82 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|x64.ActiveCfg = Release|Any CPU 83 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10}.Release|x86.ActiveCfg = Release|Any CPU 84 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 85 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|Any CPU.Build.0 = Debug|Any CPU 86 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 87 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 88 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|x64.ActiveCfg = Debug|Any CPU 89 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Debug|x86.ActiveCfg = Debug|Any CPU 90 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|Any CPU.ActiveCfg = Release|Any CPU 91 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|Any CPU.Build.0 = Release|Any CPU 92 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 93 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|Mixed Platforms.Build.0 = Release|Any CPU 94 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|x64.ActiveCfg = Release|Any CPU 95 | {CE27016B-2755-4B44-8A0E-574914B9256F}.Release|x86.ActiveCfg = Release|Any CPU 96 | EndGlobalSection 97 | GlobalSection(SolutionProperties) = preSolution 98 | HideSolutionNode = FALSE 99 | EndGlobalSection 100 | GlobalSection(NestedProjects) = preSolution 101 | {19858AEA-E842-4633-AF1B-110ED4FE3770} = {F0E92760-C697-4CC6-8767-8929C6C9898E} 102 | {99F509FE-B51C-48A8-8C79-DD32AD2FDF10} = {D5D992B5-D7B1-4132-A8E1-078513363057} 103 | {CE27016B-2755-4B44-8A0E-574914B9256F} = {F0E92760-C697-4CC6-8767-8929C6C9898E} 104 | EndGlobalSection 105 | GlobalSection(HgVSProperties) = preSolution 106 | SolutionIsControlled = True 107 | SolutionBindings = 108 | EndGlobalSection 109 | GlobalSection(TestCaseManagementSettings) = postSolution 110 | CategoryFile = ScrapySharp.vsmdi 111 | EndGlobalSection 112 | EndGlobal 113 | -------------------------------------------------------------------------------- /ScrapySharp/Cache/WebResourceEntity.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Cache 2 | { 3 | public class WebResourceEntity 4 | { 5 | public string LastModified { get; set; } 6 | 7 | public string AbsoluteUrl { get; set; } 8 | 9 | public bool ForceDownload { get; set; } 10 | } 11 | } -------------------------------------------------------------------------------- /ScrapySharp/Cache/WebResourceStorage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using ScrapySharp.Network; 3 | using System.Runtime.Caching; 4 | 5 | namespace ScrapySharp.Cache 6 | { 7 | public sealed class WebResourceStorage 8 | { 9 | private const string basePath = "_WebResourcesCache"; 10 | private MemoryCache cache; 11 | 12 | public WebResourceStorage() 13 | { 14 | Initialize(); 15 | } 16 | 17 | private void Initialize() 18 | { 19 | cache = new MemoryCache(basePath); 20 | } 21 | 22 | public void Save(WebResource webResource) 23 | { 24 | var cacheItem = new CacheItem(webResource.AbsoluteUrl.ToString(), webResource); 25 | var policy = new CacheItemPolicy 26 | { 27 | AbsoluteExpiration = new DateTimeOffset(DateTime.UtcNow.AddHours(2)) 28 | }; 29 | cache.AddOrGetExisting(cacheItem, policy); 30 | } 31 | 32 | public bool Exists(string key) 33 | { 34 | return cache.GetCacheItem(key) != null; 35 | } 36 | 37 | private static WebResourceStorage current; 38 | 39 | public static WebResourceStorage Current 40 | { 41 | get 42 | { 43 | if (current == null) 44 | current = new WebResourceStorage(); 45 | return current; 46 | } 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /ScrapySharp/Exceptions/ScrapingException.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace ScrapySharp.Exceptions 4 | { 5 | public class ScrapingException : Exception 6 | { 7 | public ScrapingException(string message) : base(message) 8 | { 9 | } 10 | 11 | public ScrapingException(string message, Exception innerException) : base(message, innerException) 12 | { 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/CssQueryExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using HtmlAgilityPack; 4 | using ScrapySharp.Core; 5 | 6 | namespace ScrapySharp.Extensions 7 | { 8 | public static class CssQueryExtensions 9 | { 10 | public static IEnumerable CssSelect(this IEnumerable nodes, string expression) 11 | { 12 | return nodes.SelectMany(node => CssSelect(node, expression)); 13 | } 14 | 15 | public static IEnumerable CssSelect(this HtmlNode node, string expression) 16 | { 17 | var tokenizer = new CssSelectorTokenizer(); 18 | var tokens = tokenizer.Tokenize(expression); 19 | var executor = new CssSelectorExecutor(new List { node }, tokens.ToList(), new AgilityNavigationProvider()); 20 | 21 | return executor.GetElements(); 22 | } 23 | 24 | public static IEnumerable CssSelect(this HtmlNode node, string[] expressions) 25 | { 26 | List elements = new List(); 27 | foreach (var expression in expressions) 28 | { 29 | var matchingElements = node.CssSelect(expression).ToList(); 30 | 31 | // Use a union to remove duplicates. 32 | elements = elements.Union(matchingElements).ToList(); 33 | } 34 | 35 | return elements.ToArray(); 36 | } 37 | 38 | public static IEnumerable CssSelectAncestors(this IEnumerable nodes, string expression) 39 | { 40 | var htmlNodes = nodes.SelectMany(node => CssSelectAncestors(node, expression)).ToArray(); 41 | return htmlNodes.Distinct(); 42 | } 43 | 44 | public static IEnumerable CssSelectAncestors(this HtmlNode node, string expression) 45 | { 46 | var tokenizer = new CssSelectorTokenizer(); 47 | var tokens = tokenizer.Tokenize(expression); 48 | var executor = new CssSelectorExecutor(new List { node }, tokens.ToList(), new AgilityNavigationProvider()); 49 | executor.MatchAncestors = true; 50 | 51 | return executor.GetElements(); 52 | } 53 | 54 | } 55 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/DictionaryExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Collections.Specialized; 4 | using System.Linq; 5 | using HtmlAgilityPack; 6 | 7 | namespace ScrapySharp.Extensions 8 | { 9 | public static class DictionaryExtensions 10 | { 11 | public static bool HasKeyIgnoreCase(this HtmlAttributeCollection dictionary, string name) 12 | { 13 | if (dictionary == null) 14 | return false; 15 | 16 | var key = dictionary.FirstOrDefault(k => k.Name.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 17 | if (key == null) 18 | return false; 19 | 20 | return true; 21 | } 22 | 23 | public static string GetIgnoreCase(this HtmlAttributeCollection dictionary, string name) 24 | { 25 | if (dictionary == null) 26 | return null; 27 | 28 | var key = dictionary.FirstOrDefault(k => k.Name.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 29 | if (key == null) 30 | return null; 31 | 32 | return dictionary[key.Name].Value; 33 | } 34 | 35 | public static bool HasKeyIgnoreCase(this NameValueCollection dictionary, string name) 36 | { 37 | if (dictionary == null) 38 | return false; 39 | 40 | var key = dictionary.AllKeys.FirstOrDefault(k => k.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 41 | if (key == null) 42 | return false; 43 | 44 | return true; 45 | } 46 | 47 | public static string GetIgnoreCase(this NameValueCollection dictionary, string name) 48 | { 49 | if (dictionary == null) 50 | return null; 51 | 52 | var key = dictionary.AllKeys.FirstOrDefault(k => k.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 53 | if (key == null) 54 | return null; 55 | 56 | return dictionary[key]; 57 | } 58 | 59 | public static string GetIgnoreCase(this IDictionary dictionary, string name) 60 | { 61 | if (dictionary == null) 62 | return null; 63 | 64 | var key = dictionary.Keys.FirstOrDefault(k => k.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 65 | if (key == null) 66 | return null; 67 | 68 | return dictionary[key]; 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/HDocumentCssQueryExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text.RegularExpressions; 5 | using ScrapySharp.Core; 6 | using ScrapySharp.Html.Dom; 7 | 8 | namespace ScrapySharp.Extensions 9 | { 10 | public static class HDocumentCssQueryExtensions 11 | { 12 | public static IEnumerable CssSelect(this HDocument doc, string expression) 13 | { 14 | var hElement = new HElement 15 | { 16 | Children = doc.Children 17 | }; 18 | 19 | return hElement.CssSelect(expression); 20 | } 21 | 22 | public static IEnumerable CssSelect(this IEnumerable nodes, string expression) 23 | { 24 | return nodes.SelectMany(node => CssSelect(node, expression)); 25 | } 26 | 27 | public static IEnumerable CssSelect(this IEnumerable nodes, string[] expressions) 28 | { 29 | return nodes.SelectMany(node => CssSelect(nodes, expressions)); 30 | } 31 | 32 | public static IEnumerable CssSelectAncestors(this IEnumerable nodes, string expression) 33 | { 34 | return nodes.SelectMany(node => CssSelectAncestors(node, expression)).Distinct(); 35 | } 36 | 37 | public static IEnumerable CssSelectAncestors(this HElement node, string expression) 38 | { 39 | if (string.IsNullOrEmpty(expression)) 40 | return new HElement[] { }; 41 | 42 | var tokenizer = new CssSelectorTokenizer(); 43 | var tokens = tokenizer.Tokenize(expression); 44 | var executor = new CssSelectorExecutor(new List { node }, tokens.ToList(), new HElementNavigationProvider()); 45 | executor.MatchAncestors = true; 46 | 47 | return executor.GetElements(); 48 | } 49 | 50 | public static IEnumerable CssSelect(this HElement node, string expression) 51 | { 52 | if (string.IsNullOrEmpty(expression)) 53 | return new HElement[] { }; 54 | 55 | var tokenizer = new CssSelectorTokenizer(); 56 | var tokens = tokenizer.Tokenize(expression); 57 | var executor = new CssSelectorExecutor(new List { node }, tokens.ToList(), new HElementNavigationProvider()); 58 | 59 | return executor.GetElements(); 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/HElementNavigationProvider.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Collections.Specialized; 3 | using System.Linq; 4 | using ScrapySharp.Core; 5 | using ScrapySharp.Html.Dom; 6 | 7 | namespace ScrapySharp.Extensions 8 | { 9 | public class HElementNavigationProvider : INavigationProvider 10 | { 11 | public List ChildNodes(List nodes) 12 | { 13 | return nodes.SelectMany(n => n.Children).ToList(); 14 | } 15 | 16 | public List Descendants(List nodes) 17 | { 18 | return nodes.SelectMany(n => n.Descendants()).ToList(); 19 | } 20 | 21 | public List ParentNodes(List nodes) 22 | { 23 | return nodes.Select(n => n.ParentNode).ToList(); 24 | } 25 | 26 | public List AncestorsAndSelf(List nodes) 27 | { 28 | return nodes.SelectMany(n => n.Ancestors()).Concat(nodes).ToList(); 29 | } 30 | 31 | public string GetName(HElement node) 32 | { 33 | return node.Name; 34 | } 35 | 36 | public string GetAttributeValue(HElement node, string name, string defaultValue) 37 | { 38 | return node.GetAttributeValue(name, defaultValue); 39 | } 40 | 41 | public string GetId(HElement node) 42 | { 43 | return node.Id; 44 | } 45 | 46 | public NameValueCollection Attributes(HElement node) 47 | { 48 | if (node.Attributes == null) 49 | return new NameValueCollection(); 50 | return node.Attributes; 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/HtmlCreationHelper.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using HtmlAgilityPack; 4 | 5 | namespace ScrapySharp.Extensions 6 | { 7 | public static class HtmlCreationHelper 8 | { 9 | public static HtmlNode MergeInParentNode(this IEnumerable nodes, string name) 10 | { 11 | var doc = new HtmlDocument(); 12 | var htmlNode = doc.CreateElement(name); 13 | nodes.ToList().ForEach(n => htmlNode.AppendChild(n)); 14 | 15 | return htmlNode; 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/HtmlParsingHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Globalization; 4 | using System.IO; 5 | using System.Net; 6 | using System.Text; 7 | using System.Text.RegularExpressions; 8 | using System.Linq; 9 | using System.Web; 10 | using HtmlAgilityPack; 11 | using ScrapySharp.Html.Dom; 12 | 13 | namespace ScrapySharp.Extensions 14 | { 15 | public static class HtmlParsingHelper 16 | { 17 | private static Regex spacesRegex = new Regex("[ ]+", RegexOptions.Compiled); 18 | private static Regex asciiRegex = new Regex("(([=][0-9A-F]{0,2})+)|([ ]+)", RegexOptions.Compiled); 19 | 20 | /// 21 | /// Convert a string to a date. 22 | /// 23 | /// The value. 24 | /// 25 | public static DateTime ToDate(this string value) 26 | { 27 | return Convert.ToDateTime(value); 28 | } 29 | 30 | /// 31 | /// Convert a string to a date. 32 | /// 33 | /// The value. 34 | /// The format. 35 | /// 36 | public static DateTime ToDate(this string value, string format) 37 | { 38 | return ToDate(value, format, CultureInfo.InvariantCulture); 39 | } 40 | 41 | /// 42 | /// Convert a string to a date. 43 | /// 44 | /// The value. 45 | /// The format. 46 | /// The culture info. 47 | /// 48 | public static DateTime ToDate(this string value, string format, CultureInfo cultureInfo) 49 | { 50 | DateTime result; 51 | if (DateTime.TryParseExact(value, format, cultureInfo, DateTimeStyles.None, out result)) 52 | return result; 53 | 54 | return DateTime.MinValue; 55 | } 56 | 57 | /// 58 | /// Gets the attribute value. 59 | /// 60 | /// The node. 61 | /// The name. 62 | /// 63 | public static string GetAttributeValue(this HtmlNode node, string name) 64 | { 65 | return node.GetAttributeValue(name, string.Empty); 66 | } 67 | 68 | /// 69 | /// Convert string value to HTML node. 70 | /// 71 | /// The content. 72 | /// 73 | public static HtmlNode ToHtmlNode(this string content) 74 | { 75 | var document = new HtmlDocument(); 76 | document.LoadHtml(content); 77 | 78 | return document.DocumentNode; 79 | } 80 | 81 | /// 82 | /// Convert WebResponse content to HTML node. 83 | /// 84 | /// The response. 85 | /// 86 | public static HtmlNode ToHtmlNode(this WebResponse response) 87 | { 88 | var document = new HtmlDocument(); 89 | string html; 90 | 91 | var responseStream = response.GetResponseStream(); 92 | if (responseStream == null) 93 | html = string.Empty; 94 | else 95 | using (var reader = new StreamReader(responseStream)) 96 | html = reader.ReadToEnd(); 97 | 98 | document.LoadHtml(html); 99 | 100 | return document.DocumentNode; 101 | } 102 | 103 | /// 104 | /// Convert string value to HDocument. 105 | /// 106 | /// The content. 107 | /// 108 | public static HDocument ToHDocument(this string content) 109 | { 110 | return HDocument.Parse(content); 111 | } 112 | 113 | /// 114 | /// Convert WebResponse content to HDocument. 115 | /// 116 | /// The response. 117 | /// 118 | public static HDocument ToHDocument(this WebResponse response) 119 | { 120 | string html; 121 | 122 | var responseStream = response.GetResponseStream(); 123 | if (responseStream == null) 124 | html = string.Empty; 125 | else 126 | using (var reader = new StreamReader(responseStream)) 127 | html = reader.ReadToEnd(); 128 | 129 | return html.ToHDocument(); 130 | } 131 | 132 | /// 133 | /// Gets the next sibling with specified tag name. 134 | /// 135 | /// The node. 136 | /// The name. 137 | /// 138 | public static HtmlNode GetNextSibling(this HtmlNode node, string name) 139 | { 140 | var currentNode = node.NextSibling; 141 | 142 | while (currentNode.NextSibling != null && currentNode.Name != name) 143 | currentNode = currentNode.NextSibling; 144 | 145 | return currentNode.Name == name ? currentNode : null; 146 | } 147 | 148 | /// 149 | /// Gets the next table cell value. 150 | /// 151 | /// The node. 152 | /// The name. 153 | /// The comparison type. 154 | /// 155 | public static HtmlValue GetNextTableCellValue(this HtmlNode node, string name) 156 | { 157 | var results = GetNodesFollowedByValue(node, "td", name, NodeValueComparison.Equals); 158 | if (!results.Any()) 159 | return null; 160 | 161 | var innerText = results.LastOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText(); 162 | if (innerText.StartsWith(":")) 163 | innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText(); 164 | 165 | return innerText; 166 | } 167 | 168 | /// 169 | /// Gets the next table cell value. 170 | /// 171 | /// The node. 172 | /// The name. 173 | /// The comparison type. 174 | /// 175 | public static HtmlValue GetNextTableCellValue(this HtmlNode node, string name, NodeValueComparison comparison/* = NodeValueComparison.Equals*/) 176 | { 177 | var results = GetNodesFollowedByValue(node, "td", name, comparison); 178 | if (!results.Any()) 179 | return null; 180 | 181 | var innerText = results.LastOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText(); 182 | if (innerText.StartsWith(":")) 183 | innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText(); 184 | 185 | return innerText; 186 | } 187 | 188 | /// 189 | /// Gets the nodes followed by value. 190 | /// 191 | /// The node. 192 | /// The name. 193 | /// The value. 194 | /// The comparison. 195 | /// 196 | public static IEnumerable GetNodesFollowedByValue(this HtmlNode node, string name, string value, NodeValueComparison comparison = NodeValueComparison.Equals) 197 | { 198 | var comparer = new NodeValueComparer(comparison); 199 | var cleanName = value.CleanInnerText(); 200 | return (from d in node.Descendants(name) 201 | where comparer.Compare(d.InnerText.CleanInnerHtmlAscii().CleanInnerText(), cleanName) 202 | select d.GetNextSibling(name)).ToArray(); 203 | } 204 | 205 | /// 206 | /// Gets the nodes followed by value. 207 | /// 208 | /// The nodes. 209 | /// The name. 210 | /// The value. 211 | /// The comparison. 212 | /// 213 | public static IEnumerable GetNodesFollowedByValue(this IEnumerable nodes, string name, string value, NodeValueComparison comparison = NodeValueComparison.Equals) 214 | { 215 | return nodes.SelectMany(node => node.GetNodesFollowedByValue(name, value, comparison)); 216 | } 217 | 218 | /// 219 | /// Gets the next table line value. 220 | /// 221 | /// The node. 222 | /// The name. 223 | /// The comparison type. 224 | /// 225 | public static HtmlValue GetNextTableLineValue(this HtmlNode node, string name, NodeValueComparison comparison = NodeValueComparison.Equals) 226 | { 227 | var results = GetNodesFollowedByValue(node, "tr", name, comparison); 228 | if (!results.Any()) 229 | return null; 230 | 231 | var innerText = results.FirstOrDefault().InnerText.CleanInnerHtmlAscii().CleanInnerText(); 232 | if (innerText.StartsWith(":")) 233 | innerText = innerText.Substring(1).CleanInnerHtmlAscii().CleanInnerText(); 234 | 235 | return innerText; 236 | } 237 | 238 | /// 239 | /// Cleans the inner HTML ASCII. 240 | /// 241 | /// 242 | /// "text =09".CleanInnerHtmlAscii() returns "text " 243 | /// 244 | /// The expression. 245 | /// 246 | public static string CleanInnerHtmlAscii(this string expression) 247 | { 248 | var cleaned = expression.Replace("=C3=B4", "ô"); 249 | cleaned = asciiRegex.Replace(cleaned, " "); 250 | 251 | return cleaned; 252 | } 253 | 254 | 255 | /// 256 | /// Cleans the inner text from excessive spaces characters. 257 | /// 258 | /// The expression. 259 | /// 260 | public static string CleanInnerText(this string expression) 261 | { 262 | var cleaned = expression.Replace('\t', ' ').Replace('\r', ' ') 263 | .Replace('\n', ' '); 264 | 265 | cleaned = WebUtility.HtmlDecode(cleaned); 266 | 267 | return spacesRegex.Replace(cleaned, " ").Trim(); 268 | } 269 | } 270 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/HtmlValue.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | 4 | namespace ScrapySharp.Extensions 5 | { 6 | public class HtmlValue : IEquatable 7 | { 8 | private readonly string value; 9 | 10 | public HtmlValue(string value) 11 | { 12 | this.value = value; 13 | } 14 | 15 | public override string ToString() 16 | { 17 | return value; 18 | } 19 | 20 | public static implicit operator string(HtmlValue htmlValue) 21 | { 22 | if (htmlValue == null) 23 | return null; 24 | return htmlValue.value; 25 | } 26 | 27 | public static implicit operator HtmlValue(string value) 28 | { 29 | return new HtmlValue(value); 30 | } 31 | 32 | public static explicit operator bool(HtmlValue htmlValue) 33 | { 34 | if (htmlValue == null) 35 | return false; 36 | return Convert.ToBoolean(htmlValue.value); 37 | } 38 | 39 | public static explicit operator bool?(HtmlValue htmlValue) 40 | { 41 | bool result; 42 | if (bool.TryParse(htmlValue.value, out result)) 43 | return result; 44 | return null; 45 | } 46 | 47 | public static explicit operator int(HtmlValue htmlValue) 48 | { 49 | return int.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite 50 | | NumberStyles.AllowTrailingWhite, NumberFormatInfo.InvariantInfo); 51 | } 52 | 53 | public static explicit operator int?(HtmlValue htmlValue) 54 | { 55 | int result; 56 | if (int.TryParse(htmlValue.value, out result)) 57 | return result; 58 | return null; 59 | } 60 | 61 | public static explicit operator uint(HtmlValue htmlValue) 62 | { 63 | return uint.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite 64 | | NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, 65 | NumberFormatInfo.InvariantInfo); 66 | } 67 | 68 | public static explicit operator uint?(HtmlValue htmlValue) 69 | { 70 | uint result; 71 | if (uint.TryParse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite 72 | | NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, 73 | NumberFormatInfo.InvariantInfo, out result)) 74 | return result; 75 | return null; 76 | } 77 | 78 | public static explicit operator long(HtmlValue htmlValue) 79 | { 80 | return long.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 81 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo); 82 | } 83 | 84 | public static explicit operator long?(HtmlValue htmlValue) 85 | { 86 | long result; 87 | if (long.TryParse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 88 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo, out result)) 89 | return result; 90 | return null; 91 | } 92 | 93 | public static explicit operator ulong(HtmlValue htmlValue) 94 | { 95 | return ulong.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 96 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo); 97 | } 98 | 99 | public static explicit operator ulong?(HtmlValue htmlValue) 100 | { 101 | ulong result; 102 | if (ulong.TryParse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 103 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo, out result)) 104 | return result; 105 | return null; 106 | } 107 | 108 | public static explicit operator float(HtmlValue htmlValue) 109 | { 110 | return float.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 111 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo); 112 | } 113 | 114 | public static explicit operator float?(HtmlValue htmlValue) 115 | { 116 | float result; 117 | if (float.TryParse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 118 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo, out result)) 119 | return result; 120 | return null; 121 | } 122 | 123 | public static explicit operator double(HtmlValue htmlValue) 124 | { 125 | return double.Parse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 126 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo); 127 | } 128 | 129 | public static explicit operator double?(HtmlValue htmlValue) 130 | { 131 | double result; 132 | if (double.TryParse(htmlValue.value, NumberStyles.AllowLeadingWhite | NumberStyles.AllowTrailingWhite | NumberStyles.AllowLeadingSign 133 | | NumberStyles.AllowDecimalPoint | NumberStyles.AllowExponent, NumberFormatInfo.InvariantInfo, out result)) 134 | return result; 135 | return null; 136 | } 137 | 138 | public static explicit operator decimal(HtmlValue htmlValue) 139 | { 140 | return Convert.ToDecimal(htmlValue.value); 141 | } 142 | 143 | public static explicit operator decimal?(HtmlValue htmlValue) 144 | { 145 | decimal result; 146 | if (decimal.TryParse(htmlValue.value, out result)) 147 | return result; 148 | return null; 149 | } 150 | 151 | public static explicit operator DateTime(HtmlValue htmlValue) 152 | { 153 | if (htmlValue == null) 154 | return DateTime.MinValue; 155 | return htmlValue.value.ToDate(); 156 | } 157 | 158 | public static explicit operator DateTime?(HtmlValue htmlValue) 159 | { 160 | if (htmlValue == null) 161 | return null; 162 | return htmlValue.value.ToDate(); 163 | } 164 | 165 | public static explicit operator TimeSpan(HtmlValue htmlValue) 166 | { 167 | if (htmlValue == null) 168 | return TimeSpan.Zero; 169 | return TimeSpan.Parse(htmlValue.value); 170 | } 171 | 172 | public static explicit operator TimeSpan?(HtmlValue htmlValue) 173 | { 174 | TimeSpan result; 175 | if (TimeSpan.TryParse(htmlValue.value, out result)) 176 | return result; 177 | return null; 178 | } 179 | 180 | public static explicit operator Guid(HtmlValue htmlValue) 181 | { 182 | if (htmlValue == null) 183 | return Guid.Empty; 184 | return new Guid(htmlValue.value); 185 | } 186 | 187 | public static explicit operator Guid?(HtmlValue htmlValue) 188 | { 189 | if (htmlValue == null) 190 | return null; 191 | return new Guid(htmlValue.value); 192 | } 193 | 194 | #region IEquatable implementation 195 | 196 | public bool Equals(HtmlValue other) 197 | { 198 | if (ReferenceEquals(null, other)) return false; 199 | if (ReferenceEquals(this, other)) return true; 200 | return Equals(other.value, value); 201 | } 202 | 203 | public override bool Equals(object obj) 204 | { 205 | if (ReferenceEquals(null, obj)) return false; 206 | if (ReferenceEquals(this, obj)) return true; 207 | if (obj.GetType() != typeof (HtmlValue)) return false; 208 | return Equals((HtmlValue) obj); 209 | } 210 | 211 | public override int GetHashCode() 212 | { 213 | return (value != null ? value.GetHashCode() : 0); 214 | } 215 | 216 | public static bool operator ==(HtmlValue left, HtmlValue right) 217 | { 218 | return Equals(left, right); 219 | } 220 | 221 | public static bool operator !=(HtmlValue left, HtmlValue right) 222 | { 223 | return !Equals(left, right); 224 | } 225 | 226 | #endregion 227 | 228 | } 229 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/NodeValueComparer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace ScrapySharp.Extensions 4 | { 5 | internal class NodeValueComparer 6 | { 7 | private readonly NodeValueComparison comparison; 8 | 9 | public NodeValueComparer(NodeValueComparison comparison) 10 | { 11 | this.comparison = comparison; 12 | } 13 | 14 | public bool Compare(string value1, string value2) 15 | { 16 | switch (comparison) 17 | { 18 | case NodeValueComparison.Equals: 19 | return value1.Equals(value2, StringComparison.InvariantCultureIgnoreCase); 20 | case NodeValueComparison.StartsWith: 21 | return value1.StartsWith(value2, StringComparison.InvariantCultureIgnoreCase); 22 | case NodeValueComparison.EndsWith: 23 | return value1.EndsWith(value2, StringComparison.InvariantCultureIgnoreCase); 24 | case NodeValueComparison.Contains: 25 | return value1.ToLowerInvariant().Contains(value2.ToLowerInvariant()); 26 | default: 27 | return value1 == value2; 28 | } 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/NodeValueComparison.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Extensions 2 | { 3 | public enum NodeValueComparison 4 | { 5 | Equals, 6 | StartsWith, 7 | EndsWith, 8 | Contains 9 | } 10 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/TokenHelper.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | using ScrapySharp.Html.Parsing; 3 | 4 | namespace ScrapySharp.Extensions 5 | { 6 | public static class TokenHelper 7 | { 8 | public static bool IsToken(this Word word) 9 | { 10 | return IsToken(word.Value); 11 | } 12 | 13 | public static bool IsToken(this char c) 14 | { 15 | return IsToken(c.ToString(CultureInfo.InvariantCulture)); 16 | } 17 | 18 | public static bool IsToken(this string value) 19 | { 20 | return value == Tokens.CloseTag || 21 | value == Tokens.CommentBegin || 22 | value == Tokens.CommentEnd || 23 | value == Tokens.Quote.ToString(CultureInfo.InvariantCulture) || 24 | value == Tokens.SimpleQuote.ToString(CultureInfo.InvariantCulture) || 25 | value == Tokens.TagBegin.ToString(CultureInfo.InvariantCulture) || 26 | value == Tokens.TagEnd.ToString(CultureInfo.InvariantCulture) || 27 | value == Tokens.Doctype || 28 | value == Tokens.CloseTagDeclarator; 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /ScrapySharp/Extensions/UrlHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text.RegularExpressions; 3 | 4 | namespace ScrapySharp.Extensions 5 | { 6 | public static class UrlHelper 7 | { 8 | private static readonly Regex basePathRegex = new Regex("(?(http[s]?[:]//)?)(?[^/]+).*", RegexOptions.Compiled); 9 | 10 | public static Uri Combine(this Uri uri, string path) 11 | { 12 | var url = uri.ToString(); 13 | return CombineUrl(url, path); 14 | } 15 | 16 | public static Uri CombineUrl(this string url, string path) 17 | { 18 | if (string.IsNullOrWhiteSpace(path)) 19 | return new Uri(url); 20 | 21 | if (path.StartsWith("/")) 22 | { 23 | var match = basePathRegex.Match(url); 24 | if (match.Success) 25 | { 26 | var scheme = match.Groups["scheme"].Value; 27 | var site = match.Groups["site"].Value; 28 | 29 | return new Uri(scheme + site + path); 30 | } 31 | } 32 | 33 | if (!url.EndsWith("/")) 34 | url += '/'; 35 | 36 | string combined; 37 | if (url.EndsWith("/") && path.StartsWith("/")) 38 | combined = url + path.Substring(1); 39 | else 40 | combined = url + path; 41 | 42 | return new Uri(combined); 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/By.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using HtmlAgilityPack; 3 | 4 | namespace ScrapySharp.Html 5 | { 6 | public class By 7 | { 8 | internal string Query { get; private set; } 9 | internal ElementSearchKind SearchKind { get; private set; } 10 | internal StringComparison ComparisonType { get; private set; } 11 | 12 | private By(string query, ElementSearchKind searchKind, StringComparison comparisonType) 13 | { 14 | Query = query; 15 | SearchKind = searchKind; 16 | ComparisonType = comparisonType; 17 | } 18 | 19 | internal ElementFinder CreateElementFinder(HtmlNode html, string tagName) 20 | { 21 | return new ElementFinder(html, SearchKind, tagName, Query, ComparisonType); 22 | } 23 | 24 | public static By Id(string query, StringComparison comparisonType = StringComparison.CurrentCulture) 25 | { 26 | return new By(query, ElementSearchKind.Id, comparisonType); 27 | } 28 | 29 | public static By Name(string query, StringComparison comparisonType = StringComparison.CurrentCulture) 30 | { 31 | return new By(query, ElementSearchKind.Name, comparisonType); 32 | } 33 | 34 | public static By Text(string query, StringComparison comparisonType = StringComparison.CurrentCulture) 35 | { 36 | return new By(query, ElementSearchKind.Text, comparisonType); 37 | } 38 | 39 | public static By Class(string query, StringComparison comparisonType = StringComparison.CurrentCulture) 40 | { 41 | return new By(query, ElementSearchKind.Class, comparisonType); 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/DeclarationType.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Dom 2 | { 3 | public enum DeclarationType 4 | { 5 | TextElement, 6 | OpenTag, 7 | CloseTag, 8 | SelfClosedTag, 9 | Comment, 10 | } 11 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HAttribute.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Dom 2 | { 3 | public class HAttribute : IHSubContainer 4 | { 5 | public HAttribute(string name, string value) 6 | { 7 | Name = name; 8 | Value = value; 9 | } 10 | 11 | public string Name { get; set; } 12 | public string Value { get; set; } 13 | } 14 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HComment.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Dom 2 | { 3 | public class HComment : HElement 4 | { 5 | public override string GetOuterHtml(HtmlGenerationStyle generationStyle = HtmlGenerationStyle.None) 6 | { 7 | return string.Format("", innerText); 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HContainer.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | using System.Linq; 4 | using System.Net; 5 | 6 | namespace ScrapySharp.Html.Dom 7 | { 8 | public abstract class HContainer 9 | { 10 | private string name; 11 | protected string innerText; 12 | 13 | 14 | protected HContainer() 15 | { 16 | Children = new List(); 17 | } 18 | 19 | public List Children { get; set; } 20 | 21 | public bool HasChildren 22 | { 23 | get { return Children != null && Children.Any(); } 24 | } 25 | 26 | public int IndentLevel { get; set; } 27 | 28 | public string Name 29 | { 30 | get 31 | { 32 | if (name == null) 33 | return string.Empty; 34 | return name; 35 | } 36 | set { name = value; } 37 | } 38 | 39 | public string InnerText 40 | { 41 | get 42 | { 43 | if (innerText == null) 44 | innerText = string.Empty; 45 | 46 | var builder = new StringBuilder(); 47 | builder.Append(innerText); 48 | 49 | if (Children != null) 50 | foreach (var child in Children) 51 | builder.Append(child.InnerText); 52 | 53 | return WebUtility.HtmlDecode(builder.ToString()); 54 | } 55 | set { innerText = value; } 56 | } 57 | 58 | public string InnerHtml 59 | { 60 | get 61 | { 62 | var builder = new StringBuilder(); 63 | foreach (var child in Children) 64 | { 65 | builder.Append(child.GetOuterHtml()); 66 | } 67 | 68 | return builder.ToString(); 69 | } 70 | set 71 | { 72 | Children.Clear(); 73 | Children.AddRange(HDocument.Parse(value).Children); 74 | } 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HDocument.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Text; 4 | using ScrapySharp.Html.Parsing; 5 | 6 | namespace ScrapySharp.Html.Dom 7 | { 8 | public class HDocument : HContainer 9 | { 10 | public HDocument(params HElement[] children) 11 | { 12 | Children = children.ToList(); 13 | } 14 | 15 | public HDocument() 16 | { 17 | Children = new List(); 18 | } 19 | 20 | public static HDocument Parse(string source) 21 | { 22 | var codeReader = new CodeReader(source); 23 | var declarationReader = new HtmlDeclarationReader(codeReader); 24 | var domBuilder = new HtmlDomBuilder(declarationReader); 25 | 26 | return new HDocument 27 | { 28 | Children = domBuilder.BuildDom().ToList() 29 | }; 30 | } 31 | 32 | public string GetOuterHtml(HtmlGenerationStyle generationStyle = HtmlGenerationStyle.None) 33 | { 34 | var builder = new StringBuilder(); 35 | 36 | var selfClosing = !HasChildren && !string.IsNullOrEmpty(innerText); 37 | 38 | if (!selfClosing) 39 | { 40 | if (!string.IsNullOrEmpty(innerText)) 41 | builder.Append(innerText); 42 | if (HasChildren) 43 | foreach (var child in Children) 44 | builder.Append(child.GetOuterHtml(generationStyle)); 45 | } 46 | 47 | return builder.ToString(); 48 | } 49 | } 50 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HElement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Collections.Specialized; 4 | using System.Text; 5 | using System.Linq; 6 | 7 | namespace ScrapySharp.Html.Dom 8 | { 9 | public class HElement : HContainer, IHSubContainer 10 | { 11 | public HElement(string name) : this() 12 | { 13 | Name = name; 14 | } 15 | 16 | public HElement(string name, string text, params IHSubContainer[] elements) : this(name, elements) 17 | { 18 | InnerText = text; 19 | } 20 | 21 | public HElement(string name, params IHSubContainer[] elements) 22 | { 23 | Name = name; 24 | Children = elements.OfType().ToList(); 25 | 26 | Attributes = new NameValueCollection(); 27 | 28 | elements.OfType().ToList() 29 | .ForEach(h => Attributes.Add(h.Name, h.Value)); 30 | } 31 | 32 | public HElement() 33 | { 34 | Children = new List(); 35 | Attributes = new NameValueCollection(); 36 | } 37 | 38 | public virtual string GetOuterHtml(HtmlGenerationStyle generationStyle = HtmlGenerationStyle.None) 39 | { 40 | var builder = new StringBuilder(); 41 | 42 | var selfClosing = !HasChildren && string.IsNullOrEmpty(innerText); 43 | 44 | if (generationStyle == HtmlGenerationStyle.Indent) 45 | builder.Append(string.Empty.PadLeft(IndentLevel, '\t')); 46 | 47 | if (!string.IsNullOrEmpty(Name)) 48 | { 49 | builder.Append('<'); 50 | builder.Append(Name); 51 | 52 | if (HasAttributes) 53 | foreach (var key in Attributes.AllKeys) 54 | builder.AppendFormat(" {0}=\"{1}\"", key, Attributes[key]); 55 | 56 | if (!selfClosing) 57 | builder.Append('>'); 58 | else 59 | builder.Append(" />"); 60 | } 61 | 62 | if (!selfClosing) 63 | { 64 | if (HasChildren) 65 | { 66 | if (generationStyle == HtmlGenerationStyle.Indent) 67 | builder.AppendLine(); 68 | foreach (var child in Children) 69 | { 70 | child.IndentLevel = IndentLevel + 1; 71 | builder.Append(child.GetOuterHtml(generationStyle)); 72 | } 73 | 74 | if (generationStyle == HtmlGenerationStyle.Indent) 75 | builder.Append(string.Empty.PadLeft(IndentLevel, '\t')); 76 | } 77 | 78 | if (string.IsNullOrEmpty(innerText)) 79 | builder.AppendFormat("", Name); 80 | 81 | } 82 | 83 | if (!string.IsNullOrEmpty(innerText)) 84 | { 85 | builder.Append(innerText); 86 | if (!selfClosing && !string.IsNullOrEmpty(Name)) 87 | builder.AppendFormat("", Name); 88 | } 89 | 90 | if (generationStyle == HtmlGenerationStyle.Indent) 91 | builder.AppendLine(); 92 | 93 | return builder.ToString(); 94 | } 95 | 96 | 97 | public string Id 98 | { 99 | get 100 | { 101 | if (HasAttributes) 102 | return Attributes["id"]; 103 | return string.Empty; 104 | } 105 | set 106 | { 107 | if (HasAttributes) 108 | Attributes["id"] = value; 109 | } 110 | } 111 | 112 | public NameValueCollection Attributes { get; internal set; } 113 | 114 | public bool HasAttributes 115 | { 116 | get { return Attributes != null && Attributes.Count > 0; } 117 | } 118 | 119 | public HElement ParentNode { get; set; } 120 | 121 | public IEnumerable Ancestors() 122 | { 123 | for (HElement node = this.ParentNode; node.ParentNode != null; node = node.ParentNode) 124 | yield return node.ParentNode; 125 | } 126 | 127 | public IEnumerable Ancestors(string name) 128 | { 129 | for (HElement n = this.ParentNode; n != null; n = n.ParentNode) 130 | { 131 | if (n.Name == name) 132 | yield return n; 133 | } 134 | } 135 | 136 | public string GetAttributeValue(string name, string def) 137 | { 138 | if (!HasAttributes) 139 | return def; 140 | var value = Attributes[name]; 141 | if (value == null) 142 | return def; 143 | return value; 144 | } 145 | 146 | public int GetAttributeValue(string name, int def) 147 | { 148 | if (!HasAttributes) 149 | return def; 150 | var value = Attributes[name]; 151 | if (value == null) 152 | return def; 153 | 154 | try 155 | { 156 | return Convert.ToInt32(value); 157 | } 158 | catch 159 | { 160 | return def; 161 | } 162 | } 163 | 164 | public bool GetAttributeValue(string name, bool def) 165 | { 166 | if (!HasAttributes) 167 | return def; 168 | var value = Attributes[name]; 169 | if (value == null) 170 | return def; 171 | 172 | try 173 | { 174 | return Convert.ToBoolean(value); 175 | } 176 | catch 177 | { 178 | return def; 179 | } 180 | } 181 | } 182 | 183 | public enum HtmlGenerationStyle 184 | { 185 | None, 186 | Indent 187 | } 188 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/HtmlLinqExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace ScrapySharp.Html.Dom 6 | { 7 | public static class HtmlLinqExtensions 8 | { 9 | public static IEnumerable Descendants(this HContainer container, string name) 10 | { 11 | foreach (var element in container.Elements(name)) 12 | { 13 | yield return element; 14 | } 15 | 16 | foreach (var child in container.Children) 17 | { 18 | foreach (var element in child.Descendants(name)) 19 | { 20 | yield return element; 21 | } 22 | } 23 | } 24 | 25 | public static IEnumerable Descendants(this HContainer container) 26 | { 27 | foreach (var element in container.Children) 28 | { 29 | yield return element; 30 | } 31 | 32 | foreach (var child in container.Children) 33 | { 34 | foreach (var element in child.Descendants()) 35 | { 36 | yield return element; 37 | } 38 | } 39 | } 40 | 41 | public static IEnumerable Elements(this IEnumerable containers, string name) 42 | { 43 | return containers.SelectMany(c => c.Elements(name)); 44 | } 45 | 46 | public static IEnumerable Elements(this HContainer container, string name) 47 | { 48 | if (container.Children == null) 49 | return new HElement[0]; 50 | 51 | return container.Children.Where(c => c.Name.Equals(name, StringComparison.InvariantCultureIgnoreCase)); 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/IHSubContainer.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Dom 2 | { 3 | public interface IHSubContainer 4 | { 5 | 6 | } 7 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Dom/TagDeclaration.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Collections.Specialized; 3 | using ScrapySharp.Html.Parsing; 4 | 5 | namespace ScrapySharp.Html.Dom 6 | { 7 | public class TagDeclaration 8 | { 9 | public string InnerText { get; set; } 10 | 11 | public string Name { get; set; } 12 | 13 | public NameValueCollection Attributes { get; set; } 14 | 15 | public List Words { get; set; } 16 | 17 | public DeclarationType Type { get; set; } 18 | } 19 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/ElementFinder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using HtmlAgilityPack; 5 | 6 | namespace ScrapySharp.Html 7 | { 8 | public class ElementFinder 9 | { 10 | private readonly HtmlNode html; 11 | private readonly ElementSearchKind searchKind; 12 | private readonly string tagName; 13 | private readonly string query; 14 | private readonly StringComparison comparisonType; 15 | 16 | internal ElementFinder(HtmlNode html, ElementSearchKind searchKind, string tagName, string query, StringComparison comparisonType = StringComparison.CurrentCulture) 17 | { 18 | this.html = html; 19 | this.searchKind = searchKind; 20 | this.tagName = tagName; 21 | this.query = query; 22 | this.comparisonType = comparisonType; 23 | } 24 | 25 | public IEnumerable FindElements() 26 | { 27 | switch (searchKind) 28 | { 29 | case ElementSearchKind.Text: 30 | return html.Descendants(tagName).Where(n => string.IsNullOrEmpty(n.InnerText) ? string.IsNullOrEmpty(query) : n.InnerText.Equals(query, comparisonType)); 31 | case ElementSearchKind.Id: 32 | return from n in html.Descendants(tagName) 33 | where string.IsNullOrEmpty(n.Id) ? string.IsNullOrEmpty(query) : n.Id.Equals(query, comparisonType) 34 | select n; 35 | case ElementSearchKind.Name: 36 | return from n in html.Descendants(tagName) 37 | let name = n.GetAttributeValue("name", string.Empty) 38 | where string.IsNullOrEmpty(name) ? string.IsNullOrEmpty(query) : name.Equals(query, comparisonType) 39 | select n; 40 | case ElementSearchKind.Class: 41 | return from n in html.Descendants(tagName) 42 | let @class = n.GetAttributeValue("class", string.Empty) 43 | let names = @class.Split(new []{' '}, StringSplitOptions.RemoveEmptyEntries) 44 | where names.Contains(query) 45 | select n; 46 | default: 47 | return new List(); 48 | } 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/ElementSearchKind.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html 2 | { 3 | public enum ElementSearchKind 4 | { 5 | Text, 6 | Id, 7 | Name, 8 | Class 9 | } 10 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/AgilityNodeParser.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Collections.Specialized; 3 | using System.Linq; 4 | using HtmlAgilityPack; 5 | using ScrapySharp.Extensions; 6 | 7 | namespace ScrapySharp.Html.Forms 8 | { 9 | internal class AgilityNodeParser : IHtmlNodeParser 10 | { 11 | private readonly HtmlNode node; 12 | 13 | public AgilityNodeParser(HtmlNode node) 14 | { 15 | this.node = node; 16 | 17 | Attributes = new NameValueCollection(); 18 | 19 | foreach (var attribute in node.Attributes) 20 | Attributes.Add(attribute.Name, attribute.Value); 21 | } 22 | 23 | public IEnumerable> CssSelect(string selector) 24 | { 25 | return node.CssSelect(selector).Select(n => new AgilityNodeParser(n)); 26 | } 27 | 28 | public string GetAttributeValue(string name) 29 | { 30 | return node.GetAttributeValue(name, string.Empty); 31 | } 32 | 33 | public NameValueCollection Attributes { get; private set; } 34 | 35 | public string InnerText 36 | { 37 | get { return node.InnerText; } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/FormField.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Forms 2 | { 3 | public class FormField 4 | { 5 | public string Name { get; set; } 6 | public string Value { get; set; } 7 | } 8 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/HElementFormParser.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using ScrapySharp.Extensions; 4 | using ScrapySharp.Html.Dom; 5 | 6 | namespace ScrapySharp.Html.Forms 7 | { 8 | public class HElementFormParser 9 | { 10 | public static List ParseFormFields(HElement html) 11 | { 12 | var hidden = from input in html.CssSelect("input") 13 | let value = input.GetAttributeValue("value", string.Empty) 14 | select new FormField 15 | { 16 | Name = input.GetAttributeValue("name", string.Empty), 17 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 18 | }; 19 | 20 | var selects = from @select in html.CssSelect("select") 21 | let name = @select.GetAttributeValue("name", string.Empty) 22 | let option = 23 | @select.CssSelect("option").FirstOrDefault(o => o.Attributes["selected"] != null) ?? 24 | @select.CssSelect("option").FirstOrDefault() 25 | let value = option.GetAttributeValue("value", string.Empty) 26 | select new FormField 27 | { 28 | Name = name, 29 | Value = string.IsNullOrEmpty(value) ? option.InnerText : value 30 | }; 31 | 32 | return hidden.Concat(selects).ToList(); 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/HElementNodeParser.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Collections.Specialized; 3 | using System.Linq; 4 | using ScrapySharp.Extensions; 5 | using ScrapySharp.Html.Dom; 6 | 7 | namespace ScrapySharp.Html.Forms 8 | { 9 | internal class HElementNodeParser : IHtmlNodeParser 10 | { 11 | private readonly HElement node; 12 | 13 | public HElementNodeParser(HElement node) 14 | { 15 | this.node = node; 16 | Attributes = node.Attributes; 17 | } 18 | 19 | public IEnumerable> CssSelect(string selector) 20 | { 21 | return node.CssSelect(selector).Select(n => new HElementNodeParser(n)); 22 | } 23 | 24 | public string GetAttributeValue(string name) 25 | { 26 | return node.GetAttributeValue(name, string.Empty); 27 | } 28 | 29 | public NameValueCollection Attributes { get; private set; } 30 | 31 | public string InnerText 32 | { 33 | get { return node.InnerText; } 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/HyperLink.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using HtmlAgilityPack; 3 | using ScrapySharp.Extensions; 4 | using ScrapySharp.Network; 5 | 6 | namespace ScrapySharp.Html.Forms 7 | { 8 | public class HyperLink 9 | { 10 | private readonly WebPage page; 11 | private readonly HtmlNode node; 12 | 13 | internal HyperLink(WebPage page, HtmlNode node) 14 | { 15 | this.page = page; 16 | this.node = node; 17 | } 18 | 19 | public string Text 20 | { 21 | get { return node.InnerText; } 22 | } 23 | 24 | public WebPage Click() 25 | { 26 | var href = node.GetAttributeValue("href", string.Empty); 27 | if (string.IsNullOrWhiteSpace(href)) 28 | return null; 29 | 30 | Uri url; 31 | if (Uri.TryCreate(href, UriKind.Absolute, out url)) 32 | return page.Browser.NavigateToPage(url, HttpVerb.Get, string.Empty); 33 | 34 | url = page.Browser.Referer.Combine(href); 35 | return page.Browser.NavigateToPage(url, HttpVerb.Get, string.Empty); 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/IHtmlNodeParser.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Collections.Specialized; 3 | 4 | namespace ScrapySharp.Html.Forms 5 | { 6 | internal interface IHtmlNodeParser 7 | { 8 | IEnumerable> CssSelect(string selector); 9 | string GetAttributeValue(string name); 10 | NameValueCollection Attributes { get; } 11 | string InnerText { get; } 12 | } 13 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/PageWebForm.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.Web; 7 | using HtmlAgilityPack; 8 | using ScrapySharp.Network; 9 | using ScrapySharp.Extensions; 10 | 11 | namespace ScrapySharp.Html.Forms 12 | { 13 | public class PageWebForm 14 | { 15 | private readonly HtmlNode html; 16 | private readonly ScrapingBrowser browser; 17 | private HttpVerb method; 18 | private string action; 19 | 20 | public PageWebForm(HtmlNode html, ScrapingBrowser browser) 21 | { 22 | this.html = html; 23 | this.browser = browser; 24 | Initialize(); 25 | } 26 | 27 | private void Initialize() 28 | { 29 | var nodeParser = new AgilityNodeParser(html); 30 | ParseAction(nodeParser); 31 | ParseMethod(nodeParser); 32 | 33 | FormFields = ParseFormFields(nodeParser); 34 | 35 | if (!FormFields.Any() && html.ParentNode != null) //forms can have a with agility pack 36 | { 37 | nodeParser = new AgilityNodeParser(html.ParentNode); 38 | FormFields = ParseFormFields(nodeParser); 39 | } 40 | } 41 | 42 | private void ParseMethod(IHtmlNodeParser nodeParser) 43 | { 44 | var value = nodeParser.GetAttributeValue("method"); 45 | 46 | if (!string.IsNullOrEmpty(value) && value.Equals("get")) 47 | method = HttpVerb.Get; 48 | else 49 | method = HttpVerb.Post; 50 | } 51 | 52 | private void ParseAction(IHtmlNodeParser nodeParser) 53 | { 54 | action = nodeParser.GetAttributeValue("action"); 55 | } 56 | 57 | internal static List ParseFormFields(IHtmlNodeParser node) 58 | { 59 | var inputs = from input in node.CssSelect("input") 60 | let value = input.GetAttributeValue("value") 61 | let type = input.GetAttributeValue("type") 62 | where type != "checkbox" && type != "radio" 63 | select new FormField 64 | { 65 | Name = input.GetAttributeValue("name"), 66 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 67 | }; 68 | 69 | var checkboxes = from input in node.CssSelect("input[type=checkbox]") 70 | let value = input.GetAttributeValue("value") 71 | where input.Attributes.AllKeys.Contains("checked") 72 | select new FormField 73 | { 74 | Name = input.GetAttributeValue("name"), 75 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 76 | }; 77 | 78 | var radios = from input in node.CssSelect("input[type=radio]") 79 | let value = input.GetAttributeValue("value") 80 | where input.Attributes.AllKeys.Contains("checked") 81 | select new FormField 82 | { 83 | Name = input.GetAttributeValue("name"), 84 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 85 | }; 86 | 87 | var selects = from @select in node.CssSelect("select") 88 | let name = @select.GetAttributeValue("name") 89 | let option = 90 | @select.CssSelect("option").FirstOrDefault(o => o.Attributes["selected"] != null) ?? 91 | @select.CssSelect("option").FirstOrDefault() 92 | let value = option.GetAttributeValue("value") 93 | select new FormField 94 | { 95 | Name = name, 96 | Value = string.IsNullOrEmpty(value) ? option.InnerText : value 97 | }; 98 | 99 | return inputs.Concat(selects).Concat(checkboxes).Concat(radios).ToList(); 100 | } 101 | 102 | public List FormFields { get; set; } 103 | 104 | public string SerializeFormFields() 105 | { 106 | var builder = new StringBuilder(); 107 | var fields = FormFields.ToArray(); 108 | 109 | for (int i = 0; i < fields.Length; i++) 110 | { 111 | if (string.IsNullOrWhiteSpace(fields[i].Name)) 112 | continue; 113 | 114 | if (i > 0) 115 | builder.Append('&'); 116 | builder.AppendFormat("{0}={1}", Uri.EscapeDataString(fields[i].Name), Uri.EscapeDataString(fields[i].Value)); 117 | } 118 | 119 | return builder.ToString(); 120 | } 121 | 122 | public string this[string key] 123 | { 124 | get 125 | { 126 | var field = FormFields.FirstOrDefault(f => f.Name == key); 127 | return field != null ? field.Value : null; 128 | } 129 | set 130 | { 131 | var field = FormFields.FirstOrDefault(f => f.Name == key); 132 | if (field != null) 133 | FormFields.Remove(field); 134 | 135 | FormFields.Add(new FormField { Name = key, Value = value }); 136 | } 137 | } 138 | 139 | public WebPage Submit(Uri url, HttpVerb verb) 140 | { 141 | return browser.NavigateToPage(url, verb, SerializeFormFields()); 142 | } 143 | 144 | public WebPage Submit(Uri url) 145 | { 146 | return browser.NavigateToPage(url, method, SerializeFormFields()); 147 | } 148 | 149 | public WebPage Submit() 150 | { 151 | Uri url; 152 | if (Uri.TryCreate(Action, UriKind.Absolute, out url)) 153 | { 154 | return browser.NavigateToPage(url, method, SerializeFormFields()); 155 | } 156 | 157 | url = browser.Referer.Combine(action); 158 | return browser.NavigateToPage(url, method, SerializeFormFields()); 159 | } 160 | 161 | public async Task SubmitAsync(Uri url, HttpVerb verb) 162 | { 163 | return await browser.NavigateToPageAsync(url, verb, SerializeFormFields()); 164 | } 165 | 166 | public async Task SubmitAsync(Uri url) 167 | { 168 | return await browser.NavigateToPageAsync(url, method, SerializeFormFields()); 169 | } 170 | 171 | public async Task SubmitAsync() 172 | { 173 | Uri url; 174 | if (Uri.TryCreate(Action, UriKind.Absolute, out url)) 175 | { 176 | return await browser.NavigateToPageAsync(url, method, SerializeFormFields()); 177 | } 178 | 179 | url = browser.Referer.Combine(action); 180 | return await browser.NavigateToPageAsync(url, method, SerializeFormFields()); 181 | } 182 | 183 | public HttpVerb Method 184 | { 185 | get { return method; } 186 | set { method = value; } 187 | } 188 | 189 | public string Action 190 | { 191 | get { return action; } 192 | set { action = value; } 193 | } 194 | 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /ScrapySharp/Html/Forms/WebForm.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Web; 6 | using HtmlAgilityPack; 7 | using ScrapySharp.Html.Dom; 8 | using ScrapySharp.Network; 9 | 10 | namespace ScrapySharp.Html.Forms 11 | { 12 | public class WebForm 13 | { 14 | private HttpVerb method; 15 | private string action; 16 | 17 | public WebForm() 18 | { 19 | FormFields = new List(); 20 | method = HttpVerb.Post; 21 | action = string.Empty; 22 | } 23 | 24 | public WebForm(HtmlNode html) 25 | { 26 | var nodeParser = new AgilityNodeParser(html); 27 | Initialize(nodeParser); 28 | } 29 | 30 | public WebForm(HElement html) 31 | { 32 | var nodeParser = new HElementNodeParser(html); 33 | Initialize(nodeParser); 34 | } 35 | 36 | private void Initialize(IHtmlNodeParser nodeParser) 37 | { 38 | FormFields = ParseFormFields(nodeParser); 39 | ParseAction(nodeParser); 40 | ParseMethod(nodeParser); 41 | } 42 | 43 | private void ParseMethod(IHtmlNodeParser nodeParser) 44 | { 45 | var value = nodeParser.GetAttributeValue("method"); 46 | 47 | if (!string.IsNullOrEmpty(value) && value.Equals("get")) 48 | method = HttpVerb.Get; 49 | else 50 | method = HttpVerb.Post; 51 | } 52 | 53 | private void ParseAction(IHtmlNodeParser nodeParser) 54 | { 55 | action = nodeParser.GetAttributeValue("action"); 56 | } 57 | 58 | internal static List ParseFormFields(IHtmlNodeParser node) 59 | { 60 | var inputs = from input in node.CssSelect("input") 61 | let value = input.GetAttributeValue("value") 62 | let type = input.GetAttributeValue("type") 63 | where type != "checkbox" && type != "radio" 64 | select new FormField 65 | { 66 | Name = input.GetAttributeValue("name"), 67 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 68 | }; 69 | 70 | var checkboxes = from input in node.CssSelect("input[type=checkbox]") 71 | let value = input.GetAttributeValue("value") 72 | where input.Attributes.AllKeys.Contains("checked") 73 | select new FormField 74 | { 75 | Name = input.GetAttributeValue("name"), 76 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 77 | }; 78 | 79 | var radios = from input in node.CssSelect("input[type=radio]") 80 | let value = input.GetAttributeValue("value") 81 | where input.Attributes.AllKeys.Contains("checked") 82 | select new FormField 83 | { 84 | Name = input.GetAttributeValue("name"), 85 | Value = string.IsNullOrEmpty(value) ? input.InnerText : value 86 | }; 87 | 88 | var selects = from @select in node.CssSelect("select") 89 | let name = @select.GetAttributeValue("name") 90 | let option = 91 | @select.CssSelect("option").FirstOrDefault(o => o.Attributes["selected"] != null) ?? 92 | @select.CssSelect("option").FirstOrDefault() 93 | let value = option.GetAttributeValue("value") 94 | select new FormField 95 | { 96 | Name = name, 97 | Value = string.IsNullOrEmpty(value) ? option.InnerText : value 98 | }; 99 | 100 | return inputs.Concat(selects).Concat(checkboxes).Concat(radios).ToList(); 101 | } 102 | 103 | public List FormFields { get; set; } 104 | 105 | public string SerializeFormFields() 106 | { 107 | var builder = new StringBuilder(); 108 | var fields = FormFields.ToArray(); 109 | 110 | for (int i = 0; i < fields.Length; i++) 111 | { 112 | if (string.IsNullOrWhiteSpace(fields[i].Name)) 113 | continue; 114 | 115 | if (i > 0) 116 | builder.Append('&'); 117 | builder.AppendFormat("{0}={1}", Uri.EscapeDataString(fields[i].Name), Uri.EscapeDataString(fields[i].Value)); 118 | } 119 | 120 | return builder.ToString(); 121 | } 122 | 123 | public string this[string key] 124 | { 125 | get 126 | { 127 | var field = FormFields.FirstOrDefault(f => f.Name == key); 128 | return field != null ? field.Value : null; 129 | } 130 | set 131 | { 132 | var field = FormFields.FirstOrDefault(f => f.Name == key); 133 | if (field != null) 134 | FormFields.Remove(field); 135 | 136 | FormFields.Add(new FormField { Name = key, Value = value }); 137 | } 138 | } 139 | 140 | public void Submit(ScrapingBrowser browser, Uri url, HttpVerb verb) 141 | { 142 | browser.NavigateTo(url, verb, SerializeFormFields()); 143 | } 144 | 145 | public void Submit(ScrapingBrowser browser, Uri url) 146 | { 147 | browser.NavigateTo(url, method, SerializeFormFields()); 148 | } 149 | 150 | public HttpVerb Method 151 | { 152 | get { return method; } 153 | set { method = value; } 154 | } 155 | 156 | public string Action 157 | { 158 | get { return action; } 159 | set { action = value; } 160 | } 161 | } 162 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Parsing/CodeReader.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | using System.Text; 3 | using ScrapySharp.Extensions; 4 | 5 | namespace ScrapySharp.Html.Parsing 6 | { 7 | public class CodeReader 8 | { 9 | private readonly string sourceCode; 10 | private readonly StringBuilder buffer; 11 | private int position; 12 | private CodeReadingContext context; 13 | 14 | private int lineNumber = 1; 15 | private int linePosition = 1; 16 | 17 | public CodeReader(string sourceCode) 18 | { 19 | if (sourceCode.EndsWith("\n")) 20 | this.sourceCode = sourceCode; 21 | else 22 | this.sourceCode = sourceCode + "\n"; 23 | 24 | buffer = new StringBuilder(); 25 | context = CodeReadingContext.None; 26 | } 27 | 28 | public int MaxWordCount 29 | { 30 | get { return sourceCode.Length; } 31 | } 32 | 33 | public Word ReadWord() 34 | { 35 | buffer.Remove(0, buffer.Length); 36 | var c = ReadChar(); 37 | 38 | //while (char.IsWhiteSpace(c)) 39 | // c = ReadChar(); 40 | 41 | if (char.IsWhiteSpace(c)) 42 | return new Word(c.ToString(CultureInfo.InvariantCulture), lineNumber, linePosition, false); 43 | 44 | if (context != CodeReadingContext.InQuotes && (c == Tokens.Quote || c == Tokens.SimpleQuote)) 45 | { 46 | context = CodeReadingContext.InQuotes; 47 | return ReadQuotedString(c); 48 | } 49 | 50 | buffer.Append(c); 51 | 52 | var letterOrDigit = IsLetterOrDigit(c); 53 | 54 | while (IsLetterOrDigit(GetNextChar()) == letterOrDigit && !char.IsWhiteSpace(GetNextChar()) && !GetNextChar().IsToken()) 55 | { 56 | c = ReadChar(); 57 | if (c == Tokens.Quote) 58 | { 59 | position--; 60 | break; 61 | } 62 | 63 | buffer.Append(c); 64 | 65 | //if (c.IsToken() && GetNextChar().IsToken()) 66 | // break; 67 | } 68 | 69 | return new Word(buffer.ToString(), lineNumber, linePosition, false); 70 | } 71 | 72 | private Word ReadQuotedString(char quoteChar) 73 | { 74 | var c = ReadChar(); 75 | 76 | while (!End && context == CodeReadingContext.InQuotes) 77 | { 78 | if (c == quoteChar) 79 | break; 80 | 81 | var nextChar = GetNextChar(); 82 | if (nextChar == Tokens.TagBegin || nextChar == Tokens.TagEnd) 83 | break; 84 | 85 | buffer.Append(c); 86 | 87 | if (c == Tokens.TagBegin || c == Tokens.TagEnd) 88 | break; 89 | 90 | c = ReadChar(); 91 | } 92 | 93 | context = CodeReadingContext.None; 94 | 95 | return new Word(buffer.ToString(), lineNumber, linePosition, true); 96 | } 97 | 98 | 99 | public char GetNextChar() 100 | { 101 | if (position >= sourceCode.Length) 102 | return (char)0; 103 | return sourceCode[position]; 104 | } 105 | 106 | public char GetPreviousChar() 107 | { 108 | if (position <= 1) 109 | return (char)0; 110 | return sourceCode[position - 2]; 111 | } 112 | 113 | public char ReadChar() 114 | { 115 | if (End) 116 | return (char)0; 117 | var c = sourceCode[position++]; 118 | linePosition++; 119 | 120 | if (c == '\n') 121 | { 122 | lineNumber++; 123 | linePosition = 1; 124 | } 125 | 126 | return c; 127 | } 128 | 129 | public bool End 130 | { 131 | get { return position >= sourceCode.Length; } 132 | } 133 | 134 | public int LineNumber 135 | { 136 | get { return lineNumber; } 137 | } 138 | 139 | public int LinePosition 140 | { 141 | get { return linePosition; } 142 | } 143 | 144 | 145 | public bool IsLetterOrDigit(char c) 146 | { 147 | return char.IsLetterOrDigit(c) || c == '-' || c == '_' 148 | || c == ':' || c == ';' || c == '+'; 149 | } 150 | } 151 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Parsing/CodeReadingContext.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Parsing 2 | { 3 | public enum CodeReadingContext 4 | { 5 | None, 6 | SearchingTag, 7 | InBeginTag, 8 | InTagContent, 9 | InTagEnd, 10 | InAttributeName, 11 | InAttributeValue, 12 | InQuotes 13 | } 14 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Parsing/HtmlDeclarationReader.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Collections.Specialized; 4 | using ScrapySharp.Extensions; 5 | using ScrapySharp.Html.Dom; 6 | using System.Linq; 7 | 8 | namespace ScrapySharp.Html.Parsing 9 | { 10 | public class HtmlDeclarationReader 11 | { 12 | private readonly List words; 13 | private int position; 14 | 15 | public HtmlDeclarationReader(CodeReader reader) 16 | { 17 | words = new List(); 18 | 19 | SkipSpaces = false; 20 | 21 | while (!reader.End) 22 | { 23 | var w = reader.ReadWord(); 24 | words.Add(w); 25 | } 26 | } 27 | 28 | public bool End 29 | { 30 | get { return position >= words.Count - 1; } 31 | } 32 | 33 | public TagDeclaration ReadTagDeclaration() 34 | { 35 | var w = ReadWord(); 36 | if (w == null) 37 | return null; 38 | 39 | if (w.IsToken() && (w == Tokens.TagBegin || w == Tokens.CloseTagDeclarator) && !GetNextWord().IsWhiteSpace) 40 | { 41 | if (w == Tokens.Doctype) 42 | { 43 | if (char.IsLetterOrDigit(GetNextWord(), 0)) 44 | return ReadDoctype(w); 45 | if (GetNextWord().Value != null && GetNextWord().Value.StartsWith("--")) 46 | return ReadComment(w); 47 | } 48 | 49 | var element = new TagDeclaration 50 | { 51 | Words = new List {w}, 52 | Attributes = new NameValueCollection() 53 | }; 54 | 55 | w = ReadWord(); 56 | element.Words.Add(w); 57 | element.Name = w; 58 | 59 | if (element.Name == Tokens.CloseTag) 60 | { 61 | w = ReadWord(); 62 | element.Words.Add(w); 63 | element.Name = w; 64 | } 65 | 66 | do 67 | { 68 | SkipSpaces = true; 69 | 70 | w = ReadWord(); 71 | element.Words.Add(w); 72 | if (IsTagDeclarationEnd(w)) 73 | break; 74 | var attributeName = w.Value; 75 | w = ReadWord(); 76 | element.Words.Add(w); 77 | if (IsTagDeclarationEnd(w)) 78 | { 79 | if (!attributeName.IsToken()) 80 | element.Attributes.Add(attributeName, attributeName); 81 | break; 82 | } 83 | 84 | if (w.Value == Tokens.Assign) 85 | { 86 | w = ReadWord(); 87 | element.Words.Add(w); 88 | if (IsTagDeclarationEnd(w)) 89 | break; 90 | element.Attributes.Add(attributeName, w.Value); 91 | } 92 | else 93 | element.Attributes.Add(attributeName, attributeName); 94 | 95 | } while (!End && w != Tokens.TagBegin && w != Tokens.TagEnd); 96 | 97 | SkipSpaces = false; 98 | element.Type = GetDeclarationType(element.Words); 99 | 100 | return element; 101 | } 102 | 103 | return ReadTextElement(w); 104 | } 105 | 106 | private TagDeclaration ReadComment(Word word) 107 | { 108 | var wordList = new List(); 109 | var w = word; 110 | 111 | wordList.Add(w); 112 | 113 | while (!End) 114 | { 115 | w = ReadWord(); 116 | 117 | if (w == "--" && GetNextWord() == Tokens.TagEnd) 118 | break; 119 | 120 | wordList.Add(w); 121 | } 122 | 123 | return new TagDeclaration 124 | { 125 | InnerText = string.Join(string.Empty, wordList.Skip(2).Select(i => i.QuotedValue)), 126 | Words = wordList, 127 | Type = DeclarationType.Comment, 128 | Name = "--" 129 | }; 130 | } 131 | 132 | private DeclarationType GetDeclarationType(List wordList) 133 | { 134 | if (wordList.Count < 3) 135 | return DeclarationType.TextElement; 136 | 137 | if (wordList.Last() != Tokens.TagEnd) 138 | return DeclarationType.TextElement; 139 | 140 | if (wordList[0] == Tokens.CloseTagDeclarator) 141 | return DeclarationType.CloseTag; 142 | 143 | if (wordList[0] == Tokens.TagBegin) 144 | { 145 | if (wordList[1] == Tokens.CloseTag) 146 | return DeclarationType.CloseTag; 147 | 148 | if (wordList[wordList.Count - 2] == Tokens.CloseTag) 149 | return DeclarationType.SelfClosedTag; 150 | 151 | return DeclarationType.OpenTag; 152 | } 153 | 154 | return DeclarationType.TextElement; 155 | } 156 | 157 | private TagDeclaration ReadDoctype(Word word) 158 | { 159 | var wordList = new List(); 160 | var w = word; 161 | 162 | wordList.Add(w); 163 | 164 | SkipSpaces = true; 165 | 166 | while (!End && GetNextWord() != Tokens.TagBegin && w != Tokens.TagEnd) 167 | { 168 | w = ReadWord(); 169 | wordList.Add(w); 170 | 171 | if (w.Value.Equals("DOCTYPE", StringComparison.InvariantCultureIgnoreCase)) 172 | SkipSpaces = false; 173 | } 174 | 175 | SkipSpaces = false; 176 | 177 | return new TagDeclaration 178 | { 179 | InnerText = string.Join(string.Empty, wordList.Select(i => i.QuotedValue)), 180 | Words = wordList, 181 | Type = DeclarationType.SelfClosedTag, 182 | Name = "DOCTYPE" 183 | }; 184 | } 185 | 186 | private TagDeclaration ReadTextElement(Word word) 187 | { 188 | var wordList = new List(); 189 | var w = word; 190 | 191 | wordList.Add(w); 192 | 193 | while (!End && GetNextWord() != Tokens.TagBegin && GetNextWord() != Tokens.TagEnd && GetNextWord() != Tokens.TagBegin) 194 | { 195 | w = ReadWord(); 196 | wordList.Add(w); 197 | } 198 | 199 | return new TagDeclaration 200 | { 201 | InnerText = string.Join(string.Empty, wordList.Select(i => i.QuotedValue)), 202 | Words = wordList, 203 | Type = DeclarationType.TextElement 204 | }; 205 | } 206 | 207 | private bool IsTagDeclarationEnd(Word w) 208 | { 209 | return End || w == Tokens.TagBegin || w == Tokens.TagEnd; 210 | } 211 | 212 | public Word GetNextWord(int count = 1) 213 | { 214 | if ((position + count - 1) >= words.Count) 215 | return null; 216 | return words[(position + count - 1)]; 217 | } 218 | 219 | public Word GetPreviousChar() 220 | { 221 | if (position <= 1) 222 | return null; 223 | return words[position - 2]; 224 | } 225 | 226 | public Word ReadWord() 227 | { 228 | if (SkipSpaces) 229 | { 230 | while (!End) 231 | { 232 | var w = words[position++]; 233 | if (!w.IsWhiteSpace) 234 | return w; 235 | } 236 | } 237 | 238 | return End ? null : words[position++]; 239 | } 240 | 241 | public bool SkipSpaces { get; set; } 242 | } 243 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Parsing/HtmlDomBuilder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using ScrapySharp.Html.Dom; 4 | using System.Linq; 5 | 6 | namespace ScrapySharp.Html.Parsing 7 | { 8 | public class HtmlDomBuilder 9 | { 10 | private readonly List tags; 11 | 12 | public HtmlDomBuilder(HtmlDeclarationReader reader) 13 | { 14 | tags = new List(); 15 | 16 | while (!reader.End) 17 | { 18 | var d = reader.ReadTagDeclaration(); 19 | tags.Add(d); 20 | } 21 | } 22 | 23 | public IEnumerable BuildDom(List declarations, HElement parent) 24 | { 25 | for (var i = 0; i < declarations.Count; i++) 26 | { 27 | var declaration = declarations[i]; 28 | 29 | if (declaration.Type == DeclarationType.Comment) 30 | { 31 | yield return new HComment 32 | { 33 | Name = "!--", 34 | InnerText = declaration.InnerText 35 | }; 36 | } 37 | 38 | if (declaration.Type == DeclarationType.OpenTag) 39 | { 40 | var openning = 1; 41 | var closing = 0; 42 | var start = i; 43 | 44 | while (closing < openning && i < declarations.Count) 45 | { 46 | if (i >= declarations.Count - 1) 47 | break; 48 | var current = declarations[++i]; 49 | if (current.Type == DeclarationType.CloseTag && current.Name == declaration.Name) 50 | closing++; 51 | if (current.Type == DeclarationType.OpenTag && current.Name == declaration.Name) 52 | openning++; 53 | 54 | if (openning == closing) 55 | { 56 | var childrenTags = declarations.Skip(start+1).Take(i - start - 1).ToList(); 57 | 58 | var element = new HElement 59 | { 60 | Name = declaration.Name, 61 | Attributes = declaration.Attributes, 62 | InnerText = declaration.InnerText, 63 | ParentNode = parent 64 | }; 65 | var children = declarations.Count > childrenTags.Count ? BuildDom(childrenTags, element).ToList() : new List(); 66 | 67 | element.Children = children; 68 | 69 | yield return element; 70 | break; 71 | } 72 | } 73 | 74 | if (openning != closing) 75 | { 76 | var childrenTags = declarations.Skip(start + 1).Take(i - start - 1).ToList(); 77 | 78 | yield return new HElement 79 | { 80 | Name = declaration.Name, 81 | Attributes = declaration.Attributes, 82 | InnerText = declaration.InnerText, 83 | Children = declarations.Count > childrenTags.Count ? BuildDom(childrenTags, parent).ToList() : new List(), 84 | ParentNode = parent 85 | }; 86 | } 87 | } 88 | 89 | if (declaration.Type == DeclarationType.TextElement || declaration.Type == DeclarationType.SelfClosedTag) 90 | yield return new HElement 91 | { 92 | InnerText = declaration.InnerText, 93 | Name = declaration.Name, 94 | Attributes = declaration.Attributes, 95 | ParentNode = parent 96 | }; 97 | } 98 | } 99 | 100 | public IEnumerable BuildDom() 101 | { 102 | return BuildDom(tags, null); 103 | } 104 | } 105 | } -------------------------------------------------------------------------------- /ScrapySharp/Html/Parsing/Tokens.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Html.Parsing 2 | { 3 | public class Tokens 4 | { 5 | public const char TagBegin = '<'; 6 | public const char TagEnd = '>'; 7 | 8 | public const char Quote = '\"'; 9 | public const char SimpleQuote = '\''; 10 | 11 | public const string CloseTag = "/"; 12 | 13 | public const string CommentBegin = "!--"; 14 | public const string CommentEnd = "--"; 15 | 16 | public const string Assign = "="; 17 | 18 | public const string CloseTagDeclarator = "[^=]+)=(?[^;]+)?[,;]+", RegexOptions.Compiled); 12 | 13 | public CookiesParser(string defaultDomain) 14 | { 15 | this.defaultDomain = defaultDomain; 16 | } 17 | 18 | public List> ParseValuePairs(string cookiesExpression) 19 | { 20 | var list = new List>(); 21 | 22 | var match = splitCookiesRegex.Match(cookiesExpression); 23 | 24 | while (match.Success) 25 | { 26 | if (match.Groups["name"].Success && match.Groups["val"].Success) 27 | { 28 | try 29 | { 30 | list.Add(new KeyValuePair(match.Groups["name"].Value, match.Groups["val"].Value)); 31 | } 32 | catch (CookieException) { } 33 | } 34 | match = match.NextMatch(); 35 | } 36 | 37 | return list; 38 | } 39 | 40 | public List ParseCookies(string cookiesExpression) 41 | { 42 | var cookies = new List(); 43 | var keyValuePairs = ParseValuePairs(cookiesExpression); 44 | 45 | for (int i = 0; i < keyValuePairs.Count; i++) 46 | { 47 | var pair = keyValuePairs[i]; 48 | if (pair.Key.Equals("path", StringComparison.InvariantCultureIgnoreCase) 49 | || pair.Key.Equals("domain", StringComparison.InvariantCultureIgnoreCase) 50 | || pair.Key.Equals("expires", StringComparison.InvariantCultureIgnoreCase)) 51 | continue; 52 | 53 | var name = pair.Key; 54 | var value = pair.Value; 55 | string path = null; 56 | string domain = null; 57 | 58 | var next1 = i + 1; 59 | if (next1 < keyValuePairs.Count) 60 | { 61 | if (keyValuePairs[next1].Key.Equals("path", StringComparison.InvariantCultureIgnoreCase)) 62 | path = keyValuePairs[next1].Value; 63 | if (keyValuePairs[next1].Key.Equals("domain", StringComparison.InvariantCultureIgnoreCase)) 64 | domain = keyValuePairs[next1].Value; 65 | } 66 | 67 | var next2 = i + 2; 68 | if (next2 < keyValuePairs.Count) 69 | { 70 | if (keyValuePairs[next2].Key.Equals("path", StringComparison.InvariantCultureIgnoreCase)) 71 | path = keyValuePairs[next2].Value; 72 | if (keyValuePairs[next2].Key.Equals("domain", StringComparison.InvariantCultureIgnoreCase)) 73 | domain = keyValuePairs[next2].Value; 74 | } 75 | 76 | if (string.IsNullOrEmpty(domain) && !string.IsNullOrEmpty(path)) 77 | cookies.Add(new Cookie(name, value, path, defaultDomain)); 78 | else if (!string.IsNullOrEmpty(domain) && !string.IsNullOrEmpty(path)) 79 | cookies.Add(new Cookie(name, value, path, domain)); 80 | else 81 | cookies.Add(new Cookie(name, value, "/", defaultDomain)); 82 | } 83 | 84 | return cookies; 85 | } 86 | } 87 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/FakeUserAgent.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Network 2 | { 3 | public class FakeUserAgent 4 | { 5 | private string name; 6 | private string userAgent; 7 | 8 | public FakeUserAgent(string name, string userAgent) 9 | { 10 | this.name = name; 11 | this.userAgent = userAgent; 12 | } 13 | 14 | public string Name 15 | { 16 | get { return name; } 17 | } 18 | 19 | public string UserAgent 20 | { 21 | get { return userAgent; } 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/FakeUserAgents.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Network 2 | { 3 | public static class FakeUserAgents 4 | { 5 | public static readonly FakeUserAgent Chrome = new FakeUserAgent("Chrome", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.98 Safari/534.13"); 6 | public static readonly FakeUserAgent Chrome24 = new FakeUserAgent("Chrome", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17"); 7 | public static readonly FakeUserAgent InternetExplorer8 = new FakeUserAgent("Internet Explorer 8", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; CMDTDF; .NET4.0C; .NET4.0E)"); 8 | } 9 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/HttpVerb.cs: -------------------------------------------------------------------------------- 1 | namespace ScrapySharp.Network 2 | { 3 | public enum HttpVerb 4 | { 5 | Get, 6 | Head, 7 | Post, 8 | Put, 9 | Delete, 10 | Trace, 11 | Options 12 | } 13 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/RawRequest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace ScrapySharp.Network 6 | { 7 | public class RawRequest 8 | { 9 | internal RawRequest(string verb, Uri url, Version httpVersion, List> headers, byte[] body, Encoding encoding) 10 | { 11 | Encoding = encoding; 12 | Verb = verb; 13 | Url = url; 14 | HttpVersion = httpVersion; 15 | Headers = headers; 16 | Body = body; 17 | } 18 | 19 | public string Verb { get; private set; } 20 | public Uri Url { get; private set; } 21 | public Version HttpVersion { get; private set; } 22 | public List> Headers { get; private set; } 23 | public byte[] Body { get; private set; } 24 | public Encoding Encoding { get; private set; } 25 | 26 | public override string ToString() 27 | { 28 | var builder = new StringBuilder(); 29 | builder.AppendFormat("{0} {1} HTTP/{2}.{3}\r\n", Verb, Url, HttpVersion.Major, HttpVersion.Minor); 30 | 31 | foreach (var header in Headers) 32 | builder.AppendFormat("{0}: {1}\r\n", header.Key, header.Value); 33 | 34 | builder.AppendFormat("\r\n"); 35 | 36 | if (Body != null && Body.Length > 0) 37 | builder.AppendFormat("{0}\r\n", Encoding.ASCII.GetString(Body)); 38 | 39 | return builder.ToString(); 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/RawResponse.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Collections.Specialized; 4 | using System.Linq; 5 | using System.Net; 6 | using System.Text; 7 | 8 | namespace ScrapySharp.Network 9 | { 10 | public class RawResponse 11 | { 12 | internal RawResponse(Version httpVersion, HttpStatusCode statusCode, string statusDescription, NameValueCollection headers, byte[] body, Encoding encoding) 13 | { 14 | Encoding = encoding; 15 | HttpVersion = httpVersion; 16 | StatusCode = (int)statusCode; 17 | StatusDescription = statusDescription; 18 | Body = body; 19 | Headers = headers.AllKeys.Select(k => new KeyValuePair(k, headers[k])).ToList(); 20 | } 21 | 22 | public Version HttpVersion { get; private set; } 23 | public int StatusCode { get; private set; } 24 | public string StatusDescription { get; private set; } 25 | public List> Headers { get; private set; } 26 | public byte[] Body { get; private set; } 27 | public Encoding Encoding { get; private set; } 28 | 29 | public override string ToString() 30 | { 31 | var builder = new StringBuilder(); 32 | builder.AppendFormat("HTTP/{0}.{1} {2} {3}\r\n", HttpVersion.Major, HttpVersion.Minor, StatusCode, StatusDescription); 33 | 34 | foreach (var header in Headers) 35 | builder.AppendFormat("{0}: {1}\r\n", header.Key, header.Value); 36 | builder.AppendFormat("\r\n"); 37 | 38 | if (Body != null && Body.Length > 0) 39 | builder.AppendFormat("{0}\r\n", Encoding.ASCII.GetString(Body)); 40 | 41 | return builder.ToString(); 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /ScrapySharp/Network/WebResource.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | 4 | namespace ScrapySharp.Network 5 | { 6 | public class WebResource : IDisposable 7 | { 8 | private readonly MemoryStream content; 9 | private readonly string lastModified; 10 | private readonly Uri absoluteUrl; 11 | private readonly bool forceDownload; 12 | private readonly string contentType; 13 | 14 | public WebResource(MemoryStream content, string lastModified, Uri absoluteUrl, bool forceDownload, string contentType) 15 | { 16 | this.content = content; 17 | this.lastModified = lastModified; 18 | this.absoluteUrl = absoluteUrl; 19 | this.forceDownload = forceDownload; 20 | this.contentType = contentType; 21 | } 22 | 23 | public void Dispose() 24 | { 25 | content.Dispose(); 26 | } 27 | 28 | public MemoryStream Content 29 | { 30 | get { return content; } 31 | } 32 | 33 | public string LastModified 34 | { 35 | get { return lastModified; } 36 | } 37 | 38 | public Uri AbsoluteUrl 39 | { 40 | get { return absoluteUrl; } 41 | } 42 | 43 | public bool ForceDownload 44 | { 45 | get { return forceDownload; } 46 | } 47 | 48 | public string ContentType 49 | { 50 | get { return contentType; } 51 | } 52 | 53 | public string GetTextContent() 54 | { 55 | content.Position = 0; 56 | using (var reader = new StreamReader(content)) 57 | return reader.ReadToEnd(); 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /ScrapySharp/ScrapySharp.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | netstandard2.0 4 | $(TargetsForTfmSpecificBuildOutput);IncludeP2PAssets 5 | 6 | ScrapySharp 7 | Scraping Framework containing : 8 | - a web client able to simulate a web browser. 9 | - an HtmlAgilityPack extension to select elements using css selector (like JQuery) 10 | Copyright 2011 Romain Flechner 11 | fr-FR 12 | Romain Flechner 13 | Library 14 | ScrapySharp 15 | Scraping;html;css;linq;agility 16 | 17 | 18 | https://github.com/rflechner/ScrapySharp 19 | https://github.com/rflechner/ScrapySharp/blob/master/LICENSE 20 | true 21 | hg 22 | https://github.com/rflechner/ScrapySharp.git 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /build.ps1: -------------------------------------------------------------------------------- 1 | # functions coming from https://blogs.msdn.microsoft.com/jaredpar/2009/01/16/powershell-linq-take-count-and-take-while/ 2 | #============================================================================ 3 | # Take count elements fro the pipeline 4 | #============================================================================ 5 | function Take-Count() { 6 | param ( [int]$count = $(throw "Need a count") ) 7 | begin { 8 | $total = 0; 9 | } 10 | process { 11 | if ( $total -lt $count ) { 12 | $_ 13 | } 14 | $total += 1 15 | } 16 | } 17 | 18 | #============================================================================ 19 | # Take elements from the pipeline while the predicate is true 20 | #============================================================================ 21 | function Take-While() { 22 | param ( [scriptblock]$pred = $(throw "Need a predicate") ) 23 | begin { 24 | $take = $true 25 | } 26 | process { 27 | if ( $take ) { 28 | $take = & $pred $_ 29 | if ( $take ) { 30 | $_ 31 | } 32 | } 33 | } 34 | } 35 | 36 | $lines = Get-Content .\ReleaseNotes.md | Where-Object { $_.Length -gt 0 } 37 | $top = $lines | Select-Object -First 1 38 | $version = $top.Replace("#", "").Trim() 39 | $releaseNotes = "" 40 | 41 | $notes = $lines | Select-Object -Skip 1 | Take-While { -not $_.Trim().StartsWith("#") } 42 | 43 | foreach ($note in $notes) { 44 | $releaseNotes += $note + "`n" 45 | } 46 | 47 | dotnet restore 48 | dotnet test ScrapySharp.Tests\ScrapySharp.Tests.csproj 49 | dotnet build --configuration release 50 | dotnet pack --configuration release /p:PackageVersion=$version /p:PackageReleaseNotes=$releaseNotes 51 | if (Test-Path .\release) 52 | { 53 | Remove-Item -Recurse -Force -Path release 54 | } 55 | mkdir release 56 | xcopy .\ScrapySharp\bin\Release\*.nupkg release 57 | Remove-Item .\ScrapySharp\bin\**\*.nupkg 58 | Remove-Item .\ScrapySharp.Core\bin\**\*.nupkg 59 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ScrapySharp - .Net scraping tools 5 | 6 | 7 | 8 | 9 |

ScrapySharp - .Net scraping tools

10 | 11 |

12 | 13 | Content comming soon ... 14 | 15 |

16 | 17 | 18 | --------------------------------------------------------------------------------