├── .codecov.yml
├── .editorconfig
├── .github
├── FUNDING.yml
├── dependabot.yml
├── release-drafter.yml
└── workflows
│ ├── build.yml
│ └── release-drafter.yml
├── .gitignore
├── CodeCoverage.runsettings
├── InfinityCrawler.sln
├── License.txt
├── README.md
├── images
└── icon.png
├── src
├── Directory.Build.props
└── InfinityCrawler
│ ├── CrawlLink.cs
│ ├── CrawlResult.cs
│ ├── CrawlSettings.cs
│ ├── CrawledUri.cs
│ ├── Crawler.cs
│ ├── InfinityCrawler.csproj
│ ├── Internal
│ ├── CrawlRunner.cs
│ └── UriExtensions.cs
│ ├── Processing
│ ├── Content
│ │ ├── CrawlHeaders.cs
│ │ ├── DefaultContentProcessor.cs
│ │ └── IContentProcessor.cs
│ └── Requests
│ │ ├── DefaultRequestProcessor.cs
│ │ ├── IRequestProcessor.cs
│ │ ├── RequestContext.cs
│ │ ├── RequestProcessorOptions.cs
│ │ └── RequestResult.cs
│ └── UriCrawlState.cs
└── tests
├── InfinityCrawler.Tests.Benchmarks
├── BasicSiteCrawlBenchmark.cs
├── InfinityCrawler.Tests.Benchmarks.csproj
└── Program.cs
├── InfinityCrawler.Tests.TestSite
├── Controllers
│ ├── HelperController.cs
│ └── RobotsController.cs
├── InfinityCrawler.Tests.TestSite.csproj
├── Resources
│ ├── BasicSite
│ │ ├── basic-page.html
│ │ ├── index.html
│ │ ├── looping-links.html
│ │ ├── robots-blocked-child.html
│ │ ├── robots-blocked.html
│ │ └── robots.txt
│ ├── DefaultContentProcessor
│ │ ├── AbsoluteCanonicalUri.html
│ │ ├── BaseHrefCrawlLink.html
│ │ ├── CrawlLinkContent.html
│ │ ├── MetaNoFollow.html
│ │ ├── MetaNoIndex.html
│ │ ├── MetaNoIndexNoFollow.html
│ │ ├── MetaNone.html
│ │ ├── NoCanonicalUri.html
│ │ └── RelativeCanonicalUri.html
│ ├── DefaultRequestProcessor
│ │ └── index.html
│ └── EmptySite
│ │ └── readme.txt
├── SiteContext.cs
├── Startup.cs
├── TestHttpMessageHandler.cs
└── TestSiteManager.cs
└── InfinityCrawler.Tests
├── BasicSiteTests.cs
├── ContentProcessorTestBase.cs
├── CrawlerTestBase.cs
├── DefaultContentProcessorTests.cs
├── DefaultRequestProcessorTests.cs
├── ExpectedExceptionPatternAttribute.cs
├── InfinityCrawler.Tests.csproj
├── TestBase.cs
└── TestSiteConfiguration.cs
/.codecov.yml:
--------------------------------------------------------------------------------
1 | comment: off
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # Based on the EditorConfig from Roslyn
2 | # top-most EditorConfig file
3 | root = true
4 |
5 | [*.cs]
6 | indent_style = tab
7 |
8 | # Sort using and Import directives with System.* appearing first
9 | dotnet_sort_system_directives_first = true
10 | # Avoid "this." and "Me." if not necessary
11 | dotnet_style_qualification_for_field = false:suggestion
12 | dotnet_style_qualification_for_property = false:suggestion
13 | dotnet_style_qualification_for_method = false:suggestion
14 | dotnet_style_qualification_for_event = false:suggestion
15 |
16 | # Use language keywords instead of framework type names for type references
17 | dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
18 | dotnet_style_predefined_type_for_member_access = true:suggestion
19 |
20 | # Suggest more modern language features when available
21 | dotnet_style_object_initializer = true:suggestion
22 | dotnet_style_collection_initializer = true:suggestion
23 | dotnet_style_coalesce_expression = true:suggestion
24 | dotnet_style_null_propagation = true:suggestion
25 | dotnet_style_explicit_tuple_names = true:suggestion
26 |
27 | # Prefer "var" everywhere
28 | csharp_style_var_for_built_in_types = true:suggestion
29 | csharp_style_var_when_type_is_apparent = true:suggestion
30 | csharp_style_var_elsewhere = true:suggestion
31 |
32 | # Prefer method-like constructs to have a block body
33 | csharp_style_expression_bodied_methods = false:none
34 | csharp_style_expression_bodied_constructors = false:none
35 | csharp_style_expression_bodied_operators = false:none
36 |
37 | # Prefer property-like constructs to have an expression-body
38 | csharp_style_expression_bodied_properties = when_on_single_line:suggestion
39 | csharp_style_expression_bodied_indexers = true:none
40 | csharp_style_expression_bodied_accessors = when_on_single_line:suggestion
41 |
42 | # Suggest more modern language features when available
43 | csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
44 | csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
45 | csharp_style_inlined_variable_declaration = true:suggestion
46 | csharp_style_throw_expression = true:suggestion
47 | csharp_style_conditional_delegate_call = true:suggestion
48 |
49 | # Newline settings
50 | csharp_new_line_before_open_brace = all
51 | csharp_new_line_before_else = true
52 | csharp_new_line_before_catch = true
53 | csharp_new_line_before_finally = true
54 | csharp_new_line_before_members_in_object_initializers = true
55 | csharp_new_line_before_members_in_anonymous_types = true
56 |
57 | # Misc
58 | csharp_space_after_keywords_in_control_flow_statements = true
59 | csharp_space_between_method_declaration_parameter_list_parentheses = false
60 | csharp_space_between_method_call_parameter_list_parentheses = false
61 | csharp_space_between_parentheses = false
62 | csharp_preserve_single_line_statements = false
63 | csharp_preserve_single_line_blocks = true
64 | csharp_indent_case_contents = true
65 | csharp_indent_switch_labels = true
66 | csharp_indent_labels = no_change
67 |
68 | # Custom naming conventions
69 | dotnet_naming_rule.non_field_members_must_be_capitalized.symbols = non_field_member_symbols
70 | dotnet_naming_symbols.non_field_member_symbols.applicable_kinds = property,method,event,delegate
71 | dotnet_naming_symbols.non_field_member_symbols.applicable_accessibilities = *
72 |
73 | dotnet_naming_rule.non_field_members_must_be_capitalized.style = pascal_case_style
74 | dotnet_naming_style.pascal_case_style.capitalization = pascal_case
75 |
76 | dotnet_naming_rule.non_field_members_must_be_capitalized.severity = suggestion
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: Turnerj
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: nuget
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | open-pull-requests-limit: 10
8 |
--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
1 | name-template: '$RESOLVED_VERSION'
2 | tag-template: '$RESOLVED_VERSION'
3 | categories:
4 | - title: '🚀 Features'
5 | labels:
6 | - 'feature'
7 | - 'enhancement'
8 | - title: '🐛 Bug Fixes'
9 | labels:
10 | - 'bug'
11 | - 'bugfix'
12 | - title: '🧰 Maintenance'
13 | label:
14 | - 'dependencies'
15 | - 'maintenance'
16 | change-template: '- $TITLE by @$AUTHOR (#$NUMBER)'
17 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
18 | version-resolver:
19 | major:
20 | labels:
21 | - 'major'
22 | minor:
23 | labels:
24 | - 'minor'
25 | patch:
26 | labels:
27 | - 'patch'
28 | default: patch
29 | template: |
30 | ## Changes
31 |
32 | $CHANGES
33 |
34 | ## 👨🏼💻 Contributors
35 |
36 | $CONTRIBUTORS
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | release:
8 | types: [ published ]
9 |
10 | env:
11 | # Disable the .NET logo in the console output.
12 | DOTNET_NOLOGO: true
13 | # Disable the .NET first time experience to skip caching NuGet packages and speed up the build.
14 | DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
15 | # Disable sending .NET CLI telemetry to Microsoft.
16 | DOTNET_CLI_TELEMETRY_OPTOUT: true
17 |
18 | BUILD_ARTIFACT_PATH: ${{github.workspace}}/build-artifacts
19 |
20 | jobs:
21 |
22 | build:
23 | name: Build ${{matrix.os}}
24 | runs-on: ${{matrix.os}}
25 | strategy:
26 | matrix:
27 | os: [ubuntu-latest, windows-latest, macOS-latest]
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v2
31 | - name: Setup dotnet 3.1 5.0 6.0
32 | uses: actions/setup-dotnet@v1
33 | with:
34 | dotnet-version: |
35 | 3.1.x
36 | 5.0.x
37 | 6.0.x
38 | - name: Install dependencies
39 | run: dotnet restore
40 | - name: Build
41 | run: dotnet build --no-restore -c Release /p:ContinuousIntegrationBuild=true
42 | - name: Test with Coverage
43 | run: dotnet test --no-restore --logger trx --results-directory ${{env.BUILD_ARTIFACT_PATH}}/coverage --collect "XPlat Code Coverage" --settings CodeCoverage.runsettings /p:SkipBuildVersioning=true
44 | - name: Pack
45 | run: dotnet pack --no-build -c Release /p:PackageOutputPath=${{env.BUILD_ARTIFACT_PATH}} /p:ContinuousIntegrationBuild=true
46 | - name: Publish artifacts
47 | uses: actions/upload-artifact@v2
48 | with:
49 | name: ${{matrix.os}}
50 | path: ${{env.BUILD_ARTIFACT_PATH}}
51 |
52 | coverage:
53 | name: Process code coverage
54 | runs-on: ubuntu-latest
55 | needs: build
56 | steps:
57 | - name: Checkout
58 | uses: actions/checkout@v2
59 | - name: Download coverage reports
60 | uses: actions/download-artifact@v2
61 | - name: Install ReportGenerator tool
62 | run: dotnet tool install -g dotnet-reportgenerator-globaltool
63 | - name: Prepare coverage reports
64 | run: reportgenerator -reports:*/coverage/*/coverage.cobertura.xml -targetdir:./ -reporttypes:Cobertura
65 | - name: Upload coverage report
66 | uses: codecov/codecov-action@v1.0.13
67 | with:
68 | file: Cobertura.xml
69 | fail_ci_if_error: false
70 | - name: Save combined coverage report as artifact
71 | uses: actions/upload-artifact@v2
72 | with:
73 | name: coverage-report
74 | path: Cobertura.xml
75 |
76 | push-to-github-packages:
77 | name: 'Push GitHub Packages'
78 | needs: build
79 | if: github.ref == 'refs/heads/main' || github.event_name == 'release'
80 | environment:
81 | name: 'GitHub Packages'
82 | url: https://github.com/TurnerSoftware/InfinityCrawler/packages
83 | permissions:
84 | packages: write
85 | runs-on: ubuntu-latest
86 | steps:
87 | - name: 'Download build'
88 | uses: actions/download-artifact@v2
89 | with:
90 | name: 'ubuntu-latest'
91 | - name: 'Add NuGet source'
92 | run: dotnet nuget add source https://nuget.pkg.github.com/TurnerSoftware/index.json --name GitHub --username Turnerj --password ${{secrets.GITHUB_TOKEN}} --store-password-in-clear-text
93 | - name: 'Upload NuGet package'
94 | run: dotnet nuget push *.nupkg --api-key ${{secrets.GH_PACKAGE_REGISTRY_API_KEY}} --source GitHub --skip-duplicate
95 |
96 | push-to-nuget:
97 | name: 'Push NuGet Packages'
98 | needs: build
99 | if: github.event_name == 'release'
100 | environment:
101 | name: 'NuGet'
102 | url: https://www.nuget.org/packages/InfinityCrawler
103 | runs-on: ubuntu-latest
104 | steps:
105 | - name: 'Download build'
106 | uses: actions/download-artifact@v2
107 | with:
108 | name: 'ubuntu-latest'
109 | - name: 'Upload NuGet package and symbols'
110 | run: dotnet nuget push *.nupkg --source https://api.nuget.org/v3/index.json --skip-duplicate --api-key ${{secrets.NUGET_API_KEY}}
111 |
--------------------------------------------------------------------------------
/.github/workflows/release-drafter.yml:
--------------------------------------------------------------------------------
1 | name: Release Drafter
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | update_release_draft:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: release-drafter/release-drafter@v5
13 | env:
14 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.suo
8 | *.user
9 | *.userosscache
10 | *.sln.docstates
11 |
12 | # User-specific files (MonoDevelop/Xamarin Studio)
13 | *.userprefs
14 |
15 | # Build results
16 | [Dd]ebug/
17 | [Dd]ebugPublic/
18 | [Rr]elease/
19 | [Rr]eleases/
20 | x64/
21 | x86/
22 | bld/
23 | [Bb]in/
24 | [Oo]bj/
25 | [Ll]og/
26 |
27 | # Visual Studio 2015/2017 cache/options directory
28 | .vs/
29 | # Uncomment if you have tasks that create the project's static files in wwwroot
30 | #wwwroot/
31 |
32 | # Visual Studio 2017 auto generated files
33 | Generated\ Files/
34 |
35 | # MSTest test Results
36 | [Tt]est[Rr]esult*/
37 | [Bb]uild[Ll]og.*
38 |
39 | # NUNIT
40 | *.VisualState.xml
41 | TestResult.xml
42 |
43 | # Build Results of an ATL Project
44 | [Dd]ebugPS/
45 | [Rr]eleasePS/
46 | dlldata.c
47 |
48 | # Benchmark Results
49 | BenchmarkDotNet.Artifacts/
50 |
51 | # .NET Core
52 | project.lock.json
53 | project.fragment.lock.json
54 | artifacts/
55 | **/Properties/launchSettings.json
56 |
57 | # StyleCop
58 | StyleCopReport.xml
59 |
60 | # Files built by Visual Studio
61 | *_i.c
62 | *_p.c
63 | *_i.h
64 | *.ilk
65 | *.meta
66 | *.obj
67 | *.iobj
68 | *.pch
69 | *.pdb
70 | *.ipdb
71 | *.pgc
72 | *.pgd
73 | *.rsp
74 | *.sbr
75 | *.tlb
76 | *.tli
77 | *.tlh
78 | *.tmp
79 | *.tmp_proj
80 | *.log
81 | *.vspscc
82 | *.vssscc
83 | .builds
84 | *.pidb
85 | *.svclog
86 | *.scc
87 |
88 | # Chutzpah Test files
89 | _Chutzpah*
90 |
91 | # Visual C++ cache files
92 | ipch/
93 | *.aps
94 | *.ncb
95 | *.opendb
96 | *.opensdf
97 | *.sdf
98 | *.cachefile
99 | *.VC.db
100 | *.VC.VC.opendb
101 |
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 |
108 | # Visual Studio Trace Files
109 | *.e2e
110 |
111 | # TFS 2012 Local Workspace
112 | $tf/
113 |
114 | # Guidance Automation Toolkit
115 | *.gpState
116 |
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 |
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 |
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 |
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 |
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 |
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 |
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 |
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 |
148 | # Web workbench (sass)
149 | .sass-cache/
150 |
151 | # Installshield output folder
152 | [Ee]xpress/
153 |
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 |
164 | # Click-Once directory
165 | publish/
166 |
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 |
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 |
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 |
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 |
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 |
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 |
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 |
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 |
224 | # Including strong name files can present a security risk
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 |
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 |
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 |
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 |
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 |
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 |
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 |
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 |
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 |
266 | # Visual Studio 6 build log
267 | *.plg
268 |
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 |
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 |
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 |
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 |
287 | # FAKE - F# Make
288 | .fake/
289 |
290 | # JetBrains Rider
291 | .idea/
292 | *.sln.iml
293 |
294 | # CodeRush
295 | .cr/
296 |
297 | # Python Tools for Visual Studio (PTVS)
298 | __pycache__/
299 | *.pyc
300 |
301 | # Cake - Uncomment if you are using it
302 | # tools/**
303 | # !tools/packages.config
304 |
305 | # Tabs Studio
306 | *.tss
307 |
308 | # Telerik's JustMock configuration file
309 | *.jmconfig
310 |
311 | # BizTalk build output
312 | *.btp.cs
313 | *.btm.cs
314 | *.odx.cs
315 | *.xsd.cs
316 |
317 | # OpenCover UI analysis results
318 | OpenCover/
319 |
320 | # Azure Stream Analytics local run output
321 | ASALocalRun/
322 |
323 | # MSBuild Binary and Structured Log
324 | *.binlog
325 |
326 | # NVidia Nsight GPU debugger configuration file
327 | *.nvuser
328 |
329 | # MFractors (Xamarin productivity tool) working folder
330 | .mfractor/
331 |
332 | /build-artifacts
--------------------------------------------------------------------------------
/CodeCoverage.runsettings:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | cobertura
8 | [InfinityCrawler.Tests]*
9 | [InfinityCrawler]*
10 | Obsolete,GeneratedCodeAttribute,CompilerGeneratedAttribute
11 | true
12 | true
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/InfinityCrawler.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.0.31808.319
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "global", "global", "{F0B4D082-200A-4DD3-9291-872B7F2A991E}"
7 | ProjectSection(SolutionItems) = preProject
8 | .codecov.yml = .codecov.yml
9 | .editorconfig = .editorconfig
10 | .gitignore = .gitignore
11 | CodeCoverage.runsettings = CodeCoverage.runsettings
12 | License.txt = License.txt
13 | README.md = README.md
14 | EndProjectSection
15 | EndProject
16 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C6187826-9F4B-4E85-90D1-BC46A0F7F8F1}"
17 | ProjectSection(SolutionItems) = preProject
18 | src\Directory.build.props = src\Directory.build.props
19 | EndProjectSection
20 | EndProject
21 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{46BF0980-A8A4-492E-8652-0725ADB6A683}"
22 | EndProject
23 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler", "src\InfinityCrawler\InfinityCrawler.csproj", "{90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}"
24 | EndProject
25 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests", "tests\InfinityCrawler.Tests\InfinityCrawler.Tests.csproj", "{F30AF2A4-C53F-40FE-8083-6A82C0583255}"
26 | EndProject
27 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests.TestSite", "tests\InfinityCrawler.Tests.TestSite\InfinityCrawler.Tests.TestSite.csproj", "{483B6FC9-98E7-4BD4-BA09-80DF504E31B2}"
28 | EndProject
29 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests.Benchmarks", "tests\InfinityCrawler.Tests.Benchmarks\InfinityCrawler.Tests.Benchmarks.csproj", "{F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}"
30 | EndProject
31 | Global
32 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
33 | Debug|Any CPU = Debug|Any CPU
34 | Release|Any CPU = Release|Any CPU
35 | EndGlobalSection
36 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
37 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
38 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Debug|Any CPU.Build.0 = Debug|Any CPU
39 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Release|Any CPU.ActiveCfg = Release|Any CPU
40 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Release|Any CPU.Build.0 = Release|Any CPU
41 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
42 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Debug|Any CPU.Build.0 = Debug|Any CPU
43 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Release|Any CPU.ActiveCfg = Release|Any CPU
44 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Release|Any CPU.Build.0 = Release|Any CPU
45 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
46 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Debug|Any CPU.Build.0 = Debug|Any CPU
47 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Release|Any CPU.ActiveCfg = Release|Any CPU
48 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Release|Any CPU.Build.0 = Release|Any CPU
49 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
50 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Debug|Any CPU.Build.0 = Debug|Any CPU
51 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Release|Any CPU.ActiveCfg = Release|Any CPU
52 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Release|Any CPU.Build.0 = Release|Any CPU
53 | EndGlobalSection
54 | GlobalSection(SolutionProperties) = preSolution
55 | HideSolutionNode = FALSE
56 | EndGlobalSection
57 | GlobalSection(NestedProjects) = preSolution
58 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565} = {C6187826-9F4B-4E85-90D1-BC46A0F7F8F1}
59 | {F30AF2A4-C53F-40FE-8083-6A82C0583255} = {46BF0980-A8A4-492E-8652-0725ADB6A683}
60 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2} = {46BF0980-A8A4-492E-8652-0725ADB6A683}
61 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97} = {46BF0980-A8A4-492E-8652-0725ADB6A683}
62 | EndGlobalSection
63 | GlobalSection(ExtensibilityGlobals) = postSolution
64 | SolutionGuid = {FC9AB8BE-670B-4F26-9F17-73D2C0DECA6A}
65 | EndGlobalSection
66 | EndGlobal
67 |
--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Turner Software
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | 
4 | # Infinity Crawler
5 | A simple but powerful web crawler library for .NET
6 |
7 | 
8 | [](https://codecov.io/gh/TurnerSoftware/infinitycrawler)
9 | [](https://www.nuget.org/packages/InfinityCrawler)
10 |
11 |
12 | ## Features
13 | - Obeys robots.txt (crawl delay & allow/disallow)
14 | - Obeys in-page robots rules (`X-Robots-Tag` header and `` tag)
15 | - Uses sitemap.xml to seed the initial crawl of the site
16 | - Built around a parallel task `async`/`await` system
17 | - Swappable request and content processors, allowing greater customisation
18 | - Auto-throttling (see below)
19 |
20 | ## Licensing and Support
21 |
22 | Infinity Crawler is licensed under the MIT license. It is free to use in personal and commercial projects.
23 |
24 | There are [support plans](https://turnersoftware.com.au/support-plans) available that cover all active [Turner Software OSS projects](https://github.com/TurnerSoftware).
25 | Support plans provide private email support, expert usage advice for our projects, priority bug fixes and more.
26 | These support plans help fund our OSS commitments to provide better software for everyone.
27 |
28 | ## Polite Crawling
29 | The crawler is built around fast but "polite" crawling of website.
30 | This is accomplished through a number of settings that allow adjustments of delays and throttles.
31 |
32 | You can control:
33 | - Number of simulatenous requests
34 | - The delay between requests starting (Note: If a `crawl-delay` is defined for the User-agent, that will be the minimum)
35 | - Artificial "jitter" in request delays (requests seem less "robotic")
36 | - Timeout for a request before throttling will apply for new requests
37 | - Throttling request backoff: The amount of time added to the delay to throttle requests (this is cumulative)
38 | - Minimum number of requests under the throttle timeout before the throttle is gradually removed
39 |
40 | ## Other Settings
41 | - Control the UserAgent used in the crawling process
42 | - Set additional host aliases you want the crawling process to follow (for example, subdomains)
43 | - The max number of retries for a specific URI
44 | - The max number of redirects to follow
45 | - The max number of pages to crawl
46 |
47 | ## Example Usage
48 | ```csharp
49 | using InfinityCrawler;
50 |
51 | var crawler = new Crawler();
52 | var result = await crawler.Crawl(new Uri("http://example.org/"), new CrawlSettings {
53 | UserAgent = "MyVeryOwnWebCrawler/1.0",
54 | RequestProcessorOptions = new RequestProcessorOptions
55 | {
56 | MaxNumberOfSimultaneousRequests = 5
57 | }
58 | });
59 | ```
--------------------------------------------------------------------------------
/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TurnerSoftware/InfinityCrawler/4b56f68b5ea90afb9b711224a053ce658f03ac3b/images/icon.png
--------------------------------------------------------------------------------
/src/Directory.Build.props:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | InfinityCrawler
5 |
6 | Turner Software
7 |
8 | $(AssemblyName)
9 | true
10 | MIT
11 | icon.png
12 | https://github.com/TurnerSoftware/InfinityCrawler
13 | crawler;robot;spider
14 |
15 |
16 | true
17 | true
18 | embedded
19 |
20 | Latest
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/CrawlLink.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler
6 | {
7 | public class CrawlLink
8 | {
9 | public Uri Location { get; set; }
10 | public string Title { get; set; }
11 | public string Text { get; set; }
12 | public string Relationship { get; set; }
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/CrawlResult.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler
6 | {
7 | public class CrawlResult
8 | {
9 | public DateTime CrawlStart { get; set; }
10 | public TimeSpan ElapsedTime { get; set; }
11 | public IEnumerable CrawledUris { get; set; }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/CrawlSettings.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net.Http;
4 | using System.Text;
5 | using InfinityCrawler.Processing.Content;
6 | using InfinityCrawler.Processing.Requests;
7 |
8 | namespace InfinityCrawler
9 | {
10 | public class CrawlSettings
11 | {
12 | public string UserAgent { get; set; } = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201";
13 | public IEnumerable HostAliases { get; set; }
14 | public int NumberOfRetries { get; set; } = 3;
15 | public int MaxNumberOfRedirects { get; set; } = 3;
16 | public int MaxNumberOfPagesToCrawl { get; set; }
17 |
18 | public IContentProcessor ContentProcessor { get; set; } = new DefaultContentProcessor();
19 | public IRequestProcessor RequestProcessor { get; set; } = new DefaultRequestProcessor();
20 | public RequestProcessorOptions RequestProcessorOptions { get; set; } = new RequestProcessorOptions();
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/CrawledUri.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Net;
6 | using System.Text;
7 |
8 | namespace InfinityCrawler
9 | {
10 | public class CrawledUri
11 | {
12 | public Uri Location { get; set; }
13 |
14 | public CrawlStatus Status { get; set; }
15 |
16 | public IList RedirectChain { get; set; }
17 | public IList Requests { get; set; }
18 |
19 | public CrawledContent Content { get; set; }
20 | }
21 |
22 | public enum CrawlStatus
23 | {
24 | Crawled,
25 | RobotsBlocked,
26 | MaxRetries,
27 | MaxRedirects
28 | }
29 |
30 | public class CrawledUriRedirect
31 | {
32 | public Uri Location { get; set; }
33 | public IList Requests { get; set; }
34 | }
35 |
36 | public class CrawlRequest
37 | {
38 | public DateTime RequestStart { get; set; }
39 | public TimeSpan ElapsedTime { get; set; }
40 | public HttpStatusCode? StatusCode { get; set; }
41 | public bool IsSuccessfulStatus { get; set; }
42 | }
43 |
44 | public class CrawledContent
45 | {
46 | public string ContentType { get; set; }
47 | public string CharacterSet { get; set; }
48 | public string ContentEncoding { get; set; }
49 |
50 | public IEnumerable PageRobotRules { get; set; }
51 |
52 | public string RawContent { get; set; }
53 |
54 | public Uri CanonicalUri { get; set; }
55 | public IEnumerable Links { get; set; } = Enumerable.Empty();
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Crawler.cs:
--------------------------------------------------------------------------------
1 | using InfinityCrawler.Internal;
2 | using System;
3 | using System.Collections.Concurrent;
4 | using System.Collections.Generic;
5 | using System.Diagnostics;
6 | using System.IO;
7 | using System.Linq;
8 | using System.Net.Http;
9 | using System.Threading;
10 | using System.Threading.Tasks;
11 | using InfinityCrawler.Processing.Requests;
12 | using TurnerSoftware.RobotsExclusionTools;
13 | using TurnerSoftware.SitemapTools;
14 | using Microsoft.Extensions.Logging;
15 | using InfinityCrawler.Processing.Content;
16 |
17 | namespace InfinityCrawler
18 | {
19 | public class Crawler
20 | {
21 | private HttpClient HttpClient { get; }
22 | private ILogger Logger { get; }
23 |
24 | public Crawler()
25 | {
26 | HttpClient = new HttpClient(new HttpClientHandler
27 | {
28 | AllowAutoRedirect = false,
29 | UseCookies = false
30 | });
31 | }
32 |
33 | public Crawler(HttpClient httpClient, ILogger logger = null)
34 | {
35 | HttpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
36 | Logger = logger;
37 | }
38 |
39 | public async Task Crawl(Uri siteUri, CrawlSettings settings)
40 | {
41 | var result = new CrawlResult
42 | {
43 | CrawlStart = DateTime.UtcNow
44 | };
45 | var overallCrawlStopwatch = new Stopwatch();
46 | overallCrawlStopwatch.Start();
47 |
48 | var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
49 | var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);
50 |
51 | UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);
52 |
53 | var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);
54 |
55 | //Use any links referred to by the sitemap as a starting point
56 | var urisFromSitemap = (await new SitemapQuery(HttpClient)
57 | .GetAllSitemapsForDomainAsync(siteUri.Host))
58 | .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());
59 | foreach (var uri in urisFromSitemap)
60 | {
61 | crawlRunner.AddRequest(uri);
62 | }
63 |
64 | result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
65 | {
66 | using (requestResult.Content)
67 | {
68 | var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders);
69 | var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content);
70 | requestResult.Content.Seek(0, SeekOrigin.Begin);
71 | content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync();
72 | crawlRunner.AddResult(crawlState.Location, content);
73 | }
74 | });
75 |
76 | overallCrawlStopwatch.Stop();
77 | result.ElapsedTime = overallCrawlStopwatch.Elapsed;
78 | return result;
79 | }
80 |
81 | private void UpdateCrawlDelay(RobotsFile robotsFile, string userAgent, RequestProcessorOptions requestProcessorOptions)
82 | {
83 | var minimumCrawlDelayInMilliseconds = 0;
84 |
85 | //Apply Robots.txt crawl-delay (if defined)
86 | if (robotsFile.TryGetEntryForUserAgent(userAgent, out var accessEntry))
87 | {
88 | minimumCrawlDelayInMilliseconds = accessEntry.CrawlDelay ?? 0 * 1000;
89 | }
90 |
91 | var taskDelay = Math.Max(minimumCrawlDelayInMilliseconds, requestProcessorOptions.DelayBetweenRequestStart.TotalMilliseconds);
92 | requestProcessorOptions.DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay);
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/InfinityCrawler.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0;net6.0
5 | InfinityCrawler
6 | A simple but powerful web crawler library
7 | $(PackageBaseTags)
8 | James Turner
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Internal/CrawlRunner.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Concurrent;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Net;
6 | using System.Net.Http;
7 | using System.Text;
8 | using System.Threading;
9 | using System.Threading.Tasks;
10 | using InfinityCrawler.Processing.Requests;
11 | using Microsoft.Extensions.Logging;
12 | using TurnerSoftware.RobotsExclusionTools;
13 |
14 | namespace InfinityCrawler.Internal
15 | {
16 | internal class CrawlRunner
17 | {
18 | public Uri BaseUri { get; }
19 | public CrawlSettings Settings { get; }
20 |
21 | private RobotsFile RobotsFile { get; }
22 | private HttpClient HttpClient { get; }
23 |
24 | private ILogger Logger { get; }
25 |
26 | private RobotsPageParser RobotsPageParser { get; }
27 |
28 | private ConcurrentDictionary UriCrawlStates { get; } = new ConcurrentDictionary();
29 | private ConcurrentDictionary SeenUris { get; } = new ConcurrentDictionary();
30 | private ConcurrentBag CrawledUris { get; } = new ConcurrentBag();
31 |
32 | public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null)
33 | {
34 | BaseUri = baseUri;
35 | RobotsFile = robotsFile;
36 | HttpClient = httpClient;
37 | Settings = crawlSettings;
38 |
39 | Logger = logger;
40 | RobotsPageParser = new RobotsPageParser();
41 |
42 | AddRequest(baseUri);
43 | }
44 |
45 | private Uri StripFragment(Uri uri)
46 | {
47 | return new UriBuilder(uri)
48 | {
49 | Fragment = null
50 | }.Uri;
51 | }
52 |
53 | private void AddLink(CrawlLink crawlLink)
54 | {
55 | if (crawlLink.Relationship != null && crawlLink.Relationship.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase))
56 | {
57 | return;
58 | }
59 |
60 | var uriWithoutFragment = StripFragment(crawlLink.Location);
61 | if (SeenUris.ContainsKey(uriWithoutFragment))
62 | {
63 | return;
64 | }
65 |
66 | AddRequest(uriWithoutFragment, false);
67 | }
68 |
69 | private void AddRedirect(Uri requestUri, Uri redirectUri)
70 | {
71 | if (UriCrawlStates.TryRemove(requestUri, out var crawlState))
72 | {
73 | var absoluteRedirectUri = new Uri(requestUri, redirectUri);
74 | absoluteRedirectUri = StripFragment(absoluteRedirectUri);
75 |
76 | var redirectCrawlState = new UriCrawlState
77 | {
78 | Location = absoluteRedirectUri,
79 | Redirects = crawlState.Redirects ?? new List()
80 | };
81 | redirectCrawlState.Redirects.Add(new CrawledUriRedirect
82 | {
83 | Location = crawlState.Location,
84 | Requests = crawlState.Requests
85 | });
86 |
87 | UriCrawlStates.TryAdd(redirectCrawlState.Location, redirectCrawlState);
88 | AddRequest(redirectCrawlState.Location, true);
89 | }
90 | }
91 |
92 | public void AddResult(Uri requestUri, CrawledContent content)
93 | {
94 | if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
95 | {
96 | var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules);
97 | if (!robotsPageDefinition.CanIndex(Settings.UserAgent))
98 | {
99 | Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule.");
100 | AddResult(new CrawledUri
101 | {
102 | Location = crawlState.Location,
103 | Status = CrawlStatus.RobotsBlocked,
104 | Requests = crawlState.Requests,
105 | RedirectChain = crawlState.Redirects
106 | });
107 | }
108 | else
109 | {
110 | Logger?.LogDebug($"Result for {requestUri} has completed successfully with content.");
111 |
112 | AddResult(new CrawledUri
113 | {
114 | Location = crawlState.Location,
115 | Status = CrawlStatus.Crawled,
116 | RedirectChain = crawlState.Redirects,
117 | Requests = crawlState.Requests,
118 | Content = content
119 | });
120 |
121 | if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
122 | {
123 | foreach (var crawlLink in content.Links)
124 | {
125 | AddLink(crawlLink);
126 | }
127 | }
128 | }
129 | }
130 | }
131 |
132 | public void AddRequest(Uri requestUri)
133 | {
134 | var uriWithoutFragment = StripFragment(requestUri);
135 | AddRequest(uriWithoutFragment, false);
136 | }
137 |
138 | private void AddRequest(Uri requestUri, bool skipMaxPageCheck)
139 | {
140 | if (Settings.HostAliases != null)
141 | {
142 | if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host)))
143 | {
144 | Logger?.LogDebug($"Request containing host {requestUri.Host} is not in the list of allowed hosts. This request will be ignored.");
145 | return;
146 | }
147 | }
148 | else if (requestUri.Host != BaseUri.Host)
149 | {
150 | Logger?.LogDebug($"Request containing host {requestUri.Host} doesn't match the base host. This request will be ignored.");
151 | return;
152 | }
153 |
154 | if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0)
155 | {
156 | var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests;
157 | if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl)
158 | {
159 | Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}. This request will be ignored.");
160 | return;
161 | }
162 | }
163 |
164 | SeenUris.TryAdd(requestUri, 0);
165 |
166 | if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
167 | {
168 | var lastRequest = crawlState.Requests.LastOrDefault();
169 | if (lastRequest != null && lastRequest.IsSuccessfulStatus)
170 | {
171 | return;
172 | }
173 |
174 | if (crawlState.Requests.Count() == Settings.NumberOfRetries)
175 | {
176 | Logger?.LogDebug($"Request for {requestUri} has hit the maximum retry limit ({Settings.NumberOfRetries}).");
177 | AddResult(new CrawledUri
178 | {
179 | Location = crawlState.Location,
180 | Status = CrawlStatus.MaxRetries,
181 | Requests = crawlState.Requests,
182 | RedirectChain = crawlState.Redirects
183 | });
184 | return;
185 | }
186 |
187 | if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects)
188 | {
189 | Logger?.LogDebug($"Request for {requestUri} has hit the maximum redirect limit ({Settings.MaxNumberOfRedirects}).");
190 | AddResult(new CrawledUri
191 | {
192 | Location = crawlState.Location,
193 | RedirectChain = crawlState.Redirects,
194 | Status = CrawlStatus.MaxRedirects
195 | });
196 | return;
197 | }
198 | }
199 |
200 | if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent))
201 | {
202 | Logger?.LogDebug($"Added {requestUri} to request queue.");
203 | Settings.RequestProcessor.Add(requestUri);
204 | }
205 | else
206 | {
207 | Logger?.LogDebug($"Request for {requestUri} has been blocked by the Robots.txt file.");
208 | AddResult(new CrawledUri
209 | {
210 | Location = requestUri,
211 | Status = CrawlStatus.RobotsBlocked
212 | });
213 | }
214 | }
215 |
216 | private void AddResult(CrawledUri result)
217 | {
218 | CrawledUris.Add(result);
219 | }
220 |
221 | public async Task> ProcessAsync(
222 | Func responseSuccessAction,
223 | CancellationToken cancellationToken = default
224 | )
225 | {
226 | await Settings.RequestProcessor.ProcessAsync(
227 | HttpClient,
228 | async (requestResult) =>
229 | {
230 | var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState
231 | {
232 | Location = requestResult.RequestUri
233 | });
234 |
235 | if (requestResult.Exception != null)
236 | {
237 | //Retry failed requests
238 | Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later.");
239 | crawlState.Requests.Add(new CrawlRequest
240 | {
241 | RequestStart = requestResult.RequestStart,
242 | ElapsedTime = requestResult.ElapsedTime
243 | });
244 | AddRequest(requestResult.RequestUri);
245 | }
246 | else
247 | {
248 | var crawlRequest = new CrawlRequest
249 | {
250 | RequestStart = requestResult.RequestStart,
251 | ElapsedTime = requestResult.ElapsedTime,
252 | StatusCode = requestResult.StatusCode,
253 | IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299
254 | };
255 | crawlState.Requests.Add(crawlRequest);
256 |
257 | var redirectStatusCodes = new[]
258 | {
259 | HttpStatusCode.MovedPermanently,
260 | HttpStatusCode.Redirect,
261 | HttpStatusCode.TemporaryRedirect
262 | };
263 | if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
264 | {
265 | Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue.");
266 | AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location);
267 | }
268 | else if (crawlRequest.IsSuccessfulStatus)
269 | {
270 | await responseSuccessAction(requestResult, crawlState);
271 | }
272 | else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
273 | {
274 | //On server errors, try to crawl the page again later
275 | Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). This URL will be added to the request queue to be attempted again later.");
276 | AddRequest(crawlState.Location);
277 | }
278 | else
279 | {
280 | //On any other error, just save what we have seen and move on
281 | //Consider the content of the request irrelevant
282 | Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). No further requests will be attempted.");
283 | AddResult(new CrawledUri
284 | {
285 | Location = crawlState.Location,
286 | Status = CrawlStatus.Crawled,
287 | RedirectChain = crawlState.Redirects,
288 | Requests = crawlState.Requests
289 | });
290 | }
291 | }
292 | },
293 | Settings.RequestProcessorOptions,
294 | cancellationToken
295 | );
296 |
297 | Logger?.LogDebug($"Completed crawling {CrawledUris.Count} pages.");
298 |
299 | return CrawledUris.ToArray();
300 | }
301 | }
302 | }
303 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Internal/UriExtensions.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler.Internal
6 | {
7 | internal static class UriExtensions
8 | {
9 | public static Uri BuildUriFromHref(this Uri pageUri, string href, string baseHref = null)
10 | {
11 | var hrefPieces = href.Split(new[] { '#' }, 2);
12 | var hrefWithoutFragment = hrefPieces[0];
13 | var hrefFragment = hrefPieces.Length > 1 ? hrefPieces[1] : null;
14 |
15 | if (Uri.IsWellFormedUriString(hrefWithoutFragment, UriKind.RelativeOrAbsolute))
16 | {
17 | var baseUri = pageUri;
18 |
19 | //Allows to work
20 | if (Uri.IsWellFormedUriString(baseHref, UriKind.RelativeOrAbsolute))
21 | {
22 | baseUri = new Uri(pageUri, baseHref);
23 | }
24 |
25 | return new UriBuilder(new Uri(baseUri, hrefWithoutFragment))
26 | {
27 | Fragment = hrefFragment
28 | }.Uri;
29 | }
30 |
31 | return null;
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Content/CrawlHeaders.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net.Http.Headers;
4 | using System.Text;
5 |
6 | namespace InfinityCrawler.Processing.Content
7 | {
8 | public class CrawlHeaders
9 | {
10 | public HttpResponseHeaders ResponseHeaders { get; }
11 | public HttpContentHeaders ContentHeaders { get; }
12 |
13 | public CrawlHeaders(HttpResponseHeaders responseHeaders, HttpContentHeaders contentHeaders)
14 | {
15 | ResponseHeaders = responseHeaders;
16 | ContentHeaders = contentHeaders;
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Content/DefaultContentProcessor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Net.Http.Headers;
6 | using System.Text;
7 | using System.Threading.Tasks;
8 | using HtmlAgilityPack;
9 | using InfinityCrawler.Internal;
10 |
11 | namespace InfinityCrawler.Processing.Content
12 | {
13 | public class DefaultContentProcessor : IContentProcessor
14 | {
15 | public CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream)
16 | {
17 | var crawledContent = new CrawledContent
18 | {
19 | ContentType = headers.ContentHeaders.ContentType?.MediaType,
20 | CharacterSet = headers.ContentHeaders.ContentType?.CharSet,
21 | ContentEncoding = headers.ContentHeaders.ContentEncoding != null ? string.Join(",", headers.ContentHeaders.ContentEncoding) : null
22 | };
23 |
24 | var document = new HtmlDocument();
25 | document.Load(contentStream);
26 |
27 | var pageRobotRules = new List();
28 | if (headers.ResponseHeaders.Contains("X-Robots-Tag"))
29 | {
30 | var robotsHeaderValues = headers.ResponseHeaders.GetValues("X-Robots-Tag");
31 | pageRobotRules.AddRange(robotsHeaderValues);
32 | }
33 |
34 | var metaNodes = document.DocumentNode.SelectNodes("html/head/meta");
35 | if (metaNodes != null)
36 | {
37 | var robotsMetaValue = metaNodes
38 | .Where(n => n.Attributes.Any(a => a.Name == "name" && a.Value.Equals("robots", StringComparison.InvariantCultureIgnoreCase)))
39 | .SelectMany(n => n.Attributes.Where(a => a.Name == "content").Select(a => a.Value))
40 | .FirstOrDefault();
41 | if (robotsMetaValue != null)
42 | {
43 | pageRobotRules.Add(robotsMetaValue);
44 | }
45 | }
46 |
47 | crawledContent.PageRobotRules = pageRobotRules.ToArray();
48 | crawledContent.CanonicalUri = GetCanonicalUri(document, requestUri);
49 | crawledContent.Links = GetLinks(document, requestUri).ToArray();
50 |
51 | return crawledContent;
52 | }
53 |
54 | private string GetBaseHref(HtmlDocument document)
55 | {
56 | var baseNode = document.DocumentNode.SelectSingleNode("html/head/base");
57 | return baseNode?.GetAttributeValue("href", string.Empty) ?? string.Empty;
58 | }
59 |
60 | private Uri GetCanonicalUri(HtmlDocument document, Uri requestUri)
61 | {
62 | var linkNodes = document.DocumentNode.SelectNodes("html/head/link");
63 | if (linkNodes != null)
64 | {
65 | var canonicalNode = linkNodes
66 | .Where(n => n.Attributes.Any(a => a.Name == "rel" && a.Value.Equals("canonical", StringComparison.InvariantCultureIgnoreCase)))
67 | .FirstOrDefault();
68 | if (canonicalNode != null)
69 | {
70 | var baseHref = GetBaseHref(document);
71 | var canonicalHref = canonicalNode.GetAttributeValue("href", null);
72 | return requestUri.BuildUriFromHref(canonicalHref, baseHref);
73 | }
74 | }
75 |
76 | return null;
77 | }
78 |
79 | private IEnumerable GetLinks(HtmlDocument document, Uri requestUri)
80 | {
81 | var anchorNodes = document.DocumentNode.SelectNodes("//a");
82 | if (anchorNodes != null)
83 | {
84 | var baseHref = GetBaseHref(document);
85 |
86 | foreach (var anchor in anchorNodes)
87 | {
88 | var href = anchor.GetAttributeValue("href", null);
89 | if (href == null)
90 | {
91 | continue;
92 | }
93 |
94 | var anchorLocation = requestUri.BuildUriFromHref(href, baseHref);
95 | if (anchorLocation == null)
96 | {
97 | //Invalid links are ignored
98 | continue;
99 | }
100 |
101 | if (anchorLocation.Scheme != Uri.UriSchemeHttp && anchorLocation.Scheme != Uri.UriSchemeHttps)
102 | {
103 | //Skip non-HTTP links
104 | continue;
105 | }
106 |
107 | yield return new CrawlLink
108 | {
109 | Location = anchorLocation,
110 | Title = anchor.GetAttributeValue("title", null),
111 | Text = anchor.InnerText,
112 | Relationship = anchor.GetAttributeValue("rel", null),
113 | };
114 | }
115 | }
116 | }
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Content/IContentProcessor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Net.Http.Headers;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace InfinityCrawler.Processing.Content
9 | {
10 | public interface IContentProcessor
11 | {
12 | CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Requests/DefaultRequestProcessor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Concurrent;
3 | using System.Collections.Generic;
4 | using System.Diagnostics;
5 | using System.IO;
6 | using System.Linq;
7 | using System.Net.Http;
8 | using System.Runtime.ExceptionServices;
9 | using System.Text;
10 | using System.Threading;
11 | using System.Threading.Tasks;
12 | using Microsoft.Extensions.Logging;
13 |
14 | namespace InfinityCrawler.Processing.Requests
15 | {
16 | public class DefaultRequestProcessor : IRequestProcessor
17 | {
18 | private ILogger Logger { get; }
19 | private ConcurrentQueue RequestQueue { get; } = new ConcurrentQueue();
20 |
21 | public DefaultRequestProcessor(ILogger logger = null)
22 | {
23 | Logger = logger;
24 | }
25 |
26 | public void Add(Uri uri)
27 | {
28 | RequestQueue.Enqueue(uri);
29 | PendingRequests++;
30 | }
31 |
32 | public int PendingRequests { get; private set; }
33 |
34 | public async Task ProcessAsync(HttpClient httpClient, Func responseAction, RequestProcessorOptions options, CancellationToken cancellationToken = default)
35 | {
36 | if (options == null)
37 | {
38 | throw new ArgumentNullException(nameof(options));
39 | }
40 |
41 | var random = new Random();
42 | var activeRequests = new ConcurrentDictionary, RequestContext>(options.MaxNumberOfSimultaneousRequests, options.MaxNumberOfSimultaneousRequests);
43 |
44 | var currentBackoff = 0;
45 | var successesSinceLastThrottle = 0;
46 | var requestCount = 0;
47 |
48 | while (activeRequests.Count > 0 || !RequestQueue.IsEmpty)
49 | {
50 | cancellationToken.ThrowIfCancellationRequested();
51 |
52 | while (!RequestQueue.IsEmpty)
53 | {
54 | cancellationToken.ThrowIfCancellationRequested();
55 |
56 | if (RequestQueue.TryDequeue(out var requestUri))
57 | {
58 | var requestStartDelay = 0d;
59 | //Request delaying and backoff
60 | if (options.DelayBetweenRequestStart.TotalMilliseconds > 0)
61 | {
62 | requestStartDelay = options.DelayBetweenRequestStart.TotalMilliseconds;
63 | requestStartDelay += random.NextDouble() * options.DelayJitter.TotalMilliseconds;
64 | }
65 |
66 | requestStartDelay += currentBackoff;
67 |
68 | var requestContext = new RequestContext
69 | {
70 | RequestNumber = requestCount + 1,
71 | RequestUri = requestUri,
72 | Timer = new Stopwatch(),
73 | RequestStartDelay = requestStartDelay,
74 | RequestTimeout = options.RequestTimeout,
75 | CancellationToken = cancellationToken
76 | };
77 |
78 | Logger?.LogDebug($"Request #{requestContext.RequestNumber} ({requestUri}) starting with a {requestStartDelay}ms delay.");
79 |
80 | var task = PerformRequestAsync(httpClient, requestContext);
81 |
82 | activeRequests.TryAdd(task, requestContext);
83 | requestCount++;
84 |
85 | if (activeRequests.Count == options.MaxNumberOfSimultaneousRequests)
86 | {
87 | break;
88 | }
89 | }
90 | }
91 |
92 | await Task.WhenAny(activeRequests.Keys).ConfigureAwait(false);
93 |
94 | cancellationToken.ThrowIfCancellationRequested();
95 |
96 | var completedRequests = activeRequests.Keys.Where(t => t.IsCompleted);
97 | foreach (var completedRequest in completedRequests)
98 | {
99 | activeRequests.TryRemove(completedRequest, out var requestContext);
100 | PendingRequests--;
101 |
102 | if (completedRequest.IsFaulted)
103 | {
104 | var aggregateException = completedRequest.Exception;
105 |
106 | //Keep the existing stack trace when re-throwing
107 | ExceptionDispatchInfo.Capture(aggregateException.InnerException).Throw();
108 | }
109 |
110 | await responseAction(completedRequest.Result);
111 |
112 | //Manage the throttling based on timeouts and successes
113 | var throttlePoint = options.TimeoutBeforeThrottle;
114 | if (throttlePoint.TotalMilliseconds > 0 && requestContext.Timer.Elapsed > throttlePoint)
115 | {
116 | successesSinceLastThrottle = 0;
117 | currentBackoff += (int)options.ThrottlingRequestBackoff.TotalMilliseconds;
118 | Logger?.LogInformation($"Increased backoff to {currentBackoff}ms.");
119 | }
120 | else if (currentBackoff > 0)
121 | {
122 | successesSinceLastThrottle += 1;
123 | if (successesSinceLastThrottle == options.MinSequentialSuccessesToMinimiseThrottling)
124 | {
125 | var newBackoff = currentBackoff - options.ThrottlingRequestBackoff.TotalMilliseconds;
126 | currentBackoff = Math.Max(0, (int)newBackoff);
127 | successesSinceLastThrottle = 0;
128 | Logger?.LogInformation($"Decreased backoff to {currentBackoff}ms.");
129 | }
130 | }
131 | }
132 | }
133 |
134 | Logger?.LogDebug($"Completed processing {requestCount} requests.");
135 | }
136 |
137 | private async Task PerformRequestAsync(HttpClient httpClient, RequestContext context)
138 | {
139 | if (context.RequestStartDelay > 0)
140 | {
141 | await Task.Delay((int)context.RequestStartDelay);
142 | }
143 |
144 | var requestStart = DateTime.UtcNow;
145 | context.Timer.Start();
146 |
147 | try
148 | {
149 | var timeoutToken = new CancellationTokenSource(context.RequestTimeout).Token;
150 | var combinedToken = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken, timeoutToken).Token;
151 | using (var response = await httpClient.GetAsync(context.RequestUri, combinedToken))
152 | {
153 | var contentStream = new MemoryStream();
154 | await response.Content.CopyToAsync(contentStream);
155 | contentStream.Seek(0, SeekOrigin.Begin);
156 |
157 | //We only want to time the request, not the handling of the response
158 | context.Timer.Stop();
159 |
160 | context.CancellationToken.ThrowIfCancellationRequested();
161 |
162 | Logger?.LogDebug($"Request #{context.RequestNumber} completed successfully in {context.Timer.ElapsedMilliseconds}ms.");
163 |
164 | return new RequestResult
165 | {
166 | RequestUri = context.RequestUri,
167 | RequestStart = requestStart,
168 | RequestStartDelay = context.RequestStartDelay,
169 | StatusCode = response.StatusCode,
170 | ResponseHeaders = response.Headers,
171 | ContentHeaders = response.Content.Headers,
172 | Content = contentStream,
173 | ElapsedTime = context.Timer.Elapsed
174 | };
175 | }
176 | }
177 | catch (OperationCanceledException) when (context.CancellationToken.IsCancellationRequested)
178 | {
179 | Logger?.LogDebug($"Request #{context.RequestNumber} cancelled.");
180 | return null;
181 | }
182 | catch (Exception ex) when (ex is HttpRequestException || ex is OperationCanceledException)
183 | {
184 | context.Timer.Stop();
185 |
186 | Logger?.LogDebug($"Request #{context.RequestNumber} completed with error in {context.Timer.ElapsedMilliseconds}ms.");
187 | Logger?.LogTrace(ex, $"Request #{context.RequestNumber} Exception: {ex.Message}");
188 |
189 | return new RequestResult
190 | {
191 | RequestUri = context.RequestUri,
192 | RequestStart = requestStart,
193 | RequestStartDelay = context.RequestStartDelay,
194 | ElapsedTime = context.Timer.Elapsed,
195 | Exception = ex
196 | };
197 | }
198 | }
199 | }
200 | }
201 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Requests/IRequestProcessor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Concurrent;
3 | using System.Collections.Generic;
4 | using System.Net.Http;
5 | using System.Text;
6 | using System.Threading;
7 | using System.Threading.Tasks;
8 |
9 | namespace InfinityCrawler.Processing.Requests
10 | {
11 | public interface IRequestProcessor
12 | {
13 | void Add(Uri requestUri);
14 |
15 | int PendingRequests { get; }
16 |
17 | Task ProcessAsync(
18 | HttpClient httpClient,
19 | Func responseAction,
20 | RequestProcessorOptions options,
21 | CancellationToken cancellationToken = default
22 | );
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Requests/RequestContext.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Text;
5 | using System.Threading;
6 |
7 | namespace InfinityCrawler.Processing.Requests
8 | {
9 | public class RequestContext
10 | {
11 | public int RequestNumber { get; set; }
12 | public Uri RequestUri { get; set; }
13 | public Stopwatch Timer { get; set; }
14 | public double RequestStartDelay { get; set; }
15 | public TimeSpan RequestTimeout { get; set; }
16 | public CancellationToken CancellationToken { get; set; }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Requests/RequestProcessorOptions.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler.Processing.Requests
6 | {
7 | public class RequestProcessorOptions
8 | {
9 | ///
10 | /// Maximum number of simultaneous asynchronous requests to run at once.
11 | ///
12 | public int MaxNumberOfSimultaneousRequests { get; set; } = 10;
13 | ///
14 | /// Delay between one request starting and the next.
15 | ///
16 | public TimeSpan DelayBetweenRequestStart { get; set; } = new TimeSpan(0, 0, 0, 0, 1000);
17 | ///
18 | /// Maximum jitter applied to a request delay.
19 | ///
20 | public TimeSpan DelayJitter { get; set; } = new TimeSpan(0, 0, 0, 0, 1000);
21 | ///
22 | /// The request timeout length before throttling sets in.
23 | ///
24 | public TimeSpan TimeoutBeforeThrottle { get; set; } = new TimeSpan(0, 0, 0, 0, 2500);
25 | ///
26 | /// The amount of throttling delay to add to subsequent requests. This is added every time the timeout is hit.
27 | ///
28 | public TimeSpan ThrottlingRequestBackoff { get; set; } = new TimeSpan(0, 0, 0, 5);
29 | ///
30 | /// Minimum number of requests below the timeout before minimising the applied throttling.
31 | ///
32 | public int MinSequentialSuccessesToMinimiseThrottling { get; set; } = 5;
33 | ///
34 | /// The amount of time before a request is cancelled and retried.
35 | ///
36 | public TimeSpan RequestTimeout { get; set; } = new TimeSpan(0, 0, 30);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/Processing/Requests/RequestResult.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Net;
5 | using System.Net.Http;
6 | using System.Net.Http.Headers;
7 | using System.Text;
8 |
9 | namespace InfinityCrawler.Processing.Requests
10 | {
11 | public class RequestResult
12 | {
13 | public Uri RequestUri { get; set; }
14 | public DateTime RequestStart { get; set; }
15 | public double RequestStartDelay { get; set; }
16 | public HttpStatusCode? StatusCode { get; set; }
17 | public HttpResponseHeaders ResponseHeaders { get; set; }
18 | public HttpContentHeaders ContentHeaders { get; set; }
19 | public Stream Content { get; set; }
20 | public TimeSpan ElapsedTime { get; set; }
21 | public Exception Exception { get; set; }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/InfinityCrawler/UriCrawlState.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler
6 | {
7 | public class UriCrawlState
8 | {
9 | public Uri Location { get; set; }
10 | public IList Requests { get; set; } = new List();
11 | public IList Redirects { get; set; }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.Benchmarks/BasicSiteCrawlBenchmark.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Threading.Tasks;
3 | using BenchmarkDotNet.Attributes;
4 | using BenchmarkDotNet.Jobs;
5 | using InfinityCrawler.Processing.Requests;
6 | using InfinityCrawler.Tests.TestSite;
7 |
8 | namespace InfinityCrawler.Tests.Benchmarks
9 | {
10 | [SimpleJob(RuntimeMoniker.Net60)]
11 | [MemoryDiagnoser]
12 | public class BasicSiteCrawlBenchmark
13 | {
14 | private TestSiteManager TestSite { get; }
15 | private Crawler Crawler { get; }
16 | private Uri Uri { get; } = new Uri("http://localhost/");
17 |
18 | public BasicSiteCrawlBenchmark()
19 | {
20 | TestSite = new TestSiteManager(new SiteContext
21 | {
22 | SiteFolder = "BasicSite"
23 | });
24 |
25 | var client = TestSite.GetHttpClient();
26 | Crawler = new Crawler(client);
27 | }
28 |
29 | [GlobalSetup]
30 | public async Task Setup()
31 | {
32 | await CrawlSite(); // benchmark warmup as a workaround for https://github.com/dotnet/BenchmarkDotNet/issues/837
33 | }
34 |
35 | [Benchmark]
36 | public async Task CrawlSite()
37 | {
38 | _ = await Crawler.Crawl(Uri, new CrawlSettings
39 | {
40 | RequestProcessorOptions = new RequestProcessorOptions
41 | {
42 | MaxNumberOfSimultaneousRequests = 5,
43 | DelayBetweenRequestStart = new TimeSpan(),
44 | DelayJitter = new TimeSpan(),
45 | TimeoutBeforeThrottle = new TimeSpan()
46 | }
47 | });
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.Benchmarks/InfinityCrawler.Tests.Benchmarks.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net6.0
6 | false
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.Benchmarks/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using BenchmarkDotNet.Running;
3 |
4 | namespace InfinityCrawler.Tests.Benchmarks
5 | {
6 | class Program
7 | {
8 | static void Main(string[] args)
9 | {
10 | BenchmarkRunner.Run();
11 | }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Controllers/HelperController.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net;
4 | using System.Text;
5 | using System.Threading;
6 | using System.Threading.Tasks;
7 | using Microsoft.AspNetCore.Mvc;
8 |
9 | namespace InfinityCrawler.Tests.TestSite.Controllers
10 | {
11 | [Route("/")]
12 | public class HelperController : ControllerBase
13 | {
14 | private SiteContext Context { get; }
15 |
16 | public HelperController(SiteContext context)
17 | {
18 | Context = context;
19 | }
20 |
21 | [Route("delay/{delay}/{path}")]
22 | public async Task Delay(int delay, string path)
23 | {
24 | await Task.Delay(delay);
25 | return new ContentResult
26 | {
27 | Content = path
28 | };
29 | }
30 |
31 | [Route("status/{statusCode}")]
32 | public IActionResult ReturnError(HttpStatusCode statusCode)
33 | {
34 | return new ContentResult
35 | {
36 | StatusCode = (int)statusCode,
37 | Content = statusCode.ToString()
38 | };
39 | }
40 |
41 | [Route("redirect/{depth}/{path}")]
42 | public IActionResult Redirect(int depth, string path)
43 | {
44 | if (depth <= 0)
45 | {
46 | return new ContentResult
47 | {
48 | Content = path
49 | };
50 | }
51 |
52 | return RedirectToAction("Redirect", new { depth = depth - 1, path });
53 | }
54 |
55 | [Route("sitemap.xml")]
56 | public IActionResult DynamicSitemap()
57 | {
58 | var defaultFile = "index.html";
59 |
60 | if (!string.IsNullOrEmpty(Context.EntryPath))
61 | {
62 | defaultFile = Context.EntryPath + defaultFile;
63 | }
64 |
65 | return new ContentResult
66 | {
67 | ContentType = "text/xml",
68 | Content = $@"
69 |
70 |
71 | http://localhost/{defaultFile}
72 |
73 | "
74 | };
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Controllers/RobotsController.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net;
4 | using System.Text;
5 | using Microsoft.AspNetCore.Mvc;
6 |
7 | namespace InfinityCrawler.Tests.TestSite.Controllers
8 | {
9 | [Route("/robots/")]
10 | public class RobotsController : ControllerBase
11 | {
12 | private string GetHtml(string path)
13 | {
14 | return $@"
15 |
16 |
17 |
18 |
19 | Test Path
20 |
21 | ";
22 | }
23 |
24 | private ContentResult GetResult(string path)
25 | {
26 | return new ContentResult
27 | {
28 | StatusCode = (int)HttpStatusCode.OK,
29 | ContentType = "text/html",
30 | Content = GetHtml(path)
31 | };
32 | }
33 |
34 | [Route("header-page-noindex")]
35 | public IActionResult AllNoIndex()
36 | {
37 | Response.Headers.Add("X-Robots-Tag", "noindex");
38 | return GetResult("header-page-no-index");
39 | }
40 | [Route("header-page-nofollow")]
41 | public IActionResult AllNoFollow()
42 | {
43 | Response.Headers.Add("X-Robots-Tag", "nofollow");
44 | return GetResult("header-page-no-follow");
45 | }
46 | [Route("header-page-none")]
47 | public IActionResult AllNone()
48 | {
49 | Response.Headers.Add("X-Robots-Tag", "none");
50 | return GetResult("header-page-none");
51 | }
52 | [Route("header-bot-specific")]
53 | public IActionResult BotSpecific()
54 | {
55 | Response.Headers.Add("X-Robots-Tag", new[]
56 | {
57 | "onebot: noindex",
58 | "twobot: nofollow"
59 | });
60 | return GetResult("header-bot-specific");
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/InfinityCrawler.Tests.TestSite.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0;net6.0
5 | false
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | Always
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/basic-page.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Basic Page
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Index
5 |
6 |
7 | Basic HTML Page
8 | Robots Blocked Page
9 | Looping Links Page
10 | Index Page with Query String
11 | 500 Error Page
12 | 404 Error Page
13 | 403 Error Page
14 | 401 Error Page
15 | 305 Error Page
16 | Fragment Link
17 | Not-allowed External Site
18 | Allowed Domain
19 | Two Redirects
20 | Five Redirects
21 | Rel NoFollow Link
22 | Alternative URL scheme
23 |
24 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/looping-links.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Looping Links
5 |
6 |
7 | Index Page
8 | Malformed HTML Page
9 |
10 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots-blocked-child.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Robots Blocked Child
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots-blocked.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Robots Blocked
5 |
6 |
7 | Child Page
8 |
9 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /robots-blocked.html
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/AbsoluteCanonicalUri.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Absolute Canonical Uri
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/BaseHrefCrawlLink.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Base Href Crawl Link
7 |
8 |
9 |
10 | External Href
11 | Relative Fragment
12 | Relative File
13 | Relative File with Fragment
14 | Relative Base File
15 | Absolute File
16 |
17 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/CrawlLinkContent.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Crawl Link Content
7 |
8 |
9 | No Href
10 | Invalid Href
11 | Relative Fragment
12 | Relative File
13 | Same Relative File with Fragment
14 | Different Relative File with Fragment
15 | Title Attribute
16 | Rel No Follow
17 |
18 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoFollow.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Meta No Index
7 |
8 |
9 |
10 | A Link
11 |
12 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoIndex.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Meta No Index
7 |
8 |
9 |
10 | A Link
11 |
12 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoIndexNoFollow.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Meta No Index
7 |
8 |
9 |
10 | A Link
11 |
12 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNone.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Meta No Index
7 |
8 |
9 |
10 | A Link
11 |
12 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/NoCanonicalUri.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | No Canonical Uri
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/RelativeCanonicalUri.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Relative Canonical Uri
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/DefaultRequestProcessor/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Resources/EmptySite/readme.txt:
--------------------------------------------------------------------------------
1 | This site is intentionally empty. This file exists for version control to store the folder.
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/SiteContext.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace InfinityCrawler.Tests.TestSite
6 | {
7 | public class SiteContext
8 | {
9 | public string SiteFolder { get; set; }
10 | public string EntryPath { get; set; }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/Startup.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using Microsoft.AspNetCore.Builder;
4 | using Microsoft.Extensions.DependencyInjection;
5 | using Microsoft.Extensions.FileProviders;
6 |
7 | namespace InfinityCrawler.Tests.TestSite
8 | {
9 | public class Startup
10 | {
11 | private SiteContext Context { get; }
12 |
13 | public Startup(SiteContext context)
14 | {
15 | Context = context;
16 | }
17 |
18 | public void ConfigureServices(IServiceCollection services)
19 | {
20 | services.AddMvcCore();
21 | }
22 |
23 | public void Configure(IApplicationBuilder app)
24 | {
25 | app.UseStaticFiles(new StaticFileOptions
26 | {
27 | FileProvider = new PhysicalFileProvider(
28 | Path.Combine(Directory.GetCurrentDirectory(), $"Resources/{Context.SiteFolder}"))
29 | });
30 |
31 | #if NET6_0_OR_GREATER
32 | app.UseRouting();
33 | app.UseEndpoints(endpoints =>
34 | {
35 | endpoints.MapControllers();
36 | });
37 | #else
38 | app.UseMvc();
39 | #endif
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/TestHttpMessageHandler.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Net;
5 | using System.Net.Http;
6 | using System.Reflection;
7 | using System.Text;
8 | using System.Threading;
9 | using System.Threading.Tasks;
10 | using Microsoft.AspNetCore.TestHost;
11 |
12 | namespace InfinityCrawler.Tests.TestSite
13 | {
14 | public class TestHttpMessageHandler : HttpMessageHandler
15 | {
16 | private HttpMessageHandler InternalHandler { get; }
17 |
18 | public TestHttpMessageHandler(HttpMessageHandler internalHandler)
19 | {
20 | InternalHandler = internalHandler;
21 | }
22 |
23 | protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
24 | {
25 | try
26 | {
27 | if (request.RequestUri.Host == "test-domain.com")
28 | {
29 | //This is the only "remote" host allowed and even then, the response is always empty.
30 | var stream = new MemoryStream();
31 | return new HttpResponseMessage(HttpStatusCode.OK)
32 | {
33 | RequestMessage = request,
34 | Version = HttpVersion.Version11,
35 | Content = new StreamContent(stream)
36 | };
37 | }
38 |
39 | return await InternalSendAsync(request, cancellationToken);
40 | }
41 | catch (IOException ex) when (ex.Message == "The request was aborted or the pipeline has finished")
42 | {
43 | //This error only happens because the test server isn't actually called via HTTP, it is called directly
44 | //In reality, it would actually throw a `TaskCanceledException`
45 | throw new TaskCanceledException(null, ex);
46 | }
47 | }
48 |
49 | private async Task InternalSendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
50 | {
51 | var method = typeof(HttpMessageHandler).GetMethod("SendAsync", BindingFlags.NonPublic | BindingFlags.Instance);
52 | var invokedTask = (Task)method.Invoke(InternalHandler, new object[] { request, cancellationToken });
53 | return await invokedTask;
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests.TestSite/TestSiteManager.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Net.Http;
4 | using System.Text;
5 | using Microsoft.AspNetCore.Hosting;
6 | using Microsoft.AspNetCore.TestHost;
7 | using Microsoft.Extensions.DependencyInjection;
8 |
9 | namespace InfinityCrawler.Tests.TestSite
10 | {
11 | public class TestSiteManager : IDisposable
12 | {
13 | private TestServer Server { get; set; }
14 | private HttpClient Client { get; set; }
15 |
16 | public TestSiteManager(SiteContext context)
17 | {
18 | var builder = new WebHostBuilder()
19 | .ConfigureServices(s =>
20 | {
21 | s.AddSingleton(context);
22 | })
23 | .UseStartup();
24 |
25 | Server = new TestServer(builder);
26 |
27 | var internalHandler = Server.CreateHandler();
28 | Client = new HttpClient(new TestHttpMessageHandler(internalHandler));
29 | }
30 |
31 | public HttpClient GetHttpClient()
32 | {
33 | return Client;
34 | }
35 |
36 | public void Dispose()
37 | {
38 | if (Server != null)
39 | {
40 | Server.Dispose();
41 | Server = null;
42 |
43 | Client.Dispose();
44 | Client = null;
45 | }
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/BasicSiteTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Linq;
3 | using System.Net;
4 | using System.Threading.Tasks;
5 | using InfinityCrawler.Processing.Requests;
6 | using InfinityCrawler.Tests.TestSite;
7 | using Microsoft.VisualStudio.TestTools.UnitTesting;
8 |
9 | namespace InfinityCrawler.Tests
10 | {
11 | [TestClass]
12 | public class BasicSiteTests : CrawlerTestBase
13 | {
14 | private async Task GetCrawlResult()
15 | {
16 | var crawler = GetTestSiteCrawler(new SiteContext
17 | {
18 | SiteFolder = "BasicSite"
19 | });
20 | var settings = new CrawlSettings
21 | {
22 | RequestProcessor = GetLoggedRequestProcessor(),
23 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
24 | };
25 | return await crawler.Crawl(new Uri("http://localhost/"), settings);
26 | }
27 |
28 | [TestMethod]
29 | public async Task DiscoverIndexPageFromSitemap()
30 | {
31 | var result = await GetCrawlResult();
32 | var uri = new Uri("http://localhost/index.html");
33 | Assert.IsTrue(result.CrawledUris.Any(c => c.Location == uri));
34 | }
35 |
36 | [TestMethod]
37 | public async Task CrawledLinksOnIndexPage()
38 | {
39 | var result = await GetCrawlResult();
40 | var uri = new Uri("http://localhost/basic-page.html");
41 | Assert.IsTrue(result.CrawledUris.Any(c => c.Location == uri));
42 | }
43 |
44 | [TestMethod]
45 | public async Task ObeysRobotsBlocking()
46 | {
47 | var result = await GetCrawlResult();
48 | var uri = new Uri("http://localhost/robots-blocked.html");
49 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();
50 |
51 | var robotsChildUri = new Uri("http://localhost/robots-blocked-childs.html");
52 |
53 | Assert.AreEqual(CrawlStatus.RobotsBlocked, crawledUri.Status);
54 | Assert.IsFalse(result.CrawledUris.Any(c => c.Location == robotsChildUri));
55 | }
56 |
57 | [TestMethod]
58 | public async Task UrisOnlyAppearOnceInResults()
59 | {
60 | var result = await GetCrawlResult();
61 | var uri = new Uri("http://localhost/index.html");
62 | Assert.AreEqual(1, result.CrawledUris.Count(c => c.Location == uri));
63 | }
64 |
65 | [TestMethod]
66 | public async Task UrisAreRetriedOnServerErrors()
67 | {
68 | var result = await GetCrawlResult();
69 | var uri = new Uri("http://localhost/status/500");
70 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();
71 | Assert.AreEqual(3, crawledUri.Requests.Count);
72 | }
73 |
74 | [TestMethod]
75 | public async Task UrisAreNotRetriedOn4xxErrors()
76 | {
77 | var result = await GetCrawlResult();
78 | var uris = new[]
79 | {
80 | new Uri("http://localhost/status/404"),
81 | new Uri("http://localhost/status/403"),
82 | new Uri("http://localhost/status/401")
83 | };
84 | Assert.IsTrue(uris.All(uri => result.CrawledUris.Any(c => c.Location == uri && c.Requests.Count == 1)));
85 | }
86 |
87 | [TestMethod]
88 | public async Task ExternalSitesAreNotCrawled()
89 | {
90 | var result = await GetCrawlResult();
91 | var uri = new Uri("http://localhost/index.html");
92 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();
93 |
94 | var externalUri = new Uri("http://not-allowed-domain.com");
95 |
96 | Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri));
97 | Assert.IsFalse(result.CrawledUris.Any(c => c.Location == externalUri));
98 | }
99 |
100 | [TestMethod]
101 | public async Task AllowedExternalSitesAreCrawled()
102 | {
103 | var crawler = GetTestSiteCrawler(new SiteContext
104 | {
105 | SiteFolder = "BasicSite"
106 | });
107 | var settings = new CrawlSettings
108 | {
109 | HostAliases = new[] { "test-domain.com" },
110 | RequestProcessor = GetLoggedRequestProcessor(),
111 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
112 | };
113 | var result = await crawler.Crawl(new Uri("http://localhost/"), settings);
114 | var uri = new Uri("http://localhost/index.html");
115 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();
116 |
117 | var externalUri = new Uri("http://test-domain.com");
118 |
119 | Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri));
120 |
121 | var externalCrawl = result.CrawledUris.FirstOrDefault(c => c.Location == externalUri);
122 | Assert.IsNotNull(externalCrawl);
123 | Assert.AreEqual(HttpStatusCode.OK, externalCrawl.Requests.LastOrDefault().StatusCode);
124 | }
125 |
126 | [TestMethod]
127 | public async Task RelNoFollowLinksAreIgnored()
128 | {
129 | var result = await GetCrawlResult();
130 | var uri = new Uri("http://localhost/index.html?v=rel-no-follow");
131 | Assert.AreEqual(0, result.CrawledUris.Count(c => c.Location == uri));
132 | }
133 |
134 | [TestMethod]
135 | public async Task MaximumRedirectLimitFollowed()
136 | {
137 | var result = await GetCrawlResult();
138 | var uri = new Uri("http://localhost/redirect/2/five-redirects");
139 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault();
140 |
141 | Assert.AreEqual(CrawlStatus.MaxRedirects, crawledUri.Status);
142 | Assert.AreEqual(3, crawledUri.RedirectChain.Count);
143 | }
144 |
145 | [DataRow(2)]
146 | [DataRow(4)]
147 | [DataTestMethod]
148 | public async Task MaximumPagesCrawledFollowed(int maxPages)
149 | {
150 | var crawler = GetTestSiteCrawler(new SiteContext
151 | {
152 | SiteFolder = "BasicSite"
153 | });
154 | var settings = new CrawlSettings
155 | {
156 | RequestProcessor = GetLoggedRequestProcessor(),
157 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions()
158 | };
159 |
160 | settings.MaxNumberOfPagesToCrawl = maxPages;
161 | var result = await crawler.Crawl(new Uri("http://localhost/"), settings);
162 | Assert.AreEqual(maxPages, result.CrawledUris.Count());
163 | }
164 |
165 | [TestMethod]
166 | public async Task AutoRetryOnFailure()
167 | {
168 | var crawler = GetTestSiteCrawler(new SiteContext
169 | {
170 | SiteFolder = "EmptySite"
171 | });
172 | var settings = new CrawlSettings
173 | {
174 | NumberOfRetries = 3,
175 | RequestProcessor = GetLoggedRequestProcessor(),
176 | RequestProcessorOptions = new RequestProcessorOptions
177 | {
178 | DelayBetweenRequestStart = new TimeSpan(),
179 | MaxNumberOfSimultaneousRequests = 4,
180 | TimeoutBeforeThrottle = new TimeSpan(),
181 | DelayJitter = new TimeSpan(),
182 | RequestTimeout = new TimeSpan(0, 0, 0, 0, 150)
183 | }
184 | };
185 |
186 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-1"));
187 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-2"));
188 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-3"));
189 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-4"));
190 |
191 | var results = await crawler.Crawl(new Uri("http://localhost/"), settings);
192 | var delayedCrawls = results.CrawledUris.Where(c => c.Location.PathAndQuery.Contains("delay")).ToArray();
193 |
194 | foreach (var crawledUri in delayedCrawls)
195 | {
196 | Assert.AreEqual(CrawlStatus.MaxRetries, crawledUri.Status);
197 | Assert.IsNull(crawledUri.Content);
198 | }
199 | }
200 | }
201 | }
202 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/ContentProcessorTestBase.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 | using InfinityCrawler.Processing.Content;
7 | using InfinityCrawler.Tests.TestSite;
8 |
9 | namespace InfinityCrawler.Tests
10 | {
11 | public class ContentProcessorTestBase : TestBase
12 | {
13 | protected async Task RequestAndProcessContentAsync(SiteContext siteContext, Uri requestUri, IContentProcessor contentProcessor)
14 | {
15 | var httpClient = TestSiteConfiguration.GetHttpClient(siteContext);
16 | using (var response = await httpClient.GetAsync(requestUri))
17 | {
18 | await response.Content.LoadIntoBufferAsync();
19 | using (var contentStream = await response.Content.ReadAsStreamAsync())
20 | {
21 | var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
22 | return contentProcessor.Parse(requestUri, headers, contentStream);
23 | }
24 | }
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/CrawlerTestBase.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 | using InfinityCrawler.Processing.Requests;
7 | using InfinityCrawler.Tests.TestSite;
8 |
9 | namespace InfinityCrawler.Tests
10 | {
11 | public class CrawlerTestBase : TestBase
12 | {
13 | protected Crawler GetTestSiteCrawler(SiteContext siteContext)
14 | {
15 | var httpClient = TestSiteConfiguration.GetHttpClient(siteContext);
16 | return new Crawler(httpClient, GetLogger());
17 | }
18 |
19 | protected RequestProcessorOptions GetNoDelayRequestProcessorOptions()
20 | {
21 | return new RequestProcessorOptions
22 | {
23 | MaxNumberOfSimultaneousRequests = 10,
24 | DelayBetweenRequestStart = new TimeSpan(),
25 | DelayJitter = new TimeSpan(),
26 | TimeoutBeforeThrottle = new TimeSpan()
27 | };
28 | }
29 |
30 | protected DefaultRequestProcessor GetLoggedRequestProcessor()
31 | {
32 | var requestProcessorLogger = GetLogger();
33 | return new DefaultRequestProcessor(requestProcessorLogger);
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/DefaultContentProcessorTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 | using InfinityCrawler.Processing.Content;
7 | using InfinityCrawler.Tests.TestSite;
8 | using Microsoft.VisualStudio.TestTools.UnitTesting;
9 |
10 | namespace InfinityCrawler.Tests
11 | {
12 | [TestClass]
13 | public class DefaultContentProcessorTests : ContentProcessorTestBase
14 | {
15 | private async Task PerformRequestAsync(string path)
16 | {
17 | var requestUri = new UriBuilder("http://localhost/")
18 | {
19 | Path = path
20 | }.Uri;
21 |
22 | return await RequestAndProcessContentAsync(new SiteContext
23 | {
24 | SiteFolder = "DefaultContentProcessor"
25 | }, requestUri, new DefaultContentProcessor());
26 | }
27 |
28 | [TestMethod]
29 | public async Task NoMetaParsed()
30 | {
31 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html");
32 | Assert.AreEqual(0, crawledContent.PageRobotRules.Count());
33 | }
34 |
35 | [TestMethod]
36 | public async Task MissingHrefLinksAreIgnored()
37 | {
38 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html");
39 | Assert.AreEqual(6, crawledContent.Links.Count());
40 | Assert.IsFalse(crawledContent.Links.Any(l => l.Text == "No Href"));
41 | }
42 |
43 | [TestMethod]
44 | public async Task InvalidHrefLinksAreIgnored()
45 | {
46 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html");
47 | Assert.AreEqual(6, crawledContent.Links.Count());
48 | Assert.IsFalse(crawledContent.Links.Any(l => l.Text == "Invalid Href"));
49 | }
50 |
51 | [TestMethod]
52 | public async Task TitleAttributeIsParsed()
53 | {
54 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html");
55 |
56 | Assert.IsTrue(crawledContent.Links.Any(l => l.Title == "Title Attribute"));
57 | Assert.IsNull(crawledContent.Links.FirstOrDefault(l => l.Text == "Relative File").Title);
58 | }
59 |
60 | [TestMethod]
61 | public async Task RelAttributeIsParsed()
62 | {
63 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html");
64 |
65 | Assert.IsTrue(crawledContent.Links.Any(l => l.Relationship == "nofollow"));
66 | Assert.IsNull(crawledContent.Links.FirstOrDefault(l => l.Text == "Relative File").Relationship);
67 | }
68 |
69 | [TestMethod]
70 | public async Task MetaRobotsParsed()
71 | {
72 | var crawledContent = await PerformRequestAsync("MetaNoFollow.html");
73 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase)));
74 |
75 | crawledContent = await PerformRequestAsync("MetaNoIndex.html");
76 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("noindex", StringComparison.InvariantCultureIgnoreCase)));
77 |
78 | crawledContent = await PerformRequestAsync("MetaNoIndexNoFollow.html");
79 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r =>
80 | r.IndexOf("noindex", StringComparison.InvariantCultureIgnoreCase) != -1 &&
81 | r.IndexOf("nofollow", StringComparison.InvariantCultureIgnoreCase) != -1
82 | ));
83 |
84 | crawledContent = await PerformRequestAsync("MetaNone.html");
85 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("none", StringComparison.InvariantCultureIgnoreCase)));
86 | }
87 | [TestMethod]
88 | public async Task HeaderRobotsParsed()
89 | {
90 | var crawledContent = await PerformRequestAsync("robots/header-page-noindex");
91 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("noindex", StringComparison.InvariantCultureIgnoreCase)));
92 |
93 | crawledContent = await PerformRequestAsync("robots/header-bot-specific");
94 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Contains("onebot")));
95 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Contains("twobot")));
96 | }
97 |
98 | [TestMethod]
99 | public async Task CanonicalUriParsing()
100 | {
101 | var crawledContent = await PerformRequestAsync("NoCanonicalUri.html");
102 | Assert.IsNull(crawledContent.CanonicalUri);
103 |
104 | crawledContent = await PerformRequestAsync("RelativeCanonicalUri.html");
105 | Assert.AreEqual(new Uri("http://localhost/RelativeCanonicalUri.html"), crawledContent.CanonicalUri);
106 |
107 | crawledContent = await PerformRequestAsync("AbsoluteCanonicalUri.html");
108 | Assert.AreEqual(new Uri("http://localhost/AbsoluteCanonicalUri.html"), crawledContent.CanonicalUri);
109 | }
110 | [TestMethod]
111 | public async Task BaseHrefLinks()
112 | {
113 | var crawledContent = await PerformRequestAsync("BaseHrefCrawlLink.html");
114 | var links = crawledContent.Links.ToArray();
115 |
116 | Assert.AreEqual(new Uri("http://test-domain.com/"), links[0].Location);
117 | Assert.AreEqual(new Uri("http://localhost/base/#RelativeFragment"), links[1].Location);
118 | Assert.AreEqual(new Uri("http://localhost/base/relative/RelativeFile.html"), links[2].Location);
119 | Assert.AreEqual(new Uri("http://localhost/base/relative/RelativeFile.html#Fragment"), links[3].Location);
120 | Assert.AreEqual(new Uri("http://localhost/RelativeBaseFile.html"), links[4].Location);
121 | Assert.AreEqual(new Uri("http://localhost/absolute/AbsoluteBaseFile.html"), links[5].Location);
122 | }
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/DefaultRequestProcessorTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Concurrent;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading;
7 | using System.Threading.Tasks;
8 | using InfinityCrawler.Processing.Requests;
9 | using InfinityCrawler.Tests.TestSite;
10 | using Microsoft.VisualStudio.TestTools.UnitTesting;
11 |
12 | namespace InfinityCrawler.Tests
13 | {
14 | [TestClass]
15 | public class DefaultRequestProcessorTests : TestBase
16 | {
17 | [TestMethod]
18 | public async Task ThrottlingTest()
19 | {
20 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext
21 | {
22 | SiteFolder = "DefaultRequestProcessor"
23 | });
24 |
25 | var processor = new DefaultRequestProcessor(GetLogger());
26 |
27 | //Warmup
28 | processor.Add(new Uri("http://localhost/delay/50/warmup"));
29 | await processor.ProcessAsync(httpClient, requestResult => Task.CompletedTask, new RequestProcessorOptions
30 | {
31 | DelayJitter = new TimeSpan(),
32 | DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, 50)
33 | });
34 |
35 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-1"));
36 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-2"));
37 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1"));
38 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2"));
39 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-3"));
40 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-4"));
41 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-5"));
42 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-6"));
43 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-7"));
44 |
45 | var results = new List();
46 | await processor.ProcessAsync(httpClient, requestResult =>
47 | {
48 | results.Add(requestResult);
49 | return Task.CompletedTask;
50 | }, new RequestProcessorOptions
51 | {
52 | MaxNumberOfSimultaneousRequests = 1,
53 | MinSequentialSuccessesToMinimiseThrottling = 2,
54 | DelayBetweenRequestStart = new TimeSpan(),
55 | DelayJitter = new TimeSpan(),
56 | TimeoutBeforeThrottle = new TimeSpan(0, 0, 0, 0, 270),
57 | ThrottlingRequestBackoff = new TimeSpan(0, 0, 0, 0, 100)
58 | });
59 |
60 | Assert.AreEqual(0, results[0].RequestStartDelay);
61 | Assert.AreEqual(0, results[1].RequestStartDelay);
62 | Assert.AreEqual(0, results[2].RequestStartDelay);
63 | Assert.AreEqual(100, results[3].RequestStartDelay);
64 | Assert.AreEqual(200, results[4].RequestStartDelay);
65 | Assert.AreEqual(200, results[5].RequestStartDelay);
66 | Assert.AreEqual(100, results[6].RequestStartDelay);
67 | Assert.AreEqual(100, results[7].RequestStartDelay);
68 | Assert.AreEqual(0, results[8].RequestStartDelay);
69 | }
70 |
71 | [TestMethod]
72 | public async Task ProcessCancellationTest()
73 | {
74 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext
75 | {
76 | SiteFolder = "DefaultRequestProcessor"
77 | });
78 |
79 | var processor = new DefaultRequestProcessor(GetLogger());
80 |
81 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1"));
82 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2"));
83 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-3"));
84 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-4"));
85 |
86 | var results = new ConcurrentBag();
87 | var tokenSource = new CancellationTokenSource(300);
88 |
89 | try
90 | {
91 | await processor.ProcessAsync(httpClient, requestResult =>
92 | {
93 | results.Add(requestResult);
94 | return Task.CompletedTask;
95 | }, new RequestProcessorOptions
96 | {
97 | DelayBetweenRequestStart = new TimeSpan(),
98 | MaxNumberOfSimultaneousRequests = 2,
99 | TimeoutBeforeThrottle = new TimeSpan(),
100 | DelayJitter = new TimeSpan()
101 | }, tokenSource.Token);
102 | }
103 | catch (OperationCanceledException)
104 | {
105 |
106 | }
107 |
108 | Assert.AreNotEqual(3, results.Count);
109 | Assert.AreNotEqual(4, results.Count);
110 | }
111 |
112 | [TestMethod]
113 | public async Task RequestTimeoutTest()
114 | {
115 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext
116 | {
117 | SiteFolder = "DefaultRequestProcessor"
118 | });
119 |
120 | var processor = new DefaultRequestProcessor(GetLogger());
121 |
122 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1"));
123 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2"));
124 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-3"));
125 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-4"));
126 |
127 | var results = new ConcurrentBag();
128 |
129 | await processor.ProcessAsync(httpClient, requestResult =>
130 | {
131 | results.Add(requestResult);
132 | return Task.CompletedTask;
133 | }, new RequestProcessorOptions
134 | {
135 | DelayBetweenRequestStart = new TimeSpan(),
136 | MaxNumberOfSimultaneousRequests = 4,
137 | TimeoutBeforeThrottle = new TimeSpan(),
138 | DelayJitter = new TimeSpan(),
139 | RequestTimeout = new TimeSpan(0, 0, 0, 0, 150)
140 | });
141 |
142 | Assert.AreEqual(4, results.Count);
143 |
144 | foreach (var requestResult in results)
145 | {
146 | Assert.IsInstanceOfType(requestResult.Exception, typeof(OperationCanceledException));
147 | }
148 | }
149 | [TestMethod, ExpectedExceptionPattern(typeof(Exception), nameof(FaultedTaskThrowsException))]
150 | public async Task FaultedTaskThrowsException()
151 | {
152 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext
153 | {
154 | SiteFolder = "DefaultRequestProcessor"
155 | });
156 |
157 | var processor = new DefaultRequestProcessor(GetLogger());
158 |
159 | processor.Add(new Uri("http://localhost/"));
160 |
161 | await processor.ProcessAsync(httpClient, requestResult =>
162 | {
163 | throw new Exception(nameof(FaultedTaskThrowsException));
164 | }, new RequestProcessorOptions
165 | {
166 | DelayBetweenRequestStart = new TimeSpan(),
167 | DelayJitter = new TimeSpan()
168 | });
169 | }
170 | }
171 | }
172 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/ExpectedExceptionPatternAttribute.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Text.RegularExpressions;
6 | using System.Threading.Tasks;
7 | using Microsoft.VisualStudio.TestTools.UnitTesting;
8 |
9 | namespace InfinityCrawler.Tests
10 | {
11 | public class ExpectedExceptionPatternAttribute : ExpectedExceptionBaseAttribute
12 | {
13 | private Type ExpectedExceptionType { get; }
14 | private Regex MessagePattern { get; }
15 | private string RawPattern { get; }
16 |
17 | public ExpectedExceptionPatternAttribute(Type expectedExceptionType, string exceptionMessagePattern)
18 | {
19 | ExpectedExceptionType = expectedExceptionType;
20 | MessagePattern = new Regex(exceptionMessagePattern);
21 | RawPattern = exceptionMessagePattern;
22 | }
23 |
24 | protected override void Verify(Exception exception)
25 | {
26 | Assert.IsNotNull(exception, $"\"{nameof(exception)}\" is null");
27 |
28 | var thrownExceptionType = exception.GetType();
29 |
30 | if (ExpectedExceptionType != thrownExceptionType)
31 | {
32 | throw new Exception($"Test method threw exception {thrownExceptionType.FullName}, but exception {ExpectedExceptionType.FullName} was expected. Exception message: {exception.Message}");
33 | }
34 |
35 | if (!MessagePattern.IsMatch(exception.Message))
36 | {
37 | throw new Exception($"Thrown exception message \"{exception.Message}\" does not match pattern \"{RawPattern}\".");
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/InfinityCrawler.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net461;netcoreapp3.1;net5.0;net6.0;
5 |
6 | false
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | all
15 | runtime; build; native; contentfiles; analyzers; buildtransitive
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/TestBase.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Text;
5 | using InfinityCrawler.Tests.TestSite;
6 | using Microsoft.Extensions.DependencyInjection;
7 | using Microsoft.Extensions.Logging;
8 | using Microsoft.VisualStudio.TestTools.UnitTesting;
9 |
10 | namespace InfinityCrawler.Tests
11 | {
12 | [TestClass]
13 | public class TestBase
14 | {
15 | private ILoggerFactory LoggerFactory { get; }
16 |
17 | public TestBase()
18 | {
19 | var serviceProvider = new ServiceCollection()
20 | .AddLogging(builder =>
21 | {
22 | builder.AddFilter("InfinityCrawler", LogLevel.Trace);
23 | builder.AddConsole();
24 | builder.AddDebug();
25 | })
26 | .BuildServiceProvider();
27 |
28 | LoggerFactory = serviceProvider.GetService();
29 | }
30 |
31 | protected ILogger GetLogger()
32 | {
33 | return LoggerFactory.CreateLogger();
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/tests/InfinityCrawler.Tests/TestSiteConfiguration.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Net.Http;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 | using InfinityCrawler.Tests.TestSite;
8 | using Microsoft.VisualStudio.TestTools.UnitTesting;
9 |
10 | namespace InfinityCrawler.Tests
11 | {
12 | [TestClass]
13 | public static class TestSiteConfiguration
14 | {
15 | private static Dictionary TestSites { get; } = new Dictionary();
16 |
17 | public static HttpClient GetHttpClient(SiteContext siteContext)
18 | {
19 | if (!TestSites.ContainsKey(siteContext.SiteFolder))
20 | {
21 | var testSiteManager = new TestSiteManager(siteContext);
22 | TestSites.Add(siteContext.SiteFolder, testSiteManager);
23 | }
24 |
25 | return TestSites[siteContext.SiteFolder].GetHttpClient();
26 | }
27 |
28 | public static void ShutdownSites()
29 | {
30 | foreach (var site in TestSites.Values)
31 | {
32 | site.Dispose();
33 | }
34 |
35 | TestSites.Clear();
36 | }
37 |
38 | [AssemblyCleanup]
39 | public static void AssemblyCleanup()
40 | {
41 | ShutdownSites();
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------