├── .codecov.yml ├── .editorconfig ├── .github ├── FUNDING.yml ├── dependabot.yml ├── release-drafter.yml └── workflows │ ├── build.yml │ └── release-drafter.yml ├── .gitignore ├── CodeCoverage.runsettings ├── InfinityCrawler.sln ├── License.txt ├── README.md ├── images └── icon.png ├── src ├── Directory.Build.props └── InfinityCrawler │ ├── CrawlLink.cs │ ├── CrawlResult.cs │ ├── CrawlSettings.cs │ ├── CrawledUri.cs │ ├── Crawler.cs │ ├── InfinityCrawler.csproj │ ├── Internal │ ├── CrawlRunner.cs │ └── UriExtensions.cs │ ├── Processing │ ├── Content │ │ ├── CrawlHeaders.cs │ │ ├── DefaultContentProcessor.cs │ │ └── IContentProcessor.cs │ └── Requests │ │ ├── DefaultRequestProcessor.cs │ │ ├── IRequestProcessor.cs │ │ ├── RequestContext.cs │ │ ├── RequestProcessorOptions.cs │ │ └── RequestResult.cs │ └── UriCrawlState.cs └── tests ├── InfinityCrawler.Tests.Benchmarks ├── BasicSiteCrawlBenchmark.cs ├── InfinityCrawler.Tests.Benchmarks.csproj └── Program.cs ├── InfinityCrawler.Tests.TestSite ├── Controllers │ ├── HelperController.cs │ └── RobotsController.cs ├── InfinityCrawler.Tests.TestSite.csproj ├── Resources │ ├── BasicSite │ │ ├── basic-page.html │ │ ├── index.html │ │ ├── looping-links.html │ │ ├── robots-blocked-child.html │ │ ├── robots-blocked.html │ │ └── robots.txt │ ├── DefaultContentProcessor │ │ ├── AbsoluteCanonicalUri.html │ │ ├── BaseHrefCrawlLink.html │ │ ├── CrawlLinkContent.html │ │ ├── MetaNoFollow.html │ │ ├── MetaNoIndex.html │ │ ├── MetaNoIndexNoFollow.html │ │ ├── MetaNone.html │ │ ├── NoCanonicalUri.html │ │ └── RelativeCanonicalUri.html │ ├── DefaultRequestProcessor │ │ └── index.html │ └── EmptySite │ │ └── readme.txt ├── SiteContext.cs ├── Startup.cs ├── TestHttpMessageHandler.cs └── TestSiteManager.cs └── InfinityCrawler.Tests ├── BasicSiteTests.cs ├── ContentProcessorTestBase.cs ├── CrawlerTestBase.cs ├── DefaultContentProcessorTests.cs ├── DefaultRequestProcessorTests.cs ├── ExpectedExceptionPatternAttribute.cs ├── InfinityCrawler.Tests.csproj ├── TestBase.cs └── TestSiteConfiguration.cs /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: off -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # Based on the EditorConfig from Roslyn 2 | # top-most EditorConfig file 3 | root = true 4 | 5 | [*.cs] 6 | indent_style = tab 7 | 8 | # Sort using and Import directives with System.* appearing first 9 | dotnet_sort_system_directives_first = true 10 | # Avoid "this." and "Me." if not necessary 11 | dotnet_style_qualification_for_field = false:suggestion 12 | dotnet_style_qualification_for_property = false:suggestion 13 | dotnet_style_qualification_for_method = false:suggestion 14 | dotnet_style_qualification_for_event = false:suggestion 15 | 16 | # Use language keywords instead of framework type names for type references 17 | dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion 18 | dotnet_style_predefined_type_for_member_access = true:suggestion 19 | 20 | # Suggest more modern language features when available 21 | dotnet_style_object_initializer = true:suggestion 22 | dotnet_style_collection_initializer = true:suggestion 23 | dotnet_style_coalesce_expression = true:suggestion 24 | dotnet_style_null_propagation = true:suggestion 25 | dotnet_style_explicit_tuple_names = true:suggestion 26 | 27 | # Prefer "var" everywhere 28 | csharp_style_var_for_built_in_types = true:suggestion 29 | csharp_style_var_when_type_is_apparent = true:suggestion 30 | csharp_style_var_elsewhere = true:suggestion 31 | 32 | # Prefer method-like constructs to have a block body 33 | csharp_style_expression_bodied_methods = false:none 34 | csharp_style_expression_bodied_constructors = false:none 35 | csharp_style_expression_bodied_operators = false:none 36 | 37 | # Prefer property-like constructs to have an expression-body 38 | csharp_style_expression_bodied_properties = when_on_single_line:suggestion 39 | csharp_style_expression_bodied_indexers = true:none 40 | csharp_style_expression_bodied_accessors = when_on_single_line:suggestion 41 | 42 | # Suggest more modern language features when available 43 | csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion 44 | csharp_style_pattern_matching_over_as_with_null_check = true:suggestion 45 | csharp_style_inlined_variable_declaration = true:suggestion 46 | csharp_style_throw_expression = true:suggestion 47 | csharp_style_conditional_delegate_call = true:suggestion 48 | 49 | # Newline settings 50 | csharp_new_line_before_open_brace = all 51 | csharp_new_line_before_else = true 52 | csharp_new_line_before_catch = true 53 | csharp_new_line_before_finally = true 54 | csharp_new_line_before_members_in_object_initializers = true 55 | csharp_new_line_before_members_in_anonymous_types = true 56 | 57 | # Misc 58 | csharp_space_after_keywords_in_control_flow_statements = true 59 | csharp_space_between_method_declaration_parameter_list_parentheses = false 60 | csharp_space_between_method_call_parameter_list_parentheses = false 61 | csharp_space_between_parentheses = false 62 | csharp_preserve_single_line_statements = false 63 | csharp_preserve_single_line_blocks = true 64 | csharp_indent_case_contents = true 65 | csharp_indent_switch_labels = true 66 | csharp_indent_labels = no_change 67 | 68 | # Custom naming conventions 69 | dotnet_naming_rule.non_field_members_must_be_capitalized.symbols = non_field_member_symbols 70 | dotnet_naming_symbols.non_field_member_symbols.applicable_kinds = property,method,event,delegate 71 | dotnet_naming_symbols.non_field_member_symbols.applicable_accessibilities = * 72 | 73 | dotnet_naming_rule.non_field_members_must_be_capitalized.style = pascal_case_style 74 | dotnet_naming_style.pascal_case_style.capitalization = pascal_case 75 | 76 | dotnet_naming_rule.non_field_members_must_be_capitalized.severity = suggestion -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: Turnerj -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: nuget 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: '$RESOLVED_VERSION' 2 | tag-template: '$RESOLVED_VERSION' 3 | categories: 4 | - title: '🚀 Features' 5 | labels: 6 | - 'feature' 7 | - 'enhancement' 8 | - title: '🐛 Bug Fixes' 9 | labels: 10 | - 'bug' 11 | - 'bugfix' 12 | - title: '🧰 Maintenance' 13 | label: 14 | - 'dependencies' 15 | - 'maintenance' 16 | change-template: '- $TITLE by @$AUTHOR (#$NUMBER)' 17 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. 18 | version-resolver: 19 | major: 20 | labels: 21 | - 'major' 22 | minor: 23 | labels: 24 | - 'minor' 25 | patch: 26 | labels: 27 | - 'patch' 28 | default: patch 29 | template: | 30 | ## Changes 31 | 32 | $CHANGES 33 | 34 | ## 👨🏼‍💻 Contributors 35 | 36 | $CONTRIBUTORS -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | release: 8 | types: [ published ] 9 | 10 | env: 11 | # Disable the .NET logo in the console output. 12 | DOTNET_NOLOGO: true 13 | # Disable the .NET first time experience to skip caching NuGet packages and speed up the build. 14 | DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true 15 | # Disable sending .NET CLI telemetry to Microsoft. 16 | DOTNET_CLI_TELEMETRY_OPTOUT: true 17 | 18 | BUILD_ARTIFACT_PATH: ${{github.workspace}}/build-artifacts 19 | 20 | jobs: 21 | 22 | build: 23 | name: Build ${{matrix.os}} 24 | runs-on: ${{matrix.os}} 25 | strategy: 26 | matrix: 27 | os: [ubuntu-latest, windows-latest, macOS-latest] 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v2 31 | - name: Setup dotnet 3.1 5.0 6.0 32 | uses: actions/setup-dotnet@v1 33 | with: 34 | dotnet-version: | 35 | 3.1.x 36 | 5.0.x 37 | 6.0.x 38 | - name: Install dependencies 39 | run: dotnet restore 40 | - name: Build 41 | run: dotnet build --no-restore -c Release /p:ContinuousIntegrationBuild=true 42 | - name: Test with Coverage 43 | run: dotnet test --no-restore --logger trx --results-directory ${{env.BUILD_ARTIFACT_PATH}}/coverage --collect "XPlat Code Coverage" --settings CodeCoverage.runsettings /p:SkipBuildVersioning=true 44 | - name: Pack 45 | run: dotnet pack --no-build -c Release /p:PackageOutputPath=${{env.BUILD_ARTIFACT_PATH}} /p:ContinuousIntegrationBuild=true 46 | - name: Publish artifacts 47 | uses: actions/upload-artifact@v2 48 | with: 49 | name: ${{matrix.os}} 50 | path: ${{env.BUILD_ARTIFACT_PATH}} 51 | 52 | coverage: 53 | name: Process code coverage 54 | runs-on: ubuntu-latest 55 | needs: build 56 | steps: 57 | - name: Checkout 58 | uses: actions/checkout@v2 59 | - name: Download coverage reports 60 | uses: actions/download-artifact@v2 61 | - name: Install ReportGenerator tool 62 | run: dotnet tool install -g dotnet-reportgenerator-globaltool 63 | - name: Prepare coverage reports 64 | run: reportgenerator -reports:*/coverage/*/coverage.cobertura.xml -targetdir:./ -reporttypes:Cobertura 65 | - name: Upload coverage report 66 | uses: codecov/codecov-action@v1.0.13 67 | with: 68 | file: Cobertura.xml 69 | fail_ci_if_error: false 70 | - name: Save combined coverage report as artifact 71 | uses: actions/upload-artifact@v2 72 | with: 73 | name: coverage-report 74 | path: Cobertura.xml 75 | 76 | push-to-github-packages: 77 | name: 'Push GitHub Packages' 78 | needs: build 79 | if: github.ref == 'refs/heads/main' || github.event_name == 'release' 80 | environment: 81 | name: 'GitHub Packages' 82 | url: https://github.com/TurnerSoftware/InfinityCrawler/packages 83 | permissions: 84 | packages: write 85 | runs-on: ubuntu-latest 86 | steps: 87 | - name: 'Download build' 88 | uses: actions/download-artifact@v2 89 | with: 90 | name: 'ubuntu-latest' 91 | - name: 'Add NuGet source' 92 | run: dotnet nuget add source https://nuget.pkg.github.com/TurnerSoftware/index.json --name GitHub --username Turnerj --password ${{secrets.GITHUB_TOKEN}} --store-password-in-clear-text 93 | - name: 'Upload NuGet package' 94 | run: dotnet nuget push *.nupkg --api-key ${{secrets.GH_PACKAGE_REGISTRY_API_KEY}} --source GitHub --skip-duplicate 95 | 96 | push-to-nuget: 97 | name: 'Push NuGet Packages' 98 | needs: build 99 | if: github.event_name == 'release' 100 | environment: 101 | name: 'NuGet' 102 | url: https://www.nuget.org/packages/InfinityCrawler 103 | runs-on: ubuntu-latest 104 | steps: 105 | - name: 'Download build' 106 | uses: actions/download-artifact@v2 107 | with: 108 | name: 'ubuntu-latest' 109 | - name: 'Upload NuGet package and symbols' 110 | run: dotnet nuget push *.nupkg --source https://api.nuget.org/v3/index.json --skip-duplicate --api-key ${{secrets.NUGET_API_KEY}} 111 | -------------------------------------------------------------------------------- /.github/workflows/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | update_release_draft: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: release-drafter/release-drafter@v5 13 | env: 14 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015/2017 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # Visual Studio 2017 auto generated files 33 | Generated\ Files/ 34 | 35 | # MSTest test Results 36 | [Tt]est[Rr]esult*/ 37 | [Bb]uild[Ll]og.* 38 | 39 | # NUNIT 40 | *.VisualState.xml 41 | TestResult.xml 42 | 43 | # Build Results of an ATL Project 44 | [Dd]ebugPS/ 45 | [Rr]eleasePS/ 46 | dlldata.c 47 | 48 | # Benchmark Results 49 | BenchmarkDotNet.Artifacts/ 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_i.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *.log 81 | *.vspscc 82 | *.vssscc 83 | .builds 84 | *.pidb 85 | *.svclog 86 | *.scc 87 | 88 | # Chutzpah Test files 89 | _Chutzpah* 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opendb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | *.VC.db 100 | *.VC.VC.opendb 101 | 102 | # Visual Studio profiler 103 | *.psess 104 | *.vsp 105 | *.vspx 106 | *.sap 107 | 108 | # Visual Studio Trace Files 109 | *.e2e 110 | 111 | # TFS 2012 Local Workspace 112 | $tf/ 113 | 114 | # Guidance Automation Toolkit 115 | *.gpState 116 | 117 | # ReSharper is a .NET coding add-in 118 | _ReSharper*/ 119 | *.[Rr]e[Ss]harper 120 | *.DotSettings.user 121 | 122 | # JustCode is a .NET coding add-in 123 | .JustCode 124 | 125 | # TeamCity is a build add-in 126 | _TeamCity* 127 | 128 | # DotCover is a Code Coverage Tool 129 | *.dotCover 130 | 131 | # AxoCover is a Code Coverage Tool 132 | .axoCover/* 133 | !.axoCover/settings.json 134 | 135 | # Visual Studio code coverage results 136 | *.coverage 137 | *.coveragexml 138 | 139 | # NCrunch 140 | _NCrunch_* 141 | .*crunch*.local.xml 142 | nCrunchTemp_* 143 | 144 | # MightyMoose 145 | *.mm.* 146 | AutoTest.Net/ 147 | 148 | # Web workbench (sass) 149 | .sass-cache/ 150 | 151 | # Installshield output folder 152 | [Ee]xpress/ 153 | 154 | # DocProject is a documentation generator add-in 155 | DocProject/buildhelp/ 156 | DocProject/Help/*.HxT 157 | DocProject/Help/*.HxC 158 | DocProject/Help/*.hhc 159 | DocProject/Help/*.hhk 160 | DocProject/Help/*.hhp 161 | DocProject/Help/Html2 162 | DocProject/Help/html 163 | 164 | # Click-Once directory 165 | publish/ 166 | 167 | # Publish Web Output 168 | *.[Pp]ublish.xml 169 | *.azurePubxml 170 | # Note: Comment the next line if you want to checkin your web deploy settings, 171 | # but database connection strings (with potential passwords) will be unencrypted 172 | *.pubxml 173 | *.publishproj 174 | 175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 176 | # checkin your Azure Web App publish settings, but sensitive information contained 177 | # in these scripts will be unencrypted 178 | PublishScripts/ 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/[Pp]ackages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/[Pp]ackages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/[Pp]ackages/repositories.config 188 | # NuGet v3's project.json files produces more ignorable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directories and files 201 | AppPackages/ 202 | BundleArtifacts/ 203 | Package.StoreAssociation.xml 204 | _pkginfo.txt 205 | *.appx 206 | 207 | # Visual Studio cache files 208 | # files ending in .cache can be ignored 209 | *.[Cc]ache 210 | # but keep track of directories ending in .cache 211 | !*.[Cc]ache/ 212 | 213 | # Others 214 | ClientBin/ 215 | ~$* 216 | *~ 217 | *.dbmdl 218 | *.dbproj.schemaview 219 | *.jfm 220 | *.pfx 221 | *.publishsettings 222 | orleans.codegen.cs 223 | 224 | # Including strong name files can present a security risk 225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 226 | #*.snk 227 | 228 | # Since there are multiple workflows, uncomment next line to ignore bower_components 229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 230 | #bower_components/ 231 | 232 | # RIA/Silverlight projects 233 | Generated_Code/ 234 | 235 | # Backup & report files from converting an old project file 236 | # to a newer Visual Studio version. Backup files are not needed, 237 | # because we have git ;-) 238 | _UpgradeReport_Files/ 239 | Backup*/ 240 | UpgradeLog*.XML 241 | UpgradeLog*.htm 242 | ServiceFabricBackup/ 243 | *.rptproj.bak 244 | 245 | # SQL Server files 246 | *.mdf 247 | *.ldf 248 | *.ndf 249 | 250 | # Business Intelligence projects 251 | *.rdl.data 252 | *.bim.layout 253 | *.bim_*.settings 254 | *.rptproj.rsuser 255 | 256 | # Microsoft Fakes 257 | FakesAssemblies/ 258 | 259 | # GhostDoc plugin setting file 260 | *.GhostDoc.xml 261 | 262 | # Node.js Tools for Visual Studio 263 | .ntvs_analysis.dat 264 | node_modules/ 265 | 266 | # Visual Studio 6 build log 267 | *.plg 268 | 269 | # Visual Studio 6 workspace options file 270 | *.opt 271 | 272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 273 | *.vbw 274 | 275 | # Visual Studio LightSwitch build output 276 | **/*.HTMLClient/GeneratedArtifacts 277 | **/*.DesktopClient/GeneratedArtifacts 278 | **/*.DesktopClient/ModelManifest.xml 279 | **/*.Server/GeneratedArtifacts 280 | **/*.Server/ModelManifest.xml 281 | _Pvt_Extensions 282 | 283 | # Paket dependency manager 284 | .paket/paket.exe 285 | paket-files/ 286 | 287 | # FAKE - F# Make 288 | .fake/ 289 | 290 | # JetBrains Rider 291 | .idea/ 292 | *.sln.iml 293 | 294 | # CodeRush 295 | .cr/ 296 | 297 | # Python Tools for Visual Studio (PTVS) 298 | __pycache__/ 299 | *.pyc 300 | 301 | # Cake - Uncomment if you are using it 302 | # tools/** 303 | # !tools/packages.config 304 | 305 | # Tabs Studio 306 | *.tss 307 | 308 | # Telerik's JustMock configuration file 309 | *.jmconfig 310 | 311 | # BizTalk build output 312 | *.btp.cs 313 | *.btm.cs 314 | *.odx.cs 315 | *.xsd.cs 316 | 317 | # OpenCover UI analysis results 318 | OpenCover/ 319 | 320 | # Azure Stream Analytics local run output 321 | ASALocalRun/ 322 | 323 | # MSBuild Binary and Structured Log 324 | *.binlog 325 | 326 | # NVidia Nsight GPU debugger configuration file 327 | *.nvuser 328 | 329 | # MFractors (Xamarin productivity tool) working folder 330 | .mfractor/ 331 | 332 | /build-artifacts -------------------------------------------------------------------------------- /CodeCoverage.runsettings: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | cobertura 8 | [InfinityCrawler.Tests]* 9 | [InfinityCrawler]* 10 | Obsolete,GeneratedCodeAttribute,CompilerGeneratedAttribute 11 | true 12 | true 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /InfinityCrawler.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.31808.319 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "global", "global", "{F0B4D082-200A-4DD3-9291-872B7F2A991E}" 7 | ProjectSection(SolutionItems) = preProject 8 | .codecov.yml = .codecov.yml 9 | .editorconfig = .editorconfig 10 | .gitignore = .gitignore 11 | CodeCoverage.runsettings = CodeCoverage.runsettings 12 | License.txt = License.txt 13 | README.md = README.md 14 | EndProjectSection 15 | EndProject 16 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C6187826-9F4B-4E85-90D1-BC46A0F7F8F1}" 17 | ProjectSection(SolutionItems) = preProject 18 | src\Directory.build.props = src\Directory.build.props 19 | EndProjectSection 20 | EndProject 21 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{46BF0980-A8A4-492E-8652-0725ADB6A683}" 22 | EndProject 23 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler", "src\InfinityCrawler\InfinityCrawler.csproj", "{90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}" 24 | EndProject 25 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests", "tests\InfinityCrawler.Tests\InfinityCrawler.Tests.csproj", "{F30AF2A4-C53F-40FE-8083-6A82C0583255}" 26 | EndProject 27 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests.TestSite", "tests\InfinityCrawler.Tests.TestSite\InfinityCrawler.Tests.TestSite.csproj", "{483B6FC9-98E7-4BD4-BA09-80DF504E31B2}" 28 | EndProject 29 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "InfinityCrawler.Tests.Benchmarks", "tests\InfinityCrawler.Tests.Benchmarks\InfinityCrawler.Tests.Benchmarks.csproj", "{F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}" 30 | EndProject 31 | Global 32 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 33 | Debug|Any CPU = Debug|Any CPU 34 | Release|Any CPU = Release|Any CPU 35 | EndGlobalSection 36 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 37 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 38 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Debug|Any CPU.Build.0 = Debug|Any CPU 39 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Release|Any CPU.ActiveCfg = Release|Any CPU 40 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565}.Release|Any CPU.Build.0 = Release|Any CPU 41 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 42 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Debug|Any CPU.Build.0 = Debug|Any CPU 43 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Release|Any CPU.ActiveCfg = Release|Any CPU 44 | {F30AF2A4-C53F-40FE-8083-6A82C0583255}.Release|Any CPU.Build.0 = Release|Any CPU 45 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 46 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Debug|Any CPU.Build.0 = Debug|Any CPU 47 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Release|Any CPU.ActiveCfg = Release|Any CPU 48 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2}.Release|Any CPU.Build.0 = Release|Any CPU 49 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 50 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Debug|Any CPU.Build.0 = Debug|Any CPU 51 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Release|Any CPU.ActiveCfg = Release|Any CPU 52 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97}.Release|Any CPU.Build.0 = Release|Any CPU 53 | EndGlobalSection 54 | GlobalSection(SolutionProperties) = preSolution 55 | HideSolutionNode = FALSE 56 | EndGlobalSection 57 | GlobalSection(NestedProjects) = preSolution 58 | {90361E0D-CB4C-4BCC-AAF2-70DAF87D5565} = {C6187826-9F4B-4E85-90D1-BC46A0F7F8F1} 59 | {F30AF2A4-C53F-40FE-8083-6A82C0583255} = {46BF0980-A8A4-492E-8652-0725ADB6A683} 60 | {483B6FC9-98E7-4BD4-BA09-80DF504E31B2} = {46BF0980-A8A4-492E-8652-0725ADB6A683} 61 | {F17C5CAC-DF32-434B-A4C5-21CBDDA86B97} = {46BF0980-A8A4-492E-8652-0725ADB6A683} 62 | EndGlobalSection 63 | GlobalSection(ExtensibilityGlobals) = postSolution 64 | SolutionGuid = {FC9AB8BE-670B-4F26-9F17-73D2C0DECA6A} 65 | EndGlobalSection 66 | EndGlobal 67 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Turner Software 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![Icon](images/icon.png) 4 | # Infinity Crawler 5 | A simple but powerful web crawler library for .NET 6 | 7 | ![Build](https://img.shields.io/github/workflow/status/TurnerSoftware/infinitycrawler/Build) 8 | [![Codecov](https://img.shields.io/codecov/c/github/turnersoftware/infinitycrawler/main.svg)](https://codecov.io/gh/TurnerSoftware/infinitycrawler) 9 | [![NuGet](https://img.shields.io/nuget/v/InfinityCrawler.svg)](https://www.nuget.org/packages/InfinityCrawler) 10 |
11 | 12 | ## Features 13 | - Obeys robots.txt (crawl delay & allow/disallow) 14 | - Obeys in-page robots rules (`X-Robots-Tag` header and `` tag) 15 | - Uses sitemap.xml to seed the initial crawl of the site 16 | - Built around a parallel task `async`/`await` system 17 | - Swappable request and content processors, allowing greater customisation 18 | - Auto-throttling (see below) 19 | 20 | ## Licensing and Support 21 | 22 | Infinity Crawler is licensed under the MIT license. It is free to use in personal and commercial projects. 23 | 24 | There are [support plans](https://turnersoftware.com.au/support-plans) available that cover all active [Turner Software OSS projects](https://github.com/TurnerSoftware). 25 | Support plans provide private email support, expert usage advice for our projects, priority bug fixes and more. 26 | These support plans help fund our OSS commitments to provide better software for everyone. 27 | 28 | ## Polite Crawling 29 | The crawler is built around fast but "polite" crawling of website. 30 | This is accomplished through a number of settings that allow adjustments of delays and throttles. 31 | 32 | You can control: 33 | - Number of simulatenous requests 34 | - The delay between requests starting (Note: If a `crawl-delay` is defined for the User-agent, that will be the minimum) 35 | - Artificial "jitter" in request delays (requests seem less "robotic") 36 | - Timeout for a request before throttling will apply for new requests 37 | - Throttling request backoff: The amount of time added to the delay to throttle requests (this is cumulative) 38 | - Minimum number of requests under the throttle timeout before the throttle is gradually removed 39 | 40 | ## Other Settings 41 | - Control the UserAgent used in the crawling process 42 | - Set additional host aliases you want the crawling process to follow (for example, subdomains) 43 | - The max number of retries for a specific URI 44 | - The max number of redirects to follow 45 | - The max number of pages to crawl 46 | 47 | ## Example Usage 48 | ```csharp 49 | using InfinityCrawler; 50 | 51 | var crawler = new Crawler(); 52 | var result = await crawler.Crawl(new Uri("http://example.org/"), new CrawlSettings { 53 | UserAgent = "MyVeryOwnWebCrawler/1.0", 54 | RequestProcessorOptions = new RequestProcessorOptions 55 | { 56 | MaxNumberOfSimultaneousRequests = 5 57 | } 58 | }); 59 | ``` -------------------------------------------------------------------------------- /images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TurnerSoftware/InfinityCrawler/4b56f68b5ea90afb9b711224a053ce658f03ac3b/images/icon.png -------------------------------------------------------------------------------- /src/Directory.Build.props: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | InfinityCrawler 5 | 6 | Turner Software 7 | 8 | $(AssemblyName) 9 | true 10 | MIT 11 | icon.png 12 | https://github.com/TurnerSoftware/InfinityCrawler 13 | crawler;robot;spider 14 | 15 | 16 | true 17 | true 18 | embedded 19 | 20 | Latest 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/InfinityCrawler/CrawlLink.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler 6 | { 7 | public class CrawlLink 8 | { 9 | public Uri Location { get; set; } 10 | public string Title { get; set; } 11 | public string Text { get; set; } 12 | public string Relationship { get; set; } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/InfinityCrawler/CrawlResult.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler 6 | { 7 | public class CrawlResult 8 | { 9 | public DateTime CrawlStart { get; set; } 10 | public TimeSpan ElapsedTime { get; set; } 11 | public IEnumerable CrawledUris { get; set; } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/InfinityCrawler/CrawlSettings.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net.Http; 4 | using System.Text; 5 | using InfinityCrawler.Processing.Content; 6 | using InfinityCrawler.Processing.Requests; 7 | 8 | namespace InfinityCrawler 9 | { 10 | public class CrawlSettings 11 | { 12 | public string UserAgent { get; set; } = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201"; 13 | public IEnumerable HostAliases { get; set; } 14 | public int NumberOfRetries { get; set; } = 3; 15 | public int MaxNumberOfRedirects { get; set; } = 3; 16 | public int MaxNumberOfPagesToCrawl { get; set; } 17 | 18 | public IContentProcessor ContentProcessor { get; set; } = new DefaultContentProcessor(); 19 | public IRequestProcessor RequestProcessor { get; set; } = new DefaultRequestProcessor(); 20 | public RequestProcessorOptions RequestProcessorOptions { get; set; } = new RequestProcessorOptions(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/InfinityCrawler/CrawledUri.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Net; 6 | using System.Text; 7 | 8 | namespace InfinityCrawler 9 | { 10 | public class CrawledUri 11 | { 12 | public Uri Location { get; set; } 13 | 14 | public CrawlStatus Status { get; set; } 15 | 16 | public IList RedirectChain { get; set; } 17 | public IList Requests { get; set; } 18 | 19 | public CrawledContent Content { get; set; } 20 | } 21 | 22 | public enum CrawlStatus 23 | { 24 | Crawled, 25 | RobotsBlocked, 26 | MaxRetries, 27 | MaxRedirects 28 | } 29 | 30 | public class CrawledUriRedirect 31 | { 32 | public Uri Location { get; set; } 33 | public IList Requests { get; set; } 34 | } 35 | 36 | public class CrawlRequest 37 | { 38 | public DateTime RequestStart { get; set; } 39 | public TimeSpan ElapsedTime { get; set; } 40 | public HttpStatusCode? StatusCode { get; set; } 41 | public bool IsSuccessfulStatus { get; set; } 42 | } 43 | 44 | public class CrawledContent 45 | { 46 | public string ContentType { get; set; } 47 | public string CharacterSet { get; set; } 48 | public string ContentEncoding { get; set; } 49 | 50 | public IEnumerable PageRobotRules { get; set; } 51 | 52 | public string RawContent { get; set; } 53 | 54 | public Uri CanonicalUri { get; set; } 55 | public IEnumerable Links { get; set; } = Enumerable.Empty(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Crawler.cs: -------------------------------------------------------------------------------- 1 | using InfinityCrawler.Internal; 2 | using System; 3 | using System.Collections.Concurrent; 4 | using System.Collections.Generic; 5 | using System.Diagnostics; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Net.Http; 9 | using System.Threading; 10 | using System.Threading.Tasks; 11 | using InfinityCrawler.Processing.Requests; 12 | using TurnerSoftware.RobotsExclusionTools; 13 | using TurnerSoftware.SitemapTools; 14 | using Microsoft.Extensions.Logging; 15 | using InfinityCrawler.Processing.Content; 16 | 17 | namespace InfinityCrawler 18 | { 19 | public class Crawler 20 | { 21 | private HttpClient HttpClient { get; } 22 | private ILogger Logger { get; } 23 | 24 | public Crawler() 25 | { 26 | HttpClient = new HttpClient(new HttpClientHandler 27 | { 28 | AllowAutoRedirect = false, 29 | UseCookies = false 30 | }); 31 | } 32 | 33 | public Crawler(HttpClient httpClient, ILogger logger = null) 34 | { 35 | HttpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient)); 36 | Logger = logger; 37 | } 38 | 39 | public async Task Crawl(Uri siteUri, CrawlSettings settings) 40 | { 41 | var result = new CrawlResult 42 | { 43 | CrawlStart = DateTime.UtcNow 44 | }; 45 | var overallCrawlStopwatch = new Stopwatch(); 46 | overallCrawlStopwatch.Start(); 47 | 48 | var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); 49 | var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); 50 | 51 | UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); 52 | 53 | var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); 54 | 55 | //Use any links referred to by the sitemap as a starting point 56 | var urisFromSitemap = (await new SitemapQuery(HttpClient) 57 | .GetAllSitemapsForDomainAsync(siteUri.Host)) 58 | .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); 59 | foreach (var uri in urisFromSitemap) 60 | { 61 | crawlRunner.AddRequest(uri); 62 | } 63 | 64 | result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => 65 | { 66 | using (requestResult.Content) 67 | { 68 | var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders); 69 | var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content); 70 | requestResult.Content.Seek(0, SeekOrigin.Begin); 71 | content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync(); 72 | crawlRunner.AddResult(crawlState.Location, content); 73 | } 74 | }); 75 | 76 | overallCrawlStopwatch.Stop(); 77 | result.ElapsedTime = overallCrawlStopwatch.Elapsed; 78 | return result; 79 | } 80 | 81 | private void UpdateCrawlDelay(RobotsFile robotsFile, string userAgent, RequestProcessorOptions requestProcessorOptions) 82 | { 83 | var minimumCrawlDelayInMilliseconds = 0; 84 | 85 | //Apply Robots.txt crawl-delay (if defined) 86 | if (robotsFile.TryGetEntryForUserAgent(userAgent, out var accessEntry)) 87 | { 88 | minimumCrawlDelayInMilliseconds = accessEntry.CrawlDelay ?? 0 * 1000; 89 | } 90 | 91 | var taskDelay = Math.Max(minimumCrawlDelayInMilliseconds, requestProcessorOptions.DelayBetweenRequestStart.TotalMilliseconds); 92 | requestProcessorOptions.DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay); 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/InfinityCrawler/InfinityCrawler.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0;net6.0 5 | InfinityCrawler 6 | A simple but powerful web crawler library 7 | $(PackageBaseTags) 8 | James Turner 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Internal/CrawlRunner.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Net; 6 | using System.Net.Http; 7 | using System.Text; 8 | using System.Threading; 9 | using System.Threading.Tasks; 10 | using InfinityCrawler.Processing.Requests; 11 | using Microsoft.Extensions.Logging; 12 | using TurnerSoftware.RobotsExclusionTools; 13 | 14 | namespace InfinityCrawler.Internal 15 | { 16 | internal class CrawlRunner 17 | { 18 | public Uri BaseUri { get; } 19 | public CrawlSettings Settings { get; } 20 | 21 | private RobotsFile RobotsFile { get; } 22 | private HttpClient HttpClient { get; } 23 | 24 | private ILogger Logger { get; } 25 | 26 | private RobotsPageParser RobotsPageParser { get; } 27 | 28 | private ConcurrentDictionary UriCrawlStates { get; } = new ConcurrentDictionary(); 29 | private ConcurrentDictionary SeenUris { get; } = new ConcurrentDictionary(); 30 | private ConcurrentBag CrawledUris { get; } = new ConcurrentBag(); 31 | 32 | public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null) 33 | { 34 | BaseUri = baseUri; 35 | RobotsFile = robotsFile; 36 | HttpClient = httpClient; 37 | Settings = crawlSettings; 38 | 39 | Logger = logger; 40 | RobotsPageParser = new RobotsPageParser(); 41 | 42 | AddRequest(baseUri); 43 | } 44 | 45 | private Uri StripFragment(Uri uri) 46 | { 47 | return new UriBuilder(uri) 48 | { 49 | Fragment = null 50 | }.Uri; 51 | } 52 | 53 | private void AddLink(CrawlLink crawlLink) 54 | { 55 | if (crawlLink.Relationship != null && crawlLink.Relationship.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase)) 56 | { 57 | return; 58 | } 59 | 60 | var uriWithoutFragment = StripFragment(crawlLink.Location); 61 | if (SeenUris.ContainsKey(uriWithoutFragment)) 62 | { 63 | return; 64 | } 65 | 66 | AddRequest(uriWithoutFragment, false); 67 | } 68 | 69 | private void AddRedirect(Uri requestUri, Uri redirectUri) 70 | { 71 | if (UriCrawlStates.TryRemove(requestUri, out var crawlState)) 72 | { 73 | var absoluteRedirectUri = new Uri(requestUri, redirectUri); 74 | absoluteRedirectUri = StripFragment(absoluteRedirectUri); 75 | 76 | var redirectCrawlState = new UriCrawlState 77 | { 78 | Location = absoluteRedirectUri, 79 | Redirects = crawlState.Redirects ?? new List() 80 | }; 81 | redirectCrawlState.Redirects.Add(new CrawledUriRedirect 82 | { 83 | Location = crawlState.Location, 84 | Requests = crawlState.Requests 85 | }); 86 | 87 | UriCrawlStates.TryAdd(redirectCrawlState.Location, redirectCrawlState); 88 | AddRequest(redirectCrawlState.Location, true); 89 | } 90 | } 91 | 92 | public void AddResult(Uri requestUri, CrawledContent content) 93 | { 94 | if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) 95 | { 96 | var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); 97 | if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) 98 | { 99 | Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule."); 100 | AddResult(new CrawledUri 101 | { 102 | Location = crawlState.Location, 103 | Status = CrawlStatus.RobotsBlocked, 104 | Requests = crawlState.Requests, 105 | RedirectChain = crawlState.Redirects 106 | }); 107 | } 108 | else 109 | { 110 | Logger?.LogDebug($"Result for {requestUri} has completed successfully with content."); 111 | 112 | AddResult(new CrawledUri 113 | { 114 | Location = crawlState.Location, 115 | Status = CrawlStatus.Crawled, 116 | RedirectChain = crawlState.Redirects, 117 | Requests = crawlState.Requests, 118 | Content = content 119 | }); 120 | 121 | if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) 122 | { 123 | foreach (var crawlLink in content.Links) 124 | { 125 | AddLink(crawlLink); 126 | } 127 | } 128 | } 129 | } 130 | } 131 | 132 | public void AddRequest(Uri requestUri) 133 | { 134 | var uriWithoutFragment = StripFragment(requestUri); 135 | AddRequest(uriWithoutFragment, false); 136 | } 137 | 138 | private void AddRequest(Uri requestUri, bool skipMaxPageCheck) 139 | { 140 | if (Settings.HostAliases != null) 141 | { 142 | if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host))) 143 | { 144 | Logger?.LogDebug($"Request containing host {requestUri.Host} is not in the list of allowed hosts. This request will be ignored."); 145 | return; 146 | } 147 | } 148 | else if (requestUri.Host != BaseUri.Host) 149 | { 150 | Logger?.LogDebug($"Request containing host {requestUri.Host} doesn't match the base host. This request will be ignored."); 151 | return; 152 | } 153 | 154 | if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0) 155 | { 156 | var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests; 157 | if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl) 158 | { 159 | Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}. This request will be ignored."); 160 | return; 161 | } 162 | } 163 | 164 | SeenUris.TryAdd(requestUri, 0); 165 | 166 | if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) 167 | { 168 | var lastRequest = crawlState.Requests.LastOrDefault(); 169 | if (lastRequest != null && lastRequest.IsSuccessfulStatus) 170 | { 171 | return; 172 | } 173 | 174 | if (crawlState.Requests.Count() == Settings.NumberOfRetries) 175 | { 176 | Logger?.LogDebug($"Request for {requestUri} has hit the maximum retry limit ({Settings.NumberOfRetries})."); 177 | AddResult(new CrawledUri 178 | { 179 | Location = crawlState.Location, 180 | Status = CrawlStatus.MaxRetries, 181 | Requests = crawlState.Requests, 182 | RedirectChain = crawlState.Redirects 183 | }); 184 | return; 185 | } 186 | 187 | if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects) 188 | { 189 | Logger?.LogDebug($"Request for {requestUri} has hit the maximum redirect limit ({Settings.MaxNumberOfRedirects})."); 190 | AddResult(new CrawledUri 191 | { 192 | Location = crawlState.Location, 193 | RedirectChain = crawlState.Redirects, 194 | Status = CrawlStatus.MaxRedirects 195 | }); 196 | return; 197 | } 198 | } 199 | 200 | if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent)) 201 | { 202 | Logger?.LogDebug($"Added {requestUri} to request queue."); 203 | Settings.RequestProcessor.Add(requestUri); 204 | } 205 | else 206 | { 207 | Logger?.LogDebug($"Request for {requestUri} has been blocked by the Robots.txt file."); 208 | AddResult(new CrawledUri 209 | { 210 | Location = requestUri, 211 | Status = CrawlStatus.RobotsBlocked 212 | }); 213 | } 214 | } 215 | 216 | private void AddResult(CrawledUri result) 217 | { 218 | CrawledUris.Add(result); 219 | } 220 | 221 | public async Task> ProcessAsync( 222 | Func responseSuccessAction, 223 | CancellationToken cancellationToken = default 224 | ) 225 | { 226 | await Settings.RequestProcessor.ProcessAsync( 227 | HttpClient, 228 | async (requestResult) => 229 | { 230 | var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState 231 | { 232 | Location = requestResult.RequestUri 233 | }); 234 | 235 | if (requestResult.Exception != null) 236 | { 237 | //Retry failed requests 238 | Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later."); 239 | crawlState.Requests.Add(new CrawlRequest 240 | { 241 | RequestStart = requestResult.RequestStart, 242 | ElapsedTime = requestResult.ElapsedTime 243 | }); 244 | AddRequest(requestResult.RequestUri); 245 | } 246 | else 247 | { 248 | var crawlRequest = new CrawlRequest 249 | { 250 | RequestStart = requestResult.RequestStart, 251 | ElapsedTime = requestResult.ElapsedTime, 252 | StatusCode = requestResult.StatusCode, 253 | IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299 254 | }; 255 | crawlState.Requests.Add(crawlRequest); 256 | 257 | var redirectStatusCodes = new[] 258 | { 259 | HttpStatusCode.MovedPermanently, 260 | HttpStatusCode.Redirect, 261 | HttpStatusCode.TemporaryRedirect 262 | }; 263 | if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) 264 | { 265 | Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue."); 266 | AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location); 267 | } 268 | else if (crawlRequest.IsSuccessfulStatus) 269 | { 270 | await responseSuccessAction(requestResult, crawlState); 271 | } 272 | else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) 273 | { 274 | //On server errors, try to crawl the page again later 275 | Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). This URL will be added to the request queue to be attempted again later."); 276 | AddRequest(crawlState.Location); 277 | } 278 | else 279 | { 280 | //On any other error, just save what we have seen and move on 281 | //Consider the content of the request irrelevant 282 | Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). No further requests will be attempted."); 283 | AddResult(new CrawledUri 284 | { 285 | Location = crawlState.Location, 286 | Status = CrawlStatus.Crawled, 287 | RedirectChain = crawlState.Redirects, 288 | Requests = crawlState.Requests 289 | }); 290 | } 291 | } 292 | }, 293 | Settings.RequestProcessorOptions, 294 | cancellationToken 295 | ); 296 | 297 | Logger?.LogDebug($"Completed crawling {CrawledUris.Count} pages."); 298 | 299 | return CrawledUris.ToArray(); 300 | } 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Internal/UriExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler.Internal 6 | { 7 | internal static class UriExtensions 8 | { 9 | public static Uri BuildUriFromHref(this Uri pageUri, string href, string baseHref = null) 10 | { 11 | var hrefPieces = href.Split(new[] { '#' }, 2); 12 | var hrefWithoutFragment = hrefPieces[0]; 13 | var hrefFragment = hrefPieces.Length > 1 ? hrefPieces[1] : null; 14 | 15 | if (Uri.IsWellFormedUriString(hrefWithoutFragment, UriKind.RelativeOrAbsolute)) 16 | { 17 | var baseUri = pageUri; 18 | 19 | //Allows to work 20 | if (Uri.IsWellFormedUriString(baseHref, UriKind.RelativeOrAbsolute)) 21 | { 22 | baseUri = new Uri(pageUri, baseHref); 23 | } 24 | 25 | return new UriBuilder(new Uri(baseUri, hrefWithoutFragment)) 26 | { 27 | Fragment = hrefFragment 28 | }.Uri; 29 | } 30 | 31 | return null; 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Content/CrawlHeaders.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net.Http.Headers; 4 | using System.Text; 5 | 6 | namespace InfinityCrawler.Processing.Content 7 | { 8 | public class CrawlHeaders 9 | { 10 | public HttpResponseHeaders ResponseHeaders { get; } 11 | public HttpContentHeaders ContentHeaders { get; } 12 | 13 | public CrawlHeaders(HttpResponseHeaders responseHeaders, HttpContentHeaders contentHeaders) 14 | { 15 | ResponseHeaders = responseHeaders; 16 | ContentHeaders = contentHeaders; 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Content/DefaultContentProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Net.Http.Headers; 6 | using System.Text; 7 | using System.Threading.Tasks; 8 | using HtmlAgilityPack; 9 | using InfinityCrawler.Internal; 10 | 11 | namespace InfinityCrawler.Processing.Content 12 | { 13 | public class DefaultContentProcessor : IContentProcessor 14 | { 15 | public CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream) 16 | { 17 | var crawledContent = new CrawledContent 18 | { 19 | ContentType = headers.ContentHeaders.ContentType?.MediaType, 20 | CharacterSet = headers.ContentHeaders.ContentType?.CharSet, 21 | ContentEncoding = headers.ContentHeaders.ContentEncoding != null ? string.Join(",", headers.ContentHeaders.ContentEncoding) : null 22 | }; 23 | 24 | var document = new HtmlDocument(); 25 | document.Load(contentStream); 26 | 27 | var pageRobotRules = new List(); 28 | if (headers.ResponseHeaders.Contains("X-Robots-Tag")) 29 | { 30 | var robotsHeaderValues = headers.ResponseHeaders.GetValues("X-Robots-Tag"); 31 | pageRobotRules.AddRange(robotsHeaderValues); 32 | } 33 | 34 | var metaNodes = document.DocumentNode.SelectNodes("html/head/meta"); 35 | if (metaNodes != null) 36 | { 37 | var robotsMetaValue = metaNodes 38 | .Where(n => n.Attributes.Any(a => a.Name == "name" && a.Value.Equals("robots", StringComparison.InvariantCultureIgnoreCase))) 39 | .SelectMany(n => n.Attributes.Where(a => a.Name == "content").Select(a => a.Value)) 40 | .FirstOrDefault(); 41 | if (robotsMetaValue != null) 42 | { 43 | pageRobotRules.Add(robotsMetaValue); 44 | } 45 | } 46 | 47 | crawledContent.PageRobotRules = pageRobotRules.ToArray(); 48 | crawledContent.CanonicalUri = GetCanonicalUri(document, requestUri); 49 | crawledContent.Links = GetLinks(document, requestUri).ToArray(); 50 | 51 | return crawledContent; 52 | } 53 | 54 | private string GetBaseHref(HtmlDocument document) 55 | { 56 | var baseNode = document.DocumentNode.SelectSingleNode("html/head/base"); 57 | return baseNode?.GetAttributeValue("href", string.Empty) ?? string.Empty; 58 | } 59 | 60 | private Uri GetCanonicalUri(HtmlDocument document, Uri requestUri) 61 | { 62 | var linkNodes = document.DocumentNode.SelectNodes("html/head/link"); 63 | if (linkNodes != null) 64 | { 65 | var canonicalNode = linkNodes 66 | .Where(n => n.Attributes.Any(a => a.Name == "rel" && a.Value.Equals("canonical", StringComparison.InvariantCultureIgnoreCase))) 67 | .FirstOrDefault(); 68 | if (canonicalNode != null) 69 | { 70 | var baseHref = GetBaseHref(document); 71 | var canonicalHref = canonicalNode.GetAttributeValue("href", null); 72 | return requestUri.BuildUriFromHref(canonicalHref, baseHref); 73 | } 74 | } 75 | 76 | return null; 77 | } 78 | 79 | private IEnumerable GetLinks(HtmlDocument document, Uri requestUri) 80 | { 81 | var anchorNodes = document.DocumentNode.SelectNodes("//a"); 82 | if (anchorNodes != null) 83 | { 84 | var baseHref = GetBaseHref(document); 85 | 86 | foreach (var anchor in anchorNodes) 87 | { 88 | var href = anchor.GetAttributeValue("href", null); 89 | if (href == null) 90 | { 91 | continue; 92 | } 93 | 94 | var anchorLocation = requestUri.BuildUriFromHref(href, baseHref); 95 | if (anchorLocation == null) 96 | { 97 | //Invalid links are ignored 98 | continue; 99 | } 100 | 101 | if (anchorLocation.Scheme != Uri.UriSchemeHttp && anchorLocation.Scheme != Uri.UriSchemeHttps) 102 | { 103 | //Skip non-HTTP links 104 | continue; 105 | } 106 | 107 | yield return new CrawlLink 108 | { 109 | Location = anchorLocation, 110 | Title = anchor.GetAttributeValue("title", null), 111 | Text = anchor.InnerText, 112 | Relationship = anchor.GetAttributeValue("rel", null), 113 | }; 114 | } 115 | } 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Content/IContentProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Net.Http.Headers; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace InfinityCrawler.Processing.Content 9 | { 10 | public interface IContentProcessor 11 | { 12 | CrawledContent Parse(Uri requestUri, CrawlHeaders headers, Stream contentStream); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Requests/DefaultRequestProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Net.Http; 8 | using System.Runtime.ExceptionServices; 9 | using System.Text; 10 | using System.Threading; 11 | using System.Threading.Tasks; 12 | using Microsoft.Extensions.Logging; 13 | 14 | namespace InfinityCrawler.Processing.Requests 15 | { 16 | public class DefaultRequestProcessor : IRequestProcessor 17 | { 18 | private ILogger Logger { get; } 19 | private ConcurrentQueue RequestQueue { get; } = new ConcurrentQueue(); 20 | 21 | public DefaultRequestProcessor(ILogger logger = null) 22 | { 23 | Logger = logger; 24 | } 25 | 26 | public void Add(Uri uri) 27 | { 28 | RequestQueue.Enqueue(uri); 29 | PendingRequests++; 30 | } 31 | 32 | public int PendingRequests { get; private set; } 33 | 34 | public async Task ProcessAsync(HttpClient httpClient, Func responseAction, RequestProcessorOptions options, CancellationToken cancellationToken = default) 35 | { 36 | if (options == null) 37 | { 38 | throw new ArgumentNullException(nameof(options)); 39 | } 40 | 41 | var random = new Random(); 42 | var activeRequests = new ConcurrentDictionary, RequestContext>(options.MaxNumberOfSimultaneousRequests, options.MaxNumberOfSimultaneousRequests); 43 | 44 | var currentBackoff = 0; 45 | var successesSinceLastThrottle = 0; 46 | var requestCount = 0; 47 | 48 | while (activeRequests.Count > 0 || !RequestQueue.IsEmpty) 49 | { 50 | cancellationToken.ThrowIfCancellationRequested(); 51 | 52 | while (!RequestQueue.IsEmpty) 53 | { 54 | cancellationToken.ThrowIfCancellationRequested(); 55 | 56 | if (RequestQueue.TryDequeue(out var requestUri)) 57 | { 58 | var requestStartDelay = 0d; 59 | //Request delaying and backoff 60 | if (options.DelayBetweenRequestStart.TotalMilliseconds > 0) 61 | { 62 | requestStartDelay = options.DelayBetweenRequestStart.TotalMilliseconds; 63 | requestStartDelay += random.NextDouble() * options.DelayJitter.TotalMilliseconds; 64 | } 65 | 66 | requestStartDelay += currentBackoff; 67 | 68 | var requestContext = new RequestContext 69 | { 70 | RequestNumber = requestCount + 1, 71 | RequestUri = requestUri, 72 | Timer = new Stopwatch(), 73 | RequestStartDelay = requestStartDelay, 74 | RequestTimeout = options.RequestTimeout, 75 | CancellationToken = cancellationToken 76 | }; 77 | 78 | Logger?.LogDebug($"Request #{requestContext.RequestNumber} ({requestUri}) starting with a {requestStartDelay}ms delay."); 79 | 80 | var task = PerformRequestAsync(httpClient, requestContext); 81 | 82 | activeRequests.TryAdd(task, requestContext); 83 | requestCount++; 84 | 85 | if (activeRequests.Count == options.MaxNumberOfSimultaneousRequests) 86 | { 87 | break; 88 | } 89 | } 90 | } 91 | 92 | await Task.WhenAny(activeRequests.Keys).ConfigureAwait(false); 93 | 94 | cancellationToken.ThrowIfCancellationRequested(); 95 | 96 | var completedRequests = activeRequests.Keys.Where(t => t.IsCompleted); 97 | foreach (var completedRequest in completedRequests) 98 | { 99 | activeRequests.TryRemove(completedRequest, out var requestContext); 100 | PendingRequests--; 101 | 102 | if (completedRequest.IsFaulted) 103 | { 104 | var aggregateException = completedRequest.Exception; 105 | 106 | //Keep the existing stack trace when re-throwing 107 | ExceptionDispatchInfo.Capture(aggregateException.InnerException).Throw(); 108 | } 109 | 110 | await responseAction(completedRequest.Result); 111 | 112 | //Manage the throttling based on timeouts and successes 113 | var throttlePoint = options.TimeoutBeforeThrottle; 114 | if (throttlePoint.TotalMilliseconds > 0 && requestContext.Timer.Elapsed > throttlePoint) 115 | { 116 | successesSinceLastThrottle = 0; 117 | currentBackoff += (int)options.ThrottlingRequestBackoff.TotalMilliseconds; 118 | Logger?.LogInformation($"Increased backoff to {currentBackoff}ms."); 119 | } 120 | else if (currentBackoff > 0) 121 | { 122 | successesSinceLastThrottle += 1; 123 | if (successesSinceLastThrottle == options.MinSequentialSuccessesToMinimiseThrottling) 124 | { 125 | var newBackoff = currentBackoff - options.ThrottlingRequestBackoff.TotalMilliseconds; 126 | currentBackoff = Math.Max(0, (int)newBackoff); 127 | successesSinceLastThrottle = 0; 128 | Logger?.LogInformation($"Decreased backoff to {currentBackoff}ms."); 129 | } 130 | } 131 | } 132 | } 133 | 134 | Logger?.LogDebug($"Completed processing {requestCount} requests."); 135 | } 136 | 137 | private async Task PerformRequestAsync(HttpClient httpClient, RequestContext context) 138 | { 139 | if (context.RequestStartDelay > 0) 140 | { 141 | await Task.Delay((int)context.RequestStartDelay); 142 | } 143 | 144 | var requestStart = DateTime.UtcNow; 145 | context.Timer.Start(); 146 | 147 | try 148 | { 149 | var timeoutToken = new CancellationTokenSource(context.RequestTimeout).Token; 150 | var combinedToken = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken, timeoutToken).Token; 151 | using (var response = await httpClient.GetAsync(context.RequestUri, combinedToken)) 152 | { 153 | var contentStream = new MemoryStream(); 154 | await response.Content.CopyToAsync(contentStream); 155 | contentStream.Seek(0, SeekOrigin.Begin); 156 | 157 | //We only want to time the request, not the handling of the response 158 | context.Timer.Stop(); 159 | 160 | context.CancellationToken.ThrowIfCancellationRequested(); 161 | 162 | Logger?.LogDebug($"Request #{context.RequestNumber} completed successfully in {context.Timer.ElapsedMilliseconds}ms."); 163 | 164 | return new RequestResult 165 | { 166 | RequestUri = context.RequestUri, 167 | RequestStart = requestStart, 168 | RequestStartDelay = context.RequestStartDelay, 169 | StatusCode = response.StatusCode, 170 | ResponseHeaders = response.Headers, 171 | ContentHeaders = response.Content.Headers, 172 | Content = contentStream, 173 | ElapsedTime = context.Timer.Elapsed 174 | }; 175 | } 176 | } 177 | catch (OperationCanceledException) when (context.CancellationToken.IsCancellationRequested) 178 | { 179 | Logger?.LogDebug($"Request #{context.RequestNumber} cancelled."); 180 | return null; 181 | } 182 | catch (Exception ex) when (ex is HttpRequestException || ex is OperationCanceledException) 183 | { 184 | context.Timer.Stop(); 185 | 186 | Logger?.LogDebug($"Request #{context.RequestNumber} completed with error in {context.Timer.ElapsedMilliseconds}ms."); 187 | Logger?.LogTrace(ex, $"Request #{context.RequestNumber} Exception: {ex.Message}"); 188 | 189 | return new RequestResult 190 | { 191 | RequestUri = context.RequestUri, 192 | RequestStart = requestStart, 193 | RequestStartDelay = context.RequestStartDelay, 194 | ElapsedTime = context.Timer.Elapsed, 195 | Exception = ex 196 | }; 197 | } 198 | } 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Requests/IRequestProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Net.Http; 5 | using System.Text; 6 | using System.Threading; 7 | using System.Threading.Tasks; 8 | 9 | namespace InfinityCrawler.Processing.Requests 10 | { 11 | public interface IRequestProcessor 12 | { 13 | void Add(Uri requestUri); 14 | 15 | int PendingRequests { get; } 16 | 17 | Task ProcessAsync( 18 | HttpClient httpClient, 19 | Func responseAction, 20 | RequestProcessorOptions options, 21 | CancellationToken cancellationToken = default 22 | ); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Requests/RequestContext.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Text; 5 | using System.Threading; 6 | 7 | namespace InfinityCrawler.Processing.Requests 8 | { 9 | public class RequestContext 10 | { 11 | public int RequestNumber { get; set; } 12 | public Uri RequestUri { get; set; } 13 | public Stopwatch Timer { get; set; } 14 | public double RequestStartDelay { get; set; } 15 | public TimeSpan RequestTimeout { get; set; } 16 | public CancellationToken CancellationToken { get; set; } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Requests/RequestProcessorOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler.Processing.Requests 6 | { 7 | public class RequestProcessorOptions 8 | { 9 | /// 10 | /// Maximum number of simultaneous asynchronous requests to run at once. 11 | /// 12 | public int MaxNumberOfSimultaneousRequests { get; set; } = 10; 13 | /// 14 | /// Delay between one request starting and the next. 15 | /// 16 | public TimeSpan DelayBetweenRequestStart { get; set; } = new TimeSpan(0, 0, 0, 0, 1000); 17 | /// 18 | /// Maximum jitter applied to a request delay. 19 | /// 20 | public TimeSpan DelayJitter { get; set; } = new TimeSpan(0, 0, 0, 0, 1000); 21 | /// 22 | /// The request timeout length before throttling sets in. 23 | /// 24 | public TimeSpan TimeoutBeforeThrottle { get; set; } = new TimeSpan(0, 0, 0, 0, 2500); 25 | /// 26 | /// The amount of throttling delay to add to subsequent requests. This is added every time the timeout is hit. 27 | /// 28 | public TimeSpan ThrottlingRequestBackoff { get; set; } = new TimeSpan(0, 0, 0, 5); 29 | /// 30 | /// Minimum number of requests below the timeout before minimising the applied throttling. 31 | /// 32 | public int MinSequentialSuccessesToMinimiseThrottling { get; set; } = 5; 33 | /// 34 | /// The amount of time before a request is cancelled and retried. 35 | /// 36 | public TimeSpan RequestTimeout { get; set; } = new TimeSpan(0, 0, 30); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/InfinityCrawler/Processing/Requests/RequestResult.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Net; 5 | using System.Net.Http; 6 | using System.Net.Http.Headers; 7 | using System.Text; 8 | 9 | namespace InfinityCrawler.Processing.Requests 10 | { 11 | public class RequestResult 12 | { 13 | public Uri RequestUri { get; set; } 14 | public DateTime RequestStart { get; set; } 15 | public double RequestStartDelay { get; set; } 16 | public HttpStatusCode? StatusCode { get; set; } 17 | public HttpResponseHeaders ResponseHeaders { get; set; } 18 | public HttpContentHeaders ContentHeaders { get; set; } 19 | public Stream Content { get; set; } 20 | public TimeSpan ElapsedTime { get; set; } 21 | public Exception Exception { get; set; } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/InfinityCrawler/UriCrawlState.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler 6 | { 7 | public class UriCrawlState 8 | { 9 | public Uri Location { get; set; } 10 | public IList Requests { get; set; } = new List(); 11 | public IList Redirects { get; set; } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.Benchmarks/BasicSiteCrawlBenchmark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Threading.Tasks; 3 | using BenchmarkDotNet.Attributes; 4 | using BenchmarkDotNet.Jobs; 5 | using InfinityCrawler.Processing.Requests; 6 | using InfinityCrawler.Tests.TestSite; 7 | 8 | namespace InfinityCrawler.Tests.Benchmarks 9 | { 10 | [SimpleJob(RuntimeMoniker.Net60)] 11 | [MemoryDiagnoser] 12 | public class BasicSiteCrawlBenchmark 13 | { 14 | private TestSiteManager TestSite { get; } 15 | private Crawler Crawler { get; } 16 | private Uri Uri { get; } = new Uri("http://localhost/"); 17 | 18 | public BasicSiteCrawlBenchmark() 19 | { 20 | TestSite = new TestSiteManager(new SiteContext 21 | { 22 | SiteFolder = "BasicSite" 23 | }); 24 | 25 | var client = TestSite.GetHttpClient(); 26 | Crawler = new Crawler(client); 27 | } 28 | 29 | [GlobalSetup] 30 | public async Task Setup() 31 | { 32 | await CrawlSite(); // benchmark warmup as a workaround for https://github.com/dotnet/BenchmarkDotNet/issues/837 33 | } 34 | 35 | [Benchmark] 36 | public async Task CrawlSite() 37 | { 38 | _ = await Crawler.Crawl(Uri, new CrawlSettings 39 | { 40 | RequestProcessorOptions = new RequestProcessorOptions 41 | { 42 | MaxNumberOfSimultaneousRequests = 5, 43 | DelayBetweenRequestStart = new TimeSpan(), 44 | DelayJitter = new TimeSpan(), 45 | TimeoutBeforeThrottle = new TimeSpan() 46 | } 47 | }); 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.Benchmarks/InfinityCrawler.Tests.Benchmarks.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net6.0 6 | false 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.Benchmarks/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BenchmarkDotNet.Running; 3 | 4 | namespace InfinityCrawler.Tests.Benchmarks 5 | { 6 | class Program 7 | { 8 | static void Main(string[] args) 9 | { 10 | BenchmarkRunner.Run(); 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Controllers/HelperController.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net; 4 | using System.Text; 5 | using System.Threading; 6 | using System.Threading.Tasks; 7 | using Microsoft.AspNetCore.Mvc; 8 | 9 | namespace InfinityCrawler.Tests.TestSite.Controllers 10 | { 11 | [Route("/")] 12 | public class HelperController : ControllerBase 13 | { 14 | private SiteContext Context { get; } 15 | 16 | public HelperController(SiteContext context) 17 | { 18 | Context = context; 19 | } 20 | 21 | [Route("delay/{delay}/{path}")] 22 | public async Task Delay(int delay, string path) 23 | { 24 | await Task.Delay(delay); 25 | return new ContentResult 26 | { 27 | Content = path 28 | }; 29 | } 30 | 31 | [Route("status/{statusCode}")] 32 | public IActionResult ReturnError(HttpStatusCode statusCode) 33 | { 34 | return new ContentResult 35 | { 36 | StatusCode = (int)statusCode, 37 | Content = statusCode.ToString() 38 | }; 39 | } 40 | 41 | [Route("redirect/{depth}/{path}")] 42 | public IActionResult Redirect(int depth, string path) 43 | { 44 | if (depth <= 0) 45 | { 46 | return new ContentResult 47 | { 48 | Content = path 49 | }; 50 | } 51 | 52 | return RedirectToAction("Redirect", new { depth = depth - 1, path }); 53 | } 54 | 55 | [Route("sitemap.xml")] 56 | public IActionResult DynamicSitemap() 57 | { 58 | var defaultFile = "index.html"; 59 | 60 | if (!string.IsNullOrEmpty(Context.EntryPath)) 61 | { 62 | defaultFile = Context.EntryPath + defaultFile; 63 | } 64 | 65 | return new ContentResult 66 | { 67 | ContentType = "text/xml", 68 | Content = $@" 69 | 70 | 71 | http://localhost/{defaultFile} 72 | 73 | " 74 | }; 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Controllers/RobotsController.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net; 4 | using System.Text; 5 | using Microsoft.AspNetCore.Mvc; 6 | 7 | namespace InfinityCrawler.Tests.TestSite.Controllers 8 | { 9 | [Route("/robots/")] 10 | public class RobotsController : ControllerBase 11 | { 12 | private string GetHtml(string path) 13 | { 14 | return $@" 15 | 16 | 17 | 18 | 19 | Test Path 20 | 21 | "; 22 | } 23 | 24 | private ContentResult GetResult(string path) 25 | { 26 | return new ContentResult 27 | { 28 | StatusCode = (int)HttpStatusCode.OK, 29 | ContentType = "text/html", 30 | Content = GetHtml(path) 31 | }; 32 | } 33 | 34 | [Route("header-page-noindex")] 35 | public IActionResult AllNoIndex() 36 | { 37 | Response.Headers.Add("X-Robots-Tag", "noindex"); 38 | return GetResult("header-page-no-index"); 39 | } 40 | [Route("header-page-nofollow")] 41 | public IActionResult AllNoFollow() 42 | { 43 | Response.Headers.Add("X-Robots-Tag", "nofollow"); 44 | return GetResult("header-page-no-follow"); 45 | } 46 | [Route("header-page-none")] 47 | public IActionResult AllNone() 48 | { 49 | Response.Headers.Add("X-Robots-Tag", "none"); 50 | return GetResult("header-page-none"); 51 | } 52 | [Route("header-bot-specific")] 53 | public IActionResult BotSpecific() 54 | { 55 | Response.Headers.Add("X-Robots-Tag", new[] 56 | { 57 | "onebot: noindex", 58 | "twobot: nofollow" 59 | }); 60 | return GetResult("header-bot-specific"); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/InfinityCrawler.Tests.TestSite.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.0;net6.0 5 | false 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Always 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/basic-page.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Basic Page 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/index.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Index 5 | 6 | 7 | Basic HTML Page 8 | Robots Blocked Page 9 | Looping Links Page 10 | Index Page with Query String 11 | 500 Error Page 12 | 404 Error Page 13 | 403 Error Page 14 | 401 Error Page 15 | 305 Error Page 16 | Fragment Link 17 | Not-allowed External Site 18 | Allowed Domain 19 | Two Redirects 20 | Five Redirects 21 | Rel NoFollow Link 22 | Alternative URL scheme 23 | 24 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/looping-links.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Looping Links 5 | 6 | 7 | Index Page 8 | Malformed HTML Page 9 | 10 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots-blocked-child.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Robots Blocked Child 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots-blocked.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Robots Blocked 5 | 6 | 7 | Child Page 8 | 9 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/BasicSite/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /robots-blocked.html -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/AbsoluteCanonicalUri.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Absolute Canonical Uri 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/BaseHrefCrawlLink.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Base Href Crawl Link 7 | 8 | 9 | 10 | External Href 11 | Relative Fragment 12 | Relative File 13 | Relative File with Fragment 14 | Relative Base File 15 | Absolute File 16 | 17 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/CrawlLinkContent.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Crawl Link Content 7 | 8 | 9 | No Href 10 | Invalid Href 11 | Relative Fragment 12 | Relative File 13 | Same Relative File with Fragment 14 | Different Relative File with Fragment 15 | Title Attribute 16 | Rel No Follow 17 | 18 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoFollow.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Meta No Index 7 | 8 | 9 | 10 | A Link 11 | 12 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoIndex.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Meta No Index 7 | 8 | 9 | 10 | A Link 11 | 12 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNoIndexNoFollow.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Meta No Index 7 | 8 | 9 | 10 | A Link 11 | 12 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/MetaNone.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Meta No Index 7 | 8 | 9 | 10 | A Link 11 | 12 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/NoCanonicalUri.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | No Canonical Uri 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultContentProcessor/RelativeCanonicalUri.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | Relative Canonical Uri 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/DefaultRequestProcessor/index.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Resources/EmptySite/readme.txt: -------------------------------------------------------------------------------- 1 | This site is intentionally empty. This file exists for version control to store the folder. -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/SiteContext.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace InfinityCrawler.Tests.TestSite 6 | { 7 | public class SiteContext 8 | { 9 | public string SiteFolder { get; set; } 10 | public string EntryPath { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/Startup.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using Microsoft.AspNetCore.Builder; 4 | using Microsoft.Extensions.DependencyInjection; 5 | using Microsoft.Extensions.FileProviders; 6 | 7 | namespace InfinityCrawler.Tests.TestSite 8 | { 9 | public class Startup 10 | { 11 | private SiteContext Context { get; } 12 | 13 | public Startup(SiteContext context) 14 | { 15 | Context = context; 16 | } 17 | 18 | public void ConfigureServices(IServiceCollection services) 19 | { 20 | services.AddMvcCore(); 21 | } 22 | 23 | public void Configure(IApplicationBuilder app) 24 | { 25 | app.UseStaticFiles(new StaticFileOptions 26 | { 27 | FileProvider = new PhysicalFileProvider( 28 | Path.Combine(Directory.GetCurrentDirectory(), $"Resources/{Context.SiteFolder}")) 29 | }); 30 | 31 | #if NET6_0_OR_GREATER 32 | app.UseRouting(); 33 | app.UseEndpoints(endpoints => 34 | { 35 | endpoints.MapControllers(); 36 | }); 37 | #else 38 | app.UseMvc(); 39 | #endif 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/TestHttpMessageHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Net; 5 | using System.Net.Http; 6 | using System.Reflection; 7 | using System.Text; 8 | using System.Threading; 9 | using System.Threading.Tasks; 10 | using Microsoft.AspNetCore.TestHost; 11 | 12 | namespace InfinityCrawler.Tests.TestSite 13 | { 14 | public class TestHttpMessageHandler : HttpMessageHandler 15 | { 16 | private HttpMessageHandler InternalHandler { get; } 17 | 18 | public TestHttpMessageHandler(HttpMessageHandler internalHandler) 19 | { 20 | InternalHandler = internalHandler; 21 | } 22 | 23 | protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) 24 | { 25 | try 26 | { 27 | if (request.RequestUri.Host == "test-domain.com") 28 | { 29 | //This is the only "remote" host allowed and even then, the response is always empty. 30 | var stream = new MemoryStream(); 31 | return new HttpResponseMessage(HttpStatusCode.OK) 32 | { 33 | RequestMessage = request, 34 | Version = HttpVersion.Version11, 35 | Content = new StreamContent(stream) 36 | }; 37 | } 38 | 39 | return await InternalSendAsync(request, cancellationToken); 40 | } 41 | catch (IOException ex) when (ex.Message == "The request was aborted or the pipeline has finished") 42 | { 43 | //This error only happens because the test server isn't actually called via HTTP, it is called directly 44 | //In reality, it would actually throw a `TaskCanceledException` 45 | throw new TaskCanceledException(null, ex); 46 | } 47 | } 48 | 49 | private async Task InternalSendAsync(HttpRequestMessage request, CancellationToken cancellationToken) 50 | { 51 | var method = typeof(HttpMessageHandler).GetMethod("SendAsync", BindingFlags.NonPublic | BindingFlags.Instance); 52 | var invokedTask = (Task)method.Invoke(InternalHandler, new object[] { request, cancellationToken }); 53 | return await invokedTask; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests.TestSite/TestSiteManager.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Net.Http; 4 | using System.Text; 5 | using Microsoft.AspNetCore.Hosting; 6 | using Microsoft.AspNetCore.TestHost; 7 | using Microsoft.Extensions.DependencyInjection; 8 | 9 | namespace InfinityCrawler.Tests.TestSite 10 | { 11 | public class TestSiteManager : IDisposable 12 | { 13 | private TestServer Server { get; set; } 14 | private HttpClient Client { get; set; } 15 | 16 | public TestSiteManager(SiteContext context) 17 | { 18 | var builder = new WebHostBuilder() 19 | .ConfigureServices(s => 20 | { 21 | s.AddSingleton(context); 22 | }) 23 | .UseStartup(); 24 | 25 | Server = new TestServer(builder); 26 | 27 | var internalHandler = Server.CreateHandler(); 28 | Client = new HttpClient(new TestHttpMessageHandler(internalHandler)); 29 | } 30 | 31 | public HttpClient GetHttpClient() 32 | { 33 | return Client; 34 | } 35 | 36 | public void Dispose() 37 | { 38 | if (Server != null) 39 | { 40 | Server.Dispose(); 41 | Server = null; 42 | 43 | Client.Dispose(); 44 | Client = null; 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/BasicSiteTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Net; 4 | using System.Threading.Tasks; 5 | using InfinityCrawler.Processing.Requests; 6 | using InfinityCrawler.Tests.TestSite; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | 9 | namespace InfinityCrawler.Tests 10 | { 11 | [TestClass] 12 | public class BasicSiteTests : CrawlerTestBase 13 | { 14 | private async Task GetCrawlResult() 15 | { 16 | var crawler = GetTestSiteCrawler(new SiteContext 17 | { 18 | SiteFolder = "BasicSite" 19 | }); 20 | var settings = new CrawlSettings 21 | { 22 | RequestProcessor = GetLoggedRequestProcessor(), 23 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions() 24 | }; 25 | return await crawler.Crawl(new Uri("http://localhost/"), settings); 26 | } 27 | 28 | [TestMethod] 29 | public async Task DiscoverIndexPageFromSitemap() 30 | { 31 | var result = await GetCrawlResult(); 32 | var uri = new Uri("http://localhost/index.html"); 33 | Assert.IsTrue(result.CrawledUris.Any(c => c.Location == uri)); 34 | } 35 | 36 | [TestMethod] 37 | public async Task CrawledLinksOnIndexPage() 38 | { 39 | var result = await GetCrawlResult(); 40 | var uri = new Uri("http://localhost/basic-page.html"); 41 | Assert.IsTrue(result.CrawledUris.Any(c => c.Location == uri)); 42 | } 43 | 44 | [TestMethod] 45 | public async Task ObeysRobotsBlocking() 46 | { 47 | var result = await GetCrawlResult(); 48 | var uri = new Uri("http://localhost/robots-blocked.html"); 49 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); 50 | 51 | var robotsChildUri = new Uri("http://localhost/robots-blocked-childs.html"); 52 | 53 | Assert.AreEqual(CrawlStatus.RobotsBlocked, crawledUri.Status); 54 | Assert.IsFalse(result.CrawledUris.Any(c => c.Location == robotsChildUri)); 55 | } 56 | 57 | [TestMethod] 58 | public async Task UrisOnlyAppearOnceInResults() 59 | { 60 | var result = await GetCrawlResult(); 61 | var uri = new Uri("http://localhost/index.html"); 62 | Assert.AreEqual(1, result.CrawledUris.Count(c => c.Location == uri)); 63 | } 64 | 65 | [TestMethod] 66 | public async Task UrisAreRetriedOnServerErrors() 67 | { 68 | var result = await GetCrawlResult(); 69 | var uri = new Uri("http://localhost/status/500"); 70 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); 71 | Assert.AreEqual(3, crawledUri.Requests.Count); 72 | } 73 | 74 | [TestMethod] 75 | public async Task UrisAreNotRetriedOn4xxErrors() 76 | { 77 | var result = await GetCrawlResult(); 78 | var uris = new[] 79 | { 80 | new Uri("http://localhost/status/404"), 81 | new Uri("http://localhost/status/403"), 82 | new Uri("http://localhost/status/401") 83 | }; 84 | Assert.IsTrue(uris.All(uri => result.CrawledUris.Any(c => c.Location == uri && c.Requests.Count == 1))); 85 | } 86 | 87 | [TestMethod] 88 | public async Task ExternalSitesAreNotCrawled() 89 | { 90 | var result = await GetCrawlResult(); 91 | var uri = new Uri("http://localhost/index.html"); 92 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); 93 | 94 | var externalUri = new Uri("http://not-allowed-domain.com"); 95 | 96 | Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri)); 97 | Assert.IsFalse(result.CrawledUris.Any(c => c.Location == externalUri)); 98 | } 99 | 100 | [TestMethod] 101 | public async Task AllowedExternalSitesAreCrawled() 102 | { 103 | var crawler = GetTestSiteCrawler(new SiteContext 104 | { 105 | SiteFolder = "BasicSite" 106 | }); 107 | var settings = new CrawlSettings 108 | { 109 | HostAliases = new[] { "test-domain.com" }, 110 | RequestProcessor = GetLoggedRequestProcessor(), 111 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions() 112 | }; 113 | var result = await crawler.Crawl(new Uri("http://localhost/"), settings); 114 | var uri = new Uri("http://localhost/index.html"); 115 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); 116 | 117 | var externalUri = new Uri("http://test-domain.com"); 118 | 119 | Assert.IsTrue(crawledUri.Content.Links.Any(l => l.Location == externalUri)); 120 | 121 | var externalCrawl = result.CrawledUris.FirstOrDefault(c => c.Location == externalUri); 122 | Assert.IsNotNull(externalCrawl); 123 | Assert.AreEqual(HttpStatusCode.OK, externalCrawl.Requests.LastOrDefault().StatusCode); 124 | } 125 | 126 | [TestMethod] 127 | public async Task RelNoFollowLinksAreIgnored() 128 | { 129 | var result = await GetCrawlResult(); 130 | var uri = new Uri("http://localhost/index.html?v=rel-no-follow"); 131 | Assert.AreEqual(0, result.CrawledUris.Count(c => c.Location == uri)); 132 | } 133 | 134 | [TestMethod] 135 | public async Task MaximumRedirectLimitFollowed() 136 | { 137 | var result = await GetCrawlResult(); 138 | var uri = new Uri("http://localhost/redirect/2/five-redirects"); 139 | var crawledUri = result.CrawledUris.Where(c => c.Location == uri).FirstOrDefault(); 140 | 141 | Assert.AreEqual(CrawlStatus.MaxRedirects, crawledUri.Status); 142 | Assert.AreEqual(3, crawledUri.RedirectChain.Count); 143 | } 144 | 145 | [DataRow(2)] 146 | [DataRow(4)] 147 | [DataTestMethod] 148 | public async Task MaximumPagesCrawledFollowed(int maxPages) 149 | { 150 | var crawler = GetTestSiteCrawler(new SiteContext 151 | { 152 | SiteFolder = "BasicSite" 153 | }); 154 | var settings = new CrawlSettings 155 | { 156 | RequestProcessor = GetLoggedRequestProcessor(), 157 | RequestProcessorOptions = GetNoDelayRequestProcessorOptions() 158 | }; 159 | 160 | settings.MaxNumberOfPagesToCrawl = maxPages; 161 | var result = await crawler.Crawl(new Uri("http://localhost/"), settings); 162 | Assert.AreEqual(maxPages, result.CrawledUris.Count()); 163 | } 164 | 165 | [TestMethod] 166 | public async Task AutoRetryOnFailure() 167 | { 168 | var crawler = GetTestSiteCrawler(new SiteContext 169 | { 170 | SiteFolder = "EmptySite" 171 | }); 172 | var settings = new CrawlSettings 173 | { 174 | NumberOfRetries = 3, 175 | RequestProcessor = GetLoggedRequestProcessor(), 176 | RequestProcessorOptions = new RequestProcessorOptions 177 | { 178 | DelayBetweenRequestStart = new TimeSpan(), 179 | MaxNumberOfSimultaneousRequests = 4, 180 | TimeoutBeforeThrottle = new TimeSpan(), 181 | DelayJitter = new TimeSpan(), 182 | RequestTimeout = new TimeSpan(0, 0, 0, 0, 150) 183 | } 184 | }; 185 | 186 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-1")); 187 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-2")); 188 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-3")); 189 | settings.RequestProcessor.Add(new Uri("http://localhost/delay/500/500ms-delay-4")); 190 | 191 | var results = await crawler.Crawl(new Uri("http://localhost/"), settings); 192 | var delayedCrawls = results.CrawledUris.Where(c => c.Location.PathAndQuery.Contains("delay")).ToArray(); 193 | 194 | foreach (var crawledUri in delayedCrawls) 195 | { 196 | Assert.AreEqual(CrawlStatus.MaxRetries, crawledUri.Status); 197 | Assert.IsNull(crawledUri.Content); 198 | } 199 | } 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/ContentProcessorTestBase.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using InfinityCrawler.Processing.Content; 7 | using InfinityCrawler.Tests.TestSite; 8 | 9 | namespace InfinityCrawler.Tests 10 | { 11 | public class ContentProcessorTestBase : TestBase 12 | { 13 | protected async Task RequestAndProcessContentAsync(SiteContext siteContext, Uri requestUri, IContentProcessor contentProcessor) 14 | { 15 | var httpClient = TestSiteConfiguration.GetHttpClient(siteContext); 16 | using (var response = await httpClient.GetAsync(requestUri)) 17 | { 18 | await response.Content.LoadIntoBufferAsync(); 19 | using (var contentStream = await response.Content.ReadAsStreamAsync()) 20 | { 21 | var headers = new CrawlHeaders(response.Headers, response.Content.Headers); 22 | return contentProcessor.Parse(requestUri, headers, contentStream); 23 | } 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/CrawlerTestBase.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using InfinityCrawler.Processing.Requests; 7 | using InfinityCrawler.Tests.TestSite; 8 | 9 | namespace InfinityCrawler.Tests 10 | { 11 | public class CrawlerTestBase : TestBase 12 | { 13 | protected Crawler GetTestSiteCrawler(SiteContext siteContext) 14 | { 15 | var httpClient = TestSiteConfiguration.GetHttpClient(siteContext); 16 | return new Crawler(httpClient, GetLogger()); 17 | } 18 | 19 | protected RequestProcessorOptions GetNoDelayRequestProcessorOptions() 20 | { 21 | return new RequestProcessorOptions 22 | { 23 | MaxNumberOfSimultaneousRequests = 10, 24 | DelayBetweenRequestStart = new TimeSpan(), 25 | DelayJitter = new TimeSpan(), 26 | TimeoutBeforeThrottle = new TimeSpan() 27 | }; 28 | } 29 | 30 | protected DefaultRequestProcessor GetLoggedRequestProcessor() 31 | { 32 | var requestProcessorLogger = GetLogger(); 33 | return new DefaultRequestProcessor(requestProcessorLogger); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/DefaultContentProcessorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using InfinityCrawler.Processing.Content; 7 | using InfinityCrawler.Tests.TestSite; 8 | using Microsoft.VisualStudio.TestTools.UnitTesting; 9 | 10 | namespace InfinityCrawler.Tests 11 | { 12 | [TestClass] 13 | public class DefaultContentProcessorTests : ContentProcessorTestBase 14 | { 15 | private async Task PerformRequestAsync(string path) 16 | { 17 | var requestUri = new UriBuilder("http://localhost/") 18 | { 19 | Path = path 20 | }.Uri; 21 | 22 | return await RequestAndProcessContentAsync(new SiteContext 23 | { 24 | SiteFolder = "DefaultContentProcessor" 25 | }, requestUri, new DefaultContentProcessor()); 26 | } 27 | 28 | [TestMethod] 29 | public async Task NoMetaParsed() 30 | { 31 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html"); 32 | Assert.AreEqual(0, crawledContent.PageRobotRules.Count()); 33 | } 34 | 35 | [TestMethod] 36 | public async Task MissingHrefLinksAreIgnored() 37 | { 38 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html"); 39 | Assert.AreEqual(6, crawledContent.Links.Count()); 40 | Assert.IsFalse(crawledContent.Links.Any(l => l.Text == "No Href")); 41 | } 42 | 43 | [TestMethod] 44 | public async Task InvalidHrefLinksAreIgnored() 45 | { 46 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html"); 47 | Assert.AreEqual(6, crawledContent.Links.Count()); 48 | Assert.IsFalse(crawledContent.Links.Any(l => l.Text == "Invalid Href")); 49 | } 50 | 51 | [TestMethod] 52 | public async Task TitleAttributeIsParsed() 53 | { 54 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html"); 55 | 56 | Assert.IsTrue(crawledContent.Links.Any(l => l.Title == "Title Attribute")); 57 | Assert.IsNull(crawledContent.Links.FirstOrDefault(l => l.Text == "Relative File").Title); 58 | } 59 | 60 | [TestMethod] 61 | public async Task RelAttributeIsParsed() 62 | { 63 | var crawledContent = await PerformRequestAsync("CrawlLinkContent.html"); 64 | 65 | Assert.IsTrue(crawledContent.Links.Any(l => l.Relationship == "nofollow")); 66 | Assert.IsNull(crawledContent.Links.FirstOrDefault(l => l.Text == "Relative File").Relationship); 67 | } 68 | 69 | [TestMethod] 70 | public async Task MetaRobotsParsed() 71 | { 72 | var crawledContent = await PerformRequestAsync("MetaNoFollow.html"); 73 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase))); 74 | 75 | crawledContent = await PerformRequestAsync("MetaNoIndex.html"); 76 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("noindex", StringComparison.InvariantCultureIgnoreCase))); 77 | 78 | crawledContent = await PerformRequestAsync("MetaNoIndexNoFollow.html"); 79 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => 80 | r.IndexOf("noindex", StringComparison.InvariantCultureIgnoreCase) != -1 && 81 | r.IndexOf("nofollow", StringComparison.InvariantCultureIgnoreCase) != -1 82 | )); 83 | 84 | crawledContent = await PerformRequestAsync("MetaNone.html"); 85 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("none", StringComparison.InvariantCultureIgnoreCase))); 86 | } 87 | [TestMethod] 88 | public async Task HeaderRobotsParsed() 89 | { 90 | var crawledContent = await PerformRequestAsync("robots/header-page-noindex"); 91 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Equals("noindex", StringComparison.InvariantCultureIgnoreCase))); 92 | 93 | crawledContent = await PerformRequestAsync("robots/header-bot-specific"); 94 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Contains("onebot"))); 95 | Assert.IsTrue(crawledContent.PageRobotRules.Any(r => r.Contains("twobot"))); 96 | } 97 | 98 | [TestMethod] 99 | public async Task CanonicalUriParsing() 100 | { 101 | var crawledContent = await PerformRequestAsync("NoCanonicalUri.html"); 102 | Assert.IsNull(crawledContent.CanonicalUri); 103 | 104 | crawledContent = await PerformRequestAsync("RelativeCanonicalUri.html"); 105 | Assert.AreEqual(new Uri("http://localhost/RelativeCanonicalUri.html"), crawledContent.CanonicalUri); 106 | 107 | crawledContent = await PerformRequestAsync("AbsoluteCanonicalUri.html"); 108 | Assert.AreEqual(new Uri("http://localhost/AbsoluteCanonicalUri.html"), crawledContent.CanonicalUri); 109 | } 110 | [TestMethod] 111 | public async Task BaseHrefLinks() 112 | { 113 | var crawledContent = await PerformRequestAsync("BaseHrefCrawlLink.html"); 114 | var links = crawledContent.Links.ToArray(); 115 | 116 | Assert.AreEqual(new Uri("http://test-domain.com/"), links[0].Location); 117 | Assert.AreEqual(new Uri("http://localhost/base/#RelativeFragment"), links[1].Location); 118 | Assert.AreEqual(new Uri("http://localhost/base/relative/RelativeFile.html"), links[2].Location); 119 | Assert.AreEqual(new Uri("http://localhost/base/relative/RelativeFile.html#Fragment"), links[3].Location); 120 | Assert.AreEqual(new Uri("http://localhost/RelativeBaseFile.html"), links[4].Location); 121 | Assert.AreEqual(new Uri("http://localhost/absolute/AbsoluteBaseFile.html"), links[5].Location); 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/DefaultRequestProcessorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading; 7 | using System.Threading.Tasks; 8 | using InfinityCrawler.Processing.Requests; 9 | using InfinityCrawler.Tests.TestSite; 10 | using Microsoft.VisualStudio.TestTools.UnitTesting; 11 | 12 | namespace InfinityCrawler.Tests 13 | { 14 | [TestClass] 15 | public class DefaultRequestProcessorTests : TestBase 16 | { 17 | [TestMethod] 18 | public async Task ThrottlingTest() 19 | { 20 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext 21 | { 22 | SiteFolder = "DefaultRequestProcessor" 23 | }); 24 | 25 | var processor = new DefaultRequestProcessor(GetLogger()); 26 | 27 | //Warmup 28 | processor.Add(new Uri("http://localhost/delay/50/warmup")); 29 | await processor.ProcessAsync(httpClient, requestResult => Task.CompletedTask, new RequestProcessorOptions 30 | { 31 | DelayJitter = new TimeSpan(), 32 | DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, 50) 33 | }); 34 | 35 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-1")); 36 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-2")); 37 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1")); 38 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2")); 39 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-3")); 40 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-4")); 41 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-5")); 42 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-6")); 43 | processor.Add(new Uri("http://localhost/delay/50/50ms-delay-7")); 44 | 45 | var results = new List(); 46 | await processor.ProcessAsync(httpClient, requestResult => 47 | { 48 | results.Add(requestResult); 49 | return Task.CompletedTask; 50 | }, new RequestProcessorOptions 51 | { 52 | MaxNumberOfSimultaneousRequests = 1, 53 | MinSequentialSuccessesToMinimiseThrottling = 2, 54 | DelayBetweenRequestStart = new TimeSpan(), 55 | DelayJitter = new TimeSpan(), 56 | TimeoutBeforeThrottle = new TimeSpan(0, 0, 0, 0, 270), 57 | ThrottlingRequestBackoff = new TimeSpan(0, 0, 0, 0, 100) 58 | }); 59 | 60 | Assert.AreEqual(0, results[0].RequestStartDelay); 61 | Assert.AreEqual(0, results[1].RequestStartDelay); 62 | Assert.AreEqual(0, results[2].RequestStartDelay); 63 | Assert.AreEqual(100, results[3].RequestStartDelay); 64 | Assert.AreEqual(200, results[4].RequestStartDelay); 65 | Assert.AreEqual(200, results[5].RequestStartDelay); 66 | Assert.AreEqual(100, results[6].RequestStartDelay); 67 | Assert.AreEqual(100, results[7].RequestStartDelay); 68 | Assert.AreEqual(0, results[8].RequestStartDelay); 69 | } 70 | 71 | [TestMethod] 72 | public async Task ProcessCancellationTest() 73 | { 74 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext 75 | { 76 | SiteFolder = "DefaultRequestProcessor" 77 | }); 78 | 79 | var processor = new DefaultRequestProcessor(GetLogger()); 80 | 81 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1")); 82 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2")); 83 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-3")); 84 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-4")); 85 | 86 | var results = new ConcurrentBag(); 87 | var tokenSource = new CancellationTokenSource(300); 88 | 89 | try 90 | { 91 | await processor.ProcessAsync(httpClient, requestResult => 92 | { 93 | results.Add(requestResult); 94 | return Task.CompletedTask; 95 | }, new RequestProcessorOptions 96 | { 97 | DelayBetweenRequestStart = new TimeSpan(), 98 | MaxNumberOfSimultaneousRequests = 2, 99 | TimeoutBeforeThrottle = new TimeSpan(), 100 | DelayJitter = new TimeSpan() 101 | }, tokenSource.Token); 102 | } 103 | catch (OperationCanceledException) 104 | { 105 | 106 | } 107 | 108 | Assert.AreNotEqual(3, results.Count); 109 | Assert.AreNotEqual(4, results.Count); 110 | } 111 | 112 | [TestMethod] 113 | public async Task RequestTimeoutTest() 114 | { 115 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext 116 | { 117 | SiteFolder = "DefaultRequestProcessor" 118 | }); 119 | 120 | var processor = new DefaultRequestProcessor(GetLogger()); 121 | 122 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-1")); 123 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-2")); 124 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-3")); 125 | processor.Add(new Uri("http://localhost/delay/300/300ms-delay-4")); 126 | 127 | var results = new ConcurrentBag(); 128 | 129 | await processor.ProcessAsync(httpClient, requestResult => 130 | { 131 | results.Add(requestResult); 132 | return Task.CompletedTask; 133 | }, new RequestProcessorOptions 134 | { 135 | DelayBetweenRequestStart = new TimeSpan(), 136 | MaxNumberOfSimultaneousRequests = 4, 137 | TimeoutBeforeThrottle = new TimeSpan(), 138 | DelayJitter = new TimeSpan(), 139 | RequestTimeout = new TimeSpan(0, 0, 0, 0, 150) 140 | }); 141 | 142 | Assert.AreEqual(4, results.Count); 143 | 144 | foreach (var requestResult in results) 145 | { 146 | Assert.IsInstanceOfType(requestResult.Exception, typeof(OperationCanceledException)); 147 | } 148 | } 149 | [TestMethod, ExpectedExceptionPattern(typeof(Exception), nameof(FaultedTaskThrowsException))] 150 | public async Task FaultedTaskThrowsException() 151 | { 152 | var httpClient = TestSiteConfiguration.GetHttpClient(new SiteContext 153 | { 154 | SiteFolder = "DefaultRequestProcessor" 155 | }); 156 | 157 | var processor = new DefaultRequestProcessor(GetLogger()); 158 | 159 | processor.Add(new Uri("http://localhost/")); 160 | 161 | await processor.ProcessAsync(httpClient, requestResult => 162 | { 163 | throw new Exception(nameof(FaultedTaskThrowsException)); 164 | }, new RequestProcessorOptions 165 | { 166 | DelayBetweenRequestStart = new TimeSpan(), 167 | DelayJitter = new TimeSpan() 168 | }); 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/ExpectedExceptionPatternAttribute.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Text.RegularExpressions; 6 | using System.Threading.Tasks; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | 9 | namespace InfinityCrawler.Tests 10 | { 11 | public class ExpectedExceptionPatternAttribute : ExpectedExceptionBaseAttribute 12 | { 13 | private Type ExpectedExceptionType { get; } 14 | private Regex MessagePattern { get; } 15 | private string RawPattern { get; } 16 | 17 | public ExpectedExceptionPatternAttribute(Type expectedExceptionType, string exceptionMessagePattern) 18 | { 19 | ExpectedExceptionType = expectedExceptionType; 20 | MessagePattern = new Regex(exceptionMessagePattern); 21 | RawPattern = exceptionMessagePattern; 22 | } 23 | 24 | protected override void Verify(Exception exception) 25 | { 26 | Assert.IsNotNull(exception, $"\"{nameof(exception)}\" is null"); 27 | 28 | var thrownExceptionType = exception.GetType(); 29 | 30 | if (ExpectedExceptionType != thrownExceptionType) 31 | { 32 | throw new Exception($"Test method threw exception {thrownExceptionType.FullName}, but exception {ExpectedExceptionType.FullName} was expected. Exception message: {exception.Message}"); 33 | } 34 | 35 | if (!MessagePattern.IsMatch(exception.Message)) 36 | { 37 | throw new Exception($"Thrown exception message \"{exception.Message}\" does not match pattern \"{RawPattern}\"."); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/InfinityCrawler.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net461;netcoreapp3.1;net5.0;net6.0; 5 | 6 | false 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | all 15 | runtime; build; native; contentfiles; analyzers; buildtransitive 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/TestBase.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | using InfinityCrawler.Tests.TestSite; 6 | using Microsoft.Extensions.DependencyInjection; 7 | using Microsoft.Extensions.Logging; 8 | using Microsoft.VisualStudio.TestTools.UnitTesting; 9 | 10 | namespace InfinityCrawler.Tests 11 | { 12 | [TestClass] 13 | public class TestBase 14 | { 15 | private ILoggerFactory LoggerFactory { get; } 16 | 17 | public TestBase() 18 | { 19 | var serviceProvider = new ServiceCollection() 20 | .AddLogging(builder => 21 | { 22 | builder.AddFilter("InfinityCrawler", LogLevel.Trace); 23 | builder.AddConsole(); 24 | builder.AddDebug(); 25 | }) 26 | .BuildServiceProvider(); 27 | 28 | LoggerFactory = serviceProvider.GetService(); 29 | } 30 | 31 | protected ILogger GetLogger() 32 | { 33 | return LoggerFactory.CreateLogger(); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/InfinityCrawler.Tests/TestSiteConfiguration.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Net.Http; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | using InfinityCrawler.Tests.TestSite; 8 | using Microsoft.VisualStudio.TestTools.UnitTesting; 9 | 10 | namespace InfinityCrawler.Tests 11 | { 12 | [TestClass] 13 | public static class TestSiteConfiguration 14 | { 15 | private static Dictionary TestSites { get; } = new Dictionary(); 16 | 17 | public static HttpClient GetHttpClient(SiteContext siteContext) 18 | { 19 | if (!TestSites.ContainsKey(siteContext.SiteFolder)) 20 | { 21 | var testSiteManager = new TestSiteManager(siteContext); 22 | TestSites.Add(siteContext.SiteFolder, testSiteManager); 23 | } 24 | 25 | return TestSites[siteContext.SiteFolder].GetHttpClient(); 26 | } 27 | 28 | public static void ShutdownSites() 29 | { 30 | foreach (var site in TestSites.Values) 31 | { 32 | site.Dispose(); 33 | } 34 | 35 | TestSites.Clear(); 36 | } 37 | 38 | [AssemblyCleanup] 39 | public static void AssemblyCleanup() 40 | { 41 | ShutdownSites(); 42 | } 43 | } 44 | } 45 | --------------------------------------------------------------------------------