├── .github └── workflows │ └── publish-AngleParse.yml ├── .gitignore ├── AngleParse.PesterTests └── Select-HtmlContent.Tests.ps1 ├── AngleParse.Test ├── AngleParse.Test.csproj ├── Helpers │ └── ElementResourceFactory.cs ├── Resource │ └── ElementResourceTests.cs ├── Selector │ ├── AttributeSelectorTests.cs │ ├── CssSelectorTests.cs │ ├── FuncSelectorTests.cs │ ├── PropertySelectorTests.cs │ ├── RegexSelectorTests.cs │ ├── ScriptBlockSelectorTests.cs │ └── TableSelectorTests.cs └── assets │ ├── empty.html │ ├── full-attribute.html │ ├── full.html │ ├── invalid.html │ ├── no-attribute.html │ └── sentence.txt ├── AngleParse.sln ├── AngleParse ├── AngleParse.csproj ├── AngleParse.psd1 ├── Attr.cs ├── Prop.cs ├── Resource │ ├── ElementResource.cs │ ├── ObjectResource.cs │ └── StringResource.cs ├── SelectHtmlElement.cs └── Selector │ ├── AttributeSelector.cs │ ├── CssSelector.cs │ ├── FuncSelector.cs │ ├── ISelector.cs │ ├── PropertySelector.cs │ ├── RegexSelector.cs │ ├── ScriptBlockSelector.cs │ ├── SelectorFactory.cs │ └── TableSelector.cs ├── LICENSE └── README.md /.github/workflows/publish-AngleParse.yml: -------------------------------------------------------------------------------- 1 | name: Publish AngleParse 2 | on: 3 | push: 4 | branches: 5 | - main 6 | tags: 7 | - '*' 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | environment: release 12 | steps: 13 | - name: Checkout source 14 | uses: actions/checkout@v4 15 | - name: Setup .NET SDK 16 | uses: actions/setup-dotnet@v4 17 | with: 18 | dotnet-version: 9.x 19 | - name: Install Pester module 20 | shell: pwsh 21 | run: | 22 | $ErrorActionPreference = 'Stop' 23 | Install-Module -Name Pester -RequiredVersion 5.7.1 -Force -SkipPublisherCheck -Scope CurrentUser 24 | - name: dotnet test 25 | run: | 26 | dotnet test ./AngleParse.Test/AngleParse.Test.csproj -c Release 27 | - name: run pester tests 28 | shell: pwsh 29 | run: | 30 | $ErrorActionPreference = 'Stop' 31 | Import-Module Pester -Force 32 | Invoke-Pester -Path ./AngleParse.PesterTests/ 33 | - name: dotnet publish 34 | run: | 35 | dotnet publish ./AngleParse/AngleParse.csproj -c Release 36 | - name: Modify module metadata 37 | shell: pwsh 38 | env: 39 | CommitTag: ${{ github.ref_name }} 40 | run: | 41 | $ErrorActionPreference = 'Stop' 42 | $moduleManifestPath = './AngleParse/bin/Release/AngleParse/AngleParse.psd1' 43 | $moduleVersion = $Env:CommitTag -replace '^v', '' 44 | 45 | filter modify($key, $value) 46 | { 47 | $_ -replace "^\s*$key\s*=.*$", "$key = '$value'" 48 | } 49 | 50 | Get-Content $moduleManifestPath | 51 | modify ModuleVersion $moduleVersion | 52 | Set-Content $moduleManifestPath 53 | - name: Publish to PowerShell Gallery 54 | shell: pwsh 55 | env: 56 | PSGalleryApiKey: ${{ secrets.PSGalleryApiKey }} 57 | run: | 58 | $ErrorActionPreference = 'Stop' 59 | $modulePath = './AngleParse/bin/Release/AngleParse' 60 | if (-not (Test-Path $modulePath)) { 61 | throw "Module path $modulePath does not exist." 62 | } 63 | Publish-Module -Path $modulePath -NuGetApiKey $env:PSGalleryApiKey -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | AngleParse.sln.DotSettings.user 2 | 3 | obj/ 4 | bin/ 5 | dist/ 6 | artifacts/ 7 | 8 | .git/ 9 | .idea/ 10 | .vscode/ 11 | .tmp 12 | 13 | .DS_Store -------------------------------------------------------------------------------- /AngleParse.PesterTests/Select-HtmlContent.Tests.ps1: -------------------------------------------------------------------------------- 1 | BeforeAll { 2 | $solutionRoot = Split-Path -Parent $PSScriptRoot 3 | 4 | if (Get-Module -Name AngleParse) 5 | { 6 | Remove-Module AngleParse 7 | } 8 | dotnet publish (Join-Path $solutionRoot 'AngleParse' 'AngleParse.csproj') -c Release 9 | Import-Module (Join-Path $solutionRoot 'AngleParse' 'bin/Release/AngleParse/AngleParse.psd1') -Force 10 | 11 | $assetsDir = Join-Path $solutionRoot 'AngleParse.Test' 'assets' 12 | $cacheTable = @{ } 13 | function Get-Asset([string]$filename) 14 | { 15 | $cache = $cacheTable[$filename] 16 | if ($cache) 17 | { 18 | return $cache 19 | } 20 | $file = Join-Path $assetsDir $filename 21 | if (Test-Path $file) 22 | { 23 | $asset = Get-Content $file -Raw 24 | $cacheTable[$filename] = $asset 25 | return $asset 26 | } 27 | else 28 | { 29 | throw "File not found: $file" 30 | } 31 | } 32 | } 33 | 34 | Describe 'Select-HtmlContent' { 35 | Context 'AttributeSelector' { 36 | Context 'Truly attribute' { 37 | It 'selects the attribute value' { 38 | $attr = [AngleParse.Attr]'some-attribute' 39 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent $attr 40 | $result | should -be 'some_value' 41 | } 42 | It 'outputs empty string if the attribute does not have value' { 43 | $attr = [AngleParse.Attr]'valueless-attribute' 44 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent $attr 45 | $result | should -be '' 46 | } 47 | It 'outputs null if the attribute does not exist' { 48 | $attr = [AngleParse.Attr]'nonexistent-attribute' 49 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent $attr 50 | $result | should -BeNullOrEmpty 51 | } 52 | It 'class-defined attributes works as well' -ForEach @( 53 | @{ AttrSb = { [AngleParse.Attr]::Href }; Expected = 'https://some_url_in_japan.go.jp' } 54 | @{ AttrSb = { [AngleParse.Attr]::Src }; Expected = 'https://some_url_in_japan.go.jp/some_pic.jpg' } 55 | @{ AttrSb = { [AngleParse.Attr]::Title }; Expected = 'Some title' } 56 | @{ AttrSb = { [AngleParse.Attr]::Name }; Expected = 'some_name' } 57 | ) { 58 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent (& $AttrSb) 59 | $result | should -be $Expected 60 | } 61 | } 62 | Context 'Not truly attribute' { 63 | It 'InnerHtml works' { 64 | # If not enclosed in parentheses, it will be evaluated as a string instead of an Attr. 65 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::InnerHtml) 66 | $result | should -be 'some link' 67 | } 68 | It 'OuterHtml works' { 69 | $result = Get-Asset 'no-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::OuterHtml) 70 | $result | should -be 'some link' 71 | } 72 | It 'TextContent works' { 73 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::TextContent) 74 | $result | should -be 'some link' 75 | } 76 | It 'Id works' { 77 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::Id) 78 | $result | should -be 'some_id' 79 | } 80 | It 'ClassName works' { 81 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::ClassName) 82 | $result | should -be 'some_class another_class' 83 | } 84 | It 'SplitClasses works' { 85 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::SplitClasses) 86 | $result | should -be 'some_class', 'another_class' 87 | } 88 | } 89 | } 90 | Context 'CssSelector' { 91 | It 'throws if the selector is not valid' { 92 | { 93 | Get-Asset 'full.html' | Select-HtmlContent 'div > p >' 94 | } | should -throw 95 | } 96 | It 'selects the element by CSS selector' { 97 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.some_class' 98 | $result.Length | should -be 2 99 | } 100 | It 'returns null if the selector does not match anything' { 101 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.nonexistent_class' 102 | $result | should -be $null 103 | } 104 | It 'returns null if the document is empty' { 105 | $result = Get-Asset 'empty.html' | Select-HtmlContent 'div' 106 | $result | should -be $null 107 | } 108 | } 109 | Context 'FuncSelector' { 110 | Context 'attribute selector' { 111 | It 'pipes between attribute and attribute throws' { 112 | { 113 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::Href) ([AngleParse.Attr]::Src) 114 | } | should -throw 115 | } 116 | It 'pipes between attribute and css selector throws' { 117 | { 118 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::Href) 'div.some_class' 119 | } | should -throw 120 | } 121 | It 'pipes between attribute and property throws' { 122 | { 123 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::Href) ([AngleParse.Prop]::Element) 124 | } | should -throw 125 | } 126 | It 'pipes between attribute and regex works' { 127 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::ClassName) ([regex]'(\w+)') 128 | $result | should -be 'some_class', 'another_class' 129 | } 130 | It 'pipes between attribute and scriptblock works' { 131 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Attr]::ClassName) { 132 | $_ -like 'some_class*' ? 1 : 2 133 | } 134 | $result | should -be 1 135 | } 136 | } 137 | Context 'css selector' { 138 | It 'pipes between css selector and attribute works' { 139 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.some_class' ([AngleParse.Attr]::ClassName) 140 | $result | should -be 'some_class', 'some_class' 141 | } 142 | It 'pipes between css selector and css selector works' { 143 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.some_class' 'a' 144 | $result.Length | should -be 10 145 | } 146 | It 'pipes between css selctor and property works' { 147 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.some_class' ([AngleParse.Prop]::Element) 148 | $result.Length | should -be 2 149 | $result | should -beOfType AngleSharp.Dom.IElement 150 | } 151 | It 'pipes between css selector and regex works' { 152 | $result = Get-Asset 'full.html' | 153 | Select-HtmlContent 'section#fragment > div.some_class' ([regex]'Windows Server (\d{4})') 154 | 155 | $result | should -be 2003, 2008 156 | } 157 | It 'pipes between css selector and scriptblock works' { 158 | $result = Get-Asset 'full.html' | Select-HtmlContent 'section#fragment > div.some_class' { 159 | $_ -match 'Windows Server' ? 1 : 2 160 | } 161 | $result | should -be 1, 2 162 | } 163 | Context 'property selector' { 164 | It 'pipes between property and attribute throws' { 165 | { 166 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]::Element) ([AngleParse.Attr]::Href) 167 | } | should -throw 168 | } 169 | It 'pipes between property and css selector throws' { 170 | { 171 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]::Element) 'div.some_class' 172 | } | should -throw 173 | } 174 | It 'pipes between property and property throws' { 175 | { 176 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]::Element) ([AngleParse.Prop]::Element) 177 | } | should -throw 178 | } 179 | It 'pipes between property and regex throws' { 180 | { 181 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]::Element) ([regex]'(\w+)') 182 | } | should -throw 183 | } 184 | It 'pipes between property and scriptblock works' { 185 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]'TextContent') { 186 | $_ -like 'some link' ? 1 : 2 187 | } 188 | $result | should -be 1 189 | } 190 | } 191 | Context 'regex selector' { 192 | It 'pipes between regex and attribute throws' { 193 | { 194 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([regex]'(\w+)') ([AngleParse.Attr]::Href) 195 | } | should -throw 196 | } 197 | It 'pipes between regex and css selector throws' { 198 | { 199 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([regex]'(\w+)') 'div.some_class' 200 | } | should -throw 201 | } 202 | It 'pipes between regex and property throws' { 203 | { 204 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([regex]'(\w+)') ([AngleParse.Prop]::Element) 205 | } | should -throw 206 | } 207 | It 'pipes between regex and regex works' { 208 | $result = Get-Asset 'full.html' | Select-HtmlContent ([regex]'(\w+ \d{4})') ([regex]'(\d{4})') 209 | $result | should -be 2006, 2003, 2008, 2016, 2018 210 | } 211 | It 'pipes between regex and scriptblock works' { 212 | $result = Get-Asset 'full.html' | Select-HtmlContent ([regex]'(\w+ \d{4})') { 213 | $_ -like 'November*' ? 1 : 2 214 | } 215 | $result | should -be 1, 2, 2, 2, 2 216 | } 217 | } 218 | Context 'scriptblock selector' { 219 | It 'pipes between scriptblock and attribute throws' { 220 | { 221 | Get-Asset 'full-attribute.html' | Select-HtmlContent { $_ -like 'some link' } ([AngleParse.Attr]::Href) 222 | } | should -throw 223 | } 224 | It 'pipes between scriptblock and css selector throws' { 225 | { 226 | Get-Asset 'full-attribute.html' | Select-HtmlContent { $_ -like 'some link' } 'div.some_class' 227 | } | should -throw 228 | } 229 | It 'pipes between scriptblock and property throws' { 230 | { 231 | Get-Asset 'full-attribute.html' | Select-HtmlContent { $_ -like 'some link' } ([AngleParse.Prop]::Element) 232 | } | should -throw 233 | } 234 | It 'pipes between scriptblock and regex throws' { 235 | { 236 | Get-Asset 'full-attribute.html' | Select-HtmlContent { $_ -like 'some link' } ([regex]'(\w+)') 237 | } | should -throw 238 | } 239 | It 'pipes between scriptblock and scriptblock works' { 240 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent { 241 | $_ -like 'some link' ? 1 : 2 242 | } { $_ * 2 } 243 | $result | should -be 2 244 | } 245 | } 246 | } 247 | } 248 | Context 'PropertySelector' { 249 | It 'retunrs inner IElement when the selector is Prop.Element' { 250 | $result = Get-Asset 'full.html' | Select-HtmlContent ([AngleParse.Prop]::Element) 251 | $result | should -beOfType AngleSharp.Dom.IElement 252 | } 253 | It 'returns attribute table when the selector is Prop.AttributesTable' { 254 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]::AttributesTable) 255 | $result.class | should -be 'some_class another_class' 256 | $result.href | should -be 'https://some_url_in_japan.go.jp' 257 | $result.id | should -be 'some_id' 258 | $result.name | should -be 'some_name' 259 | $result.'some-attribute' | should -be 'some_value' 260 | $result.src | should -be 'https://some_url_in_japan.go.jp/some_pic.jpg' 261 | $result.title | should -be 'Some title' 262 | $result.'valueless-attribute' | should -be '' 263 | $result.nonexistent_attribute | should -be $null 264 | } 265 | It 'retunrs inner IElement property value with undefined property selector' { 266 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]'ClassName') 267 | $result | should -be 'some_class another_class' 268 | } 269 | It 'throws if the property does not exist' { 270 | { 271 | Get-Asset 'full-attribute.html' | Select-HtmlContent ([AngleParse.Prop]'NonexistentProperty') 272 | } | should -throw 273 | } 274 | } 275 | Context 'RegexSelector' { 276 | It 'returns captured values when the regex matches' { 277 | $result = Get-Asset 'full.html' | Select-HtmlContent ([regex]'(\w+) (\d{4})') 278 | $result | should -be @( 279 | "November", "2006", 280 | "Server", "2003", 281 | "Server", "2008", 282 | "August", "2016", 283 | "January", "2018" 284 | ) 285 | } 286 | It 'returns null when the regex has no capture groups' { 287 | $result = Get-Asset 'full.html' | Select-HtmlContent ([regex]'\w+ \d{4}') 288 | $result | should -be $null 289 | } 290 | It 'returns null when the regex does not match anything' { 291 | $result = Get-Asset 'full.html' | Select-HtmlContent ([regex]'(nonexistent)') 292 | $result | should -be $null 293 | } 294 | } 295 | Context 'ScriptBlockSelector' { 296 | It 'binds $_ in the scriptblock to the current element' { 297 | Get-Asset 'full-attribute.html' | Select-HtmlContent { 298 | $_ | should -be 'some link' 299 | } 300 | } 301 | It 'outputs the result of the scriptblock' { 302 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent { 303 | if ($_ -like 'some link') 304 | { 305 | return 1 306 | } 307 | } 308 | $result | should -be 1 309 | } 310 | It 'outputs multiple results when the scriptblock returns multiple values' { 311 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent { 312 | if ($_ -like 'some link') 313 | { 314 | return 1, 2 315 | } 316 | } 317 | $result | should -be 1, 2 318 | } 319 | It 'outputs null when the scriptblock returns nothing' { 320 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent { } 321 | $result | should -be $null 322 | } 323 | It 'outputs the result even if the Write-Error is called in the scriptblock' { 324 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent { 325 | # Override the error action to continue which is the default in PowerShell. 326 | Write-Error -Message "Error in scriptblock" -ErrorAction 'Continue' 2> $null 327 | return 1 328 | } 329 | $result | should -be 1 330 | } 331 | It 'throws if the scriptblock throws an exception' { 332 | { 333 | Get-Asset 'full-attribute.html' | Select-HtmlContent { 334 | throw "Error in scriptblock" 335 | } 336 | } | should -throw 337 | } 338 | } 339 | Context 'TableSelector' { 340 | It 'returns a table whose keys are the input table keys and values are the evaluated values' { 341 | $result = Get-Asset 'full-attribute.html' | Select-HtmlContent @{ 342 | ClassName = [AngleParse.Attr]::ClassName, { $_ -split ' ' } 343 | Content = 'span', [AngleParse.Attr]::TextContent, ([regex]'(\w+)') 344 | } 345 | $expected = @{ 346 | ClassName = 'some_class', 'another_class' 347 | Content = 'some', 'link' 348 | } 349 | foreach ($key in $result.Keys) 350 | { 351 | $result[$key] | should -be $expected[$key] 352 | } 353 | } 354 | It 'throws when input type does not satisfy the most strict type requirement in the table' { 355 | { 356 | Get-Asset 'full-attribute.html' | Select-HtmlContent { $_ } @{ 357 | SplitContent = { $_ -split ' ' } 358 | SomeWhat = ([regex]'some (\w+)') 359 | } 360 | } | should -throw 361 | } 362 | } 363 | } -------------------------------------------------------------------------------- /AngleParse.Test/AngleParse.Test.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | default 6 | enable 7 | false 8 | enable 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | all 17 | runtime; build; native; contentfiles; analyzers; buildtransitive 18 | 19 | 20 | all 21 | runtime; build; native; contentfiles; analyzers; buildtransitive 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /AngleParse.Test/Helpers/ElementResourceFactory.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Runtime.CompilerServices; 3 | using System.Threading.Tasks; 4 | using AngleParse.Resource; 5 | 6 | namespace AngleParse.Test.Helpers; 7 | 8 | internal static class ElementResourceFactory 9 | { 10 | public static async Task CreateElementResourceAsync( 11 | string filename, 12 | [CallerFilePath] string rootPath = "", 13 | string assetsPath = "../../assets") 14 | { 15 | var path = Path.Combine(rootPath, assetsPath, filename); 16 | var content = await File.ReadAllTextAsync(path); 17 | return await ElementResource.CreateAsync(content); 18 | } 19 | } -------------------------------------------------------------------------------- /AngleParse.Test/Resource/ElementResourceTests.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Threading.Tasks; 3 | using AngleParse.Selector; 4 | using Xunit; 5 | using static AngleParse.Test.Helpers.ElementResourceFactory; 6 | 7 | namespace AngleParse.Test.Resource; 8 | 9 | public sealed class ElementResourceTests 10 | { 11 | [Fact] 12 | public async Task Create() 13 | { 14 | var resource = await CreateElementResourceAsync("full.html"); 15 | CssSelector headerSelector = SelectorFactory.CreateSelector("header"); 16 | CssSelector sectionSelector = SelectorFactory.CreateSelector("section"); 17 | CssSelector footerSelector = SelectorFactory.CreateSelector("footer"); 18 | Assert.Single(headerSelector.Select(resource)); 19 | Assert.Equal(4, sectionSelector.Select(resource).Count()); 20 | Assert.Single(footerSelector.Select(resource)); 21 | } 22 | 23 | [Fact] 24 | public async Task ResourceFromEmptyDocumentContainsEmptyDom() 25 | { 26 | var resource = await CreateElementResourceAsync("empty.html"); 27 | Assert.Empty(resource.String); 28 | } 29 | 30 | [Fact] 31 | public async Task ResourceFromInvalidDocumentContainsEmptyDom() 32 | { 33 | var resource = await CreateElementResourceAsync("invalid.html"); 34 | Assert.Empty(resource.String); 35 | } 36 | 37 | [Fact] 38 | public async Task ResourceFromNonHtmlSentenceDoesNotEmpty() 39 | { 40 | var resource = await CreateElementResourceAsync("sentence.txt"); 41 | Assert.NotEmpty(resource.String); 42 | } 43 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/AttributeSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Threading.Tasks; 3 | using AngleParse.Resource; 4 | using AngleParse.Selector; 5 | using Xunit; 6 | using static AngleParse.Test.Helpers.ElementResourceFactory; 7 | 8 | namespace AngleParse.Test.Selector; 9 | 10 | public sealed class AttributeSelectorTests 11 | { 12 | [Fact] 13 | public void Create() 14 | { 15 | var attribute = Attr.Href; 16 | var attributeSelector = SelectorFactory.CreateSelector(attribute); 17 | Assert.IsType(attributeSelector); 18 | } 19 | 20 | [Fact] 21 | public void CreateWithOriginalAttribute() 22 | { 23 | var attribute = new Attr("some-attribute"); 24 | var attributeSelector = SelectorFactory.CreateSelector(attribute); 25 | Assert.IsType(attributeSelector); 26 | } 27 | 28 | [Fact] 29 | public async Task SelectInnerHtml() 30 | { 31 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.InnerHtml); 32 | var resource = await CreateElementResourceAsync("no-attribute.html"); 33 | var result = selector.Select(resource).Select(r => r.String); 34 | const string expected = "some link"; 35 | Assert.Single(result, item => item == expected); 36 | } 37 | 38 | [Fact] 39 | public async Task SelectOuterHtml() 40 | { 41 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.OuterHtml); 42 | var resource = await CreateElementResourceAsync("no-attribute.html"); 43 | var result = selector.Select(resource).Select(r => r.String); 44 | const string expected = "some link"; 45 | Assert.Single(result, item => item == expected); 46 | } 47 | 48 | [Fact] 49 | public async Task SelectTextContent() 50 | { 51 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.TextContent); 52 | var resource = await CreateElementResourceAsync("no-attribute.html"); 53 | var result = selector.Select(resource).Select(r => r.String).ToList(); 54 | const string expected = "some link"; 55 | Assert.Single(result, item => item == expected); 56 | } 57 | 58 | [Fact] 59 | public async Task SelectId() 60 | { 61 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.Id); 62 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 63 | Assert.Single( 64 | selector.Select(fullAttribute).Select(r => r.String), 65 | item => item == "some_id"); 66 | Assert.Empty(selector.Select(noAttribute)); 67 | } 68 | 69 | [Fact] 70 | public async Task SelectClass() 71 | { 72 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.ClassName); 73 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 74 | Assert.Single( 75 | selector.Select(fullAttribute).Select(r => r.String), 76 | item => item == "some_class another_class"); 77 | Assert.Empty(selector.Select(noAttribute)); 78 | } 79 | 80 | [Fact] 81 | public async Task SelectSplitClasses() 82 | { 83 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.SplitClasses); 84 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 85 | Assert.Equal( 86 | ["some_class", "another_class"], 87 | selector.Select(fullAttribute).Select(r => r.String)); 88 | Assert.Empty(selector.Select(noAttribute)); 89 | } 90 | 91 | [Fact] 92 | public async Task SelectHref() 93 | { 94 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.Href); 95 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 96 | Assert.Single( 97 | selector.Select(fullAttribute).Select(r => r.String), 98 | item => item == "https://some_url_in_japan.go.jp"); 99 | Assert.Empty(selector.Select(noAttribute)); 100 | } 101 | 102 | [Fact] 103 | public async Task SelectSrc() 104 | { 105 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.Src); 106 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 107 | Assert.Single( 108 | selector.Select(fullAttribute).Select(r => r.String), 109 | item => item == "https://some_url_in_japan.go.jp/some_pic.jpg"); 110 | Assert.Empty(selector.Select(noAttribute)); 111 | } 112 | 113 | [Fact] 114 | public async Task SelectTitle() 115 | { 116 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.Title); 117 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 118 | Assert.Single( 119 | selector.Select(fullAttribute).Select(r => r.String), 120 | item => item == "Some title"); 121 | Assert.Empty(selector.Select(noAttribute)); 122 | } 123 | 124 | [Fact] 125 | public async Task SelectName() 126 | { 127 | AttributeSelector selector = SelectorFactory.CreateSelector(Attr.Name); 128 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 129 | Assert.Single( 130 | selector.Select(fullAttribute).Select(r => r.String), 131 | item => item == "some_name"); 132 | Assert.Empty(selector.Select(noAttribute)); 133 | } 134 | 135 | [Fact] 136 | public async Task SelectByUndefinedAttribute() 137 | { 138 | var attribute = new Attr("some-attribute"); 139 | AttributeSelector selector = SelectorFactory.CreateSelector(attribute); 140 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 141 | Assert.Single( 142 | selector.Select(fullAttribute).Select(r => r.String), 143 | item => item == "some_value"); 144 | Assert.Empty(selector.Select(noAttribute)); 145 | } 146 | 147 | [Fact] 148 | public async Task SelectingValuelessAttributeReturnsEmptyString() 149 | { 150 | var attribute = new Attr("valueless-attribute"); 151 | AttributeSelector selector = SelectorFactory.CreateSelector(attribute); 152 | var (fullAttribute, noAttribute) = await CreateElementResourcesAsync(); 153 | Assert.Single( 154 | selector.Select(fullAttribute).Select(r => r.String), 155 | item => item == string.Empty); 156 | Assert.Empty(selector.Select(noAttribute)); 157 | } 158 | 159 | private static async Task<(ElementResource fullAttribute, ElementResource noAttribute)> 160 | CreateElementResourcesAsync() 161 | { 162 | var fullAttribute = await CreateElementResourceAsync("full-attribute.html"); 163 | var noAttribute = await CreateElementResourceAsync("no-attribute.html"); 164 | return (fullAttribute, noAttribute); 165 | } 166 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/CssSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Threading.Tasks; 4 | using AngleParse.Selector; 5 | using Xunit; 6 | using static AngleParse.Test.Helpers.ElementResourceFactory; 7 | 8 | namespace AngleParse.Test.Selector; 9 | 10 | public sealed class CssSelectorTests 11 | { 12 | [Fact] 13 | public void CreateWithValidSelector() 14 | { 15 | const string validCssSelectorExpr = "div > p"; 16 | var cssSelector = SelectorFactory.CreateSelector(validCssSelectorExpr); 17 | Assert.IsType(cssSelector); 18 | } 19 | 20 | [Fact] 21 | public void CreatingWithInvalidSelectorThrowsException() 22 | { 23 | const string invalidCssSelectorExpr = "div > p >"; 24 | Assert.Throws(() => 25 | SelectorFactory.CreateSelector(invalidCssSelectorExpr)); 26 | } 27 | 28 | [Fact] 29 | public async Task Select() 30 | { 31 | CssSelector selector = SelectorFactory.CreateSelector("section#fragment p > a.mw-redirect"); 32 | var resource = await CreateElementResourceAsync("full.html"); 33 | var result = selector.Select(resource).Select(r => r.String); 34 | var expected = new[] 35 | { "Windows XP SP2", "Windows Server 2003 SP1", "general availability" }; 36 | Assert.Equal(expected, result); 37 | } 38 | 39 | [Fact] 40 | public async Task UnmatchedSelectorReturnsEmptySequence() 41 | { 42 | CssSelector selector = SelectorFactory.CreateSelector("section#fragment div > a > p > div"); 43 | var resource = await CreateElementResourceAsync("full.html"); 44 | var result = selector.Select(resource); 45 | Assert.Empty(result); 46 | } 47 | 48 | [Fact] 49 | public async Task SelectOnEmptyDocumentReturnsEmptySequence() 50 | { 51 | CssSelector selector = SelectorFactory.CreateSelector("div"); 52 | var resource = await CreateElementResourceAsync("empty.html"); 53 | var result = selector.Select(resource); 54 | Assert.Empty(result); 55 | } 56 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/FuncSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Management.Automation; 4 | using System.Text.RegularExpressions; 5 | using System.Threading.Tasks; 6 | using AngleParse.Resource; 7 | using AngleParse.Selector; 8 | using Xunit; 9 | using static AngleParse.Test.Helpers.ElementResourceFactory; 10 | 11 | namespace AngleParse.Test.Selector.FuncSelectorTests; 12 | 13 | public sealed class CreationTests 14 | { 15 | [Fact] 16 | public void CreatingWithEmptyArrayThrowsException() 17 | { 18 | object[] empty = []; 19 | Assert.Throws(() => 20 | { 21 | SelectorFactory.CreateSelector(empty); 22 | }); 23 | } 24 | 25 | [Fact] 26 | public void CreatingWithOneObjectReturnsInternalSelector() 27 | { 28 | object[] objects = ["div > p"]; 29 | var selector = SelectorFactory.CreateSelector(objects); 30 | Assert.IsType(selector); 31 | } 32 | 33 | [Fact] 34 | public void CreateWithTwoObjects() 35 | { 36 | object[] objects = 37 | [ 38 | new Regex("tag: (.*)"), 39 | ScriptBlock.Create("{ $_.Name -eq 'test' }") 40 | ]; 41 | var selector = SelectorFactory.CreateSelector(objects); 42 | Assert.IsType>(selector); 43 | } 44 | 45 | [Fact] 46 | public void CreatingPipelineWithOneObjectCastsProperly() 47 | { 48 | object[] objects = [new Regex("tag: (.*)")]; 49 | var selector = SelectorFactory.CreatePipeline(objects); 50 | Assert.IsType>(selector, false); 51 | } 52 | 53 | [Fact] 54 | public void CreatingPipelineWithTwoObjectsCastsProperly() 55 | { 56 | object[] objects = 57 | [ 58 | new Regex("tag: (.*)"), 59 | new Regex("(.*),") 60 | ]; 61 | var selector = SelectorFactory.CreatePipeline(objects); 62 | Assert.IsType>(selector, false); 63 | } 64 | 65 | [Fact] 66 | public void CreatingUnconnectablePipelineThrowsException() 67 | { 68 | object[] objects = 69 | [ 70 | new Regex("tag: (.*)"), 71 | Attr.Href 72 | ]; 73 | Assert.Throws(() => 74 | { 75 | SelectorFactory.CreatePipeline(objects); 76 | }); 77 | } 78 | 79 | [Fact] 80 | public void CreatingWithInvalidTypeObjectThrowsException() 81 | { 82 | object[] objects = 83 | [ 84 | new Regex("tag: (.*)"), 85 | 0 86 | ]; 87 | Assert.Throws(() => 88 | { 89 | SelectorFactory.CreateSelector(objects); 90 | }); 91 | } 92 | 93 | [Fact] 94 | public void CreatingWithInvalidExpressionThrowsException() 95 | { 96 | object[] objects = 97 | [ 98 | "div > p", 99 | "div > p >" 100 | ]; 101 | Assert.Throws(() => 102 | { 103 | SelectorFactory.CreateSelector(objects); 104 | }); 105 | } 106 | } 107 | 108 | public class SelectionTests 109 | { 110 | [Fact] 111 | public async Task Select() 112 | { 113 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 114 | { 115 | "section", 116 | "div > p > a.mw-redirect", 117 | Attr.TextContent, 118 | new Regex("Windows (.+)") 119 | }); 120 | var resource = await CreateElementResourceAsync("full.html"); 121 | Assert.Equal( 122 | ["XP SP2", "Server 2003 SP1"], 123 | pipeline.Select(resource).Select(r => r.Object)); 124 | } 125 | 126 | [Fact] 127 | public async Task NoMatchingSelectorReturnsEmptySequence() 128 | { 129 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 130 | { 131 | "section.not-matching-class", 132 | Attr.TextContent 133 | }); 134 | var resource = await CreateElementResourceAsync("full.html"); 135 | Assert.Empty(pipeline.Select(resource)); 136 | } 137 | 138 | [Fact] 139 | public async Task SelectionOnEmptyDocumentReturnsEmptySequence() 140 | { 141 | var pipeline = SelectorFactory.CreatePipeline(new object?[] { "div" }); 142 | var resource = await CreateElementResourceAsync("empty.html"); 143 | Assert.Empty(pipeline.Select(resource)); 144 | } 145 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/PropertySelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using AngleParse.Selector; 6 | using Xunit; 7 | using static AngleParse.Test.Helpers.ElementResourceFactory; 8 | 9 | namespace AngleParse.Test.Selector; 10 | 11 | public sealed class PropertySelectorTests 12 | { 13 | [Fact] 14 | public void Create() 15 | { 16 | var prop = new Prop("some-property"); 17 | var propertySelector = SelectorFactory.CreateSelector(prop); 18 | Assert.IsType(propertySelector); 19 | } 20 | 21 | [Fact] 22 | public async Task SelectByElementProp() 23 | { 24 | var prop = Prop.Element; 25 | PropertySelector selector = SelectorFactory.CreateSelector(prop); 26 | var resource = await CreateElementResourceAsync("full-attribute.html"); 27 | Assert.Single(selector.Select(resource).Select(r => r.Object), resource.Element); 28 | } 29 | 30 | [Fact] 31 | public async Task SelectByAttributesTableProp() 32 | { 33 | var prop = Prop.AttributesTable; 34 | PropertySelector selector = SelectorFactory.CreateSelector(prop); 35 | var resource = await CreateElementResourceAsync("full-attribute.html"); 36 | var selected = selector 37 | .Select(resource) 38 | .Select(r => (Dictionary)r.Object) 39 | .ToList(); 40 | var expected = new Dictionary 41 | { 42 | ["class"] = "some_class another_class", 43 | ["href"] = "https://some_url_in_japan.go.jp", 44 | ["id"] = "some_id", 45 | ["name"] = "some_name", 46 | ["some-attribute"] = "some_value", 47 | ["src"] = "https://some_url_in_japan.go.jp/some_pic.jpg", 48 | ["title"] = "Some title", 49 | ["valueless-attribute"] = "" 50 | }; 51 | Assert.Single(selected); 52 | Assert.Equal(selected.First(), expected); 53 | } 54 | 55 | [Fact] 56 | public async Task SelectByUndefinedProperty() 57 | { 58 | var prop = new Prop("Text"); 59 | PropertySelector selector = SelectorFactory.CreateSelector(prop); 60 | var resource = await CreateElementResourceAsync("full-attribute.html"); 61 | Assert.Single( 62 | selector.Select(resource).Select(r => r.Object as string), 63 | item => item is not null && item == "some link"); 64 | } 65 | 66 | [Fact] 67 | public async Task SelectingByNonExistingPropertyThrowsException() 68 | { 69 | var prop = new Prop("non-existing-property"); 70 | PropertySelector selector = SelectorFactory.CreateSelector(prop); 71 | var resource = await CreateElementResourceAsync("full-attribute.html"); 72 | Assert.Throws(() => { selector.Select(resource); }); 73 | } 74 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/RegexSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using System.Text.RegularExpressions; 3 | using System.Threading.Tasks; 4 | using AngleParse.Selector; 5 | using Xunit; 6 | using static AngleParse.Test.Helpers.ElementResourceFactory; 7 | 8 | namespace AngleParse.Test.Selector; 9 | 10 | public sealed class RegexSelectorTests 11 | { 12 | [Fact] 13 | public void Create() 14 | { 15 | const string pattern = "tag: (.*)"; 16 | var regex = new Regex(pattern); 17 | var regexSelector = SelectorFactory.CreateSelector(regex); 18 | Assert.IsType(regexSelector); 19 | } 20 | 21 | [Fact] 22 | public async Task Select() 23 | { 24 | RegexSelector selector = SelectorFactory.CreateSelector(new Regex("""(\w+) (\d{4})""")); 25 | var resource = await CreateElementResourceAsync("full.html"); 26 | var result = selector.Select(resource).Select(r => r.String); 27 | var expected = new[] 28 | { 29 | "November", "2006", "Server", "2003", "Server", "2008", "August", "2016", "January", 30 | "2018" 31 | }; 32 | Assert.Equal(expected, result); 33 | } 34 | 35 | [Fact] 36 | public async Task NoCapturingSelectorReturnsEmptySequence() 37 | { 38 | RegexSelector selector = SelectorFactory.CreateSelector(new Regex("""\w+ \d{4}""")); 39 | var resource = await CreateElementResourceAsync("full.html"); 40 | var result = selector.Select(resource); 41 | Assert.Empty(result); 42 | } 43 | 44 | [Fact] 45 | public async Task UnmatchedSelectorReturnsEmptySequence() 46 | { 47 | RegexSelector selector = 48 | SelectorFactory.CreateSelector(new Regex("hey, how's your day? (.+)")); 49 | var resource = await CreateElementResourceAsync("full.html"); 50 | var result = selector.Select(resource); 51 | Assert.Empty(result); 52 | } 53 | 54 | [Fact] 55 | public async Task SelectOnEmptyDocumentReturnsEmptySequence() 56 | { 57 | RegexSelector selector = SelectorFactory.CreateSelector(new Regex("""(\w+) (\d{4})""")); 58 | var resource = await CreateElementResourceAsync("empty.html"); 59 | var result = selector.Select(resource); 60 | Assert.Empty(result); 61 | } 62 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/ScriptBlockSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System.Management.Automation; 2 | using AngleParse.Selector; 3 | using Xunit; 4 | 5 | namespace AngleParse.Test.Selector; 6 | 7 | public sealed class ScriptBlockSelectorTests 8 | { 9 | [Fact] 10 | public void Create() 11 | { 12 | var scriptBlock = ScriptBlock.Create("{ $_.Name -eq 'test' }"); 13 | var scriptBlockSelector = SelectorFactory.CreateSelector(scriptBlock); 14 | Assert.IsType(scriptBlockSelector); 15 | } 16 | 17 | // ScriptBlock invocation from C# proved unreliable (e.g., runspace context issues). 18 | // Tests about selection are implemented using Pester instead. 19 | } -------------------------------------------------------------------------------- /AngleParse.Test/Selector/TableSelectorTests.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Linq; 4 | using System.Management.Automation; 5 | using System.Text.RegularExpressions; 6 | using System.Threading.Tasks; 7 | using AngleParse.Resource; 8 | using AngleParse.Selector; 9 | using Xunit; 10 | using static AngleParse.Test.Helpers.ElementResourceFactory; 11 | 12 | namespace AngleParse.Test.Selector; 13 | 14 | public sealed class CreationTests 15 | { 16 | private static string ValidCssSelectorExpr => "div > p"; 17 | private static string InvalidCssSelectorExpr => "div > p >"; 18 | private static Regex ValidRegex => new("tag: (.*)"); 19 | private static ScriptBlock ValidScriptBlock => ScriptBlock.Create("{ $_.Name -eq 'test' }"); 20 | 21 | [Fact] 22 | public void CreateWithOneElementSelector() 23 | { 24 | var tableSelector = SelectorFactory.CreateSelector(new Hashtable 25 | { 26 | { "cssSelector", ValidCssSelectorExpr } 27 | }); 28 | Assert.IsType>(tableSelector); 29 | } 30 | 31 | [Fact] 32 | public void CreateWithOneStringSelector() 33 | { 34 | var tableSelector = SelectorFactory.CreateSelector(new Hashtable 35 | { 36 | { "regex", ValidRegex } 37 | }); 38 | Assert.IsType>(tableSelector); 39 | } 40 | 41 | [Fact] 42 | public void CreateWithOneObjectSelector() 43 | { 44 | var tableSelector = SelectorFactory.CreateSelector(new Hashtable 45 | { 46 | { "scriptBlock", ValidScriptBlock } 47 | }); 48 | Assert.IsType>(tableSelector); 49 | } 50 | 51 | [Fact] 52 | public void CreateWithMultipleSelectors() 53 | { 54 | var tableSelector = SelectorFactory.CreateSelector(new Hashtable 55 | { 56 | { "regex", ValidRegex }, 57 | { "scriptBlock", ValidScriptBlock } 58 | }); 59 | Assert.IsType>(tableSelector); 60 | } 61 | 62 | [Fact] 63 | public void CreatingWithInvalidExpressionThrowsException() 64 | { 65 | Assert.Throws(() => 66 | { 67 | SelectorFactory.CreateSelector(new Hashtable 68 | { 69 | { "cssSelector", InvalidCssSelectorExpr }, 70 | { "invalidSelector", "invalid" }, 71 | { "scriptBlock", ValidScriptBlock }, 72 | { "regex", ValidRegex } 73 | }); 74 | }); 75 | } 76 | } 77 | 78 | public class SelectionTests 79 | { 80 | [Fact] 81 | public async Task EachCaseMatchesOneElement() 82 | { 83 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 84 | { 85 | "div.some_class:nth-child(1)", 86 | "span.mw-headline", 87 | new Hashtable 88 | { 89 | { "class", Attr.ClassName }, 90 | { "text", Attr.TextContent } 91 | } 92 | }); 93 | var resource = await CreateElementResourceAsync("full.html"); 94 | var expected = new Hashtable 95 | { 96 | { "class", "mw-headline" }, 97 | { "text", "Windows PowerShell 1.0" } 98 | }; 99 | Assert.Single(pipeline.Select(resource), item => 100 | { 101 | var result = (PSObject)item.Object; 102 | Assert.Equal(expected, result.BaseObject); 103 | return true; 104 | }); 105 | } 106 | 107 | [Fact] 108 | public async Task EachCaseMatchesMultipleElements() 109 | { 110 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 111 | { 112 | "div.some_class:nth-child(1)", 113 | new Hashtable 114 | { 115 | { "redirect", new object[] { "a.mw-redirect", Attr.TextContent } } 116 | } 117 | }); 118 | var expected = new Hashtable 119 | { 120 | { "redirect", new object?[] { "Windows XP SP2", "Windows Server 2003 SP1" } } 121 | }; 122 | var resource = await CreateElementResourceAsync("full.html"); 123 | Assert.Single(pipeline.Select(resource), item => 124 | { 125 | var result = (PSObject)item.Object; 126 | Assert.Equal(expected, result.BaseObject); 127 | return true; 128 | }); 129 | } 130 | 131 | [Fact] 132 | public async Task EachCaseMatchesNoElements() 133 | { 134 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 135 | { 136 | "div.some_class:nth-child(1)", 137 | new Hashtable 138 | { 139 | { 140 | "redirect", new object[] 141 | { 142 | "a.mw-redirect", 143 | Attr.TextContent, 144 | new Regex("not-matching-pattern: (.+)") 145 | } 146 | } 147 | } 148 | }); 149 | var expected = new Hashtable 150 | { 151 | { "redirect", null } 152 | }; 153 | var resource = await CreateElementResourceAsync("full.html"); 154 | Assert.Single(pipeline.Select(resource), item => 155 | { 156 | var result = (PSObject)item.Object; 157 | Assert.Equal(expected, result.BaseObject); 158 | return true; 159 | }); 160 | } 161 | 162 | [Fact] 163 | public async Task ComplexSelector() 164 | { 165 | var pipeline = SelectorFactory.CreatePipeline(new object?[] 166 | { 167 | "div.some_class", 168 | new Hashtable 169 | { 170 | { 171 | "windows", new object[] 172 | { 173 | "a.mw-redirect", 174 | Attr.TextContent, 175 | new Regex("Windows( .+)"), 176 | new Regex("""(\S+)""") 177 | } 178 | }, 179 | { "title", new object[] { "a.mw-redirect", Attr.Title } } 180 | } 181 | }); 182 | var resource = await CreateElementResourceAsync("full.html"); 183 | var expected = new[] 184 | { 185 | new Hashtable 186 | { 187 | { "windows", new object[] { "XP", "SP2", "Server", "2003", "SP1" } }, 188 | { "title", new object[] { "Windows XP SP2", "Windows Server 2003 SP1" } } 189 | }, 190 | new Hashtable 191 | { 192 | { "windows", null }, 193 | { "title", "General availability" } 194 | } 195 | }; 196 | var result = pipeline.Select(resource).Select(r => 197 | (Hashtable)((PSObject)r.Object).BaseObject); 198 | Assert.Equal(expected, result); 199 | } 200 | } -------------------------------------------------------------------------------- /AngleParse.Test/assets/empty.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamome283/AngleParse/64bd48e7e941f4e2d26de801b00caab185792203/AngleParse.Test/assets/empty.html -------------------------------------------------------------------------------- /AngleParse.Test/assets/full-attribute.html: -------------------------------------------------------------------------------- 1 | some link 11 | -------------------------------------------------------------------------------- /AngleParse.Test/assets/full.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Test Page | Full Document 7 | 8 | 9 | 10 | 11 |
12 |
13 |

My Test Site

14 | 22 |
23 |
24 | 25 | 26 |
27 |
28 |

Welcome to the Test Page

29 |

This page is a sample of a complete HTML document.

30 | Learn More 31 |
32 |
33 | 34 | 35 |
36 |
37 |

Features

38 |
39 |
40 |

Responsive

41 |

Looks great on any device.

42 |
43 |
44 |

Simple

45 |

Readable and easy-to-test HTML structure.

46 |
47 |
48 |

Extendable

49 |

Add CSS or scripts to enhance functionality.

50 |
51 |
52 |
53 |
54 | 55 | 56 |
57 |
58 |

Windows PowerShell 1.0

59 |

60 | PowerShell 1.0 was released in November 2006 for 61 | Windows XP SP2, 63 | Windows Server 2003 SP1 and 65 | Windows Vista. 66 | [58] 67 | It is an optional component of Windows Server 2008. 69 |

70 |
71 |
72 |

73 | PowerShell Core 6 74 | 75 | [ 76 | edit 78 | ] 79 | 80 |

81 |

82 | PowerShell Core 6.0 was first announced on 18 August 2016, when Microsoft unveiled 83 | PowerShell Core and its decision to make the product cross-platform, 84 | independent of Windows, free and open source. 85 | [5] 87 | It achieved general availability 89 | on 10 January 2018 for Windows, macOS and Linux. 90 | [89] 91 | It has its own support lifecycle and adheres to the Microsoft lifecycle policy that is 92 | introduced with Windows 10: 93 | Only the latest version of PowerShell Core is supported. 94 | Microsoft expects to release one minor version for PowerShell Core 6.0 every six months. 95 | [90] 96 |

97 |
98 |
99 | 100 | 101 |
102 |
103 |

Contact Us

104 |
105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 |
113 |
114 |
115 | 116 | 117 |
118 |
119 |

foo bar

120 |
121 |
122 | 123 | -------------------------------------------------------------------------------- /AngleParse.Test/assets/invalid.html: -------------------------------------------------------------------------------- 1 | !r3jmr23.30r u9f3.2$2394div><#/!"DFASh1>Title 2 | -------------------------------------------------------------------------------- /AngleParse.Test/assets/no-attribute.html: -------------------------------------------------------------------------------- 1 | some link -------------------------------------------------------------------------------- /AngleParse.Test/assets/sentence.txt: -------------------------------------------------------------------------------- 1 | Windows PowerShell 1.0 2 | PowerShell 1.0 was released in November 2006 for Windows XP SP2, Windows Server 2003 SP1 and Windows Vista.[58] 3 | It is an optional component of Windows Server 2008. 4 | 5 | Windows PowerShell 5.1 6 | It was released along with the Windows 10 Anniversary Update[84] on August 2, 2016, and in Windows Server 2016.[85] 7 | PackageManagement now supports proxies, PSReadLine now has ViMode support, and two new commands were added:Get-TimeZone and Set-TimeZone. 8 | The LocalAccounts module allows for adding/removing local user accounts.[86] 9 | A preview for PowerShell 5.1 was released for Windows 7, Windows Server 2008, Windows Server 2008 R2, Windows Server 2012, 10 | and Windows Server 2012 R2 on July 16, 2016,[87] and was released on January 19, 2017.[88] -------------------------------------------------------------------------------- /AngleParse.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AngleParse", "AngleParse\AngleParse.csproj", "{8233CD4C-E353-49AE-9509-B6977388ABC9}" 4 | EndProject 5 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AngleParse.Test", "AngleParse.Test\AngleParse.Test.csproj", "{059000D1-C792-41F1-95DF-185187021F74}" 6 | EndProject 7 | Global 8 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 9 | Debug|Any CPU = Debug|Any CPU 10 | Release|Any CPU = Release|Any CPU 11 | EndGlobalSection 12 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 13 | {8233CD4C-E353-49AE-9509-B6977388ABC9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 14 | {8233CD4C-E353-49AE-9509-B6977388ABC9}.Debug|Any CPU.Build.0 = Debug|Any CPU 15 | {8233CD4C-E353-49AE-9509-B6977388ABC9}.Release|Any CPU.ActiveCfg = Release|Any CPU 16 | {8233CD4C-E353-49AE-9509-B6977388ABC9}.Release|Any CPU.Build.0 = Release|Any CPU 17 | {059000D1-C792-41F1-95DF-185187021F74}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 18 | {059000D1-C792-41F1-95DF-185187021F74}.Debug|Any CPU.Build.0 = Debug|Any CPU 19 | {059000D1-C792-41F1-95DF-185187021F74}.Release|Any CPU.ActiveCfg = Release|Any CPU 20 | {059000D1-C792-41F1-95DF-185187021F74}.Release|Any CPU.Build.0 = Release|Any CPU 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /AngleParse/AngleParse.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | default 6 | enable 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /AngleParse/AngleParse.psd1: -------------------------------------------------------------------------------- 1 | @{ 2 | RootModule = './AngleParse.dll'; 3 | ModuleVersion = '0.0.0.0'; 4 | CompatiblePSEditions = @('Core'); 5 | GUID = '9a56d36c-7cff-405a-9f2f-9410ac6ce0b5'; 6 | Author = 'kamome283'; 7 | CompanyName = ''; 8 | Copyright = '(c) kamome283. All rights reserved.'; 9 | Description = 'HTML parsing and processing module'; 10 | PowerShellVersion = '7.4'; 11 | NestedModules = @(); 12 | FunctionsToExport = '*'; 13 | CmdletsToExport = '*'; 14 | VariablesToExport = '*'; 15 | AliasesToExport = '*'; 16 | PrivateData = @{ 17 | PSData = @{ 18 | Tags = 'HTML', 'parse', 'AngleSharp'; 19 | LicenseUri = 'https://github.com/kamome283/AngleParse/blob/master/LICENSE'; 20 | ProjectUri = 'https://github.com/kamome283/AngleParse'; 21 | ReleaseNotes = ''; 22 | }; 23 | }; 24 | HelpInfoURI = 'https://github.com/kamome283/AngleParse'; 25 | } 26 | 27 | -------------------------------------------------------------------------------- /AngleParse/Attr.cs: -------------------------------------------------------------------------------- 1 | namespace AngleParse; 2 | 3 | public sealed record Attr(string Value) 4 | { 5 | public static Attr InnerHtml => new("'<>InnerHtml"); 6 | public static Attr OuterHtml => new("'<>OuterHtml"); 7 | public static Attr TextContent => new("'<>TextContent"); 8 | public static Attr Id => new("'<>Id"); 9 | public static Attr ClassName => new("'<>class"); 10 | public static Attr SplitClasses => new("'<>splitClasses"); 11 | public static Attr Href => new("href"); 12 | public static Attr Src => new("src"); 13 | public static Attr Title => new("title"); 14 | public static Attr Name => new("name"); 15 | } -------------------------------------------------------------------------------- /AngleParse/Prop.cs: -------------------------------------------------------------------------------- 1 | namespace AngleParse; 2 | 3 | public sealed record Prop(string Value) 4 | { 5 | public static Prop Element => new(string.Empty); 6 | public static Prop AttributesTable => new("!AttributesTable"); 7 | } -------------------------------------------------------------------------------- /AngleParse/Resource/ElementResource.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Threading.Tasks; 4 | using AngleSharp; 5 | using AngleSharp.Dom; 6 | 7 | namespace AngleParse.Resource; 8 | 9 | internal sealed record ElementResource(IElement Element) : StringResource(Element.TextContent) 10 | { 11 | public static async Task CreateAsync(string content) 12 | { 13 | var config = Configuration.Default; 14 | var context = BrowsingContext.New(config); 15 | var doc = await context.OpenAsync(res => res.Content(content)); 16 | if (doc.Body is null) throw new ArgumentOutOfRangeException(nameof(content)); 17 | var element = doc.Body.ChildElementCount == 1 ? doc.Body.Children.First() : doc.Body; 18 | return new ElementResource(element); 19 | } 20 | } -------------------------------------------------------------------------------- /AngleParse/Resource/ObjectResource.cs: -------------------------------------------------------------------------------- 1 | namespace AngleParse.Resource; 2 | 3 | public record ObjectResource(object Object); -------------------------------------------------------------------------------- /AngleParse/Resource/StringResource.cs: -------------------------------------------------------------------------------- 1 | namespace AngleParse.Resource; 2 | 3 | public record StringResource(string String) : ObjectResource(String); -------------------------------------------------------------------------------- /AngleParse/SelectHtmlElement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Linq; 3 | using System.Management.Automation; 4 | using AngleParse.Resource; 5 | using AngleParse.Selector; 6 | 7 | namespace AngleParse; 8 | 9 | [Cmdlet(VerbsCommon.Select, "HtmlContent", HelpUri = "https://github.com/kamome283/AngleParse")] 10 | [OutputType(typeof(PSObject))] 11 | public sealed class SelectHtmlElement : PSCmdlet 12 | { 13 | [Parameter( 14 | Mandatory = true, 15 | ValueFromRemainingArguments = true, 16 | HelpMessage = "Selector to select and process data in the content.")] 17 | public object[]? Selector { get; set; } 18 | 19 | [Parameter( 20 | ValueFromPipeline = true, 21 | ValueFromPipelineByPropertyName = true, 22 | HelpMessage = "HTML content.")] 23 | public string? Content { get; set; } 24 | 25 | private ISelector Pipeline { get; set; } = null!; 26 | 27 | protected override void BeginProcessing() 28 | { 29 | if (Selector is null || Selector.Length == 0) 30 | throw new ArgumentNullException(nameof(Selector), "Selector cannot be null or empty."); 31 | Pipeline = SelectorFactory.CreatePipeline(Selector); 32 | } 33 | 34 | protected override void ProcessRecord() 35 | { 36 | if (Content is null) 37 | throw new ArgumentNullException(nameof(Content), "Content cannot be null."); 38 | var elementResource = ElementResource 39 | .CreateAsync(Content) 40 | .GetAwaiter() 41 | .GetResult(); 42 | WriteObject( 43 | Pipeline 44 | .Select(elementResource) 45 | .Select(r => new PSObject(r.Object)) 46 | .ToArray(), 47 | true 48 | ); 49 | } 50 | } -------------------------------------------------------------------------------- /AngleParse/Selector/AttributeSelector.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using AngleParse.Resource; 4 | 5 | namespace AngleParse.Selector; 6 | 7 | internal sealed class AttributeSelector(Attr attr) : ISelector 8 | { 9 | public IEnumerable Select(ElementResource resource) => 10 | attr switch 11 | { 12 | _ when attr == Attr.InnerHtml => WrapToResources(resource.Element.InnerHtml), 13 | _ when attr == Attr.OuterHtml => WrapToResources(resource.Element.OuterHtml), 14 | _ when attr == Attr.TextContent => WrapToResources(resource.Element.TextContent), 15 | _ when attr == Attr.Id => WrapToResources(resource.Element.Id), 16 | _ when attr == Attr.ClassName => WrapToResources(resource.Element.ClassName), 17 | _ when attr == Attr.SplitClasses => resource.Element.ClassList.SelectMany( 18 | WrapToResources), 19 | _ => WrapToResources(resource.Element.Attributes[attr.Value]?.Value) 20 | }; 21 | 22 | private static StringResource[] WrapToResources(string? s) => 23 | s is null ? [] : [new StringResource(s)]; 24 | } -------------------------------------------------------------------------------- /AngleParse/Selector/CssSelector.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using AngleParse.Resource; 5 | using AngleSharp.Css.Parser; 6 | 7 | namespace AngleParse.Selector; 8 | 9 | internal sealed class CssSelector : ISelector 10 | { 11 | public CssSelector(string cssSelectorExpr) 12 | { 13 | if (Parser.ParseSelector(cssSelectorExpr) is null) 14 | throw new ArgumentOutOfRangeException(nameof(cssSelectorExpr)); 15 | CssSelectorExpr = cssSelectorExpr; 16 | } 17 | 18 | private static CssSelectorParser Parser { get; } = new(); 19 | 20 | private string CssSelectorExpr { get; } 21 | 22 | public IEnumerable Select(ElementResource resource) => 23 | resource 24 | .Element 25 | .QuerySelectorAll(CssSelectorExpr) 26 | .Select(e => new ElementResource(e)); 27 | } -------------------------------------------------------------------------------- /AngleParse/Selector/FuncSelector.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using AngleParse.Resource; 4 | 5 | namespace AngleParse.Selector; 6 | 7 | internal sealed class FuncSelector(Func> func) : ISelector 8 | where In : Out 9 | where Out : ObjectResource 10 | { 11 | public IEnumerable Select(In resource) => func(resource); 12 | } -------------------------------------------------------------------------------- /AngleParse/Selector/ISelector.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using AngleParse.Resource; 3 | 4 | namespace AngleParse.Selector; 5 | 6 | public interface ISelector where In : Out where Out : ObjectResource 7 | { 8 | public IEnumerable Select(In resource); 9 | } -------------------------------------------------------------------------------- /AngleParse/Selector/PropertySelector.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using AngleParse.Resource; 5 | using AngleSharp.Dom; 6 | 7 | namespace AngleParse.Selector; 8 | 9 | internal sealed class PropertySelector(Prop prop) : ISelector 10 | { 11 | public IEnumerable Select(ElementResource resource) 12 | { 13 | if (prop == Prop.Element) 14 | return [new ObjectResource(resource.Element)]; 15 | if (prop == Prop.AttributesTable) 16 | { 17 | var attributesTable = resource 18 | .Element 19 | .Attributes 20 | .ToDictionary(a => a.Name, a => a.Value); 21 | return [new ObjectResource(attributesTable)]; 22 | } 23 | 24 | var accessedValue = DynamicAccess(resource.Element, prop.Value); 25 | return accessedValue is null ? [] : [new ObjectResource(accessedValue)]; 26 | } 27 | 28 | private static object? DynamicAccess(IElement element, string propName) 29 | { 30 | var type = element.GetType(); 31 | var typeProp = type.GetProperty(propName) ?? 32 | throw new ArgumentOutOfRangeException( 33 | nameof(propName), 34 | $"{type} does not have '{propName}' property."); 35 | return typeProp.GetValue(element); 36 | } 37 | } -------------------------------------------------------------------------------- /AngleParse/Selector/RegexSelector.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Text.RegularExpressions; 4 | using AngleParse.Resource; 5 | 6 | namespace AngleParse.Selector; 7 | 8 | internal sealed class RegexSelector(Regex regex) : ISelector 9 | { 10 | public IEnumerable Select(StringResource resource) => 11 | regex.Matches(resource.String).SelectMany(GetGroupedValue); 12 | 13 | private static IEnumerable GetGroupedValue(Match match) => 14 | match 15 | .Groups 16 | .Cast() 17 | .Skip(1) // Skip the first group which is the whole match 18 | .Select(group => new StringResource(group.Value)); 19 | } -------------------------------------------------------------------------------- /AngleParse/Selector/ScriptBlockSelector.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Management.Automation; 4 | using AngleParse.Resource; 5 | 6 | namespace AngleParse.Selector; 7 | 8 | internal sealed class ScriptBlockSelector(ScriptBlock scriptBlock) 9 | : ISelector 10 | { 11 | public IEnumerable Select(ObjectResource resource) 12 | { 13 | return scriptBlock 14 | // Set '$_' variable into the script block 15 | .InvokeWithContext([], [new PSVariable("_", resource.Object)]) 16 | .Select(pso => new ObjectResource(pso)); 17 | } 18 | } -------------------------------------------------------------------------------- /AngleParse/Selector/SelectorFactory.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Management.Automation; 6 | using System.Text.RegularExpressions; 7 | using AngleParse.Resource; 8 | using Microsoft.CSharp.RuntimeBinder; 9 | 10 | namespace AngleParse.Selector; 11 | 12 | internal static class SelectorFactory 13 | { 14 | public static ISelector CreatePipeline(object? obj) => 15 | CreateSelector(obj); 16 | 17 | internal static dynamic CreateSelector(object? obj) => obj switch 18 | { 19 | string cssSelectorExpr => new CssSelector(cssSelectorExpr), 20 | Regex regex => new RegexSelector(regex), 21 | Attr attr => new AttributeSelector(attr), 22 | Prop prop => new PropertySelector(prop), 23 | ScriptBlock scriptBlock => new ScriptBlockSelector(scriptBlock), 24 | Hashtable hashtable => CreateTableSelector(hashtable), 25 | object[] objects => CreateFuncSelector(objects), 26 | null => throw new ArgumentOutOfRangeException(nameof(obj), "null cannot be a selector"), 27 | _ => throw new ArgumentOutOfRangeException(nameof(obj), obj, 28 | $"Invalid selector type: {obj.GetType()}") 29 | }; 30 | 31 | private static dynamic CreateTableSelector(Hashtable hashtable) 32 | { 33 | // Since Dictionary does not support variance, we need to make it from lists. 34 | List keys = []; 35 | List selectors = []; 36 | foreach (var entry in hashtable.Cast()) 37 | { 38 | keys.Add(entry.Key); 39 | selectors.Add(CreateSelector(entry.Value)); 40 | } 41 | 42 | List? CastList(List list) 43 | { 44 | var casted = list.OfType().ToList(); 45 | return casted.Count == list.Count ? casted : null; 46 | } 47 | 48 | Dictionary GetDictionary(List ks, List vs) where T : notnull 49 | { 50 | if (ks.Count != vs.Count) 51 | throw new InvalidOperationException("Keys and selectors count mismatch."); 52 | return ks.Zip(vs).ToDictionary(); 53 | } 54 | 55 | // Since In of ISelector is contravariant, 56 | // ObjectResource is the most specific type that can appear in this position. 57 | var objectSelectors = CastList>(selectors); 58 | if (objectSelectors is not null) 59 | return new TableSelector(GetDictionary(keys, objectSelectors)); 60 | var stringSelectors = CastList>(selectors); 61 | if (stringSelectors is not null) 62 | return new TableSelector(GetDictionary(keys, stringSelectors)); 63 | var elementSelectors = CastList>(selectors); 64 | if (elementSelectors is not null) 65 | return new TableSelector(GetDictionary(keys, elementSelectors)); 66 | throw new ArgumentOutOfRangeException( 67 | nameof(hashtable), 68 | $"Cannot create table selector from {selectors.GetType()}"); 69 | } 70 | 71 | private static dynamic CreateFuncSelector(object[] objects) 72 | { 73 | if (objects.Length == 0) 74 | throw new ArgumentOutOfRangeException(nameof(objects), 75 | "Cannot create selector from empty array"); 76 | var (head, tail) = (objects[0], objects[1..]); 77 | var selector = CreateSelector(head); 78 | foreach (var obj in tail) 79 | { 80 | var nextSelector = CreateSelector(obj); 81 | try 82 | { 83 | selector = Connect(selector, nextSelector); 84 | } 85 | catch (RuntimeBinderException e) 86 | { 87 | throw new InvalidOperationException( 88 | $"Cannot connect {selector.GetType()} to {nextSelector.GetType()}", e); 89 | } 90 | } 91 | 92 | return selector; 93 | } 94 | 95 | private static FuncSelector Connect( 96 | ISelector left, ISelector right) 97 | where LIn : LOut 98 | where LOut : RIn 99 | where RIn : ROut 100 | where ROut : ObjectResource => 101 | new(x => left.Select(x).SelectMany(right.Select)); 102 | } -------------------------------------------------------------------------------- /AngleParse/Selector/TableSelector.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Management.Automation; 5 | using AngleParse.Resource; 6 | 7 | namespace AngleParse.Selector; 8 | 9 | internal sealed class TableSelector(IDictionary> table) 10 | : ISelector where In : ObjectResource 11 | { 12 | public IEnumerable Select(In resource) 13 | { 14 | Dictionary evaluated = []; 15 | foreach (var (key, selector) in table) 16 | evaluated[key] = Unify(selector.Select(resource).Select(r => r.Object).ToArray()); 17 | PSObject psObject = new Hashtable(evaluated); 18 | return [new ObjectResource(psObject)]; 19 | } 20 | 21 | private static object? Unify(object[] objects) => 22 | objects.Length switch 23 | { 24 | 0 => null, 25 | 1 => objects[0], 26 | _ => objects 27 | }; 28 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 kamome283 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AngleParse 2 | 3 | An easy‑to‑use HTML parsing and processing toolkit for PowerShell. 4 | 5 | ```powershell 6 | # Popular modules in PowerShell Gallery 7 | 8 | Invoke-WebRequest 'https://www.powershellgallery.com' | 9 | Select-HtmlContent 'div.stats-table tr:not(:first-child)' @{ 10 | Module = 'td.text-left', { $_.Trim() } 11 | Downloaded = 'td.text-right', { [long]($_ -replace ',', '') } 12 | Page = 'td.text-left a', ([AngleParse.Attr]'href'), { "https://www.powershellgallery.com$_" } 13 | } { [pscustomobject]$_ } 14 | 15 | # Output: 16 | # Page Downloaded Module 17 | # ---- ---------- ------ 18 | # https://www.powershellgallery.com/packages/Az.Accounts/ 17969207 Az.Accounts 19 | # https://www.powershellgallery.com/packages/DellBIOSProvider/ 12883950 DellBIOSProvider 20 | # https://www.powershellgallery.com/packages/PackageManagement/ 5033505 PackageManagement 21 | # https://www.powershellgallery.com/packages/Az.Storage/ 4769681 Az.Storage 22 | # https://www.powershellgallery.com/packages/Az.Resources/ 4608517 Az.Resources 23 | ``` 24 | 25 | ## Table of Contents 26 | 27 | - [Overview](#overview) 28 | - [Breaking Changes before v0.4](#breaking-changes-before-v04) 29 | - [Installation](#installation) 30 | - [Usage](#usage) 31 | - [How selectors work](#how-selectors-work) 32 | - [Selector Reference](#selector-reference) 33 | - [CSS Selector](#css-selector) 34 | - [Attribute Selector](#attribute-selector) 35 | - [Property Selector](#property-selector) 36 | - [Regex Selector](#regex-selector) 37 | - [ScriptBlock Selector](#scriptblock-selector) 38 | - [Table Selector](#table-selector) 39 | - [FAQ](#faq) 40 | - [License](#license) 41 | 42 | ## Overview 43 | 44 | AngleParse simplifies HTML parsing and data extraction in PowerShell. 45 | Built on robust C# and the AngleSharp library, it provides intuitive, PowerShell‑style 46 | processing toolkit. 47 | Seamlessly integrates with common cmdlets like: 48 | - `Invoke-WebRequest` – retrieve web pages 49 | - `Get-Content` – read local HTML files 50 | - `Select-Object` – format and filter output 51 | - `ConvertTo-*` – transform output into other formats 52 | 53 | ## Breaking Changes before v0.4 54 | 55 | - The `-Selector` parameter now has the `ValueFromRemainingArguments` attribute. 56 | - The `-Selector` parameter is no longer a positional parameter. 57 | - The `-Content` parameter is no longer a positional parameter. 58 | - `[AngleParse.Attr]::Element` has been deprecated. Use `[AngleParse.Prop]::Element` instead. 59 | 60 | ```powershell 61 | ## This is how it used to be before v0.4 62 | 'Some HTML Document' | Select-HtmlContent 'div', ([AngleParse.Attr]::Element) 63 | 64 | ## This is how it is now 65 | 'Some HTML Document' | Select-HtmlContent 'div' ([AngleParse.Prop]::Element) 66 | ``` 67 | 68 | 69 | ## Installation 70 | 71 | ```powershell 72 | Install-Package AngleParse 73 | Import-Module AngleParse 74 | ``` 75 | 76 | ## Usage 77 | 78 | The `Select-HtmlContent` command reads a string from the pipeline or `-Content` parameter, parses it as an HTML DOM, and applies the specified selectors which are in forms of variadic parameters. 79 | 80 | ```powershell 81 | Get-Content example.html -Raw | 82 | Select-HtmlContent 'div.foo a.bar' ([AngleParse.Attr]::Href) 83 | ``` 84 | 85 | ## How selectors work 86 | 87 | There are 6 types of selectors: 88 | CSS selector, Attribute selector, Property selector, Regex selector, ScriptBlock selector, and Table selector. 89 | Each selector inputs one item and outputs zero to many items. 90 | By specifying multiple selectors, you can combine them to work like PowerShell's pipeline. 91 | 92 | ```selector_work.html 93 | 94 | 95 | Shop 96 | 97 |

Shop

98 |
99 |
100 |

Product 1

101 |

Price: $10

102 | 103 |
104 |
105 |

Product 2

106 |

Price: $20

107 | 108 |
109 |
110 | 111 | 112 | ``` 113 | 114 | ```powershell 115 | filter makeOutputInTableFormat { [pscustomobject]$_ } 116 | 117 | Get-Content selector_work.html -raw | 118 | Select-HtmlContent 'div#products > div.product' @{ 119 | Name = 'h2' 120 | Price = 'p', ([regex]'\$(\d+)') 121 | } | makeOutputInTableFormat 122 | # Output: 123 | # Name Price 124 | # ---- ----- 125 | # Product 1 10 126 | # Product 2 20 127 | ``` 128 | 129 | There are 3 kinds of input and output types: 130 | - **Element**: a DOM element and subtype of string 131 | - **String**: a string and subtype of object 132 | - **Object**: any object 133 | 134 | Each selector has its own input and output types. 135 | And if its input type is not matched with previous output type, 136 | it will throw an error when this cmdlet tries to connect them, not on the actual 137 | processing stage. 138 | 139 | ```powershell 140 | # This code throws an error because the first selector outputs string, 141 | # but the second selector requires Element as its input type which is subtype of string. 142 | Get-Content selector_work.html -raw | 143 | Select-HtmlContent ([regex]'Price: \$(\d+)') 'p' 144 | ``` 145 | 146 | ## Selector Reference 147 | 148 | ### CSS Selector 149 | 150 | *Element -> Element* 151 | 152 | a CSS selector receives DOM elements and output DOM elements that match the given CSS selector expression. 153 | Any bare string is interpreted as a CSS selector. 154 | 155 | ```css_selector.html 156 |
157 | text content here 158 |
159 | ``` 160 | 161 | ```powershell 162 | Get-Content css_selector.html -raw | 163 | Select-HtmlContent "div > span.foo" 164 | # Output: 'text content here' 165 | ``` 166 | 167 | ### Attribute Selector 168 | 169 | *Element -> String* 170 | 171 | An attribute selector receives DOM elements and outputs matched attributes as strings. 172 | There are some already defined attributes like: 173 | - `Href` 174 | - `Src` 175 | - `Title` 176 | - `Name` 177 | 178 | If you want to access to other attributes, you can make your own attribute selector by 179 | converting from string using `[AngleParse.Attr]` class. 180 | (e.g. `([AngleParse.Attr]'some-attribute')`) 181 | 182 | ```attribute_selector.html 183 | some link 184 | ``` 185 | 186 | ```powershell 187 | Get-Content attribute_selector.html -raw | 188 | Select-HtmlContent ([AngleParse.Attr]::Href) 189 | # Output: https://example.com 190 | 191 | Get-Content attribute_selector.html -raw | 192 | Select-HtmlContent ([AngleParse.Attr]'some-attribute') 193 | # Output: hey 194 | ``` 195 | 196 | If you access a valueless attribute, it will return an empty string. 197 | And if you access an attribute that doesn't exist, it will return `$null`. 198 | 199 | There are also some special selectors in this category, which are actually not attributes but are useful for HTML processing. 200 | They are: 201 | - `InnerHtml` - the inner HTML of the element 202 | - `OuterHtml` - the outer HTML of the element 203 | - `TextContent` - the text content of the element 204 | - `Id` - the ID of the element 205 | - `ClassName` - the class name of the element 206 | - `SplitClasses` - array of class names split by space 207 | 208 | ```powershell 209 | Get-Content attribute_selector.html -raw | 210 | Select-HtmlContent ([AngleParse.Attr]::InnerHtml) 211 | 212 | # Output: some link 213 | ``` 214 | 215 | ### Property Selector 216 | 217 | *Element -> Object* 218 | 219 | A property selector receives DOM elements and outputs the property value of the inner 220 | `AngleSharp.Dom.IElement` by acessing dynamically. 221 | This selector is useful when you want to access the `IElement` property of the DOM element. 222 | You can make a property selector by converting from string using `[AngleParse.Prop]` class. 223 | (e.g. `([AngleParse.Prop]'some-property')`) 224 | 225 | ```property_selector.html 226 |
text content here
227 | ``` 228 | 229 | ```powershell 230 | # Well, I know that you should use [AngleParse.Attr]::TextContent instead of this. 231 | # This is just an example. 232 | Get-Content property_selector.html -raw | 233 | Select-HtmlContent ([AngleParse.Prop]'TextContent') 234 | # Output: text content here 235 | ``` 236 | 237 | As like the attribute selector, there are some special properties in this category. 238 | They are: 239 | - `Element` - the inner `AngleSharp.Dom.IElement` of the DOM element 240 | - `AttributesTable` - the attributes of the element as a dictionary 241 | 242 | 243 | ### Regex Selector 244 | 245 | *String -> String* 246 | 247 | This selector receives string and outputs captured strings. 248 | When you pass DOM element to this selector, it operates capturing on the element's inner text content. 249 | Regex value is interpreted as regex selector. 250 | 251 | ```regex_selector.html 252 |
2020/07/22
253 | ``` 254 | 255 | ```powershell 256 | Get-Content regex_selector.html -raw | 257 | Select-HtmlContent ([regex]'(\d{4})/(\d{2})') 258 | # Output: 2020, 07 259 | ``` 260 | 261 | ### ScriptBlock Selector 262 | 263 | *Object -> Object* 264 | 265 | This selector receives any object and outputs the result of the script block. 266 | This selector is useful when you want to process on the scraped data. 267 | In the script block, you can use `$_` to refer to the current object. 268 | When you pass DOM element to this selector, it operates on the inner text content of the element. 269 | 270 | ```scriptblock_selector.html 271 | 2025/05/04 272 | ``` 273 | 274 | ```powershell 275 | Get-Content scriptblock_selector.html -raw | 276 | Select-HtmlContent { [DateTime]$_ } 277 | # Output: 2025/05/04 0:00:00 278 | ``` 279 | 280 | ### Table Selector 281 | 282 | *T -> Object where T is the most strict type required in the each branch* 283 | 284 | Table selector outputs hashtable composed of the given key-value pairs, whose values are 285 | processed by the given selectors in each branch. 286 | The input of this selector required to conform the most strict type among the selectors in the hashtable requires to conform. 287 | Hashtables are interpreted as table selectors. 288 | 289 | ```table_selector.html 290 | 291 |
292 | 1a 293 |
294 |
295 | 2b 296 |
297 | 298 | ``` 299 | 300 | ```powershell 301 | Get-Content table_selector.html -raw | 302 | Select-HtmlContent @{ 303 | ClassName = ([AngleParse.Attr]::ClassName); 304 | NumPlus1 = ([regex]'(\d)\w'), { [int]$_ + 1 } 305 | } 306 | # Output: 307 | # ClassName Number 308 | # --------- ------ 309 | # a 2 310 | # b 3 311 | 312 | # This throws an error because the input type is string, 313 | # though the most strict type required in the each branch is Element that is subtype of string. 314 | # This does not conform to type constraint. 315 | Get-Content table_selector.html -raw | 316 | Select-HtmlContent ([regex]'.*') @{ 317 | ClassName = ([AngleParse.Attr]::ClassName); 318 | NumPlus1 = ([regex]'(\d)\w'), { [int]$_ + 1 } 319 | } 320 | ``` 321 | 322 | ## FAQ 323 | 324 | **Q: Why does the output is not in array when the output is a single item?** 325 | 326 | This is because of the unification. This emulates PowerShell's default behavior for ease of use. 327 | 328 | 329 | ```unification.html 330 | 331 |
332 |

333 | Example 334 |

335 | 336 | 337 |
338 | 339 | ``` 340 | 341 | ```powershell 342 | Get-Content unification.html -raw | 343 | Select-HtmlContent "div.entrylist-contents" @{ 344 | Title = "h3.entrylist-contents-title > a" 345 | Tags = "a[rel=tag]" 346 | } { [pscustomobject]$_ } 347 | # Output: 348 | # Title : Example 349 | # Tags : {Tag1, Tag2} 350 | 351 | # Did you see that the Title contains only one string item, not a string array? 352 | # This is because the output array is unified. 353 | ``` 354 | 355 | 356 | ## License 357 | 358 | Apache License 2.0 359 | --------------------------------------------------------------------------------