├── .github ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ └── push.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── doc └── colspans-rowspans.png ├── example_test.go ├── go.mod ├── go.sum ├── log.go ├── log_test.go ├── page.go ├── page_test.go ├── slice.go ├── slice_test.go └── util_test.go /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | schedule: 9 | - cron: '21 13 * * 4' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'go' ] 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v3 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v2 31 | with: 32 | languages: ${{ matrix.language }} 33 | 34 | - name: Autobuild 35 | uses: github/codeql-action/autobuild@v2 36 | 37 | - name: Perform CodeQL Analysis 38 | uses: github/codeql-action/analyze@v2 39 | -------------------------------------------------------------------------------- /.github/workflows/push.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | tests: 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | goVersion: [ '1.18.x', '1.19.x' ] 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - run: git fetch --prune --unshallow 19 | - uses: actions/setup-go@v1 20 | with: 21 | go-version: ${{ matrix.goVersion }} 22 | - run: go mod vendor 23 | - run: make test 24 | - uses: codecov/codecov-action@v1 25 | if: always() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | vendor/ 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Serge Smertin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | default: vendor 2 | 3 | fmt: 4 | go fmt ./... 5 | 6 | vendor: 7 | go mod vendor 8 | 9 | test: 10 | go test -coverpkg=./... -coverprofile=coverage.out -timeout=10s ./... 11 | 12 | coverage: test 13 | go tool cover -html=coverage.out 14 | 15 | .PHONY: build fmt coverage test vendor 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTML table data extractor for Go 2 | 3 | [![GoDoc](https://img.shields.io/badge/go-documentation-blue.svg)](https://pkg.go.dev/mod/github.com/nfx/go-htmltable) 4 | [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/nfx/go-htmltable/blob/main/LICENSE) 5 | [![codecov](https://codecov.io/gh/nfx/go-htmltable/branch/main/graph/badge.svg)](https://codecov.io/gh/nfx/go-htmltable) 6 | [![build](https://github.com/nfx/go-htmltable/workflows/build/badge.svg?branch=main)](https://github.com/nfx/go-htmltable/actions?query=workflow%3Abuild+branch%3Amain) 7 | 8 | 9 | `htmltable` enables structured data extraction from HTML tables and URLs and requires almost no external dependencies. Tested with Go 1.18.x and 1.19.x. 10 | 11 | ## Installation 12 | 13 | ```bash 14 | go get github.com/nfx/go-htmltable 15 | ``` 16 | 17 | ## Usage 18 | 19 | You can retrieve a slice of `header`-annotated types using the `NewSlice*` contructors: 20 | 21 | ```go 22 | type Ticker struct { 23 | Symbol string `header:"Symbol"` 24 | Security string `header:"Security"` 25 | CIK string `header:"CIK"` 26 | } 27 | 28 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" 29 | out, _ := htmltable.NewSliceFromURL[Ticker](url) 30 | fmt.Println(out[0].Symbol) 31 | fmt.Println(out[0].Security) 32 | 33 | // Output: 34 | // MMM 35 | // 3M 36 | ``` 37 | 38 | An error would be thrown if there's no matching page with the specified columns: 39 | 40 | ```go 41 | page, _ := htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 42 | _, err := page.FindWithColumns("invalid", "column", "names") 43 | fmt.Println(err) 44 | 45 | // Output: 46 | // cannot find table with columns: invalid, column, names 47 | ``` 48 | 49 | And you can use more low-level API to work with extracted data: 50 | 51 | ```go 52 | page, _ := htmltable.NewFromString(` 53 |

foo

54 | 55 | 56 | 57 | 58 |
ab
1 2
3 4
59 |

bar

60 | 61 | 62 | 63 | 64 |
bcd
125
346
65 | `) 66 | 67 | fmt.Printf("found %d tables\n", page.Len()) 68 | _ = page.Each2("c", "d", func(c, d string) error { 69 | fmt.Printf("c:%s d:%s\n", c, d) 70 | return nil 71 | }) 72 | 73 | // Output: 74 | // found 2 tables 75 | // c:2 d:5 76 | // c:4 d:6 77 | ``` 78 | 79 | Complex [tables with row and col spans](https://en.wikipedia.org/wiki/List_of_AMD_chipsets#AM4_chipsets) are natively supported as well. You can annotate `string`, `int`, and `bool` fields. Any `bool` field value is `true` if it is equal in lowercase to one of `yes`, `y`, `true`, `t`. 80 | 81 | ![Wikipedia, AMD AM4 chipsets](doc/colspans-rowspans.png) 82 | 83 | ```go 84 | type AM4 struct { 85 | Model string `header:"Model"` 86 | ReleaseDate string `header:"Release date"` 87 | PCIeSupport string `header:"PCIesupport[a]"` 88 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"` 89 | MultiGpuSLI bool `header:"Multi-GPU SLI"` 90 | USBSupport string `header:"USBsupport[b]"` 91 | SATAPorts int `header:"Storage features SATAports"` 92 | RAID string `header:"Storage features RAID"` 93 | AMDStoreMI bool `header:"Storage features AMD StoreMI"` 94 | Overclocking string `header:"Processoroverclocking"` 95 | TDP string `header:"TDP"` 96 | SupportExcavator string `header:"CPU support[14] Excavator"` 97 | SupportZen string `header:"CPU support[14] Zen"` 98 | SupportZenPlus string `header:"CPU support[14] Zen+"` 99 | SupportZen2 string `header:"CPU support[14] Zen 2"` 100 | SupportZen3 string `header:"CPU support[14] Zen 3"` 101 | Architecture string `header:"Architecture"` 102 | } 103 | am4Chipsets, _ := htmltable.NewSliceFromURL[AM4]("https://en.wikipedia.org/wiki/List_of_AMD_chipsets") 104 | fmt.Println(am4Chipsets[2].Model) 105 | fmt.Println(am4Chipsets[2].SupportZen2) 106 | 107 | // Output: 108 | // X370 109 | // Varies[c] 110 | ``` 111 | 112 | And the last note: you're encouraged to plug your own structured logger: 113 | 114 | ```go 115 | htmltable.Logger = func(_ context.Context, msg string, fields ...any) { 116 | fmt.Printf("[INFO] %s %v\n", msg, fields) 117 | } 118 | htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 119 | 120 | // Output: 121 | // [INFO] found table [columns [Symbol Security SEC filings GICSSector GICS Sub-Industry Headquarters Location Date first added CIK Founded] count 504] 122 | // [INFO] found table [columns [Date Added Ticker Added Security Removed Ticker Removed Security Reason] count 308] 123 | ``` 124 | 125 | ## Inspiration 126 | 127 | This library aims to be something like [pandas.read_html](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html) or [table_extract](https://docs.rs/table-extract/latest/table_extract/) Rust crate, but more idiomatic for Go. -------------------------------------------------------------------------------- /doc/colspans-rowspans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nfx/go-htmltable/f3d02958624856309008a72f34fe07a7c412751f/doc/colspans-rowspans.png -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package htmltable_test 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "github.com/nfx/go-htmltable" 8 | ) 9 | 10 | func ExampleNewSliceFromUrl() { 11 | type Ticker struct { 12 | Symbol string `header:"Symbol"` 13 | Security string `header:"Security"` 14 | CIK string `header:"CIK"` 15 | } 16 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" 17 | out, _ := htmltable.NewSliceFromURL[Ticker](url) 18 | fmt.Println(out[0].Symbol) 19 | fmt.Println(out[0].Security) 20 | 21 | // Output: 22 | // MMM 23 | // 3M 24 | } 25 | 26 | func ExampleNewSliceFromURL_rowspansAndColspans() { 27 | type AM4 struct { 28 | Model string `header:"Model"` 29 | ReleaseDate string `header:"Release date"` 30 | PCIeSupport string `header:"PCIesupport[a]"` 31 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"` 32 | MultiGpuSLI bool `header:"Multi-GPU SLI"` 33 | USBSupport string `header:"USBsupport[b]"` 34 | SATAPorts int `header:"Storage features SATAports"` 35 | RAID string `header:"Storage features RAID"` 36 | AMDStoreMI bool `header:"Storage features AMD StoreMI"` 37 | Overclocking string `header:"Processoroverclocking"` 38 | TDP string `header:"TDP"` 39 | SupportExcavator string `header:"CPU support Excavator"` 40 | SupportZen string `header:"CPU support Zen"` 41 | SupportZenPlus string `header:"CPU support Zen+"` 42 | SupportZen2 string `header:"CPU support Zen 2"` 43 | SupportZen3 string `header:"CPU support Zen 3"` 44 | Architecture string `header:"Architecture"` 45 | } 46 | am4Chipsets, _ := htmltable.NewSliceFromURL[AM4]("https://en.wikipedia.org/wiki/List_of_AMD_chipsets") 47 | fmt.Println(am4Chipsets[2].Model) 48 | fmt.Println(am4Chipsets[2].SupportZen2) 49 | 50 | // Output: 51 | // X370 52 | // Varies[c] 53 | } 54 | 55 | func ExampleNewFromString() { 56 | page, _ := htmltable.NewFromString(` 57 |

foo

58 | 59 | 60 | 61 | 62 |
ab
1 2
3 4
63 |

bar

64 | 65 | 66 | 67 | 68 |
bcd
125
346
69 | `) 70 | 71 | fmt.Printf("found %d tables\n", page.Len()) 72 | _ = page.Each2("c", "d", func(c, d string) error { 73 | fmt.Printf("c:%s d:%s\n", c, d) 74 | return nil 75 | }) 76 | 77 | // Output: 78 | // found 2 tables 79 | // c:2 d:5 80 | // c:4 d:6 81 | } 82 | 83 | func ExampleNewFromURL() { 84 | page, _ := htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 85 | _, err := page.FindWithColumns("invalid", "column", "names") 86 | fmt.Println(err) 87 | 88 | // Output: 89 | // cannot find table with columns: invalid, column, names 90 | } 91 | 92 | func ExampleLogger() { 93 | htmltable.Logger = func(_ context.Context, msg string, fields ...any) { 94 | fmt.Printf("[INFO] %s %v\n", msg, fields) 95 | } 96 | _, _ = htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 97 | 98 | // Output: 99 | // [INFO] found table [columns [Symbol Security SEC filings GICSSector GICS Sub-Industry Headquarters Location Date first added CIK Founded] count 503] 100 | // [INFO] found table [columns [Date Added Ticker Added Security Removed Ticker Removed Security Reason] count 316] 101 | } 102 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/nfx/go-htmltable 2 | 3 | go 1.18 4 | 5 | require golang.org/x/net v0.26.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= 2 | golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= 3 | -------------------------------------------------------------------------------- /log.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "strings" 8 | ) 9 | 10 | // Logger is a very simplistic structured logger, than should 11 | // be overriden by integrations. 12 | var Logger func(_ context.Context, msg string, fields ...any) 13 | 14 | func init() { 15 | Logger = defaultLogger 16 | } 17 | 18 | var defaultLogger = func(_ context.Context, msg string, fields ...any) { 19 | var sb strings.Builder 20 | sb.WriteString(msg) 21 | if len(fields)%2 != 0 { 22 | panic(fmt.Errorf("number of logged fields is not even")) 23 | } 24 | for i := 0; i < len(fields); i += 2 { 25 | sb.WriteRune(' ') 26 | sb.WriteString(fmt.Sprint(fields[i])) 27 | sb.WriteRune('=') 28 | sb.WriteString(fmt.Sprint(fields[i+1])) 29 | } 30 | log.Print(sb.String()) 31 | } 32 | -------------------------------------------------------------------------------- /log_test.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | ) 7 | 8 | func TestLogger(t *testing.T) { 9 | Logger(context.Background(), "message", "foo", "bar", "x", 1) 10 | } 11 | 12 | func TestLoggerNoFields(t *testing.T) { 13 | Logger(context.Background(), "message") 14 | } 15 | 16 | func TestLoggerWrongFields(t *testing.T) { 17 | defer func() { 18 | p := recover() 19 | if p == nil { 20 | t.Fatalf("there must be panic") 21 | } 22 | }() 23 | Logger(context.Background(), "message", 1) 24 | } 25 | -------------------------------------------------------------------------------- /page.go: -------------------------------------------------------------------------------- 1 | // htmltable enables structured data extraction from HTML tables and URLs 2 | package htmltable 3 | 4 | import ( 5 | "context" 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "strconv" 10 | "strings" 11 | 12 | "golang.org/x/net/html" 13 | ) 14 | 15 | // mock for tests 16 | var htmlParse = html.Parse 17 | 18 | // Page is the container for all tables parseable 19 | type Page struct { 20 | Tables []*Table 21 | 22 | ctx context.Context 23 | rowSpans []int 24 | colSpans []int 25 | row []string 26 | rows [][]string 27 | maxCols int 28 | 29 | // current row 30 | colSpan []int 31 | rowSpan []int 32 | // all 33 | cSpans [][]int 34 | rSpans [][]int 35 | } 36 | 37 | // New returns an instance of the page with possibly more than one table 38 | func New(ctx context.Context, r io.Reader) (*Page, error) { 39 | p := &Page{ctx: ctx} 40 | return p, p.init(r) 41 | } 42 | 43 | // NewFromString is same as New(ctx.Context, io.Reader), but from string 44 | func NewFromString(r string) (*Page, error) { 45 | return New(context.Background(), strings.NewReader(r)) 46 | } 47 | 48 | // NewFromResponse is same as New(ctx.Context, io.Reader), but from http.Response. 49 | // 50 | // In case of failure, returns `ResponseError`, that could be further inspected. 51 | func NewFromResponse(resp *http.Response) (*Page, error) { 52 | p, err := New(resp.Request.Context(), resp.Body) 53 | if err != nil { 54 | return nil, err 55 | } 56 | return p, nil 57 | } 58 | 59 | // NewFromURL is same as New(ctx.Context, io.Reader), but from URL. 60 | // 61 | // In case of failure, returns `ResponseError`, that could be further inspected. 62 | func NewFromURL(url string) (*Page, error) { 63 | resp, err := http.Get(url) 64 | if err != nil { 65 | return nil, err 66 | } 67 | if resp.Body != nil { 68 | defer resp.Body.Close() 69 | } 70 | return NewFromResponse(resp) 71 | } 72 | 73 | // Len returns number of tables found on the page 74 | func (p *Page) Len() int { 75 | return len(p.Tables) 76 | } 77 | 78 | // FindWithColumns performs fuzzy matching of tables by given header column names 79 | func (p *Page) FindWithColumns(columns ...string) (*Table, error) { 80 | // realistic p won't have this much 81 | found := 0xfffffff 82 | for idx, table := range p.Tables { 83 | matchedColumns := 0 84 | for _, col := range columns { 85 | for _, header := range table.Header { 86 | if col == header { 87 | // perform fuzzy matching of table headers 88 | matchedColumns++ 89 | } 90 | } 91 | } 92 | if matchedColumns != len(columns) { 93 | continue 94 | } 95 | if found < len(p.Tables) { 96 | // and do a best-effort error message, that is cleaner than pandas.read_html 97 | return nil, fmt.Errorf("more than one table matches columns `%s`: "+ 98 | "[%d] %s and [%d] %s", strings.Join(columns, ", "), 99 | found, p.Tables[found], idx, p.Tables[idx]) 100 | } 101 | found = idx 102 | } 103 | if found > len(p.Tables) { 104 | return nil, fmt.Errorf("cannot find table with columns: %s", 105 | strings.Join(columns, ", ")) 106 | } 107 | return p.Tables[found], nil 108 | } 109 | 110 | // Each row would call func with the value of the table cell from the column 111 | // specified in the first argument. 112 | // 113 | // Returns an error if table has no matching column name. 114 | func (p *Page) Each(a string, f func(a string) error) error { 115 | table, err := p.FindWithColumns(a) 116 | if err != nil { 117 | return err 118 | } 119 | offsets := map[string]int{} 120 | for idx, header := range table.Header { 121 | offsets[header] = idx 122 | } 123 | for idx, row := range table.Rows { 124 | if len(row) < 1 { 125 | continue 126 | } 127 | err = f(row[offsets[a]]) 128 | if err != nil { 129 | return fmt.Errorf("row %d: %w", idx, err) 130 | } 131 | } 132 | return nil 133 | } 134 | 135 | // Each2 will get two columns specified in the first two arguments 136 | // and call the func with those values for every row in the table. 137 | // 138 | // Returns an error if table has no matching column names. 139 | func (p *Page) Each2(a, b string, f func(a, b string) error) error { 140 | table, err := p.FindWithColumns(a, b) 141 | if err != nil { 142 | return err 143 | } 144 | offsets := map[string]int{} 145 | for idx, header := range table.Header { 146 | offsets[header] = idx 147 | } 148 | _1, _2 := offsets[a], offsets[b] 149 | for idx, row := range table.Rows { 150 | if len(row) < 2 { 151 | continue 152 | } 153 | err = f(row[_1], row[_2]) 154 | if err != nil { 155 | return fmt.Errorf("row %d: %w", idx, err) 156 | } 157 | } 158 | return nil 159 | } 160 | 161 | // Each3 will get three columns specified in the first three arguments 162 | // and call the func with those values for every row in the table. 163 | // 164 | // Returns an error if table has no matching column names. 165 | func (p *Page) Each3(a, b, c string, f func(a, b, c string) error) error { 166 | table, err := p.FindWithColumns(a, b, c) 167 | if err != nil { 168 | return err 169 | } 170 | offsets := map[string]int{} 171 | for idx, header := range table.Header { 172 | offsets[header] = idx 173 | } 174 | _1, _2, _3 := offsets[a], offsets[b], offsets[c] 175 | for idx, row := range table.Rows { 176 | if len(row) < 3 { 177 | continue 178 | } 179 | err = f(row[_1], row[_2], row[_3]) 180 | if err != nil { 181 | return fmt.Errorf("row %d: %w", idx, err) 182 | } 183 | } 184 | return nil 185 | } 186 | 187 | func (p *Page) init(r io.Reader) error { 188 | root, err := htmlParse(r) 189 | if err != nil { 190 | return err 191 | } 192 | p.parse(root) 193 | p.finishTable() 194 | return nil 195 | } 196 | 197 | func (p *Page) parse(n *html.Node) { 198 | if n == nil { 199 | return 200 | } 201 | switch n.Data { 202 | case "td", "th": 203 | p.colSpan = append(p.colSpan, p.intAttrOr(n, "colspan", 1)) 204 | p.rowSpan = append(p.rowSpan, p.intAttrOr(n, "rowspan", 1)) 205 | var sb strings.Builder 206 | p.innerText(n, &sb) 207 | p.row = append(p.row, sb.String()) 208 | return 209 | case "tr": 210 | p.finishRow() 211 | case "table": 212 | p.finishTable() 213 | } 214 | for c := n.FirstChild; c != nil; c = c.NextSibling { 215 | p.parse(c) 216 | } 217 | } 218 | 219 | func (p *Page) intAttrOr(n *html.Node, attr string, default_ int) int { 220 | for _, a := range n.Attr { 221 | if a.Key != attr { 222 | continue 223 | } 224 | val, err := strconv.Atoi(a.Val) 225 | if err != nil { 226 | return default_ 227 | } 228 | return val 229 | } 230 | return default_ 231 | } 232 | 233 | func (p *Page) finishRow() { 234 | if len(p.row) == 0 { 235 | return 236 | } 237 | if len(p.row) > p.maxCols { 238 | p.maxCols = len(p.row) 239 | } 240 | p.rows = append(p.rows, p.row) 241 | p.cSpans = append(p.cSpans, p.colSpan) 242 | p.rSpans = append(p.rSpans, p.rowSpan) 243 | p.row = []string{} 244 | p.colSpan = []int{} 245 | p.rowSpan = []int{} 246 | } 247 | 248 | type cellSpan struct { 249 | BeginX, EndX int 250 | BeginY, EndY int 251 | Value string 252 | } 253 | 254 | func (d *cellSpan) Match(x, y int) bool { 255 | if d.BeginX > x { 256 | return false 257 | } 258 | if d.EndX <= x { 259 | return false 260 | } 261 | if d.BeginY > y { 262 | return false 263 | } 264 | if d.EndY <= y { 265 | return false 266 | } 267 | return true 268 | } 269 | 270 | type spans []cellSpan 271 | 272 | func (s spans) Value(x, y int) (string, bool) { 273 | for _, v := range s { 274 | if !v.Match(x, y) { 275 | continue 276 | } 277 | return v.Value, true 278 | } 279 | return "", false 280 | } 281 | 282 | func (p *Page) finishTable() { 283 | defer func() { 284 | if r := recover(); r != nil { 285 | firstRow := []string{} 286 | if len(p.rows) > 0 { 287 | firstRow = p.rows[0][:] 288 | } 289 | Logger(p.ctx, "unparsable table", "panic", fmt.Sprintf("%v", r), "firstRow", firstRow) 290 | } 291 | p.rows = [][]string{} 292 | p.colSpans = []int{} 293 | p.rowSpans = []int{} 294 | p.cSpans = [][]int{} 295 | p.rSpans = [][]int{} 296 | p.maxCols = 0 297 | }() 298 | p.finishRow() 299 | if len(p.rows) == 0 { 300 | return 301 | } 302 | 303 | rows := [][]string{} 304 | allSpans := spans{} 305 | rowSkips := 0 306 | gotHeader := false 307 | 308 | ROWS: 309 | for y := 0; y < len(p.rows); y++ { // rows cols addressable by x 310 | currentRow := []string{} 311 | skipRow := false 312 | k := 0 // next row columns 313 | j := 0 // p.rows cols addressable by j 314 | for x := 0; x < p.maxCols; x++ { 315 | value, ok := allSpans.Value(x, y) 316 | if ok { 317 | currentRow = append(currentRow, value) 318 | continue 319 | } 320 | if gotHeader && len(p.rows[y]) == 1 && p.rows[y][0] == "" { 321 | // this are most likely empty rows or table dividers 322 | rowSkips++ 323 | continue ROWS 324 | } 325 | if len(p.rSpans[y]) == j { 326 | break 327 | } 328 | rowSpan := p.rSpans[y][j] 329 | colSpan := p.cSpans[y][j] 330 | value = p.rows[y][j] 331 | if gotHeader && (rowSpan > 1 || colSpan > 1) { 332 | allSpans = append(allSpans, cellSpan{ 333 | BeginX: x, 334 | EndX: x + colSpan, 335 | BeginY: y, 336 | EndY: y + rowSpan, 337 | Value: value, 338 | }) 339 | } 340 | if !gotHeader && colSpan > 1 { 341 | skipRow = true 342 | // in header: merge, in row - duplicate 343 | for q := 0; q < colSpan; q++ { 344 | nextValue := fmt.Sprintf("%s %s", value, p.rows[y+1][k]) 345 | currentRow = append(currentRow, nextValue) 346 | k++ 347 | } 348 | } else { 349 | currentRow = append(currentRow, value) 350 | } 351 | j++ 352 | } 353 | if skipRow { 354 | rowSkips++ 355 | y++ 356 | } 357 | gotHeader = true 358 | if len(currentRow) > p.maxCols { 359 | p.maxCols = len(currentRow) 360 | } 361 | rows = append(rows, currentRow) 362 | } 363 | header := rows[0] 364 | rows = rows[1:] 365 | Logger(p.ctx, "found table", "columns", header, "count", len(rows)) 366 | p.Tables = append(p.Tables, &Table{ 367 | Header: header, 368 | Rows: rows, 369 | }) 370 | } 371 | 372 | func (p *Page) innerText(n *html.Node, sb *strings.Builder) { 373 | if n.Type == html.TextNode { 374 | sb.WriteString(strings.TrimSpace(n.Data)) 375 | return 376 | } 377 | if n.FirstChild == nil { 378 | return 379 | } 380 | for c := n.FirstChild; c != nil; c = c.NextSibling { 381 | p.innerText(c, sb) 382 | } 383 | } 384 | 385 | // Table is the low-level representation of raw header and rows. 386 | // 387 | // Every cell string value is truncated of its whitespace. 388 | type Table struct { 389 | // Header holds names of headers 390 | Header []string 391 | 392 | // Rows holds slice of string slices 393 | Rows [][]string 394 | } 395 | 396 | func (table *Table) String() string { 397 | return fmt.Sprintf("Table[%s] (%d rows)", strings.Join(table.Header, ", "), len(table.Rows)) 398 | } 399 | -------------------------------------------------------------------------------- /page_test.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "strings" 9 | "testing" 10 | 11 | "golang.org/x/net/html" 12 | ) 13 | 14 | const fixture = ` 15 |

foo

16 | 17 | 18 | 19 | 20 |
ab
1 2
3 4
21 |

bar

22 | 23 | 24 | 25 | 26 |
bcd
125
346
27 | ` 28 | 29 | func TestFindsAllTables(t *testing.T) { 30 | p, err := NewFromString(fixture) 31 | assertNoError(t, err) 32 | assertEqual(t, p.Len(), 2) 33 | } 34 | 35 | // added public domain data from https://en.wikipedia.org/wiki/List_of_S&P_500_companies 36 | const fixtureColspans = ` 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 |
DateAddedRemovedReason
TickerSecurityTickerSecurity
June 21, 2022KDPKeurig Dr PepperUA/UAAUnder ArmourMarket capitalization change.[4]
June 21, 2022ONON SemiconductorIPGPIPG PhotonicsMarket capitalization change.[4]
` 70 | 71 | func TestFindsWithColspans(t *testing.T) { 72 | p, err := NewFromString(fixtureColspans) 73 | assertNoError(t, err) 74 | assertEqual(t, p.Len(), 1) 75 | assertEqual(t, "Added Ticker", p.Tables[0].Header[1]) 76 | assertEqual(t, "Market capitalization change.[4]", p.Tables[0].Rows[0][5]) 77 | } 78 | 79 | func TestInitFails(t *testing.T) { 80 | prev := htmlParse 81 | t.Cleanup(func() { 82 | htmlParse = prev 83 | }) 84 | htmlParse = func(r io.Reader) (*html.Node, error) { 85 | return nil, fmt.Errorf("nope") 86 | } 87 | _, err := New(context.Background(), strings.NewReader("..")) 88 | 89 | assertEqualError(t, err, "nope") 90 | } 91 | 92 | func TestNewFromHttpResponseError(t *testing.T) { 93 | prev := htmlParse 94 | t.Cleanup(func() { 95 | htmlParse = prev 96 | }) 97 | htmlParse = func(r io.Reader) (*html.Node, error) { 98 | return nil, fmt.Errorf("nope") 99 | } 100 | _, err := NewFromResponse(&http.Response{ 101 | Request: &http.Request{}, 102 | }) 103 | assertEqualError(t, err, "nope") 104 | } 105 | 106 | func TestRealPageFound(t *testing.T) { 107 | wiki, err := http.Get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 108 | assertNoError(t, err) 109 | p, err := NewFromResponse(wiki) 110 | assertNoError(t, err) 111 | snp, err := p.FindWithColumns("Symbol", "Security", "CIK") 112 | assertNoError(t, err) 113 | assertGreaterOrEqual(t, len(snp.Rows), 500) 114 | } 115 | 116 | func TestRealPageFound_BasicRowColSpans(t *testing.T) { 117 | wiki, err := http.Get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") 118 | assertNoError(t, err) 119 | p, err := NewFromResponse(wiki) 120 | assertNoError(t, err) 121 | snp, err := p.FindWithColumns("Date", "Added Ticker", "Removed Ticker") 122 | assertNoError(t, err) 123 | assertGreaterOrEqual(t, len(snp.Rows), 250) 124 | } 125 | 126 | func TestFindsTableByColumnNames(t *testing.T) { 127 | p, err := NewFromString(fixture) 128 | assertNoError(t, err) 129 | 130 | cd, err := p.FindWithColumns("c", "d") 131 | assertNoError(t, err) 132 | assertEqual(t, 2, len(cd.Rows)) 133 | } 134 | 135 | func TestEach(t *testing.T) { 136 | p, err := NewFromString(fixture) 137 | assertNoError(t, err) 138 | err = p.Each("a", func(a string) error { 139 | t.Logf("%s", a) 140 | return nil 141 | }) 142 | assertNoError(t, err) 143 | } 144 | 145 | func TestEachFails(t *testing.T) { 146 | p, err := NewFromString(fixture) 147 | assertNoError(t, err) 148 | err = p.Each("a", func(a string) error { 149 | return fmt.Errorf("nope") 150 | }) 151 | assertEqualError(t, err, "row 0: nope") 152 | } 153 | 154 | func TestEachFailsNoCols(t *testing.T) { 155 | p, err := NewFromString(fixture) 156 | assertNoError(t, err) 157 | err = p.Each("x", func(a string) error { 158 | return nil 159 | }) 160 | assertEqualError(t, err, "cannot find table with columns: x") 161 | } 162 | 163 | func TestEach2(t *testing.T) { 164 | p, err := NewFromString(fixture) 165 | assertNoError(t, err) 166 | err = p.Each2("b", "c", func(b, c string) error { 167 | t.Logf("%s %s", b, c) 168 | return nil 169 | }) 170 | assertNoError(t, err) 171 | } 172 | 173 | func TestEach2Fails(t *testing.T) { 174 | p, err := NewFromString(fixture) 175 | assertNoError(t, err) 176 | err = p.Each2("b", "c", func(b, c string) error { 177 | return fmt.Errorf("nope") 178 | }) 179 | assertEqualError(t, err, "row 0: nope") 180 | } 181 | 182 | func TestEach2FailsNoCols(t *testing.T) { 183 | p, err := NewFromString(fixture) 184 | assertNoError(t, err) 185 | err = p.Each2("x", "y", func(b, c string) error { 186 | return nil 187 | }) 188 | assertEqualError(t, err, "cannot find table with columns: x, y") 189 | } 190 | 191 | func TestEach3(t *testing.T) { 192 | p, err := NewFromString(fixture) 193 | assertNoError(t, err) 194 | err = p.Each3("b", "c", "d", func(b, c, d string) error { 195 | t.Logf("%s %s %s", b, c, d) 196 | return nil 197 | }) 198 | assertNoError(t, err) 199 | } 200 | 201 | func TestEach3Fails(t *testing.T) { 202 | p, err := NewFromString(fixture) 203 | assertNoError(t, err) 204 | err = p.Each3("b", "c", "d", func(b, c, d string) error { 205 | return fmt.Errorf("nope") 206 | }) 207 | assertEqualError(t, err, "row 0: nope") 208 | } 209 | 210 | func TestEach3FailsNoCols(t *testing.T) { 211 | p, err := NewFromString(fixture) 212 | assertNoError(t, err) 213 | err = p.Each3("x", "y", "z", func(b, c, d string) error { 214 | return nil 215 | }) 216 | assertEqualError(t, err, "cannot find table with columns: x, y, z") 217 | } 218 | 219 | func TestMoreThanOneTableFoundErrors(t *testing.T) { 220 | p, err := NewFromString(fixture) 221 | assertNoError(t, err) 222 | 223 | _, err = p.FindWithColumns("b") 224 | assertError(t, err) 225 | } 226 | 227 | func TestNoTablesFoundErrors(t *testing.T) { 228 | p, err := NewFromString(fixture) 229 | assertNoError(t, err) 230 | 231 | _, err = p.FindWithColumns("z") 232 | assertError(t, err) 233 | } 234 | 235 | func TestNilNodeReturns(t *testing.T) { 236 | p := &Page{} 237 | p.parse(nil) 238 | } 239 | -------------------------------------------------------------------------------- /slice.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "reflect" 9 | "strings" 10 | ) 11 | 12 | // NewSlice returns slice of annotated struct types from io.Reader 13 | func NewSlice[T any](ctx context.Context, r io.Reader) ([]T, error) { 14 | f := &feeder[T]{ 15 | Page: Page{ctx: ctx}, 16 | } 17 | f.init(r) 18 | return f.slice() 19 | } 20 | 21 | // NewSliceFromPage finds a table matching the slice and returns the slice 22 | func NewSliceFromPage[T any](p *Page) ([]T, error) { 23 | return (&feeder[T]{ 24 | Page: *p, 25 | }).slice() 26 | } 27 | 28 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader), 29 | // but takes just a string. 30 | func NewSliceFromString[T any](in string) ([]T, error) { 31 | return NewSlice[T](context.Background(), strings.NewReader(in)) 32 | } 33 | 34 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader), 35 | // but takes just an http.Response 36 | func NewSliceFromResponse[T any](resp *http.Response) ([]T, error) { 37 | return NewSlice[T](resp.Request.Context(), resp.Body) 38 | } 39 | 40 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader), 41 | // but takes just an URL. 42 | func NewSliceFromURL[T any](url string) ([]T, error) { 43 | resp, err := http.Get(url) 44 | if err != nil { 45 | return nil, err 46 | } 47 | if resp.Body != nil { 48 | defer resp.Body.Close() 49 | } 50 | return NewSliceFromResponse[T](resp) 51 | } 52 | 53 | type feeder[T any] struct { 54 | Page 55 | 56 | dummy T 57 | } 58 | 59 | func (f *feeder[T]) headers() ([]string, map[string]int, error) { 60 | dt := reflect.ValueOf(f.dummy) 61 | elem := dt.Type() 62 | headers := []string{} 63 | fields := map[string]int{} 64 | for i := 0; i < elem.NumField(); i++ { 65 | field := elem.Field(i) 66 | header := field.Tag.Get("header") 67 | if header == "" { 68 | continue 69 | } 70 | err := f.isTypeSupported(field) 71 | if err != nil { 72 | return nil, nil, err 73 | } 74 | fields[header] = i 75 | headers = append(headers, header) 76 | } 77 | return headers, fields, nil 78 | } 79 | 80 | func (f *feeder[T]) isTypeSupported(field reflect.StructField) error { 81 | k := field.Type.Kind() 82 | if k == reflect.String { 83 | return nil 84 | } 85 | if k == reflect.Int { 86 | return nil 87 | } 88 | if k == reflect.Bool { 89 | return nil 90 | } 91 | return fmt.Errorf("setting field is not supported, %s is %v", 92 | field.Name, field.Type.Name()) 93 | } 94 | 95 | func (f *feeder[T]) table() (*Table, map[int]int, error) { 96 | headers, fields, err := f.headers() 97 | if err != nil { 98 | return nil, nil, err 99 | } 100 | table, err := f.FindWithColumns(headers...) 101 | if err != nil { 102 | return nil, nil, err 103 | } 104 | mapping := map[int]int{} 105 | for idx, header := range table.Header { 106 | field, ok := fields[header] 107 | if !ok { 108 | continue 109 | } 110 | mapping[idx] = field 111 | } 112 | return table, mapping, nil 113 | } 114 | 115 | func (f *feeder[T]) slice() ([]T, error) { 116 | table, mapping, err := f.table() 117 | if err != nil { 118 | return nil, err 119 | } 120 | dummy := reflect.ValueOf(f.dummy) 121 | dt := dummy.Type() 122 | sliceValue := reflect.MakeSlice(reflect.SliceOf(dt), 123 | len(table.Rows), len(table.Rows)) 124 | for rowIdx, row := range table.Rows { 125 | item := sliceValue.Index(rowIdx) 126 | for idx, field := range mapping { 127 | if len(row) < len(mapping) && idx == len(row) { 128 | // either corrupt row or something like that 129 | continue 130 | } 131 | switch item.Field(field).Kind() { 132 | case reflect.String: 133 | item.Field(field).SetString(row[idx]) 134 | case reflect.Bool: 135 | var v bool 136 | lower := strings.ToLower(row[idx]) 137 | if lower == "yes" || 138 | lower == "y" || 139 | lower == "true" || 140 | lower == "t" { 141 | v = true 142 | } 143 | item.Field(field).SetBool(v) 144 | case reflect.Int: 145 | var v int64 146 | _, err := fmt.Sscan(row[idx], &v) 147 | if err != nil { 148 | column := table.Header[idx] 149 | return nil, fmt.Errorf("row %d: %s: %w", rowIdx, column, err) 150 | } 151 | item.Field(field).SetInt(v) 152 | default: // noop 153 | } 154 | } 155 | } 156 | return sliceValue.Interface().([]T), nil 157 | } 158 | -------------------------------------------------------------------------------- /slice_test.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | ) 8 | 9 | type nice struct { 10 | C string `header:"c"` 11 | D string `header:"d"` 12 | } 13 | 14 | func TestNewSliceFromString(t *testing.T) { 15 | out, err := NewSliceFromString[nice](fixture) 16 | assertNoError(t, err) 17 | assertEqual(t, []nice{ 18 | {"2", "5"}, 19 | {"4", "6"}, 20 | }, out) 21 | } 22 | 23 | type Ticker struct { 24 | Symbol string `header:"Symbol"` 25 | Security string `header:"Security"` 26 | CIK string `header:"CIK"` 27 | } 28 | 29 | func TestNewSliceFromUrl(t *testing.T) { 30 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" 31 | out, err := NewSliceFromURL[Ticker](url) 32 | assertNoError(t, err) 33 | assertGreaterOrEqual(t, len(out), 500) 34 | } 35 | 36 | func TestNewSliceFromPage(t *testing.T) { 37 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" 38 | p, err := NewFromURL(url) 39 | assertNoError(t, err) 40 | out, err := NewSliceFromPage[Ticker](p) 41 | assertNoError(t, err) 42 | assertGreaterOrEqual(t, len(out), 500) 43 | } 44 | 45 | func TestNewSliceFromUrl_Fails(t *testing.T) { 46 | _, err := NewSliceFromURL[Ticker]("https://127.0.0.1") 47 | assertEqualError(t, err, "Get \"https://127.0.0.1\": dial tcp 127.0.0.1:443: connect: connection refused") 48 | } 49 | 50 | func TestNewSliceFromUrl_NoTables(t *testing.T) { 51 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 52 | w.WriteHeader(200) 53 | })) 54 | defer server.Close() 55 | _, err := NewSliceFromURL[Ticker](server.URL) 56 | assertEqualError(t, err, "cannot find table with columns: Symbol, Security, CIK") 57 | } 58 | 59 | func TestNewSliceInvalidTypes(t *testing.T) { 60 | type exotic struct { 61 | A string `header:""` 62 | C float32 `header:"c"` 63 | } 64 | _, err := NewSliceFromString[exotic](fixture) 65 | assertEqualError(t, err, "setting field is not supported, C is float32") 66 | } 67 | 68 | func TestVeryCreativeTableWithRowAndColspans(t *testing.T) { 69 | type AM4 struct { 70 | Model string `header:"Model"` 71 | ReleaseDate string `header:"Release date"` 72 | PCIeSupport string `header:"PCIesupport[a]"` 73 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"` 74 | MultiGpuSLI bool `header:"Multi-GPU SLI"` 75 | USBSupport string `header:"USBsupport[b]"` 76 | SATAPorts int `header:"Storage features SATAports"` 77 | RAID string `header:"Storage features RAID"` 78 | AMDStoreMI bool `header:"Storage features AMD StoreMI"` 79 | Overclocking string `header:"Processoroverclocking"` 80 | TDP string `header:"TDP"` 81 | SupportExcavator string `header:"CPU support[14] Excavator"` 82 | SupportZen string `header:"CPU support[14] Zen"` 83 | SupportZenPlus string `header:"CPU support[14] Zen+"` 84 | SupportZen2 string `header:"CPU support[14] Zen 2"` 85 | SupportZen3 string `header:"CPU support[14] Zen 3"` 86 | Architecture string `header:"Architecture"` 87 | } 88 | chipsets, err := NewSliceFromString[AM4](am4info) 89 | assertNoError(t, err) 90 | expected := []AM4{ 91 | { // row 0 92 | Model: "A320", 93 | ReleaseDate: "February 2017[15]", 94 | PCIeSupport: "PCIe 2.0 ×4", 95 | MultiGpuCrossFire: false, 96 | MultiGpuSLI: false, 97 | USBSupport: "1, 2, 6", 98 | SATAPorts: 4, 99 | RAID: "0,1,10", 100 | AMDStoreMI: false, 101 | Overclocking: "Limited to pre-Zen CPUs, unless an unsupported third-party motherboard firmware applied", 102 | TDP: "~5 W[16]", 103 | SupportExcavator: "Yes", 104 | SupportZen: "Yes", 105 | SupportZenPlus: "Yes", 106 | SupportZen2: "Varies[c]", 107 | SupportZen3: "Varies[c]", 108 | Architecture: "Promontory", 109 | }, 110 | { // row 1 111 | Model: "B350", 112 | ReleaseDate: "February 2017[15]", 113 | PCIeSupport: "PCIe 2.0 ×6", 114 | MultiGpuCrossFire: true, 115 | MultiGpuSLI: false, 116 | USBSupport: "2, 2, 6", 117 | SATAPorts: 4, 118 | RAID: "0,1,10", 119 | AMDStoreMI: false, 120 | Overclocking: "Yes", 121 | TDP: "~5 W[16]", 122 | SupportExcavator: "Yes", 123 | SupportZen: "Yes", 124 | SupportZenPlus: "Yes", 125 | SupportZen2: "Varies[c]", 126 | SupportZen3: "Varies[c]", 127 | Architecture: "Promontory", 128 | }, 129 | { // row 2 130 | Model: "X370", 131 | ReleaseDate: "February 2017[15]", 132 | PCIeSupport: "PCIe 2.0 ×8", 133 | MultiGpuCrossFire: true, 134 | MultiGpuSLI: true, 135 | USBSupport: "2, 6, 6", 136 | SATAPorts: 8, 137 | RAID: "0,1,10", 138 | AMDStoreMI: false, 139 | Overclocking: "Yes", 140 | TDP: "~5 W[16]", 141 | SupportExcavator: "Yes", 142 | SupportZen: "Yes", 143 | SupportZenPlus: "Yes", 144 | SupportZen2: "Varies[c]", 145 | SupportZen3: "Varies[c]", 146 | Architecture: "Promontory", 147 | }, 148 | { // row 3 149 | Model: "B450", 150 | ReleaseDate: "March 2018[17]", 151 | PCIeSupport: "PCIe 2.0 ×6", 152 | MultiGpuCrossFire: true, 153 | MultiGpuSLI: false, 154 | USBSupport: "2, 2, 6", 155 | SATAPorts: 4, 156 | RAID: "0,1,10", 157 | AMDStoreMI: true, 158 | Overclocking: "Yes,withPBO", 159 | TDP: "~5 W[16]", 160 | SupportExcavator: "Varies[d]", 161 | SupportZen: "Yes", 162 | SupportZenPlus: "Yes", 163 | SupportZen2: "Yes", 164 | SupportZen3: "Varies[d][18]", 165 | Architecture: "Promontory", 166 | }, 167 | { // row 4 168 | Model: "X470", 169 | ReleaseDate: "March 2018[17]", 170 | PCIeSupport: "PCIe 2.0 ×8", 171 | MultiGpuCrossFire: true, 172 | MultiGpuSLI: true, 173 | USBSupport: "2, 6, 6", 174 | SATAPorts: 8, 175 | RAID: "0,1,10", 176 | AMDStoreMI: true, 177 | Overclocking: "Yes,withPBO", 178 | TDP: "~5 W[16]", 179 | SupportExcavator: "Varies[d]", 180 | SupportZen: "Yes", 181 | SupportZenPlus: "Yes", 182 | SupportZen2: "Yes", 183 | SupportZen3: "Varies[d][18]", 184 | Architecture: "Promontory", 185 | }, 186 | { // row 5 187 | Model: "A520", 188 | ReleaseDate: "August 2020[19]", 189 | PCIeSupport: "PCIe 3.0 ×6", 190 | MultiGpuCrossFire: false, 191 | MultiGpuSLI: false, 192 | USBSupport: "1, 2, 6", 193 | SATAPorts: 4, 194 | RAID: "0,1,10", 195 | AMDStoreMI: true, 196 | Overclocking: "No, unless an unsupported third-party motherboard firmware applied", 197 | TDP: "~5 W[16]", 198 | SupportExcavator: "Varies[d]", 199 | SupportZen: "Varies", 200 | SupportZenPlus: "Yes", 201 | SupportZen2: "Yes", 202 | SupportZen3: "Varies[d][18]", 203 | Architecture: "Promontory", 204 | }, 205 | { // row 6 206 | Model: "B550[e]", 207 | ReleaseDate: "June 2020[20]", 208 | PCIeSupport: "PCIe 3.0 ×10[21]", 209 | MultiGpuCrossFire: true, 210 | MultiGpuSLI: false, 211 | USBSupport: "2, 2, 6", 212 | SATAPorts: 6, 213 | RAID: "0,1,10", 214 | AMDStoreMI: true, 215 | Overclocking: "Yes,withPBO", 216 | TDP: "~5 W[16]", 217 | SupportExcavator: "Varies[d]", 218 | SupportZen: "Varies", 219 | SupportZenPlus: "Yes", 220 | SupportZen2: "Yes", 221 | SupportZen3: "Varies[d][18]", 222 | Architecture: "Promontory", 223 | }, 224 | { // row 7 225 | Model: "X570", 226 | ReleaseDate: "July 2019[22]", 227 | PCIeSupport: "PCIe 4.0 ×16", 228 | MultiGpuCrossFire: true, 229 | MultiGpuSLI: true, 230 | USBSupport: "8, 0, 4", 231 | SATAPorts: 12, 232 | RAID: "0,1,10", 233 | AMDStoreMI: true, 234 | Overclocking: "Yes,withPBO", 235 | TDP: "~15 W[23][24][f]", 236 | SupportExcavator: "No[g]", 237 | SupportZen: "Yes", 238 | SupportZenPlus: "Yes", 239 | SupportZen2: "Yes", 240 | SupportZen3: "Yes", 241 | Architecture: "Bixby", 242 | }, 243 | } 244 | var failed bool 245 | for i, v := range expected { 246 | if chipsets[i].Model != v.Model { 247 | failed = true 248 | t.Logf("expected chipsets[%d].Model (%s) to be %v but got %v", i, v.Model, v.Model, chipsets[i].Model) 249 | } 250 | if chipsets[i].ReleaseDate != v.ReleaseDate { 251 | failed = true 252 | t.Logf("expected chipsets[%d].ReleaseDate (%s) to be %v but got %v", i, v.Model, v.ReleaseDate, chipsets[i].ReleaseDate) 253 | } 254 | if chipsets[i].PCIeSupport != v.PCIeSupport { 255 | failed = true 256 | t.Logf("expected chipsets[%d].PCIeSupport (%s) to be %v but got %v", i, v.Model, v.PCIeSupport, chipsets[i].PCIeSupport) 257 | } 258 | if chipsets[i].MultiGpuCrossFire != v.MultiGpuCrossFire { 259 | failed = true 260 | t.Logf("expected chipsets[%d].MultiGpuCrossFire (%s) to be %v but got %v", i, v.Model, v.MultiGpuCrossFire, chipsets[i].MultiGpuCrossFire) 261 | } 262 | if chipsets[i].MultiGpuSLI != v.MultiGpuSLI { 263 | failed = true 264 | t.Logf("expected chipsets[%d].MultiGpuSLI (%s) to be %v but got %v", i, v.Model, v.MultiGpuSLI, chipsets[i].MultiGpuSLI) 265 | } 266 | if chipsets[i].USBSupport != v.USBSupport { 267 | failed = true 268 | t.Logf("expected chipsets[%d].USBSupport (%s) to be %v but got %v", i, v.Model, v.USBSupport, chipsets[i].USBSupport) 269 | } 270 | if chipsets[i].SATAPorts != v.SATAPorts { 271 | failed = true 272 | t.Logf("expected chipsets[%d].SATAPorts (%s) to be %v but got %v", i, v.Model, v.SATAPorts, chipsets[i].SATAPorts) 273 | } 274 | if chipsets[i].RAID != v.RAID { 275 | failed = true 276 | t.Logf("expected chipsets[%d].RAID (%s) to be %v but got %v", i, v.Model, v.RAID, chipsets[i].RAID) 277 | } 278 | if chipsets[i].AMDStoreMI != v.AMDStoreMI { 279 | failed = true 280 | t.Logf("expected chipsets[%d].AMDStoreMI (%s) to be %v but got %v", i, v.Model, v.AMDStoreMI, chipsets[i].AMDStoreMI) 281 | } 282 | if chipsets[i].Overclocking != v.Overclocking { 283 | failed = true 284 | t.Logf("expected chipsets[%d].Overclocking (%s) to be %v but got %v", i, v.Model, v.Overclocking, chipsets[i].Overclocking) 285 | } 286 | if chipsets[i].TDP != v.TDP { 287 | failed = true 288 | t.Logf("expected chipsets[%d].TDP (%s) to be %v but got %v", i, v.Model, v.TDP, chipsets[i].TDP) 289 | } 290 | if chipsets[i].SupportExcavator != v.SupportExcavator { 291 | failed = true 292 | t.Logf("expected chipsets[%d].SupportExcavator (%s) to be %v but got %v", i, v.Model, v.SupportExcavator, chipsets[i].SupportExcavator) 293 | } 294 | if chipsets[i].SupportZen != v.SupportZen { 295 | failed = true 296 | t.Logf("expected chipsets[%d].SupportZen (%s) to be %v but got %v", i, v.Model, v.SupportZen, chipsets[i].SupportZen) 297 | } 298 | if chipsets[i].SupportZenPlus != v.SupportZenPlus { 299 | failed = true 300 | t.Logf("expected chipsets[%d].SupportZenPlus (%s) to be %v but got %v", i, v.Model, v.SupportZenPlus, chipsets[i].SupportZenPlus) 301 | } 302 | if chipsets[i].SupportZen2 != v.SupportZen2 { 303 | failed = true 304 | t.Logf("expected chipsets[%d].SupportZen2 (%s) to be %v but got %v", i, v.Model, v.SupportZen2, chipsets[i].SupportZen2) 305 | } 306 | if chipsets[i].SupportZen3 != v.SupportZen3 { 307 | failed = true 308 | t.Logf("expected chipsets[%d].SupportZen3 (%s) to be %v but got %v", i, v.Model, v.SupportZen3, chipsets[i].SupportZen3) 309 | } 310 | if chipsets[i].Architecture != v.Architecture { 311 | failed = true 312 | t.Logf("expected chipsets[%d].Architecture (%s) to be %v but got %v", i, v.Model, v.Architecture, chipsets[i].Architecture) 313 | } 314 | } 315 | if failed { 316 | t.Fail() 317 | } 318 | } 319 | 320 | // taken from https://en.wikipedia.org/wiki/List_of_AMD_chipsets#AM4_chipsets 321 | const am4info = ` 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 |
ModelRelease datePCIe support[a]Multi-GPUUSB support[b]Storage featuresProcessor
overclocking
TDPCPU support[14]Architecture
CrossFireSLISATA portsRAIDAMD StoreMIExcavatorZenZen+Zen 2Zen 3
A320February 2017[15]PCIe 2.0 ×4NoNo1, 2, 640,
1,
10
NoLimited to pre-Zen CPUs, unless an unsupported third-party motherboard firmware applied~5 W[16]YesYesYesVaries[c]Promontory
B350February 2017[15]PCIe 2.0 ×6Yes2, 2, 6Yes
X370February 2017[15]PCIe 2.0 ×8Yes2, 6, 68
B450March 2018[17]PCIe 2.0 ×6No2, 2, 64YesYes,
with PBO
Varies[d]YesVaries[d][18]
X470March 2018[17]PCIe 2.0 ×8Yes2, 6, 68
A520August 2020[19]PCIe 3.0 ×6NoNo1, 2, 64No, unless an unsupported third-party motherboard firmware appliedVariesYes
B550[e]June 2020[20]PCIe 3.0 ×10[21]YesVaries2, 2, 66Yes,
with PBO
X570July 2019[22]PCIe 4.0 ×16Yes8, 0, 412~15 W[23][24] [f]No [g]YesYesBixby
` 439 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | package htmltable 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func assertError(t *testing.T, err error) { 9 | if err == nil { 10 | t.Errorf("expected error, got nil") 11 | } 12 | } 13 | 14 | func assertNoError(t *testing.T, err error) { 15 | if err != nil { 16 | t.Errorf("expected no error, got %s", err.Error()) 17 | } 18 | } 19 | 20 | func assertEqualError(t *testing.T, err error, msg string) { 21 | assertError(t, err) 22 | got := err.Error() 23 | if got != msg { 24 | t.Errorf("%#v (expected) != %#v (got)", msg, err.Error()) 25 | } 26 | } 27 | 28 | func assertEqual(t *testing.T, a, b any) { 29 | if !reflect.DeepEqual(a, b) { 30 | t.Errorf("%#v (expected) != %#v (got)", a, b) 31 | } 32 | } 33 | 34 | type comparable interface { 35 | int | string 36 | } 37 | 38 | func assertGreaterOrEqual[T comparable](t *testing.T, a, b T) { 39 | if !(a >= b) { 40 | t.Errorf("%#v (expected) >= %#v (got)", a, b) 41 | } 42 | } 43 | --------------------------------------------------------------------------------