├── .github
├── dependabot.yml
└── workflows
│ ├── codeql-analysis.yml
│ └── push.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── doc
└── colspans-rowspans.png
├── example_test.go
├── go.mod
├── go.sum
├── log.go
├── log_test.go
├── page.go
├── page_test.go
├── slice.go
├── slice_test.go
└── util_test.go
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "gomod"
4 | directory: "/"
5 | schedule:
6 | interval: "weekly"
7 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: "CodeQL"
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 | schedule:
9 | - cron: '21 13 * * 4'
10 |
11 | jobs:
12 | analyze:
13 | name: Analyze
14 | runs-on: ubuntu-latest
15 | permissions:
16 | actions: read
17 | contents: read
18 | security-events: write
19 |
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | language: [ 'go' ]
24 |
25 | steps:
26 | - name: Checkout repository
27 | uses: actions/checkout@v3
28 |
29 | - name: Initialize CodeQL
30 | uses: github/codeql-action/init@v2
31 | with:
32 | languages: ${{ matrix.language }}
33 |
34 | - name: Autobuild
35 | uses: github/codeql-action/autobuild@v2
36 |
37 | - name: Perform CodeQL Analysis
38 | uses: github/codeql-action/analyze@v2
39 |
--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
1 | name: build
2 |
3 | on:
4 | pull_request:
5 | types: [opened, synchronize]
6 | push:
7 | branches: [main]
8 |
9 | jobs:
10 | tests:
11 | strategy:
12 | fail-fast: false
13 | matrix:
14 | goVersion: [ '1.18.x', '1.19.x' ]
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v2
18 | - run: git fetch --prune --unshallow
19 | - uses: actions/setup-go@v1
20 | with:
21 | go-version: ${{ matrix.goVersion }}
22 | - run: go mod vendor
23 | - run: make test
24 | - uses: codecov/codecov-action@v1
25 | if: always()
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 |
8 | # Test binary, built with `go test -c`
9 | *.test
10 |
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 |
14 | # Dependency directories (remove the comment below to include it)
15 | vendor/
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Serge Smertin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | default: vendor
2 |
3 | fmt:
4 | go fmt ./...
5 |
6 | vendor:
7 | go mod vendor
8 |
9 | test:
10 | go test -coverpkg=./... -coverprofile=coverage.out -timeout=10s ./...
11 |
12 | coverage: test
13 | go tool cover -html=coverage.out
14 |
15 | .PHONY: build fmt coverage test vendor
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HTML table data extractor for Go
2 |
3 | [](https://pkg.go.dev/mod/github.com/nfx/go-htmltable)
4 | [](https://github.com/nfx/go-htmltable/blob/main/LICENSE)
5 | [](https://codecov.io/gh/nfx/go-htmltable)
6 | [](https://github.com/nfx/go-htmltable/actions?query=workflow%3Abuild+branch%3Amain)
7 |
8 |
9 | `htmltable` enables structured data extraction from HTML tables and URLs and requires almost no external dependencies. Tested with Go 1.18.x and 1.19.x.
10 |
11 | ## Installation
12 |
13 | ```bash
14 | go get github.com/nfx/go-htmltable
15 | ```
16 |
17 | ## Usage
18 |
19 | You can retrieve a slice of `header`-annotated types using the `NewSlice*` contructors:
20 |
21 | ```go
22 | type Ticker struct {
23 | Symbol string `header:"Symbol"`
24 | Security string `header:"Security"`
25 | CIK string `header:"CIK"`
26 | }
27 |
28 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
29 | out, _ := htmltable.NewSliceFromURL[Ticker](url)
30 | fmt.Println(out[0].Symbol)
31 | fmt.Println(out[0].Security)
32 |
33 | // Output:
34 | // MMM
35 | // 3M
36 | ```
37 |
38 | An error would be thrown if there's no matching page with the specified columns:
39 |
40 | ```go
41 | page, _ := htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
42 | _, err := page.FindWithColumns("invalid", "column", "names")
43 | fmt.Println(err)
44 |
45 | // Output:
46 | // cannot find table with columns: invalid, column, names
47 | ```
48 |
49 | And you can use more low-level API to work with extracted data:
50 |
51 | ```go
52 | page, _ := htmltable.NewFromString(`
53 | foo
54 |
55 | a | b |
56 | 1 | 2 |
57 | 3 | 4 |
58 |
59 | bar
60 |
61 | b | c | d |
62 | 1 | 2 | 5 |
63 | 3 | 4 | 6 |
64 |
65 | `)
66 |
67 | fmt.Printf("found %d tables\n", page.Len())
68 | _ = page.Each2("c", "d", func(c, d string) error {
69 | fmt.Printf("c:%s d:%s\n", c, d)
70 | return nil
71 | })
72 |
73 | // Output:
74 | // found 2 tables
75 | // c:2 d:5
76 | // c:4 d:6
77 | ```
78 |
79 | Complex [tables with row and col spans](https://en.wikipedia.org/wiki/List_of_AMD_chipsets#AM4_chipsets) are natively supported as well. You can annotate `string`, `int`, and `bool` fields. Any `bool` field value is `true` if it is equal in lowercase to one of `yes`, `y`, `true`, `t`.
80 |
81 | 
82 |
83 | ```go
84 | type AM4 struct {
85 | Model string `header:"Model"`
86 | ReleaseDate string `header:"Release date"`
87 | PCIeSupport string `header:"PCIesupport[a]"`
88 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"`
89 | MultiGpuSLI bool `header:"Multi-GPU SLI"`
90 | USBSupport string `header:"USBsupport[b]"`
91 | SATAPorts int `header:"Storage features SATAports"`
92 | RAID string `header:"Storage features RAID"`
93 | AMDStoreMI bool `header:"Storage features AMD StoreMI"`
94 | Overclocking string `header:"Processoroverclocking"`
95 | TDP string `header:"TDP"`
96 | SupportExcavator string `header:"CPU support[14] Excavator"`
97 | SupportZen string `header:"CPU support[14] Zen"`
98 | SupportZenPlus string `header:"CPU support[14] Zen+"`
99 | SupportZen2 string `header:"CPU support[14] Zen 2"`
100 | SupportZen3 string `header:"CPU support[14] Zen 3"`
101 | Architecture string `header:"Architecture"`
102 | }
103 | am4Chipsets, _ := htmltable.NewSliceFromURL[AM4]("https://en.wikipedia.org/wiki/List_of_AMD_chipsets")
104 | fmt.Println(am4Chipsets[2].Model)
105 | fmt.Println(am4Chipsets[2].SupportZen2)
106 |
107 | // Output:
108 | // X370
109 | // Varies[c]
110 | ```
111 |
112 | And the last note: you're encouraged to plug your own structured logger:
113 |
114 | ```go
115 | htmltable.Logger = func(_ context.Context, msg string, fields ...any) {
116 | fmt.Printf("[INFO] %s %v\n", msg, fields)
117 | }
118 | htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
119 |
120 | // Output:
121 | // [INFO] found table [columns [Symbol Security SEC filings GICSSector GICS Sub-Industry Headquarters Location Date first added CIK Founded] count 504]
122 | // [INFO] found table [columns [Date Added Ticker Added Security Removed Ticker Removed Security Reason] count 308]
123 | ```
124 |
125 | ## Inspiration
126 |
127 | This library aims to be something like [pandas.read_html](https://pandas.pydata.org/docs/reference/api/pandas.read_html.html) or [table_extract](https://docs.rs/table-extract/latest/table_extract/) Rust crate, but more idiomatic for Go.
--------------------------------------------------------------------------------
/doc/colspans-rowspans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nfx/go-htmltable/f3d02958624856309008a72f34fe07a7c412751f/doc/colspans-rowspans.png
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package htmltable_test
2 |
3 | import (
4 | "context"
5 | "fmt"
6 |
7 | "github.com/nfx/go-htmltable"
8 | )
9 |
10 | func ExampleNewSliceFromUrl() {
11 | type Ticker struct {
12 | Symbol string `header:"Symbol"`
13 | Security string `header:"Security"`
14 | CIK string `header:"CIK"`
15 | }
16 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
17 | out, _ := htmltable.NewSliceFromURL[Ticker](url)
18 | fmt.Println(out[0].Symbol)
19 | fmt.Println(out[0].Security)
20 |
21 | // Output:
22 | // MMM
23 | // 3M
24 | }
25 |
26 | func ExampleNewSliceFromURL_rowspansAndColspans() {
27 | type AM4 struct {
28 | Model string `header:"Model"`
29 | ReleaseDate string `header:"Release date"`
30 | PCIeSupport string `header:"PCIesupport[a]"`
31 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"`
32 | MultiGpuSLI bool `header:"Multi-GPU SLI"`
33 | USBSupport string `header:"USBsupport[b]"`
34 | SATAPorts int `header:"Storage features SATAports"`
35 | RAID string `header:"Storage features RAID"`
36 | AMDStoreMI bool `header:"Storage features AMD StoreMI"`
37 | Overclocking string `header:"Processoroverclocking"`
38 | TDP string `header:"TDP"`
39 | SupportExcavator string `header:"CPU support Excavator"`
40 | SupportZen string `header:"CPU support Zen"`
41 | SupportZenPlus string `header:"CPU support Zen+"`
42 | SupportZen2 string `header:"CPU support Zen 2"`
43 | SupportZen3 string `header:"CPU support Zen 3"`
44 | Architecture string `header:"Architecture"`
45 | }
46 | am4Chipsets, _ := htmltable.NewSliceFromURL[AM4]("https://en.wikipedia.org/wiki/List_of_AMD_chipsets")
47 | fmt.Println(am4Chipsets[2].Model)
48 | fmt.Println(am4Chipsets[2].SupportZen2)
49 |
50 | // Output:
51 | // X370
52 | // Varies[c]
53 | }
54 |
55 | func ExampleNewFromString() {
56 | page, _ := htmltable.NewFromString(`
57 | foo
58 |
59 | a | b |
60 | 1 | 2 |
61 | 3 | 4 |
62 |
63 | bar
64 |
65 | b | c | d |
66 | 1 | 2 | 5 |
67 | 3 | 4 | 6 |
68 |
69 | `)
70 |
71 | fmt.Printf("found %d tables\n", page.Len())
72 | _ = page.Each2("c", "d", func(c, d string) error {
73 | fmt.Printf("c:%s d:%s\n", c, d)
74 | return nil
75 | })
76 |
77 | // Output:
78 | // found 2 tables
79 | // c:2 d:5
80 | // c:4 d:6
81 | }
82 |
83 | func ExampleNewFromURL() {
84 | page, _ := htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
85 | _, err := page.FindWithColumns("invalid", "column", "names")
86 | fmt.Println(err)
87 |
88 | // Output:
89 | // cannot find table with columns: invalid, column, names
90 | }
91 |
92 | func ExampleLogger() {
93 | htmltable.Logger = func(_ context.Context, msg string, fields ...any) {
94 | fmt.Printf("[INFO] %s %v\n", msg, fields)
95 | }
96 | _, _ = htmltable.NewFromURL("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
97 |
98 | // Output:
99 | // [INFO] found table [columns [Symbol Security SEC filings GICSSector GICS Sub-Industry Headquarters Location Date first added CIK Founded] count 503]
100 | // [INFO] found table [columns [Date Added Ticker Added Security Removed Ticker Removed Security Reason] count 316]
101 | }
102 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/nfx/go-htmltable
2 |
3 | go 1.18
4 |
5 | require golang.org/x/net v0.26.0
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
2 | golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
3 |
--------------------------------------------------------------------------------
/log.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 | "strings"
8 | )
9 |
10 | // Logger is a very simplistic structured logger, than should
11 | // be overriden by integrations.
12 | var Logger func(_ context.Context, msg string, fields ...any)
13 |
14 | func init() {
15 | Logger = defaultLogger
16 | }
17 |
18 | var defaultLogger = func(_ context.Context, msg string, fields ...any) {
19 | var sb strings.Builder
20 | sb.WriteString(msg)
21 | if len(fields)%2 != 0 {
22 | panic(fmt.Errorf("number of logged fields is not even"))
23 | }
24 | for i := 0; i < len(fields); i += 2 {
25 | sb.WriteRune(' ')
26 | sb.WriteString(fmt.Sprint(fields[i]))
27 | sb.WriteRune('=')
28 | sb.WriteString(fmt.Sprint(fields[i+1]))
29 | }
30 | log.Print(sb.String())
31 | }
32 |
--------------------------------------------------------------------------------
/log_test.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "context"
5 | "testing"
6 | )
7 |
8 | func TestLogger(t *testing.T) {
9 | Logger(context.Background(), "message", "foo", "bar", "x", 1)
10 | }
11 |
12 | func TestLoggerNoFields(t *testing.T) {
13 | Logger(context.Background(), "message")
14 | }
15 |
16 | func TestLoggerWrongFields(t *testing.T) {
17 | defer func() {
18 | p := recover()
19 | if p == nil {
20 | t.Fatalf("there must be panic")
21 | }
22 | }()
23 | Logger(context.Background(), "message", 1)
24 | }
25 |
--------------------------------------------------------------------------------
/page.go:
--------------------------------------------------------------------------------
1 | // htmltable enables structured data extraction from HTML tables and URLs
2 | package htmltable
3 |
4 | import (
5 | "context"
6 | "fmt"
7 | "io"
8 | "net/http"
9 | "strconv"
10 | "strings"
11 |
12 | "golang.org/x/net/html"
13 | )
14 |
15 | // mock for tests
16 | var htmlParse = html.Parse
17 |
18 | // Page is the container for all tables parseable
19 | type Page struct {
20 | Tables []*Table
21 |
22 | ctx context.Context
23 | rowSpans []int
24 | colSpans []int
25 | row []string
26 | rows [][]string
27 | maxCols int
28 |
29 | // current row
30 | colSpan []int
31 | rowSpan []int
32 | // all
33 | cSpans [][]int
34 | rSpans [][]int
35 | }
36 |
37 | // New returns an instance of the page with possibly more than one table
38 | func New(ctx context.Context, r io.Reader) (*Page, error) {
39 | p := &Page{ctx: ctx}
40 | return p, p.init(r)
41 | }
42 |
43 | // NewFromString is same as New(ctx.Context, io.Reader), but from string
44 | func NewFromString(r string) (*Page, error) {
45 | return New(context.Background(), strings.NewReader(r))
46 | }
47 |
48 | // NewFromResponse is same as New(ctx.Context, io.Reader), but from http.Response.
49 | //
50 | // In case of failure, returns `ResponseError`, that could be further inspected.
51 | func NewFromResponse(resp *http.Response) (*Page, error) {
52 | p, err := New(resp.Request.Context(), resp.Body)
53 | if err != nil {
54 | return nil, err
55 | }
56 | return p, nil
57 | }
58 |
59 | // NewFromURL is same as New(ctx.Context, io.Reader), but from URL.
60 | //
61 | // In case of failure, returns `ResponseError`, that could be further inspected.
62 | func NewFromURL(url string) (*Page, error) {
63 | resp, err := http.Get(url)
64 | if err != nil {
65 | return nil, err
66 | }
67 | if resp.Body != nil {
68 | defer resp.Body.Close()
69 | }
70 | return NewFromResponse(resp)
71 | }
72 |
73 | // Len returns number of tables found on the page
74 | func (p *Page) Len() int {
75 | return len(p.Tables)
76 | }
77 |
78 | // FindWithColumns performs fuzzy matching of tables by given header column names
79 | func (p *Page) FindWithColumns(columns ...string) (*Table, error) {
80 | // realistic p won't have this much
81 | found := 0xfffffff
82 | for idx, table := range p.Tables {
83 | matchedColumns := 0
84 | for _, col := range columns {
85 | for _, header := range table.Header {
86 | if col == header {
87 | // perform fuzzy matching of table headers
88 | matchedColumns++
89 | }
90 | }
91 | }
92 | if matchedColumns != len(columns) {
93 | continue
94 | }
95 | if found < len(p.Tables) {
96 | // and do a best-effort error message, that is cleaner than pandas.read_html
97 | return nil, fmt.Errorf("more than one table matches columns `%s`: "+
98 | "[%d] %s and [%d] %s", strings.Join(columns, ", "),
99 | found, p.Tables[found], idx, p.Tables[idx])
100 | }
101 | found = idx
102 | }
103 | if found > len(p.Tables) {
104 | return nil, fmt.Errorf("cannot find table with columns: %s",
105 | strings.Join(columns, ", "))
106 | }
107 | return p.Tables[found], nil
108 | }
109 |
110 | // Each row would call func with the value of the table cell from the column
111 | // specified in the first argument.
112 | //
113 | // Returns an error if table has no matching column name.
114 | func (p *Page) Each(a string, f func(a string) error) error {
115 | table, err := p.FindWithColumns(a)
116 | if err != nil {
117 | return err
118 | }
119 | offsets := map[string]int{}
120 | for idx, header := range table.Header {
121 | offsets[header] = idx
122 | }
123 | for idx, row := range table.Rows {
124 | if len(row) < 1 {
125 | continue
126 | }
127 | err = f(row[offsets[a]])
128 | if err != nil {
129 | return fmt.Errorf("row %d: %w", idx, err)
130 | }
131 | }
132 | return nil
133 | }
134 |
135 | // Each2 will get two columns specified in the first two arguments
136 | // and call the func with those values for every row in the table.
137 | //
138 | // Returns an error if table has no matching column names.
139 | func (p *Page) Each2(a, b string, f func(a, b string) error) error {
140 | table, err := p.FindWithColumns(a, b)
141 | if err != nil {
142 | return err
143 | }
144 | offsets := map[string]int{}
145 | for idx, header := range table.Header {
146 | offsets[header] = idx
147 | }
148 | _1, _2 := offsets[a], offsets[b]
149 | for idx, row := range table.Rows {
150 | if len(row) < 2 {
151 | continue
152 | }
153 | err = f(row[_1], row[_2])
154 | if err != nil {
155 | return fmt.Errorf("row %d: %w", idx, err)
156 | }
157 | }
158 | return nil
159 | }
160 |
161 | // Each3 will get three columns specified in the first three arguments
162 | // and call the func with those values for every row in the table.
163 | //
164 | // Returns an error if table has no matching column names.
165 | func (p *Page) Each3(a, b, c string, f func(a, b, c string) error) error {
166 | table, err := p.FindWithColumns(a, b, c)
167 | if err != nil {
168 | return err
169 | }
170 | offsets := map[string]int{}
171 | for idx, header := range table.Header {
172 | offsets[header] = idx
173 | }
174 | _1, _2, _3 := offsets[a], offsets[b], offsets[c]
175 | for idx, row := range table.Rows {
176 | if len(row) < 3 {
177 | continue
178 | }
179 | err = f(row[_1], row[_2], row[_3])
180 | if err != nil {
181 | return fmt.Errorf("row %d: %w", idx, err)
182 | }
183 | }
184 | return nil
185 | }
186 |
187 | func (p *Page) init(r io.Reader) error {
188 | root, err := htmlParse(r)
189 | if err != nil {
190 | return err
191 | }
192 | p.parse(root)
193 | p.finishTable()
194 | return nil
195 | }
196 |
197 | func (p *Page) parse(n *html.Node) {
198 | if n == nil {
199 | return
200 | }
201 | switch n.Data {
202 | case "td", "th":
203 | p.colSpan = append(p.colSpan, p.intAttrOr(n, "colspan", 1))
204 | p.rowSpan = append(p.rowSpan, p.intAttrOr(n, "rowspan", 1))
205 | var sb strings.Builder
206 | p.innerText(n, &sb)
207 | p.row = append(p.row, sb.String())
208 | return
209 | case "tr":
210 | p.finishRow()
211 | case "table":
212 | p.finishTable()
213 | }
214 | for c := n.FirstChild; c != nil; c = c.NextSibling {
215 | p.parse(c)
216 | }
217 | }
218 |
219 | func (p *Page) intAttrOr(n *html.Node, attr string, default_ int) int {
220 | for _, a := range n.Attr {
221 | if a.Key != attr {
222 | continue
223 | }
224 | val, err := strconv.Atoi(a.Val)
225 | if err != nil {
226 | return default_
227 | }
228 | return val
229 | }
230 | return default_
231 | }
232 |
233 | func (p *Page) finishRow() {
234 | if len(p.row) == 0 {
235 | return
236 | }
237 | if len(p.row) > p.maxCols {
238 | p.maxCols = len(p.row)
239 | }
240 | p.rows = append(p.rows, p.row)
241 | p.cSpans = append(p.cSpans, p.colSpan)
242 | p.rSpans = append(p.rSpans, p.rowSpan)
243 | p.row = []string{}
244 | p.colSpan = []int{}
245 | p.rowSpan = []int{}
246 | }
247 |
248 | type cellSpan struct {
249 | BeginX, EndX int
250 | BeginY, EndY int
251 | Value string
252 | }
253 |
254 | func (d *cellSpan) Match(x, y int) bool {
255 | if d.BeginX > x {
256 | return false
257 | }
258 | if d.EndX <= x {
259 | return false
260 | }
261 | if d.BeginY > y {
262 | return false
263 | }
264 | if d.EndY <= y {
265 | return false
266 | }
267 | return true
268 | }
269 |
270 | type spans []cellSpan
271 |
272 | func (s spans) Value(x, y int) (string, bool) {
273 | for _, v := range s {
274 | if !v.Match(x, y) {
275 | continue
276 | }
277 | return v.Value, true
278 | }
279 | return "", false
280 | }
281 |
282 | func (p *Page) finishTable() {
283 | defer func() {
284 | if r := recover(); r != nil {
285 | firstRow := []string{}
286 | if len(p.rows) > 0 {
287 | firstRow = p.rows[0][:]
288 | }
289 | Logger(p.ctx, "unparsable table", "panic", fmt.Sprintf("%v", r), "firstRow", firstRow)
290 | }
291 | p.rows = [][]string{}
292 | p.colSpans = []int{}
293 | p.rowSpans = []int{}
294 | p.cSpans = [][]int{}
295 | p.rSpans = [][]int{}
296 | p.maxCols = 0
297 | }()
298 | p.finishRow()
299 | if len(p.rows) == 0 {
300 | return
301 | }
302 |
303 | rows := [][]string{}
304 | allSpans := spans{}
305 | rowSkips := 0
306 | gotHeader := false
307 |
308 | ROWS:
309 | for y := 0; y < len(p.rows); y++ { // rows cols addressable by x
310 | currentRow := []string{}
311 | skipRow := false
312 | k := 0 // next row columns
313 | j := 0 // p.rows cols addressable by j
314 | for x := 0; x < p.maxCols; x++ {
315 | value, ok := allSpans.Value(x, y)
316 | if ok {
317 | currentRow = append(currentRow, value)
318 | continue
319 | }
320 | if gotHeader && len(p.rows[y]) == 1 && p.rows[y][0] == "" {
321 | // this are most likely empty rows or table dividers
322 | rowSkips++
323 | continue ROWS
324 | }
325 | if len(p.rSpans[y]) == j {
326 | break
327 | }
328 | rowSpan := p.rSpans[y][j]
329 | colSpan := p.cSpans[y][j]
330 | value = p.rows[y][j]
331 | if gotHeader && (rowSpan > 1 || colSpan > 1) {
332 | allSpans = append(allSpans, cellSpan{
333 | BeginX: x,
334 | EndX: x + colSpan,
335 | BeginY: y,
336 | EndY: y + rowSpan,
337 | Value: value,
338 | })
339 | }
340 | if !gotHeader && colSpan > 1 {
341 | skipRow = true
342 | // in header: merge, in row - duplicate
343 | for q := 0; q < colSpan; q++ {
344 | nextValue := fmt.Sprintf("%s %s", value, p.rows[y+1][k])
345 | currentRow = append(currentRow, nextValue)
346 | k++
347 | }
348 | } else {
349 | currentRow = append(currentRow, value)
350 | }
351 | j++
352 | }
353 | if skipRow {
354 | rowSkips++
355 | y++
356 | }
357 | gotHeader = true
358 | if len(currentRow) > p.maxCols {
359 | p.maxCols = len(currentRow)
360 | }
361 | rows = append(rows, currentRow)
362 | }
363 | header := rows[0]
364 | rows = rows[1:]
365 | Logger(p.ctx, "found table", "columns", header, "count", len(rows))
366 | p.Tables = append(p.Tables, &Table{
367 | Header: header,
368 | Rows: rows,
369 | })
370 | }
371 |
372 | func (p *Page) innerText(n *html.Node, sb *strings.Builder) {
373 | if n.Type == html.TextNode {
374 | sb.WriteString(strings.TrimSpace(n.Data))
375 | return
376 | }
377 | if n.FirstChild == nil {
378 | return
379 | }
380 | for c := n.FirstChild; c != nil; c = c.NextSibling {
381 | p.innerText(c, sb)
382 | }
383 | }
384 |
385 | // Table is the low-level representation of raw header and rows.
386 | //
387 | // Every cell string value is truncated of its whitespace.
388 | type Table struct {
389 | // Header holds names of headers
390 | Header []string
391 |
392 | // Rows holds slice of string slices
393 | Rows [][]string
394 | }
395 |
396 | func (table *Table) String() string {
397 | return fmt.Sprintf("Table[%s] (%d rows)", strings.Join(table.Header, ", "), len(table.Rows))
398 | }
399 |
--------------------------------------------------------------------------------
/page_test.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "io"
7 | "net/http"
8 | "strings"
9 | "testing"
10 |
11 | "golang.org/x/net/html"
12 | )
13 |
14 | const fixture = `
15 | foo
16 |
17 | a | b |
18 | 1 | 2 |
19 | 3 | 4 |
20 |
21 | bar
22 |
23 | b | c | d |
24 | 1 | 2 | 5 |
25 | 3 | 4 | 6 |
26 |
27 | `
28 |
29 | func TestFindsAllTables(t *testing.T) {
30 | p, err := NewFromString(fixture)
31 | assertNoError(t, err)
32 | assertEqual(t, p.Len(), 2)
33 | }
34 |
35 | // added public domain data from https://en.wikipedia.org/wiki/List_of_S&P_500_companies
36 | const fixtureColspans = `
37 |
38 |
39 | Date |
40 | Added |
41 | Removed |
42 | Reason |
43 |
44 |
45 | Ticker |
46 | Security |
47 | Ticker |
48 | Security |
49 |
50 |
51 |
52 |
53 | June 21, 2022 |
54 | KDP |
55 | Keurig Dr Pepper |
56 | UA/UAA |
57 | Under Armour |
58 | Market capitalization change.[4] |
59 |
60 |
61 | June 21, 2022 |
62 | ON |
63 | ON Semiconductor |
64 | IPGP |
65 | IPG Photonics |
66 | Market capitalization change.[4] |
67 |
68 |
69 |
`
70 |
71 | func TestFindsWithColspans(t *testing.T) {
72 | p, err := NewFromString(fixtureColspans)
73 | assertNoError(t, err)
74 | assertEqual(t, p.Len(), 1)
75 | assertEqual(t, "Added Ticker", p.Tables[0].Header[1])
76 | assertEqual(t, "Market capitalization change.[4]", p.Tables[0].Rows[0][5])
77 | }
78 |
79 | func TestInitFails(t *testing.T) {
80 | prev := htmlParse
81 | t.Cleanup(func() {
82 | htmlParse = prev
83 | })
84 | htmlParse = func(r io.Reader) (*html.Node, error) {
85 | return nil, fmt.Errorf("nope")
86 | }
87 | _, err := New(context.Background(), strings.NewReader(".."))
88 |
89 | assertEqualError(t, err, "nope")
90 | }
91 |
92 | func TestNewFromHttpResponseError(t *testing.T) {
93 | prev := htmlParse
94 | t.Cleanup(func() {
95 | htmlParse = prev
96 | })
97 | htmlParse = func(r io.Reader) (*html.Node, error) {
98 | return nil, fmt.Errorf("nope")
99 | }
100 | _, err := NewFromResponse(&http.Response{
101 | Request: &http.Request{},
102 | })
103 | assertEqualError(t, err, "nope")
104 | }
105 |
106 | func TestRealPageFound(t *testing.T) {
107 | wiki, err := http.Get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
108 | assertNoError(t, err)
109 | p, err := NewFromResponse(wiki)
110 | assertNoError(t, err)
111 | snp, err := p.FindWithColumns("Symbol", "Security", "CIK")
112 | assertNoError(t, err)
113 | assertGreaterOrEqual(t, len(snp.Rows), 500)
114 | }
115 |
116 | func TestRealPageFound_BasicRowColSpans(t *testing.T) {
117 | wiki, err := http.Get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
118 | assertNoError(t, err)
119 | p, err := NewFromResponse(wiki)
120 | assertNoError(t, err)
121 | snp, err := p.FindWithColumns("Date", "Added Ticker", "Removed Ticker")
122 | assertNoError(t, err)
123 | assertGreaterOrEqual(t, len(snp.Rows), 250)
124 | }
125 |
126 | func TestFindsTableByColumnNames(t *testing.T) {
127 | p, err := NewFromString(fixture)
128 | assertNoError(t, err)
129 |
130 | cd, err := p.FindWithColumns("c", "d")
131 | assertNoError(t, err)
132 | assertEqual(t, 2, len(cd.Rows))
133 | }
134 |
135 | func TestEach(t *testing.T) {
136 | p, err := NewFromString(fixture)
137 | assertNoError(t, err)
138 | err = p.Each("a", func(a string) error {
139 | t.Logf("%s", a)
140 | return nil
141 | })
142 | assertNoError(t, err)
143 | }
144 |
145 | func TestEachFails(t *testing.T) {
146 | p, err := NewFromString(fixture)
147 | assertNoError(t, err)
148 | err = p.Each("a", func(a string) error {
149 | return fmt.Errorf("nope")
150 | })
151 | assertEqualError(t, err, "row 0: nope")
152 | }
153 |
154 | func TestEachFailsNoCols(t *testing.T) {
155 | p, err := NewFromString(fixture)
156 | assertNoError(t, err)
157 | err = p.Each("x", func(a string) error {
158 | return nil
159 | })
160 | assertEqualError(t, err, "cannot find table with columns: x")
161 | }
162 |
163 | func TestEach2(t *testing.T) {
164 | p, err := NewFromString(fixture)
165 | assertNoError(t, err)
166 | err = p.Each2("b", "c", func(b, c string) error {
167 | t.Logf("%s %s", b, c)
168 | return nil
169 | })
170 | assertNoError(t, err)
171 | }
172 |
173 | func TestEach2Fails(t *testing.T) {
174 | p, err := NewFromString(fixture)
175 | assertNoError(t, err)
176 | err = p.Each2("b", "c", func(b, c string) error {
177 | return fmt.Errorf("nope")
178 | })
179 | assertEqualError(t, err, "row 0: nope")
180 | }
181 |
182 | func TestEach2FailsNoCols(t *testing.T) {
183 | p, err := NewFromString(fixture)
184 | assertNoError(t, err)
185 | err = p.Each2("x", "y", func(b, c string) error {
186 | return nil
187 | })
188 | assertEqualError(t, err, "cannot find table with columns: x, y")
189 | }
190 |
191 | func TestEach3(t *testing.T) {
192 | p, err := NewFromString(fixture)
193 | assertNoError(t, err)
194 | err = p.Each3("b", "c", "d", func(b, c, d string) error {
195 | t.Logf("%s %s %s", b, c, d)
196 | return nil
197 | })
198 | assertNoError(t, err)
199 | }
200 |
201 | func TestEach3Fails(t *testing.T) {
202 | p, err := NewFromString(fixture)
203 | assertNoError(t, err)
204 | err = p.Each3("b", "c", "d", func(b, c, d string) error {
205 | return fmt.Errorf("nope")
206 | })
207 | assertEqualError(t, err, "row 0: nope")
208 | }
209 |
210 | func TestEach3FailsNoCols(t *testing.T) {
211 | p, err := NewFromString(fixture)
212 | assertNoError(t, err)
213 | err = p.Each3("x", "y", "z", func(b, c, d string) error {
214 | return nil
215 | })
216 | assertEqualError(t, err, "cannot find table with columns: x, y, z")
217 | }
218 |
219 | func TestMoreThanOneTableFoundErrors(t *testing.T) {
220 | p, err := NewFromString(fixture)
221 | assertNoError(t, err)
222 |
223 | _, err = p.FindWithColumns("b")
224 | assertError(t, err)
225 | }
226 |
227 | func TestNoTablesFoundErrors(t *testing.T) {
228 | p, err := NewFromString(fixture)
229 | assertNoError(t, err)
230 |
231 | _, err = p.FindWithColumns("z")
232 | assertError(t, err)
233 | }
234 |
235 | func TestNilNodeReturns(t *testing.T) {
236 | p := &Page{}
237 | p.parse(nil)
238 | }
239 |
--------------------------------------------------------------------------------
/slice.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "io"
7 | "net/http"
8 | "reflect"
9 | "strings"
10 | )
11 |
12 | // NewSlice returns slice of annotated struct types from io.Reader
13 | func NewSlice[T any](ctx context.Context, r io.Reader) ([]T, error) {
14 | f := &feeder[T]{
15 | Page: Page{ctx: ctx},
16 | }
17 | f.init(r)
18 | return f.slice()
19 | }
20 |
21 | // NewSliceFromPage finds a table matching the slice and returns the slice
22 | func NewSliceFromPage[T any](p *Page) ([]T, error) {
23 | return (&feeder[T]{
24 | Page: *p,
25 | }).slice()
26 | }
27 |
28 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader),
29 | // but takes just a string.
30 | func NewSliceFromString[T any](in string) ([]T, error) {
31 | return NewSlice[T](context.Background(), strings.NewReader(in))
32 | }
33 |
34 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader),
35 | // but takes just an http.Response
36 | func NewSliceFromResponse[T any](resp *http.Response) ([]T, error) {
37 | return NewSlice[T](resp.Request.Context(), resp.Body)
38 | }
39 |
40 | // NewSliceFromString is same as NewSlice(context.Context, io.Reader),
41 | // but takes just an URL.
42 | func NewSliceFromURL[T any](url string) ([]T, error) {
43 | resp, err := http.Get(url)
44 | if err != nil {
45 | return nil, err
46 | }
47 | if resp.Body != nil {
48 | defer resp.Body.Close()
49 | }
50 | return NewSliceFromResponse[T](resp)
51 | }
52 |
53 | type feeder[T any] struct {
54 | Page
55 |
56 | dummy T
57 | }
58 |
59 | func (f *feeder[T]) headers() ([]string, map[string]int, error) {
60 | dt := reflect.ValueOf(f.dummy)
61 | elem := dt.Type()
62 | headers := []string{}
63 | fields := map[string]int{}
64 | for i := 0; i < elem.NumField(); i++ {
65 | field := elem.Field(i)
66 | header := field.Tag.Get("header")
67 | if header == "" {
68 | continue
69 | }
70 | err := f.isTypeSupported(field)
71 | if err != nil {
72 | return nil, nil, err
73 | }
74 | fields[header] = i
75 | headers = append(headers, header)
76 | }
77 | return headers, fields, nil
78 | }
79 |
80 | func (f *feeder[T]) isTypeSupported(field reflect.StructField) error {
81 | k := field.Type.Kind()
82 | if k == reflect.String {
83 | return nil
84 | }
85 | if k == reflect.Int {
86 | return nil
87 | }
88 | if k == reflect.Bool {
89 | return nil
90 | }
91 | return fmt.Errorf("setting field is not supported, %s is %v",
92 | field.Name, field.Type.Name())
93 | }
94 |
95 | func (f *feeder[T]) table() (*Table, map[int]int, error) {
96 | headers, fields, err := f.headers()
97 | if err != nil {
98 | return nil, nil, err
99 | }
100 | table, err := f.FindWithColumns(headers...)
101 | if err != nil {
102 | return nil, nil, err
103 | }
104 | mapping := map[int]int{}
105 | for idx, header := range table.Header {
106 | field, ok := fields[header]
107 | if !ok {
108 | continue
109 | }
110 | mapping[idx] = field
111 | }
112 | return table, mapping, nil
113 | }
114 |
115 | func (f *feeder[T]) slice() ([]T, error) {
116 | table, mapping, err := f.table()
117 | if err != nil {
118 | return nil, err
119 | }
120 | dummy := reflect.ValueOf(f.dummy)
121 | dt := dummy.Type()
122 | sliceValue := reflect.MakeSlice(reflect.SliceOf(dt),
123 | len(table.Rows), len(table.Rows))
124 | for rowIdx, row := range table.Rows {
125 | item := sliceValue.Index(rowIdx)
126 | for idx, field := range mapping {
127 | if len(row) < len(mapping) && idx == len(row) {
128 | // either corrupt row or something like that
129 | continue
130 | }
131 | switch item.Field(field).Kind() {
132 | case reflect.String:
133 | item.Field(field).SetString(row[idx])
134 | case reflect.Bool:
135 | var v bool
136 | lower := strings.ToLower(row[idx])
137 | if lower == "yes" ||
138 | lower == "y" ||
139 | lower == "true" ||
140 | lower == "t" {
141 | v = true
142 | }
143 | item.Field(field).SetBool(v)
144 | case reflect.Int:
145 | var v int64
146 | _, err := fmt.Sscan(row[idx], &v)
147 | if err != nil {
148 | column := table.Header[idx]
149 | return nil, fmt.Errorf("row %d: %s: %w", rowIdx, column, err)
150 | }
151 | item.Field(field).SetInt(v)
152 | default: // noop
153 | }
154 | }
155 | }
156 | return sliceValue.Interface().([]T), nil
157 | }
158 |
--------------------------------------------------------------------------------
/slice_test.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "net/http"
5 | "net/http/httptest"
6 | "testing"
7 | )
8 |
9 | type nice struct {
10 | C string `header:"c"`
11 | D string `header:"d"`
12 | }
13 |
14 | func TestNewSliceFromString(t *testing.T) {
15 | out, err := NewSliceFromString[nice](fixture)
16 | assertNoError(t, err)
17 | assertEqual(t, []nice{
18 | {"2", "5"},
19 | {"4", "6"},
20 | }, out)
21 | }
22 |
23 | type Ticker struct {
24 | Symbol string `header:"Symbol"`
25 | Security string `header:"Security"`
26 | CIK string `header:"CIK"`
27 | }
28 |
29 | func TestNewSliceFromUrl(t *testing.T) {
30 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
31 | out, err := NewSliceFromURL[Ticker](url)
32 | assertNoError(t, err)
33 | assertGreaterOrEqual(t, len(out), 500)
34 | }
35 |
36 | func TestNewSliceFromPage(t *testing.T) {
37 | url := "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
38 | p, err := NewFromURL(url)
39 | assertNoError(t, err)
40 | out, err := NewSliceFromPage[Ticker](p)
41 | assertNoError(t, err)
42 | assertGreaterOrEqual(t, len(out), 500)
43 | }
44 |
45 | func TestNewSliceFromUrl_Fails(t *testing.T) {
46 | _, err := NewSliceFromURL[Ticker]("https://127.0.0.1")
47 | assertEqualError(t, err, "Get \"https://127.0.0.1\": dial tcp 127.0.0.1:443: connect: connection refused")
48 | }
49 |
50 | func TestNewSliceFromUrl_NoTables(t *testing.T) {
51 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
52 | w.WriteHeader(200)
53 | }))
54 | defer server.Close()
55 | _, err := NewSliceFromURL[Ticker](server.URL)
56 | assertEqualError(t, err, "cannot find table with columns: Symbol, Security, CIK")
57 | }
58 |
59 | func TestNewSliceInvalidTypes(t *testing.T) {
60 | type exotic struct {
61 | A string `header:""`
62 | C float32 `header:"c"`
63 | }
64 | _, err := NewSliceFromString[exotic](fixture)
65 | assertEqualError(t, err, "setting field is not supported, C is float32")
66 | }
67 |
68 | func TestVeryCreativeTableWithRowAndColspans(t *testing.T) {
69 | type AM4 struct {
70 | Model string `header:"Model"`
71 | ReleaseDate string `header:"Release date"`
72 | PCIeSupport string `header:"PCIesupport[a]"`
73 | MultiGpuCrossFire bool `header:"Multi-GPU CrossFire"`
74 | MultiGpuSLI bool `header:"Multi-GPU SLI"`
75 | USBSupport string `header:"USBsupport[b]"`
76 | SATAPorts int `header:"Storage features SATAports"`
77 | RAID string `header:"Storage features RAID"`
78 | AMDStoreMI bool `header:"Storage features AMD StoreMI"`
79 | Overclocking string `header:"Processoroverclocking"`
80 | TDP string `header:"TDP"`
81 | SupportExcavator string `header:"CPU support[14] Excavator"`
82 | SupportZen string `header:"CPU support[14] Zen"`
83 | SupportZenPlus string `header:"CPU support[14] Zen+"`
84 | SupportZen2 string `header:"CPU support[14] Zen 2"`
85 | SupportZen3 string `header:"CPU support[14] Zen 3"`
86 | Architecture string `header:"Architecture"`
87 | }
88 | chipsets, err := NewSliceFromString[AM4](am4info)
89 | assertNoError(t, err)
90 | expected := []AM4{
91 | { // row 0
92 | Model: "A320",
93 | ReleaseDate: "February 2017[15]",
94 | PCIeSupport: "PCIe 2.0 ×4",
95 | MultiGpuCrossFire: false,
96 | MultiGpuSLI: false,
97 | USBSupport: "1, 2, 6",
98 | SATAPorts: 4,
99 | RAID: "0,1,10",
100 | AMDStoreMI: false,
101 | Overclocking: "Limited to pre-Zen CPUs, unless an unsupported third-party motherboard firmware applied",
102 | TDP: "~5 W[16]",
103 | SupportExcavator: "Yes",
104 | SupportZen: "Yes",
105 | SupportZenPlus: "Yes",
106 | SupportZen2: "Varies[c]",
107 | SupportZen3: "Varies[c]",
108 | Architecture: "Promontory",
109 | },
110 | { // row 1
111 | Model: "B350",
112 | ReleaseDate: "February 2017[15]",
113 | PCIeSupport: "PCIe 2.0 ×6",
114 | MultiGpuCrossFire: true,
115 | MultiGpuSLI: false,
116 | USBSupport: "2, 2, 6",
117 | SATAPorts: 4,
118 | RAID: "0,1,10",
119 | AMDStoreMI: false,
120 | Overclocking: "Yes",
121 | TDP: "~5 W[16]",
122 | SupportExcavator: "Yes",
123 | SupportZen: "Yes",
124 | SupportZenPlus: "Yes",
125 | SupportZen2: "Varies[c]",
126 | SupportZen3: "Varies[c]",
127 | Architecture: "Promontory",
128 | },
129 | { // row 2
130 | Model: "X370",
131 | ReleaseDate: "February 2017[15]",
132 | PCIeSupport: "PCIe 2.0 ×8",
133 | MultiGpuCrossFire: true,
134 | MultiGpuSLI: true,
135 | USBSupport: "2, 6, 6",
136 | SATAPorts: 8,
137 | RAID: "0,1,10",
138 | AMDStoreMI: false,
139 | Overclocking: "Yes",
140 | TDP: "~5 W[16]",
141 | SupportExcavator: "Yes",
142 | SupportZen: "Yes",
143 | SupportZenPlus: "Yes",
144 | SupportZen2: "Varies[c]",
145 | SupportZen3: "Varies[c]",
146 | Architecture: "Promontory",
147 | },
148 | { // row 3
149 | Model: "B450",
150 | ReleaseDate: "March 2018[17]",
151 | PCIeSupport: "PCIe 2.0 ×6",
152 | MultiGpuCrossFire: true,
153 | MultiGpuSLI: false,
154 | USBSupport: "2, 2, 6",
155 | SATAPorts: 4,
156 | RAID: "0,1,10",
157 | AMDStoreMI: true,
158 | Overclocking: "Yes,withPBO",
159 | TDP: "~5 W[16]",
160 | SupportExcavator: "Varies[d]",
161 | SupportZen: "Yes",
162 | SupportZenPlus: "Yes",
163 | SupportZen2: "Yes",
164 | SupportZen3: "Varies[d][18]",
165 | Architecture: "Promontory",
166 | },
167 | { // row 4
168 | Model: "X470",
169 | ReleaseDate: "March 2018[17]",
170 | PCIeSupport: "PCIe 2.0 ×8",
171 | MultiGpuCrossFire: true,
172 | MultiGpuSLI: true,
173 | USBSupport: "2, 6, 6",
174 | SATAPorts: 8,
175 | RAID: "0,1,10",
176 | AMDStoreMI: true,
177 | Overclocking: "Yes,withPBO",
178 | TDP: "~5 W[16]",
179 | SupportExcavator: "Varies[d]",
180 | SupportZen: "Yes",
181 | SupportZenPlus: "Yes",
182 | SupportZen2: "Yes",
183 | SupportZen3: "Varies[d][18]",
184 | Architecture: "Promontory",
185 | },
186 | { // row 5
187 | Model: "A520",
188 | ReleaseDate: "August 2020[19]",
189 | PCIeSupport: "PCIe 3.0 ×6",
190 | MultiGpuCrossFire: false,
191 | MultiGpuSLI: false,
192 | USBSupport: "1, 2, 6",
193 | SATAPorts: 4,
194 | RAID: "0,1,10",
195 | AMDStoreMI: true,
196 | Overclocking: "No, unless an unsupported third-party motherboard firmware applied",
197 | TDP: "~5 W[16]",
198 | SupportExcavator: "Varies[d]",
199 | SupportZen: "Varies",
200 | SupportZenPlus: "Yes",
201 | SupportZen2: "Yes",
202 | SupportZen3: "Varies[d][18]",
203 | Architecture: "Promontory",
204 | },
205 | { // row 6
206 | Model: "B550[e]",
207 | ReleaseDate: "June 2020[20]",
208 | PCIeSupport: "PCIe 3.0 ×10[21]",
209 | MultiGpuCrossFire: true,
210 | MultiGpuSLI: false,
211 | USBSupport: "2, 2, 6",
212 | SATAPorts: 6,
213 | RAID: "0,1,10",
214 | AMDStoreMI: true,
215 | Overclocking: "Yes,withPBO",
216 | TDP: "~5 W[16]",
217 | SupportExcavator: "Varies[d]",
218 | SupportZen: "Varies",
219 | SupportZenPlus: "Yes",
220 | SupportZen2: "Yes",
221 | SupportZen3: "Varies[d][18]",
222 | Architecture: "Promontory",
223 | },
224 | { // row 7
225 | Model: "X570",
226 | ReleaseDate: "July 2019[22]",
227 | PCIeSupport: "PCIe 4.0 ×16",
228 | MultiGpuCrossFire: true,
229 | MultiGpuSLI: true,
230 | USBSupport: "8, 0, 4",
231 | SATAPorts: 12,
232 | RAID: "0,1,10",
233 | AMDStoreMI: true,
234 | Overclocking: "Yes,withPBO",
235 | TDP: "~15 W[23][24][f]",
236 | SupportExcavator: "No[g]",
237 | SupportZen: "Yes",
238 | SupportZenPlus: "Yes",
239 | SupportZen2: "Yes",
240 | SupportZen3: "Yes",
241 | Architecture: "Bixby",
242 | },
243 | }
244 | var failed bool
245 | for i, v := range expected {
246 | if chipsets[i].Model != v.Model {
247 | failed = true
248 | t.Logf("expected chipsets[%d].Model (%s) to be %v but got %v", i, v.Model, v.Model, chipsets[i].Model)
249 | }
250 | if chipsets[i].ReleaseDate != v.ReleaseDate {
251 | failed = true
252 | t.Logf("expected chipsets[%d].ReleaseDate (%s) to be %v but got %v", i, v.Model, v.ReleaseDate, chipsets[i].ReleaseDate)
253 | }
254 | if chipsets[i].PCIeSupport != v.PCIeSupport {
255 | failed = true
256 | t.Logf("expected chipsets[%d].PCIeSupport (%s) to be %v but got %v", i, v.Model, v.PCIeSupport, chipsets[i].PCIeSupport)
257 | }
258 | if chipsets[i].MultiGpuCrossFire != v.MultiGpuCrossFire {
259 | failed = true
260 | t.Logf("expected chipsets[%d].MultiGpuCrossFire (%s) to be %v but got %v", i, v.Model, v.MultiGpuCrossFire, chipsets[i].MultiGpuCrossFire)
261 | }
262 | if chipsets[i].MultiGpuSLI != v.MultiGpuSLI {
263 | failed = true
264 | t.Logf("expected chipsets[%d].MultiGpuSLI (%s) to be %v but got %v", i, v.Model, v.MultiGpuSLI, chipsets[i].MultiGpuSLI)
265 | }
266 | if chipsets[i].USBSupport != v.USBSupport {
267 | failed = true
268 | t.Logf("expected chipsets[%d].USBSupport (%s) to be %v but got %v", i, v.Model, v.USBSupport, chipsets[i].USBSupport)
269 | }
270 | if chipsets[i].SATAPorts != v.SATAPorts {
271 | failed = true
272 | t.Logf("expected chipsets[%d].SATAPorts (%s) to be %v but got %v", i, v.Model, v.SATAPorts, chipsets[i].SATAPorts)
273 | }
274 | if chipsets[i].RAID != v.RAID {
275 | failed = true
276 | t.Logf("expected chipsets[%d].RAID (%s) to be %v but got %v", i, v.Model, v.RAID, chipsets[i].RAID)
277 | }
278 | if chipsets[i].AMDStoreMI != v.AMDStoreMI {
279 | failed = true
280 | t.Logf("expected chipsets[%d].AMDStoreMI (%s) to be %v but got %v", i, v.Model, v.AMDStoreMI, chipsets[i].AMDStoreMI)
281 | }
282 | if chipsets[i].Overclocking != v.Overclocking {
283 | failed = true
284 | t.Logf("expected chipsets[%d].Overclocking (%s) to be %v but got %v", i, v.Model, v.Overclocking, chipsets[i].Overclocking)
285 | }
286 | if chipsets[i].TDP != v.TDP {
287 | failed = true
288 | t.Logf("expected chipsets[%d].TDP (%s) to be %v but got %v", i, v.Model, v.TDP, chipsets[i].TDP)
289 | }
290 | if chipsets[i].SupportExcavator != v.SupportExcavator {
291 | failed = true
292 | t.Logf("expected chipsets[%d].SupportExcavator (%s) to be %v but got %v", i, v.Model, v.SupportExcavator, chipsets[i].SupportExcavator)
293 | }
294 | if chipsets[i].SupportZen != v.SupportZen {
295 | failed = true
296 | t.Logf("expected chipsets[%d].SupportZen (%s) to be %v but got %v", i, v.Model, v.SupportZen, chipsets[i].SupportZen)
297 | }
298 | if chipsets[i].SupportZenPlus != v.SupportZenPlus {
299 | failed = true
300 | t.Logf("expected chipsets[%d].SupportZenPlus (%s) to be %v but got %v", i, v.Model, v.SupportZenPlus, chipsets[i].SupportZenPlus)
301 | }
302 | if chipsets[i].SupportZen2 != v.SupportZen2 {
303 | failed = true
304 | t.Logf("expected chipsets[%d].SupportZen2 (%s) to be %v but got %v", i, v.Model, v.SupportZen2, chipsets[i].SupportZen2)
305 | }
306 | if chipsets[i].SupportZen3 != v.SupportZen3 {
307 | failed = true
308 | t.Logf("expected chipsets[%d].SupportZen3 (%s) to be %v but got %v", i, v.Model, v.SupportZen3, chipsets[i].SupportZen3)
309 | }
310 | if chipsets[i].Architecture != v.Architecture {
311 | failed = true
312 | t.Logf("expected chipsets[%d].Architecture (%s) to be %v but got %v", i, v.Model, v.Architecture, chipsets[i].Architecture)
313 | }
314 | }
315 | if failed {
316 | t.Fail()
317 | }
318 | }
319 |
320 | // taken from https://en.wikipedia.org/wiki/List_of_AMD_chipsets#AM4_chipsets
321 | const am4info = `
322 |
323 |
324 | Model |
325 | Release date |
326 | PCIe support[a] |
327 | Multi-GPU |
328 | USB support[b] |
329 | Storage features |
330 | Processor overclocking |
331 | TDP |
332 | CPU support[14] |
333 | Architecture |
334 |
335 |
336 | CrossFire |
337 | SLI |
338 | SATA ports |
339 | RAID |
340 | AMD StoreMI |
341 | Excavator |
342 | Zen |
343 | Zen+ |
344 | Zen 2 |
345 | Zen 3 |
346 |
347 |
348 | A320 |
349 | February 2017[15] |
350 | PCIe 2.0 ×4 |
351 | No |
352 | No |
353 | 1, 2, 6 |
354 | 4 |
355 | 0, 1, 10 |
356 | No |
357 | Limited to pre-Zen CPUs, unless an unsupported third-party motherboard firmware applied |
358 | ~5 W[16] |
359 | Yes |
360 | Yes |
361 | Yes |
362 | Varies[c] |
363 | Promontory |
364 |
365 |
366 | B350 |
367 | February 2017[15] |
368 | PCIe 2.0 ×6 |
369 | Yes |
370 | 2, 2, 6 |
371 | Yes |
372 |
373 |
374 | X370 |
375 | February 2017[15] |
376 | PCIe 2.0 ×8 |
377 | Yes |
378 | 2, 6, 6 |
379 | 8 |
380 |
381 |
382 | B450 |
383 | March 2018[17] |
384 | PCIe 2.0 ×6 |
385 | No |
386 | 2, 2, 6 |
387 | 4 |
388 | Yes |
389 | Yes, with PBO |
390 | Varies[d] |
391 | Yes |
392 | Varies[d][18] |
393 |
394 |
395 | X470 |
396 | March 2018[17] |
397 | PCIe 2.0 ×8 |
398 | Yes |
399 | 2, 6, 6 |
400 | 8 |
401 |
402 |
403 | A520 |
404 | August 2020[19] |
405 | PCIe 3.0 ×6 |
406 | No |
407 | No |
408 | 1, 2, 6 |
409 | 4 |
410 | No, unless an unsupported third-party motherboard firmware applied |
411 | Varies |
412 | Yes |
413 |
414 |
415 | B550[e] |
416 | June 2020[20] |
417 | PCIe 3.0 ×10[21] |
418 | Yes |
419 | Varies |
420 | 2, 2, 6 |
421 | 6 |
422 | Yes, with PBO |
423 |
424 |
425 | X570 |
426 | July 2019[22] |
427 | PCIe 4.0 ×16 |
428 | Yes |
429 | 8, 0, 4 |
430 | 12 |
431 | ~15 W[23][24] [f] |
432 | No [g] |
433 | Yes |
434 | Yes |
435 | Bixby |
436 |
437 |
438 |
`
439 |
--------------------------------------------------------------------------------
/util_test.go:
--------------------------------------------------------------------------------
1 | package htmltable
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func assertError(t *testing.T, err error) {
9 | if err == nil {
10 | t.Errorf("expected error, got nil")
11 | }
12 | }
13 |
14 | func assertNoError(t *testing.T, err error) {
15 | if err != nil {
16 | t.Errorf("expected no error, got %s", err.Error())
17 | }
18 | }
19 |
20 | func assertEqualError(t *testing.T, err error, msg string) {
21 | assertError(t, err)
22 | got := err.Error()
23 | if got != msg {
24 | t.Errorf("%#v (expected) != %#v (got)", msg, err.Error())
25 | }
26 | }
27 |
28 | func assertEqual(t *testing.T, a, b any) {
29 | if !reflect.DeepEqual(a, b) {
30 | t.Errorf("%#v (expected) != %#v (got)", a, b)
31 | }
32 | }
33 |
34 | type comparable interface {
35 | int | string
36 | }
37 |
38 | func assertGreaterOrEqual[T comparable](t *testing.T, a, b T) {
39 | if !(a >= b) {
40 | t.Errorf("%#v (expected) >= %#v (got)", a, b)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------