├── .gitignore ├── .travis.yml ├── ErrorMessages.go ├── LICENSE ├── README.md ├── benchmark_test.go ├── examples_test.go ├── go.mod ├── go.sum ├── html.go ├── html_test.go ├── testHTML ├── all-type.html └── courses.html └── utils.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | go.coverprofile -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go # 声明构建语言环境 2 | sudo: false # 开启基于容器的Travis CI任务,让编译效率更高。 3 | 4 | notifications: # 每次构建的时候是否通知,如果不想收到通知,那就设置false吧(email: false) 5 | email: 6 | recipients: 7 | - hexileee@gmail.com 8 | on_success: change 9 | on_failure: always 10 | 11 | go: 12 | - "1.11" 13 | 14 | install: 15 | - go build 16 | - go get github.com/mattn/goveralls 17 | 18 | 19 | script: # 集成脚本 20 | - go test -bench=. 21 | - go test -v -cover -coverprofile=go.coverprofile 22 | - goveralls -coverprofile=go.coverprofile -service=travis-ci -repotoken=$COVERALLS_TOKEN 23 | 24 | after_success: 25 | - bash <(curl -s https://codecov.io/bash) 26 | 27 | env: #env环境变量设置,travis提供的repo_token安全方式 28 | global: 29 | secure: "XgZkE9tb9of2lsnXepbJeQU2xj3RuJea1zjlWh4W8ukH8Nwp9CJ6wfs3YnuchV+x1npgY1qE+sdWpieB6brQZgqJNzhC9vXcdOissPUviS0JUctric2UEz+4b0C4pN0LsTYCCENdTdFi62S/0LzyXugumVdeQHHtE75EknnhRhaQh4Dz04c7O7SBygVHEn9DwpbD8LRYVXwrSQ8aeWqhRgI6t4LRYInAs0M8hV4AE1HyKSkIueFQcbAfsV1lMB+pfiVHin5g6veRtRYq+++LzyoQ+gsSKx7yfFFOafKiw6e47lAKQaBQzkkkwy9RaB5KcWhI3CQfYZL7fKwqNy7ytKg3l9oboFEqGc42U6ZiGa1DLv8y3f2yaNtkgSpw5QMRjEd+iYVb2EFISfelUFq4srJ4utv+bn47/tBURN2dsLVAgKY/Z8uPv2fvbIUbkbej91uRyRDGQs1z02MW+66DacfnGrZz3yNFYCffTLCDBFFhrG2IsP7tJuSz7s9RnRnQoGTk5mImPCMnYIhTO0sQj4083qmT6+drx0RPVqUMjm99JeTFfMZfpzXyNAe1zP2DU8IqtKdtk4dzodBPy8MvWKsYNJHwjZKJmQyrOWW74+OvchG2WHN3gkY7fXKqalNWK/yb8ARBsj0TKbGPn29okXp5ut8UHXpVDrN+Kg7BAyg=" 30 | -------------------------------------------------------------------------------- /ErrorMessages.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | ) 7 | 8 | const ( 9 | UnmarshaledKindMustBePtr = "unmarshaled kind must be Ptr" 10 | UnmarshalerItemKind = "unmarshaled elem cannot be Ptr/Uintptr/Interface/Chan/Func/" 11 | DtoZero = "dto cannot be zero" 12 | SelectionNil = "selection cannot be nil" 13 | ConverterNotExist = "converter not exist" 14 | ConverterTypeWrong = "type of converter is wrong" 15 | ) 16 | 17 | type ( 18 | UnmarshaledKindMustBePtrError struct { 19 | dtoType reflect.Type 20 | } 21 | 22 | UnmarshalerItemKindError struct { 23 | dtoType reflect.Type 24 | } 25 | 26 | ConverterNotExistError struct { 27 | name string 28 | } 29 | 30 | ConverterTypeWrongError struct { 31 | name string 32 | methodType reflect.Type 33 | } 34 | ) 35 | 36 | func NewUnmarshaledKindMustBePtrError(dtoType reflect.Type) *UnmarshaledKindMustBePtrError { 37 | return &UnmarshaledKindMustBePtrError{dtoType} 38 | } 39 | 40 | func NewUnmarshalerItemKindError(dtoType reflect.Type) *UnmarshalerItemKindError { 41 | return &UnmarshalerItemKindError{dtoType} 42 | } 43 | 44 | func NewConverterNotExistError(name string) *ConverterNotExistError { 45 | return &ConverterNotExistError{name} 46 | } 47 | 48 | func NewConverterTypeWrongError(name string, methodType reflect.Type) *ConverterTypeWrongError { 49 | return &ConverterTypeWrongError{name, methodType} 50 | } 51 | 52 | func (err UnmarshaledKindMustBePtrError) Error() string { 53 | return UnmarshaledKindMustBePtr + ": " + err.dtoType.String() 54 | } 55 | 56 | func (err UnmarshalerItemKindError) Error() string { 57 | return UnmarshalerItemKind + ": " + err.dtoType.String() 58 | } 59 | 60 | func (err ConverterNotExistError) Error() string { 61 | return ConverterNotExist + ": " + err.name 62 | } 63 | 64 | func (err ConverterTypeWrongError) Error() string { 65 | return fmt.Sprintf(ConverterTypeWrong+"(%s): %s", err.name, err.methodType) 66 | } 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 ZJU QSC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Coverage Status](https://coveralls.io/repos/github/Hexilee/unhtml/badge.svg)](https://coveralls.io/github/Hexilee/unhtml) 2 | [![Go Report Card](https://goreportcard.com/badge/github.com/Hexilee/unhtml)](https://goreportcard.com/report/github.com/Hexilee/unhtml) 3 | [![Build Status](https://travis-ci.org/Hexilee/unhtml.svg?branch=master)](https://travis-ci.org/Hexilee/unhtml) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Hexilee/unhtml/blob/master/LICENSE) 5 | [![Documentation](https://godoc.org/github.com/Hexilee/unhtml?status.svg)](https://godoc.org/github.com/Hexilee/unhtml) 6 | 7 | Table of Contents 8 | ================= 9 | 10 | * [Example & Performance](#example--performance) 11 | * [Tips & Features](#tips--features) 12 | * [Types](#types) 13 | * [Root](#root) 14 | * [Selector](#selector) 15 | * [Struct](#struct) 16 | * [Slice](#slice) 17 | * [Tags](#tags) 18 | * [html](#html) 19 | * [attr](#attr) 20 | * [converter](#converter) 21 | 22 | 23 | ### Example & Performance 24 | 25 | A HTML file 26 | 27 | ```html 28 | 29 | 30 | 31 | 32 | Title 33 | 34 | 35 |
36 | 42 |
43 |

Hexilee

44 |

20

45 |

true

46 |
47 |

Hello World!

48 |

10

49 |

3.14

50 |

true

51 |
52 | 53 | 54 | ``` 55 | 56 | Read it 57 | 58 | ```go 59 | AllTypeHTML, _ := ioutil.ReadFile("testHTML/all-type.html") 60 | ``` 61 | 62 | If we want to parse it and get the values we want, like the following structs, how should we do it? 63 | 64 | 65 | ```go 66 | package example 67 | 68 | type ( 69 | PartTypesStruct struct { 70 | Slice []int 71 | Struct TestUser 72 | String string 73 | Int int 74 | Float64 float64 75 | Bool bool 76 | } 77 | 78 | TestUser struct { 79 | Name string 80 | Age uint 81 | LikeLemon bool 82 | } 83 | ) 84 | ``` 85 | 86 | In the traditional way, we should do it like this: 87 | 88 | ```go 89 | package example 90 | 91 | import ( 92 | "bytes" 93 | "github.com/PuerkitoBio/goquery" 94 | "strconv" 95 | ) 96 | 97 | func parsePartTypesLogically() (PartTypesStruct, error) { 98 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML)) 99 | partTypes := PartTypesStruct{} 100 | if err == nil { 101 | selection := doc.Find(partTypes.Root()) 102 | partTypes.Slice = make([]int, 0) 103 | selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) { 104 | Int, parseErr := strconv.Atoi(selection.Text()) 105 | if parseErr != nil { 106 | err = parseErr 107 | } 108 | partTypes.Slice = append(partTypes.Slice, Int) 109 | }) 110 | if err == nil { 111 | partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text() 112 | Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text()) 113 | if err = parseErr; err == nil { 114 | partTypes.Struct.Age = uint(Int) 115 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text()) 116 | if err = parseErr; err == nil { 117 | partTypes.Struct.LikeLemon = Bool 118 | 119 | String := selection.Find(`#test > p:nth-child(3)`).Text() 120 | Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text()) 121 | if err = parseErr; err != nil { 122 | return partTypes, err 123 | } 124 | 125 | Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0) 126 | if err = parseErr; err != nil { 127 | return partTypes, err 128 | } 129 | 130 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text()) 131 | if err = parseErr; err != nil { 132 | return partTypes, err 133 | } 134 | partTypes.String = String 135 | partTypes.Int = Int 136 | partTypes.Float64 = Float64 137 | partTypes.Bool = Bool 138 | } 139 | } 140 | } 141 | } 142 | return partTypes, err 143 | } 144 | 145 | ``` 146 | 147 | It works pretty well, but is boring. And now, you can do it like this: 148 | 149 | ```go 150 | package main 151 | 152 | import ( 153 | "encoding/json" 154 | "fmt" 155 | "github.com/Hexilee/unhtml" 156 | "io/ioutil" 157 | ) 158 | 159 | type ( 160 | PartTypesStruct struct { 161 | Slice []int `html:"ul > li"` 162 | Struct TestUser `html:"#test > div"` 163 | String string `html:"#test > p:nth-child(3)"` 164 | Int int `html:"#test > p:nth-child(4)"` 165 | Float64 float64 `html:"#test > p:nth-child(5)"` 166 | Bool bool `html:"#test > p:nth-child(6)"` 167 | } 168 | 169 | TestUser struct { 170 | Name string `html:"p:nth-child(1)"` 171 | Age uint `html:"p:nth-child(2)"` 172 | LikeLemon bool `html:"p:nth-child(3)"` 173 | } 174 | ) 175 | 176 | func (PartTypesStruct) Root() string { 177 | return "#test" 178 | } 179 | 180 | func main() { 181 | allTypes := PartTypesStruct{} 182 | _ := unhtml.Unmarshal(AllTypeHTML, &allTypes) 183 | result, _ := json.Marshal(&allTypes) 184 | fmt.Println(string(result)) 185 | } 186 | ``` 187 | 188 | Result: 189 | 190 | ```json 191 | { 192 | "Slice": [ 193 | 0, 194 | 1, 195 | 2, 196 | 3 197 | ], 198 | "Struct": { 199 | "Name": "Hexilee", 200 | "Age": 20, 201 | "LikeLemon": true 202 | }, 203 | "String": "Hello World!", 204 | "Int": 10, 205 | "Float64": 3.14, 206 | "Bool": true 207 | } 208 | ``` 209 | 210 | I think it can really improve the efficiency of my development, but what about its performance? 211 | 212 | There are two benchmarks: 213 | 214 | ```go 215 | func BenchmarkUnmarshalPartTypes(b *testing.B) { 216 | assert.NotNil(b, AllTypeHTML) 217 | for i := 0; i < b.N; i++ { 218 | partTypes := PartTypesStruct{} 219 | assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes)) 220 | } 221 | } 222 | 223 | func BenchmarkParsePartTypesLogically(b *testing.B) { 224 | assert.NotNil(b, AllTypeHTML) 225 | for i := 0; i < b.N; i++ { 226 | _, err := parsePartTypesLogically() 227 | assert.Nil(b, err) 228 | } 229 | } 230 | ``` 231 | 232 | Test it: 233 | 234 | ```bash 235 | > go test -bench=. 236 | goos: darwin 237 | goarch: amd64 238 | pkg: github.com/Hexilee/unhtml 239 | BenchmarkUnmarshalPartTypes-4 30000 54096 ns/op 240 | BenchmarkParsePartTypesLogically-4 30000 45188 ns/op 241 | PASS 242 | ok github.com/Hexilee/unhtml 4.098s 243 | ``` 244 | 245 | Not very bad, in consideration of the small size of the demo HTML. In true development with more complicated HTML, their efficiency is almost the same. 246 | 247 | ### Tips & Features 248 | 249 | The only API this package exposes is the function: 250 | 251 | ```go 252 | func Unmarshal(data []byte, v interface{}) error 253 | ``` 254 | 255 | which is compatible with the standard library's `json` and `xml`. However, you can do some jobs with the data types in your code. 256 | 257 | #### Types 258 | 259 | This package supports all kinds of type in the `reflect` package except `Ptr/Uintptr/Interface/Chan/Func`. 260 | 261 | The following fields are invalid and will cause `UnmarshalerItemKindError`. 262 | 263 | ```go 264 | type WrongFieldsStruct struct { 265 | Ptr *int 266 | Uintptr uintptr 267 | Interface io.Reader 268 | Chan chan int 269 | Func func() 270 | } 271 | ``` 272 | 273 | However, when you call the function `Unmarshal`, you **MUST** pass a pointer, otherwise you will get an `UnmarshaledKindMustBePtrError`. 274 | 275 | ```go 276 | a := 1 277 | 278 | // Wrong 279 | Unmarshal([]byte(""), a) 280 | 281 | // Right 282 | Unmarshal([]byte(""), &a) 283 | ``` 284 | 285 | #### Root 286 | 287 | Return the root selector. 288 | 289 | You are only supported to define a `Root() string` method for the root type, like 290 | 291 | ```go 292 | func (PartTypesStruct) Root() string { 293 | return "#test" 294 | } 295 | ``` 296 | 297 | If you define it for a field type, such as `TestUser` 298 | 299 | ```go 300 | func (TestUser) Root() string { 301 | return "#test" 302 | } 303 | ``` 304 | 305 | In this case, in `PartTypesStruct`, the field selector will be covered. 306 | 307 | ```go 308 | type ( 309 | PartTypesStruct struct { 310 | ... 311 | Struct TestUser `html:"#test > div"` 312 | ... 313 | } 314 | ) 315 | 316 | // real 317 | type ( 318 | PartTypesStruct struct { 319 | ... 320 | Struct TestUser `html:"#test"` 321 | ... 322 | } 323 | ) 324 | ``` 325 | 326 | 327 | 328 | #### Selector 329 | 330 | This package is based on `github.com/PuerkitoBio/goquery` and supports standard css selectors. 331 | 332 | You can define selectors of a field in tags, like this: 333 | 334 | ```go 335 | type ( 336 | PartTypesStruct struct { 337 | ... 338 | Int int `html:"#test > p:nth-child(4)"` 339 | ... 340 | } 341 | ) 342 | ``` 343 | 344 | In most cases, this package will find the `#test > p:nth-child(4)` element and try to parse its `innerText` as int. 345 | 346 | However, when the field type is `Struct` or `Slice`, it will be more complex. 347 | 348 | ##### Struct 349 | 350 | ```go 351 | type ( 352 | PartTypesStruct struct { 353 | ... 354 | Struct TestUser `html:"#test > div"` 355 | ... 356 | } 357 | 358 | TestUser struct { 359 | Name string `html:"p:nth-child(1)"` 360 | Age uint `html:"p:nth-child(2)"` 361 | LikeLemon bool `html:"p:nth-child(3)"` 362 | } 363 | ) 364 | 365 | func (PartTypesStruct) Root() string { 366 | return "#test" 367 | } 368 | ``` 369 | 370 | First, it will call `*goquery.Selection.Find("#test")`, we get: 371 | 372 | ```html 373 |
374 | 380 |
381 |

Hexilee

382 |

20

383 |

true

384 |
385 |

Hello World!

386 |

10

387 |

3.14

388 |

true

389 |
390 | ``` 391 | 392 | Then, it will call `*goquery.Selection.Find("#test > div")`, we get 393 | 394 | ```html 395 |
396 |

Hexilee

397 |

20

398 |

true

399 |
400 | ``` 401 | 402 | Then, in `TestUser`, it will call 403 | 404 | ```go 405 | *goquery.Selection.Find("p:nth-child(1)") // as Name 406 | *goquery.Selection.Find("p:nth-child(2)") // as Age 407 | *goquery.Selection.Find("p:nth-child(3)") // as LikeLemon 408 | ``` 409 | 410 | ##### Slice 411 | 412 | ```go 413 | type ( 414 | PartTypesStruct struct { 415 | Slice []int `html:"ul > li"` ... 416 | } 417 | ) 418 | 419 | func (PartTypesStruct) Root() string { 420 | return "#test" 421 | } 422 | ``` 423 | 424 | As above, we get 425 | 426 | ```html 427 |
428 | 434 |
435 |

Hexilee

436 |

20

437 |

true

438 |
439 |

Hello World!

440 |

10

441 |

3.14

442 |

true

443 |
444 | ``` 445 | 446 | Then it will call `*goquery.Selection.Find("ul > li")`, we get 447 | 448 | ```html 449 |
  • 0
  • 450 |
  • 1
  • 451 |
  • 2
  • 452 |
  • 3
  • 453 | ``` 454 | 455 | Then, it will call `*goquery.Selection.Each(func(int, *goquery.Selection))`, iterate the list and parse values for slice. 456 | 457 | #### Tags 458 | 459 | This package supports three tags, `html`, `attr` and `converter` 460 | 461 | ##### html 462 | 463 | Provide the `css selector` of this field. 464 | 465 | ##### attr 466 | 467 | By default, this package regards the `innerText` of a element as its `value` 468 | 469 | ```html 470 | Google 471 | ``` 472 | 473 | ```go 474 | type Link struct { 475 | Text string `html:"a"` 476 | } 477 | ``` 478 | 479 | You will get `Text = Google`. However, what should we do if we want to get `href`? 480 | 481 | ```go 482 | type Link struct { 483 | Href string `html:"a" attr:"href"` 484 | Text string `html:"a"` 485 | } 486 | ``` 487 | 488 | You will get `link.Href == "https://google.com"` 489 | 490 | ##### converter 491 | 492 | Sometimes, you want to process the original data 493 | 494 | ```html 495 |

    2018-10-01 00:00:01

    496 | ``` 497 | 498 | You may unmarshal it like this 499 | 500 | ```go 501 | type Birthday struct { 502 | Time time.Time `html:"p"` 503 | } 504 | 505 | func TestConverter(t *testing.T) { 506 | birthday := Birthday{} 507 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday)) 508 | assert.Equal(t, 2018, birthday.Time.Year()) 509 | assert.Equal(t, time.October, birthday.Time.Month()) 510 | assert.Equal(t, 1, birthday.Time.Day()) 511 | } 512 | ``` 513 | 514 | Absolutely, you will fail, because you don't define the way it converts a string to time.Time. `unhtml` will regard it as a struct. 515 | 516 | However, you can use `converter` 517 | 518 | ```go 519 | type Birthday struct { 520 | Time time.Time `html:"p" converter:"StringToTime"` 521 | } 522 | 523 | const TimeStandard = `2006-01-02 15:04:05` 524 | 525 | func (Birthday) StringToTime(str string) (time.Time, error) { 526 | return time.Parse(TimeStandard, str) 527 | } 528 | 529 | func TestConverter(t *testing.T) { 530 | birthday := Birthday{} 531 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday)) 532 | assert.Equal(t, 2018, birthday.Time.Year()) 533 | assert.Equal(t, time.October, birthday.Time.Month()) 534 | assert.Equal(t, 1, birthday.Time.Day()) 535 | } 536 | ``` 537 | 538 | Make it. 539 | 540 | The type of converter **MUST** be 541 | 542 | ```go 543 | func (inputType) (resultType, error) 544 | ``` 545 | 546 | `resultType` **MUST** be the same as the field type, and they can be any type. 547 | 548 | `inputType` **MUST NOT** violate the requirements in [Types](#types). 549 | 550 | 551 | 552 | -------------------------------------------------------------------------------- /benchmark_test.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "bytes" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/stretchr/testify/assert" 7 | "strconv" 8 | "testing" 9 | ) 10 | 11 | type ( 12 | PartTypesStruct struct { 13 | Slice []int `html:"ul > li"` 14 | Struct TestUser `html:"#test > div"` 15 | String string `html:"#test > p:nth-child(3)"` 16 | Int int `html:"#test > p:nth-child(4)"` 17 | Float64 float64 `html:"#test > p:nth-child(5)"` 18 | Bool bool `html:"#test > p:nth-child(6)"` 19 | } 20 | ) 21 | 22 | func (PartTypesStruct) Root() string { 23 | return "#test" 24 | } 25 | 26 | func BenchmarkUnmarshalCourses(b *testing.B) { 27 | assert.NotNil(b, CourseHTML) 28 | for i := 0; i < b.N; i++ { 29 | courses := make(Courses, 0) 30 | assert.Nil(b, Unmarshal(CourseHTML, &courses)) 31 | } 32 | } 33 | 34 | func BenchmarkUnmarshalPartTypes(b *testing.B) { 35 | assert.NotNil(b, AllTypeHTML) 36 | for i := 0; i < b.N; i++ { 37 | partTypes := PartTypesStruct{} 38 | assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes)) 39 | } 40 | } 41 | 42 | func BenchmarkParseCoursesLogically(b *testing.B) { 43 | assert.NotNil(b, CourseHTML) 44 | for i := 0; i < b.N; i++ { 45 | _, err := parseCoursesLogically() 46 | assert.Nil(b, err) 47 | } 48 | } 49 | 50 | func BenchmarkParsePartTypesLogically(b *testing.B) { 51 | assert.NotNil(b, AllTypeHTML) 52 | for i := 0; i < b.N; i++ { 53 | _, err := parsePartTypesLogically() 54 | assert.Nil(b, err) 55 | } 56 | } 57 | 58 | func getLink(selection *goquery.Selection) Link { 59 | link, _ := selection.Attr(AttrHref) 60 | return Link{Text: selection.Text(), Href: link} 61 | } 62 | 63 | func parseCoursesLogically() (Courses, error) { 64 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(CourseHTML)) 65 | courses := make(Courses, 0) 66 | if err == nil { 67 | doc.Find(courses.Root()).Each(func(i int, selection *goquery.Selection) { 68 | course := Course{} 69 | course.Code = getLink(selection.Find(`td:nth-child(1) > a`)) 70 | course.Name = getLink(selection.Find(`td:nth-child(2) > a`)) 71 | course.Teacher = getLink(selection.Find(`td:nth-child(3) > a`)) 72 | course.Semester = selection.Find(`td:nth-child(4)`).Text() 73 | course.Time = selection.Find(`td:nth-child(5)`).Text() 74 | course.Location = selection.Find(`td:nth-child(6)`).Text() 75 | courses = append(courses, course) 76 | }) 77 | } 78 | 79 | return courses, err 80 | } 81 | 82 | func parsePartTypesLogically() (PartTypesStruct, error) { 83 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML)) 84 | partTypes := PartTypesStruct{} 85 | if err == nil { 86 | selection := doc.Find(partTypes.Root()) 87 | partTypes.Slice = make([]int, 0) 88 | selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) { 89 | Int, parseErr := strconv.Atoi(selection.Text()) 90 | if parseErr != nil { 91 | err = parseErr 92 | } 93 | partTypes.Slice = append(partTypes.Slice, Int) 94 | }) 95 | if err == nil { 96 | partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text() 97 | Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text()) 98 | if err = parseErr; err == nil { 99 | partTypes.Struct.Age = uint(Int) 100 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text()) 101 | if err = parseErr; err == nil { 102 | partTypes.Struct.LikeLemon = Bool 103 | 104 | String := selection.Find(`#test > p:nth-child(3)`).Text() 105 | Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text()) 106 | if err = parseErr; err != nil { 107 | return partTypes, err 108 | } 109 | 110 | Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0) 111 | if err = parseErr; err != nil { 112 | return partTypes, err 113 | } 114 | 115 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text()) 116 | if err = parseErr; err != nil { 117 | return partTypes, err 118 | } 119 | partTypes.String = String 120 | partTypes.Int = Int 121 | partTypes.Float64 = Float64 122 | partTypes.Bool = Bool 123 | } 124 | } 125 | } 126 | } 127 | 128 | return partTypes, err 129 | } 130 | -------------------------------------------------------------------------------- /examples_test.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | ) 7 | 8 | const ( 9 | AllTypesHTML = ` 10 | 11 | 12 | 13 | 14 | Title 15 | 16 | 17 |
    18 | 24 |
    25 |

    Hexilee

    26 |

    20

    27 |

    true

    28 |
    29 |

    Hello World!

    30 |

    10

    31 |

    3.14

    32 |

    true

    33 |
    34 | 35 | 36 | ` 37 | ) 38 | 39 | func ExampleUnmarshal() { 40 | allTypes := AllTypeTest{} 41 | _ = Unmarshal(AllTypeHTML, &allTypes) 42 | result, _ := json.Marshal(&allTypes) 43 | fmt.Println(string(result)) 44 | // Output: 45 | // {"Slice":[0,1,2,3],"Struct":{"Name":"Hexilee","Age":20,"LikeLemon":true},"String":"Hello World!","Int":10,"Int8":10,"Int16":10,"Int32":10,"Int64":10,"Uint":10,"Uint8":10,"Uint16":10,"Uint32":10,"Uint64":10,"Float32":3.14,"Float64":3.14,"Bool":true} 46 | } 47 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Hexilee/unhtml 2 | 3 | require ( 4 | github.com/PuerkitoBio/goquery v1.4.1 5 | github.com/andybalholm/cascadia v1.0.0 // indirect 6 | github.com/davecgh/go-spew v1.1.1 // indirect 7 | github.com/pmezard/go-difflib v1.0.0 // indirect 8 | github.com/stretchr/testify v1.2.2 9 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.4.1 h1:smcIRGdYm/w7JSbcdeLHEMzxmsBQvl8lhf0dSw2nzMI= 2 | github.com/PuerkitoBio/goquery v1.4.1/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA= 3 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= 4 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 9 | github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= 10 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 11 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 12 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3 h1:dgd4x4kJt7G4k4m93AYLzM8Ni6h2qLTfh9n9vXJT3/0= 13 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 14 | -------------------------------------------------------------------------------- /html.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "bytes" 5 | "github.com/PuerkitoBio/goquery" 6 | "reflect" 7 | "strconv" 8 | ) 9 | 10 | type ( 11 | // HTMLUnmarshalerBuilder: inner hidden 12 | HTMLUnmarshalerBuilder struct { 13 | dto reflect.Value 14 | kind reflect.Kind 15 | dtoElemType reflect.Type 16 | selection *goquery.Selection 17 | selector string 18 | attrKey string 19 | } 20 | 21 | // HTMLUnmarshaler: inner hidden 22 | HTMLUnmarshaler struct { 23 | dto reflect.Value 24 | kind reflect.Kind 25 | dtoElemType reflect.Type 26 | selection goquery.Selection 27 | selector string 28 | attrKey string 29 | } 30 | 31 | // HTMLModel: HTML model with root selector 32 | HTMLModel interface { 33 | // Root return root selector 34 | Root() string 35 | } 36 | ) 37 | 38 | const ( 39 | SelectorKey = "html" 40 | AttrKey = "attr" 41 | ConverterKey = "converter" 42 | ZeroStr = "" 43 | ) 44 | 45 | const ( 46 | AttrHref = "href" 47 | ) 48 | 49 | func Unmarshal(data []byte, v interface{}) error { 50 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) 51 | if err == nil { 52 | err = unmarshal(reflect.ValueOf(v), *doc.Selection, "") 53 | } 54 | return err 55 | } 56 | 57 | func unmarshal(ptr reflect.Value, selection goquery.Selection, tag reflect.StructTag) (err error) { 58 | realUnmarshal, buildErr := new(HTMLUnmarshalerBuilder). 59 | setDto(ptr). 60 | setSelection(&selection). 61 | setSelector(tag.Get(SelectorKey)). 62 | setAttrKey(tag.Get(AttrKey)). 63 | build() 64 | 65 | if err = buildErr; err == nil { 66 | err = realUnmarshal.unmarshal() 67 | } 68 | return err 69 | } 70 | 71 | func (builder *HTMLUnmarshalerBuilder) build() (unmarshaler *HTMLUnmarshaler, err error) { 72 | if err = builder.initRoot(); err == nil { 73 | if err = builder.parseType(); err == nil { 74 | if err = builder.checkBeforeReturn(); err == nil { 75 | unmarshaler = &HTMLUnmarshaler{ 76 | dto: builder.dto, 77 | kind: builder.kind, 78 | dtoElemType: builder.dtoElemType, 79 | selection: *builder.selection, 80 | selector: builder.selector, 81 | attrKey: builder.attrKey, 82 | } 83 | } 84 | } 85 | } 86 | return 87 | } 88 | 89 | func (builder *HTMLUnmarshalerBuilder) setDto(v reflect.Value) *HTMLUnmarshalerBuilder { 90 | builder.dto = v 91 | return builder 92 | } 93 | 94 | func (builder *HTMLUnmarshalerBuilder) setSelector(selector string) *HTMLUnmarshalerBuilder { 95 | builder.selector = selector 96 | return builder 97 | } 98 | 99 | func (builder *HTMLUnmarshalerBuilder) setAttrKey(attrKey string) *HTMLUnmarshalerBuilder { 100 | builder.attrKey = attrKey 101 | return builder 102 | } 103 | 104 | func (builder *HTMLUnmarshalerBuilder) setSelection(selection *goquery.Selection) *HTMLUnmarshalerBuilder { 105 | builder.selection = selection 106 | return builder 107 | } 108 | 109 | func (builder *HTMLUnmarshalerBuilder) initRoot() (err error) { 110 | //if err = builder.checkDtoZero(); err == nil { 111 | if value, ok := builder.dto.Interface().(HTMLModel); ok { 112 | builder.selector = value.Root() 113 | } 114 | //} 115 | return 116 | } 117 | 118 | func (builder *HTMLUnmarshalerBuilder) parseType() (err error) { 119 | //if err = builder.checkDtoZero(); err == nil { 120 | dtoType := builder.dto.Type() 121 | switch dtoType.Kind() { 122 | case reflect.Ptr: 123 | builder.dtoElemType = dtoType.Elem() 124 | builder.kind = builder.dtoElemType.Kind() 125 | default: 126 | err = NewUnmarshaledKindMustBePtrError(dtoType) 127 | } 128 | //} 129 | 130 | return 131 | } 132 | 133 | func (builder *HTMLUnmarshalerBuilder) checkItemKind() (err error) { 134 | switch builder.kind { 135 | case reflect.Ptr: 136 | fallthrough 137 | case reflect.Uintptr: 138 | fallthrough 139 | case reflect.Interface: 140 | fallthrough 141 | case reflect.Chan: 142 | fallthrough 143 | case reflect.Func: 144 | fallthrough 145 | case reflect.Map: 146 | err = NewUnmarshalerItemKindError(builder.dtoElemType) 147 | default: 148 | } 149 | return 150 | } 151 | 152 | func (builder *HTMLUnmarshalerBuilder) checkBeforeReturn() (err error) { 153 | //if err = builder.checkDtoZero(); err == nil { 154 | // if err = builder.checkSelectionNil(); err == nil { 155 | err = builder.checkItemKind() 156 | //} 157 | //} 158 | return 159 | } 160 | 161 | // never return err in production env 162 | //func (builder *HTMLUnmarshalerBuilder) checkDtoZero() (err error) { 163 | // // Zero reflect.Value 164 | // if isZero(builder.dto) { 165 | // err = errors.New(DtoZero) 166 | // } 167 | // return 168 | //} 169 | 170 | // never return err in production env 171 | //func (builder *HTMLUnmarshalerBuilder) checkSelectionNil() (err error) { 172 | // if builder.selection == nil { 173 | // err = errors.New(SelectionNil) 174 | // } 175 | // return 176 | //} 177 | 178 | func (unmarshaler HTMLUnmarshaler) getSelection() goquery.Selection { 179 | return unmarshaler.selection 180 | } 181 | 182 | func (unmarshaler HTMLUnmarshaler) getSelector() string { 183 | return unmarshaler.selector 184 | } 185 | 186 | func (unmarshaler HTMLUnmarshaler) getAttrKey() string { 187 | return unmarshaler.attrKey 188 | } 189 | 190 | func (unmarshaler HTMLUnmarshaler) getDto() reflect.Value { 191 | return unmarshaler.dto 192 | } 193 | 194 | func (unmarshaler HTMLUnmarshaler) getKind() reflect.Kind { 195 | return unmarshaler.kind 196 | } 197 | 198 | func (unmarshaler HTMLUnmarshaler) getDtoElemType() reflect.Type { 199 | return unmarshaler.dtoElemType 200 | } 201 | 202 | func (unmarshaler HTMLUnmarshaler) unmarshalSlice(preSelection goquery.Selection) (err error) { 203 | itemType := unmarshaler.getDtoElemType().Elem() 204 | sliceValue := reflect.MakeSlice(reflect.SliceOf(itemType), 0, 0) 205 | preSelection.Each(func(i int, selection *goquery.Selection) { 206 | newItem := reflect.New(itemType) 207 | if err = unmarshal(newItem, *selection, ""); err == nil { 208 | sliceValue = reflect.Append(sliceValue, newItem.Elem()) 209 | } 210 | }) 211 | unmarshaler.getDto().Elem().Set(sliceValue) 212 | return err 213 | } 214 | 215 | func (unmarshaler HTMLUnmarshaler) callConverter(converter string, fieldIndex int, preSelection goquery.Selection) (result reflect.Value, err error) { 216 | motherValue := unmarshaler.getDto().Elem() 217 | motherType := unmarshaler.getDtoElemType() 218 | tag := motherType.Field(fieldIndex).Tag 219 | resultType := motherType.Field(fieldIndex).Type 220 | method, exist := motherType.MethodByName(converter) 221 | if !exist { 222 | err = NewConverterNotExistError(converter) 223 | } 224 | if err == nil { 225 | methodValue := motherValue.MethodByName(converter) 226 | inputValuePtr, converterTypeErr := checkConverter(method.Name, methodValue.Type(), resultType) 227 | if err = converterTypeErr; err == nil { 228 | if err = unmarshal(inputValuePtr, preSelection, tag); err == nil { 229 | results := methodValue.Call([]reflect.Value{inputValuePtr.Elem()}) 230 | if errInterface := results[1].Interface(); errInterface != nil { 231 | err = errInterface.(error) 232 | } 233 | if err == nil { 234 | result = results[0] 235 | } 236 | } 237 | } 238 | } 239 | return 240 | } 241 | 242 | func (unmarshaler HTMLUnmarshaler) unmarshalStruct(preSelection goquery.Selection) (err error) { 243 | motherValue := unmarshaler.getDto().Elem() 244 | motherType := unmarshaler.getDtoElemType() 245 | for i := 0; i < motherValue.NumField(); i++ { 246 | field := motherValue.Field(i) 247 | if field.CanSet() { 248 | fieldPtr := field.Addr() 249 | tag := motherType.Field(i).Tag 250 | if converter := tag.Get(ConverterKey); converter != ZeroStr { 251 | result, callConverterErr := unmarshaler.callConverter(converter, i, preSelection) 252 | if err = callConverterErr; err == nil { 253 | fieldPtr.Elem().Set(result) 254 | } 255 | } else { 256 | err = unmarshal(fieldPtr, preSelection, tag) 257 | } 258 | 259 | if err != nil { 260 | break 261 | } 262 | } 263 | } 264 | return 265 | } 266 | 267 | func (unmarshaler HTMLUnmarshaler) unmarshal() (err error) { 268 | preSelection := unmarshaler.getSelection() 269 | if unmarshaler.getSelector() != ZeroStr { 270 | preSelection = *preSelection.Find(unmarshaler.getSelector()) 271 | } 272 | switch unmarshaler.getKind() { 273 | case reflect.Slice: 274 | err = unmarshaler.unmarshalSlice(preSelection) 275 | case reflect.Struct: 276 | err = unmarshaler.unmarshalStruct(preSelection) 277 | case reflect.String: 278 | unmarshaler.getDto().Elem().SetString(unmarshaler.getAttrValue(preSelection)) 279 | case reflect.Int: 280 | fallthrough 281 | case reflect.Int8: 282 | fallthrough 283 | case reflect.Int16: 284 | fallthrough 285 | case reflect.Int32: 286 | fallthrough 287 | case reflect.Int64: 288 | valueStr := unmarshaler.getAttrValue(preSelection) 289 | value, err := strconv.Atoi(valueStr) 290 | if err == nil { 291 | unmarshaler.getDto().Elem().SetInt(int64(value)) 292 | } 293 | case reflect.Uint: 294 | fallthrough 295 | case reflect.Uint8: 296 | fallthrough 297 | case reflect.Uint16: 298 | fallthrough 299 | case reflect.Uint32: 300 | fallthrough 301 | case reflect.Uint64: 302 | valueStr := unmarshaler.getAttrValue(preSelection) 303 | value, err := strconv.ParseUint(valueStr, 0, 0) 304 | if err == nil { 305 | unmarshaler.getDto().Elem().SetUint(value) 306 | } 307 | case reflect.Float32: 308 | fallthrough 309 | case reflect.Float64: 310 | valueStr := unmarshaler.getAttrValue(preSelection) 311 | value, err := strconv.ParseFloat(valueStr, 0) 312 | if err == nil { 313 | unmarshaler.getDto().Elem().SetFloat(value) 314 | } 315 | case reflect.Bool: 316 | valueStr := unmarshaler.getAttrValue(preSelection) 317 | value, err := strconv.ParseBool(valueStr) 318 | if err == nil { 319 | unmarshaler.getDto().Elem().SetBool(value) 320 | } 321 | } 322 | 323 | return err 324 | } 325 | 326 | func (unmarshaler HTMLUnmarshaler) getAttrValue(selection goquery.Selection) (valueStr string) { 327 | if unmarshaler.getAttrKey() == ZeroStr { 328 | valueStr = selection.Text() 329 | } else { 330 | if str, exist := selection.Attr(unmarshaler.getAttrKey()); exist { 331 | valueStr = str 332 | } 333 | } 334 | return 335 | } 336 | -------------------------------------------------------------------------------- /html_test.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "github.com/stretchr/testify/assert" 7 | "io/ioutil" 8 | "reflect" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | const ( 14 | CoursesJSON = `[{"code":{"text":"061B0020","href":"#"},"name":{"text":"复变函数与积分变换","href":"#"},"teacher":{"text":"王伟","href":"#"},"semester":"秋","time":"周一第1,2节周四第1,2节","location":"紫金港西2-205(多)紫金港西2-205(多)"},{"code":{"text":"101C0350","href":"#"},"name":{"text":"电路与模拟电子技术","href":"#"},"teacher":{"text":"孙盾","href":"#"},"semester":"秋冬","time":"周二第6,7节周二第8节{单周}周五第3,4,5节","location":"紫金港西1-417(多)紫金港西1-417(多)紫金港西1-417(多)"},{"code":{"text":"101C0360","href":"#"},"name":{"text":"电路与模拟电子技术实验","href":"#"},"teacher":{"text":"干于","href":"#"},"semester":"秋冬","time":"周四第3,4,5节","location":"紫金港东3-202"},{"code":{"text":"241L0020","href":"#"},"name":{"text":"博弈论基础","href":"#"},"teacher":{"text":"蒋文华","href":"#"},"semester":"冬","time":"周三第6,7,8节","location":"紫金港西1-316(多)*"},{"code":{"text":"261C0070","href":"#"},"name":{"text":"工程力学","href":"#"},"teacher":{"text":"吴禹季葆华","href":"#"},"semester":"秋冬","time":"周二第1,2节{单周}周四第6,7节周四第8节{双周}","location":"紫金港西1-404(多)紫金港西1-404(多)紫金港西1-404(多)"},{"code":{"text":"74188020","href":"#"},"name":{"text":"专业实习","href":"#"},"teacher":{"text":"陈家旺黄豪彩","href":"#"},"semester":"短","time":" ","location":" "},{"code":{"text":"761T0010","href":"#"},"name":{"text":"大学物理(甲)Ⅰ","href":"#"},"teacher":{"text":"潘国卫","href":"#"},"semester":"秋冬","time":"周六第6,7,8,9节","location":"紫金港西2-101(多)"},{"code":{"text":"761T0020","href":"#"},"name":{"text":"大学物理(甲)Ⅱ","href":"#"},"teacher":{"text":"郑大方","href":"#"},"semester":"秋冬","time":"周一第3,4节周三第1,2节","location":"紫金港西2-202(多)#"},{"code":{"text":"821T0020","href":"#"},"name":{"text":"微积分(甲)Ⅱ","href":"#"},"teacher":{"text":"薛儒英","href":"#"},"semester":"秋冬","time":"周六第1,2,3,4节{单周}周六第1,2,3,4,5节{双周}","location":"紫金港西2-105(多)"}]` 15 | AllTypesJSON = `{"Slice":[0,1,2,3],"Struct":{"Name":"Hexilee","Age":20,"LikeLemon":true},"String":"Hello World!","Int":10,"Int8":10,"Int16":10,"Int32":10,"Int64":10,"Uint":10,"Uint8":10,"Uint16":10,"Uint32":10,"Uint64":10,"Float32":3.14,"Float64":3.14,"Bool":true}` 16 | TestError = "test error" 17 | BirthdayHTML = `

    2018-10-01 00:00:01

    ` 18 | TimeStandard = `2006-01-02 15:04:05` 19 | ) 20 | 21 | var ( 22 | CourseHTML, _ = ioutil.ReadFile("testHTML/courses.html") 23 | AllTypeHTML, _ = ioutil.ReadFile("testHTML/all-type.html") 24 | ) 25 | 26 | type ( 27 | Link struct { 28 | Text string `json:"text"` 29 | Href string `attr:"href" json:"href"` 30 | } 31 | 32 | Course struct { 33 | Code Link `html:"td:nth-child(1) > a" json:"code"` 34 | Name Link `html:"td:nth-child(2) > a" json:"name"` 35 | Teacher Link `html:"td:nth-child(3) > a" json:"teacher"` 36 | Semester string `html:"td:nth-child(4)" json:"semester"` 37 | Time string `html:"td:nth-child(5)" json:"time"` 38 | Location string `html:"td:nth-child(6)" json:"location"` 39 | } 40 | 41 | Courses []Course 42 | 43 | AllTypeTest struct { 44 | Slice []int `html:"ul > li"` 45 | Struct TestUser `html:"#test > div"` 46 | String string `html:"#test > p:nth-child(3)"` 47 | Int int `html:"#test > p:nth-child(4)"` 48 | Int8 int8 `html:"#test > p:nth-child(4)"` 49 | Int16 int16 `html:"#test > p:nth-child(4)"` 50 | Int32 int32 `html:"#test > p:nth-child(4)"` 51 | Int64 int64 `html:"#test > p:nth-child(4)"` 52 | Uint uint `html:"#test > p:nth-child(4)"` 53 | Uint8 uint8 `html:"#test > p:nth-child(4)"` 54 | Uint16 uint16 `html:"#test > p:nth-child(4)"` 55 | Uint32 uint32 `html:"#test > p:nth-child(4)"` 56 | Uint64 uint64 `html:"#test > p:nth-child(4)"` 57 | Float32 float32 `html:"#test > p:nth-child(5)"` 58 | Float64 float64 `html:"#test > p:nth-child(5)"` 59 | Bool bool `html:"#test > p:nth-child(6)"` 60 | } 61 | 62 | TestUser struct { 63 | Name string `html:"p:nth-child(1)"` 64 | Age uint `html:"p:nth-child(2)"` 65 | LikeLemon bool `html:"p:nth-child(3)"` 66 | } 67 | 68 | WrongTypes struct { 69 | WrongStruct *TestUser `html:"div"` 70 | } 71 | 72 | ConverterTest struct { 73 | ConvertedStruct map[string]interface{} `html:"div" converter:"TestUserToMap"` 74 | } 75 | 76 | ConverterNotExistTest struct { 77 | Foo int `html:"div" converter:"NotExistMethod"` 78 | } 79 | 80 | ConverterTypeWrongTest struct { 81 | Foo string `html:"div" converter:"WrongResultTypeMethod"` 82 | } 83 | 84 | ConverterReturnErrTest struct { 85 | Foo []string `html:"#test > p:nth-child(3)" converter:"ReturnErrorMethod"` 86 | } 87 | 88 | Birthday struct { 89 | Time time.Time `html:"p" converter:"StringToTime"` 90 | } 91 | ) 92 | 93 | func (Courses) Root() string { 94 | return "#xsgrid > tbody > tr:nth-child(1n+2)" 95 | } 96 | 97 | func (AllTypeTest) Root() string { 98 | return "#test" 99 | } 100 | 101 | func (WrongTypes) Root() string { 102 | return "#test" 103 | } 104 | 105 | func (ConverterTest) Root() string { 106 | return "#test" 107 | } 108 | 109 | func (ConverterTest) TestUserToMap(user TestUser) (map[string]interface{}, error) { 110 | return map[string]interface{}{ 111 | "name": user.Name, 112 | "age": user.Age, 113 | "like_lemon": user.LikeLemon, 114 | }, nil 115 | } 116 | 117 | func (ConverterTypeWrongTest) WrongResultTypeMethod(user TestUser) (Int int, err error) { 118 | return 119 | } 120 | 121 | func (ConverterReturnErrTest) ReturnErrorMethod(input string) (result []string, err error) { 122 | return []string{input}, errors.New(TestError) 123 | } 124 | 125 | func (Birthday) StringToTime(str string) (time.Time, error) { 126 | return time.Parse(TimeStandard, str) 127 | } 128 | 129 | func TestUnmarshal(t *testing.T) { 130 | assert.NotNil(t, CourseHTML) 131 | courses := make(Courses, 0) 132 | assert.Nil(t, Unmarshal(CourseHTML, &courses)) 133 | result, err := json.Marshal(courses) 134 | assert.Nil(t, err) 135 | assert.Equal(t, CoursesJSON, string(result)) 136 | 137 | assert.NotNil(t, AllTypeHTML) 138 | allTypes := AllTypeTest{} 139 | assert.Nil(t, Unmarshal(AllTypeHTML, &allTypes)) 140 | result, err = json.Marshal(&allTypes) 141 | assert.Nil(t, err) 142 | assert.Equal(t, AllTypesJSON, string(result)) 143 | } 144 | 145 | func TestBuilderErr(t *testing.T) { 146 | assert.NotNil(t, CourseHTML) 147 | courses := make(Courses, 0) 148 | err := Unmarshal(CourseHTML, courses) 149 | assert.NotNil(t, err) 150 | assert.Equal(t, NewUnmarshaledKindMustBePtrError(reflect.TypeOf(courses)).Error(), err.Error()) 151 | 152 | assert.NotNil(t, AllTypeHTML) 153 | wrongTypes := WrongTypes{} 154 | err = Unmarshal(AllTypeHTML, &wrongTypes) 155 | assert.NotNil(t, err) 156 | assert.Equal(t, NewUnmarshalerItemKindError(reflect.TypeOf(new(TestUser))).Error(), err.Error()) 157 | 158 | } 159 | 160 | func TestConverter(t *testing.T) { 161 | assert.NotNil(t, AllTypeHTML) 162 | convertedStruct := ConverterTest{} 163 | assert.Nil(t, Unmarshal(AllTypeHTML, &convertedStruct)) 164 | assert.Equal(t, "Hexilee", convertedStruct.ConvertedStruct["name"]) 165 | 166 | assert.NotNil(t, AllTypeHTML) 167 | converterNotExistTest := ConverterNotExistTest{} 168 | err := Unmarshal(AllTypeHTML, &converterNotExistTest) 169 | assert.NotNil(t, err) 170 | assert.Equal(t, NewConverterNotExistError("NotExistMethod").Error(), err.Error()) 171 | 172 | assert.NotNil(t, AllTypeHTML) 173 | converterTypeWrongTest := ConverterTypeWrongTest{} 174 | err = Unmarshal(AllTypeHTML, &converterTypeWrongTest) 175 | assert.NotNil(t, err) 176 | assert.Equal(t, NewConverterTypeWrongError("WrongResultTypeMethod", reflect.ValueOf(converterTypeWrongTest).MethodByName("WrongResultTypeMethod").Type()).Error(), err.Error()) 177 | 178 | assert.NotNil(t, AllTypeHTML) 179 | converterReturnErrTest := ConverterReturnErrTest{} 180 | err = Unmarshal(AllTypeHTML, &converterReturnErrTest) 181 | assert.NotNil(t, err) 182 | assert.Equal(t, TestError, err.Error()) 183 | 184 | birthday := Birthday{} 185 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday)) 186 | assert.Equal(t, 2018, birthday.Time.Year()) 187 | assert.Equal(t, time.October, birthday.Time.Month()) 188 | assert.Equal(t, 1, birthday.Time.Day()) 189 | } 190 | -------------------------------------------------------------------------------- /testHTML/all-type.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 |
    9 | 15 |
    16 |

    Hexilee

    17 |

    20

    18 |

    true

    19 |
    20 |

    Hello World!

    21 |

    10

    22 |

    3.14

    23 |

    true

    24 |
    25 | 26 | -------------------------------------------------------------------------------- /testHTML/courses.html: -------------------------------------------------------------------------------- 1 | !DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> 2 | 3 | 4 | Title 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 23 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 34 | 36 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 47 | 49 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 60 | 62 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 73 | 75 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 86 | 88 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 99 | 101 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 112 | 114 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 125 | 127 | 129 | 130 | 131 | 132 | 133 | 134 | 135 |
    课程代码课程名称教师姓名学期上课时间上课地点选课时间选课志愿
    061B0020 20 | 复变函数与积分变换 22 | 王伟 24 | 周一第1,2节
    周四第1,2节
    紫金港西2-205(多)
    紫金港西2-205(多)
    2018-06-13 14:48:521
    101C0350 33 | 电路与模拟电子技术 35 | 孙盾 37 | 秋冬周二第6,7节
    周二第8节{单周}
    周五第3,4,5节
    紫金港西1-417(多)
    紫金港西1-417(多)
    紫金港西1-417(多)
    2018-06-13 14:49:351
    101C0360 46 | 电路与模拟电子技术实验 48 | 干于 50 | 秋冬周四第3,4,5节紫金港东3-2022018-06-13 14:52:001
    241L0020 59 | 博弈论基础 61 | 蒋文华 63 | 周三第6,7,8节紫金港西1-316(多)*2018-06-13 15:19:081
    261C0070 72 | 工程力学 74 | 吴禹
    季葆华
    76 |
    秋冬周二第1,2节{单周}
    周四第6,7节
    周四第8节{双周}
    紫金港西1-404(多)
    紫金港西1-404(多)
    紫金港西1-404(多)
    2018-06-13 14:55:191
    74188020 85 | 专业实习 87 | 陈家旺
    黄豪彩
    89 |
      2018-06-13 20:07:341
    761T0010 98 | 大学物理(甲)Ⅰ 100 | 潘国卫 102 | 秋冬周六第6,7,8,9节紫金港西2-101(多)2018-09-14 13:03:151
    761T0020 111 | 大学物理(甲)Ⅱ 113 | 郑大方 115 | 秋冬周一第3,4节
    周三第1,2节
    紫金港西2-202(多)#2018-06-13 14:43:031
    821T0020 124 | 微积分(甲)Ⅱ 126 | 薛儒英 128 | 秋冬周六第1,2,3,4节{单周}
    周六第1,2,3,4,5节{双周}
    紫金港西2-105(多)2018-09-14 13:01:471
    136 | 137 | 138 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package unhtml 2 | 3 | import ( 4 | "reflect" 5 | ) 6 | 7 | const ( 8 | ErrorMethodName = "Error" 9 | ) 10 | 11 | // cannot use it for reference kind (Ptr, Interface, Func, Map, Slice) 12 | //func isZero(v interface{}) bool { 13 | // return v == reflect.Zero(reflect.TypeOf(v)).Interface() 14 | //} 15 | 16 | // Converter: Func (inputType) -> (resultType, error) 17 | func checkConverter(methodName string, methodType reflect.Type, expectResultType reflect.Type) (inputValuePtr reflect.Value, err error) { 18 | err = NewConverterTypeWrongError(methodName, methodType) 19 | if methodType.NumIn() == 1 && 20 | methodType.NumOut() == 2 && 21 | methodType.Out(0) == expectResultType { 22 | if _, exist := methodType.Out(1).MethodByName(ErrorMethodName); exist { 23 | inputValuePtr = reflect.New(methodType.In(0)) 24 | err = nil 25 | } 26 | } 27 | return 28 | } 29 | --------------------------------------------------------------------------------