├── .gitignore
├── .travis.yml
├── ErrorMessages.go
├── LICENSE
├── README.md
├── benchmark_test.go
├── examples_test.go
├── go.mod
├── go.sum
├── html.go
├── html_test.go
├── testHTML
├── all-type.html
└── courses.html
└── utils.go
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | go.coverprofile
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go # 声明构建语言环境
2 | sudo: false # 开启基于容器的Travis CI任务,让编译效率更高。
3 |
4 | notifications: # 每次构建的时候是否通知,如果不想收到通知,那就设置false吧(email: false)
5 | email:
6 | recipients:
7 | - hexileee@gmail.com
8 | on_success: change
9 | on_failure: always
10 |
11 | go:
12 | - "1.11"
13 |
14 | install:
15 | - go build
16 | - go get github.com/mattn/goveralls
17 |
18 |
19 | script: # 集成脚本
20 | - go test -bench=.
21 | - go test -v -cover -coverprofile=go.coverprofile
22 | - goveralls -coverprofile=go.coverprofile -service=travis-ci -repotoken=$COVERALLS_TOKEN
23 |
24 | after_success:
25 | - bash <(curl -s https://codecov.io/bash)
26 |
27 | env: #env环境变量设置,travis提供的repo_token安全方式
28 | global:
29 | secure: "XgZkE9tb9of2lsnXepbJeQU2xj3RuJea1zjlWh4W8ukH8Nwp9CJ6wfs3YnuchV+x1npgY1qE+sdWpieB6brQZgqJNzhC9vXcdOissPUviS0JUctric2UEz+4b0C4pN0LsTYCCENdTdFi62S/0LzyXugumVdeQHHtE75EknnhRhaQh4Dz04c7O7SBygVHEn9DwpbD8LRYVXwrSQ8aeWqhRgI6t4LRYInAs0M8hV4AE1HyKSkIueFQcbAfsV1lMB+pfiVHin5g6veRtRYq+++LzyoQ+gsSKx7yfFFOafKiw6e47lAKQaBQzkkkwy9RaB5KcWhI3CQfYZL7fKwqNy7ytKg3l9oboFEqGc42U6ZiGa1DLv8y3f2yaNtkgSpw5QMRjEd+iYVb2EFISfelUFq4srJ4utv+bn47/tBURN2dsLVAgKY/Z8uPv2fvbIUbkbej91uRyRDGQs1z02MW+66DacfnGrZz3yNFYCffTLCDBFFhrG2IsP7tJuSz7s9RnRnQoGTk5mImPCMnYIhTO0sQj4083qmT6+drx0RPVqUMjm99JeTFfMZfpzXyNAe1zP2DU8IqtKdtk4dzodBPy8MvWKsYNJHwjZKJmQyrOWW74+OvchG2WHN3gkY7fXKqalNWK/yb8ARBsj0TKbGPn29okXp5ut8UHXpVDrN+Kg7BAyg="
30 |
--------------------------------------------------------------------------------
/ErrorMessages.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "fmt"
5 | "reflect"
6 | )
7 |
8 | const (
9 | UnmarshaledKindMustBePtr = "unmarshaled kind must be Ptr"
10 | UnmarshalerItemKind = "unmarshaled elem cannot be Ptr/Uintptr/Interface/Chan/Func/"
11 | DtoZero = "dto cannot be zero"
12 | SelectionNil = "selection cannot be nil"
13 | ConverterNotExist = "converter not exist"
14 | ConverterTypeWrong = "type of converter is wrong"
15 | )
16 |
17 | type (
18 | UnmarshaledKindMustBePtrError struct {
19 | dtoType reflect.Type
20 | }
21 |
22 | UnmarshalerItemKindError struct {
23 | dtoType reflect.Type
24 | }
25 |
26 | ConverterNotExistError struct {
27 | name string
28 | }
29 |
30 | ConverterTypeWrongError struct {
31 | name string
32 | methodType reflect.Type
33 | }
34 | )
35 |
36 | func NewUnmarshaledKindMustBePtrError(dtoType reflect.Type) *UnmarshaledKindMustBePtrError {
37 | return &UnmarshaledKindMustBePtrError{dtoType}
38 | }
39 |
40 | func NewUnmarshalerItemKindError(dtoType reflect.Type) *UnmarshalerItemKindError {
41 | return &UnmarshalerItemKindError{dtoType}
42 | }
43 |
44 | func NewConverterNotExistError(name string) *ConverterNotExistError {
45 | return &ConverterNotExistError{name}
46 | }
47 |
48 | func NewConverterTypeWrongError(name string, methodType reflect.Type) *ConverterTypeWrongError {
49 | return &ConverterTypeWrongError{name, methodType}
50 | }
51 |
52 | func (err UnmarshaledKindMustBePtrError) Error() string {
53 | return UnmarshaledKindMustBePtr + ": " + err.dtoType.String()
54 | }
55 |
56 | func (err UnmarshalerItemKindError) Error() string {
57 | return UnmarshalerItemKind + ": " + err.dtoType.String()
58 | }
59 |
60 | func (err ConverterNotExistError) Error() string {
61 | return ConverterNotExist + ": " + err.name
62 | }
63 |
64 | func (err ConverterTypeWrongError) Error() string {
65 | return fmt.Sprintf(ConverterTypeWrong+"(%s): %s", err.name, err.methodType)
66 | }
67 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 ZJU QSC
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://coveralls.io/github/Hexilee/unhtml)
2 | [](https://goreportcard.com/report/github.com/Hexilee/unhtml)
3 | [](https://travis-ci.org/Hexilee/unhtml)
4 | [](https://github.com/Hexilee/unhtml/blob/master/LICENSE)
5 | [](https://godoc.org/github.com/Hexilee/unhtml)
6 |
7 | Table of Contents
8 | =================
9 |
10 | * [Example & Performance](#example--performance)
11 | * [Tips & Features](#tips--features)
12 | * [Types](#types)
13 | * [Root](#root)
14 | * [Selector](#selector)
15 | * [Struct](#struct)
16 | * [Slice](#slice)
17 | * [Tags](#tags)
18 | * [html](#html)
19 | * [attr](#attr)
20 | * [converter](#converter)
21 |
22 |
23 | ### Example & Performance
24 |
25 | A HTML file
26 |
27 | ```html
28 |
29 |
30 |
31 |
32 | Title
33 |
34 |
35 |
36 |
37 | - 0
38 | - 1
39 | - 2
40 | - 3
41 |
42 |
43 |
Hexilee
44 |
20
45 |
true
46 |
47 |
Hello World!
48 |
10
49 |
3.14
50 |
true
51 |
52 |
53 |
54 | ```
55 |
56 | Read it
57 |
58 | ```go
59 | AllTypeHTML, _ := ioutil.ReadFile("testHTML/all-type.html")
60 | ```
61 |
62 | If we want to parse it and get the values we want, like the following structs, how should we do it?
63 |
64 |
65 | ```go
66 | package example
67 |
68 | type (
69 | PartTypesStruct struct {
70 | Slice []int
71 | Struct TestUser
72 | String string
73 | Int int
74 | Float64 float64
75 | Bool bool
76 | }
77 |
78 | TestUser struct {
79 | Name string
80 | Age uint
81 | LikeLemon bool
82 | }
83 | )
84 | ```
85 |
86 | In the traditional way, we should do it like this:
87 |
88 | ```go
89 | package example
90 |
91 | import (
92 | "bytes"
93 | "github.com/PuerkitoBio/goquery"
94 | "strconv"
95 | )
96 |
97 | func parsePartTypesLogically() (PartTypesStruct, error) {
98 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML))
99 | partTypes := PartTypesStruct{}
100 | if err == nil {
101 | selection := doc.Find(partTypes.Root())
102 | partTypes.Slice = make([]int, 0)
103 | selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) {
104 | Int, parseErr := strconv.Atoi(selection.Text())
105 | if parseErr != nil {
106 | err = parseErr
107 | }
108 | partTypes.Slice = append(partTypes.Slice, Int)
109 | })
110 | if err == nil {
111 | partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text()
112 | Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text())
113 | if err = parseErr; err == nil {
114 | partTypes.Struct.Age = uint(Int)
115 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text())
116 | if err = parseErr; err == nil {
117 | partTypes.Struct.LikeLemon = Bool
118 |
119 | String := selection.Find(`#test > p:nth-child(3)`).Text()
120 | Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
121 | if err = parseErr; err != nil {
122 | return partTypes, err
123 | }
124 |
125 | Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
126 | if err = parseErr; err != nil {
127 | return partTypes, err
128 | }
129 |
130 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text())
131 | if err = parseErr; err != nil {
132 | return partTypes, err
133 | }
134 | partTypes.String = String
135 | partTypes.Int = Int
136 | partTypes.Float64 = Float64
137 | partTypes.Bool = Bool
138 | }
139 | }
140 | }
141 | }
142 | return partTypes, err
143 | }
144 |
145 | ```
146 |
147 | It works pretty well, but is boring. And now, you can do it like this:
148 |
149 | ```go
150 | package main
151 |
152 | import (
153 | "encoding/json"
154 | "fmt"
155 | "github.com/Hexilee/unhtml"
156 | "io/ioutil"
157 | )
158 |
159 | type (
160 | PartTypesStruct struct {
161 | Slice []int `html:"ul > li"`
162 | Struct TestUser `html:"#test > div"`
163 | String string `html:"#test > p:nth-child(3)"`
164 | Int int `html:"#test > p:nth-child(4)"`
165 | Float64 float64 `html:"#test > p:nth-child(5)"`
166 | Bool bool `html:"#test > p:nth-child(6)"`
167 | }
168 |
169 | TestUser struct {
170 | Name string `html:"p:nth-child(1)"`
171 | Age uint `html:"p:nth-child(2)"`
172 | LikeLemon bool `html:"p:nth-child(3)"`
173 | }
174 | )
175 |
176 | func (PartTypesStruct) Root() string {
177 | return "#test"
178 | }
179 |
180 | func main() {
181 | allTypes := PartTypesStruct{}
182 | _ := unhtml.Unmarshal(AllTypeHTML, &allTypes)
183 | result, _ := json.Marshal(&allTypes)
184 | fmt.Println(string(result))
185 | }
186 | ```
187 |
188 | Result:
189 |
190 | ```json
191 | {
192 | "Slice": [
193 | 0,
194 | 1,
195 | 2,
196 | 3
197 | ],
198 | "Struct": {
199 | "Name": "Hexilee",
200 | "Age": 20,
201 | "LikeLemon": true
202 | },
203 | "String": "Hello World!",
204 | "Int": 10,
205 | "Float64": 3.14,
206 | "Bool": true
207 | }
208 | ```
209 |
210 | I think it can really improve the efficiency of my development, but what about its performance?
211 |
212 | There are two benchmarks:
213 |
214 | ```go
215 | func BenchmarkUnmarshalPartTypes(b *testing.B) {
216 | assert.NotNil(b, AllTypeHTML)
217 | for i := 0; i < b.N; i++ {
218 | partTypes := PartTypesStruct{}
219 | assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes))
220 | }
221 | }
222 |
223 | func BenchmarkParsePartTypesLogically(b *testing.B) {
224 | assert.NotNil(b, AllTypeHTML)
225 | for i := 0; i < b.N; i++ {
226 | _, err := parsePartTypesLogically()
227 | assert.Nil(b, err)
228 | }
229 | }
230 | ```
231 |
232 | Test it:
233 |
234 | ```bash
235 | > go test -bench=.
236 | goos: darwin
237 | goarch: amd64
238 | pkg: github.com/Hexilee/unhtml
239 | BenchmarkUnmarshalPartTypes-4 30000 54096 ns/op
240 | BenchmarkParsePartTypesLogically-4 30000 45188 ns/op
241 | PASS
242 | ok github.com/Hexilee/unhtml 4.098s
243 | ```
244 |
245 | Not very bad, in consideration of the small size of the demo HTML. In true development with more complicated HTML, their efficiency is almost the same.
246 |
247 | ### Tips & Features
248 |
249 | The only API this package exposes is the function:
250 |
251 | ```go
252 | func Unmarshal(data []byte, v interface{}) error
253 | ```
254 |
255 | which is compatible with the standard library's `json` and `xml`. However, you can do some jobs with the data types in your code.
256 |
257 | #### Types
258 |
259 | This package supports all kinds of type in the `reflect` package except `Ptr/Uintptr/Interface/Chan/Func`.
260 |
261 | The following fields are invalid and will cause `UnmarshalerItemKindError`.
262 |
263 | ```go
264 | type WrongFieldsStruct struct {
265 | Ptr *int
266 | Uintptr uintptr
267 | Interface io.Reader
268 | Chan chan int
269 | Func func()
270 | }
271 | ```
272 |
273 | However, when you call the function `Unmarshal`, you **MUST** pass a pointer, otherwise you will get an `UnmarshaledKindMustBePtrError`.
274 |
275 | ```go
276 | a := 1
277 |
278 | // Wrong
279 | Unmarshal([]byte(""), a)
280 |
281 | // Right
282 | Unmarshal([]byte(""), &a)
283 | ```
284 |
285 | #### Root
286 |
287 | Return the root selector.
288 |
289 | You are only supported to define a `Root() string` method for the root type, like
290 |
291 | ```go
292 | func (PartTypesStruct) Root() string {
293 | return "#test"
294 | }
295 | ```
296 |
297 | If you define it for a field type, such as `TestUser`
298 |
299 | ```go
300 | func (TestUser) Root() string {
301 | return "#test"
302 | }
303 | ```
304 |
305 | In this case, in `PartTypesStruct`, the field selector will be covered.
306 |
307 | ```go
308 | type (
309 | PartTypesStruct struct {
310 | ...
311 | Struct TestUser `html:"#test > div"`
312 | ...
313 | }
314 | )
315 |
316 | // real
317 | type (
318 | PartTypesStruct struct {
319 | ...
320 | Struct TestUser `html:"#test"`
321 | ...
322 | }
323 | )
324 | ```
325 |
326 |
327 |
328 | #### Selector
329 |
330 | This package is based on `github.com/PuerkitoBio/goquery` and supports standard css selectors.
331 |
332 | You can define selectors of a field in tags, like this:
333 |
334 | ```go
335 | type (
336 | PartTypesStruct struct {
337 | ...
338 | Int int `html:"#test > p:nth-child(4)"`
339 | ...
340 | }
341 | )
342 | ```
343 |
344 | In most cases, this package will find the `#test > p:nth-child(4)` element and try to parse its `innerText` as int.
345 |
346 | However, when the field type is `Struct` or `Slice`, it will be more complex.
347 |
348 | ##### Struct
349 |
350 | ```go
351 | type (
352 | PartTypesStruct struct {
353 | ...
354 | Struct TestUser `html:"#test > div"`
355 | ...
356 | }
357 |
358 | TestUser struct {
359 | Name string `html:"p:nth-child(1)"`
360 | Age uint `html:"p:nth-child(2)"`
361 | LikeLemon bool `html:"p:nth-child(3)"`
362 | }
363 | )
364 |
365 | func (PartTypesStruct) Root() string {
366 | return "#test"
367 | }
368 | ```
369 |
370 | First, it will call `*goquery.Selection.Find("#test")`, we get:
371 |
372 | ```html
373 |
374 |
375 | - 0
376 | - 1
377 | - 2
378 | - 3
379 |
380 |
381 |
Hexilee
382 |
20
383 |
true
384 |
385 |
Hello World!
386 |
10
387 |
3.14
388 |
true
389 |
390 | ```
391 |
392 | Then, it will call `*goquery.Selection.Find("#test > div")`, we get
393 |
394 | ```html
395 |
396 |
Hexilee
397 |
20
398 |
true
399 |
400 | ```
401 |
402 | Then, in `TestUser`, it will call
403 |
404 | ```go
405 | *goquery.Selection.Find("p:nth-child(1)") // as Name
406 | *goquery.Selection.Find("p:nth-child(2)") // as Age
407 | *goquery.Selection.Find("p:nth-child(3)") // as LikeLemon
408 | ```
409 |
410 | ##### Slice
411 |
412 | ```go
413 | type (
414 | PartTypesStruct struct {
415 | Slice []int `html:"ul > li"` ...
416 | }
417 | )
418 |
419 | func (PartTypesStruct) Root() string {
420 | return "#test"
421 | }
422 | ```
423 |
424 | As above, we get
425 |
426 | ```html
427 |
428 |
429 | - 0
430 | - 1
431 | - 2
432 | - 3
433 |
434 |
435 |
Hexilee
436 |
20
437 |
true
438 |
439 |
Hello World!
440 |
10
441 |
3.14
442 |
true
443 |
444 | ```
445 |
446 | Then it will call `*goquery.Selection.Find("ul > li")`, we get
447 |
448 | ```html
449 | 0
450 | 1
451 | 2
452 | 3
453 | ```
454 |
455 | Then, it will call `*goquery.Selection.Each(func(int, *goquery.Selection))`, iterate the list and parse values for slice.
456 |
457 | #### Tags
458 |
459 | This package supports three tags, `html`, `attr` and `converter`
460 |
461 | ##### html
462 |
463 | Provide the `css selector` of this field.
464 |
465 | ##### attr
466 |
467 | By default, this package regards the `innerText` of a element as its `value`
468 |
469 | ```html
470 | Google
471 | ```
472 |
473 | ```go
474 | type Link struct {
475 | Text string `html:"a"`
476 | }
477 | ```
478 |
479 | You will get `Text = Google`. However, what should we do if we want to get `href`?
480 |
481 | ```go
482 | type Link struct {
483 | Href string `html:"a" attr:"href"`
484 | Text string `html:"a"`
485 | }
486 | ```
487 |
488 | You will get `link.Href == "https://google.com"`
489 |
490 | ##### converter
491 |
492 | Sometimes, you want to process the original data
493 |
494 | ```html
495 | 2018-10-01 00:00:01
496 | ```
497 |
498 | You may unmarshal it like this
499 |
500 | ```go
501 | type Birthday struct {
502 | Time time.Time `html:"p"`
503 | }
504 |
505 | func TestConverter(t *testing.T) {
506 | birthday := Birthday{}
507 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday))
508 | assert.Equal(t, 2018, birthday.Time.Year())
509 | assert.Equal(t, time.October, birthday.Time.Month())
510 | assert.Equal(t, 1, birthday.Time.Day())
511 | }
512 | ```
513 |
514 | Absolutely, you will fail, because you don't define the way it converts a string to time.Time. `unhtml` will regard it as a struct.
515 |
516 | However, you can use `converter`
517 |
518 | ```go
519 | type Birthday struct {
520 | Time time.Time `html:"p" converter:"StringToTime"`
521 | }
522 |
523 | const TimeStandard = `2006-01-02 15:04:05`
524 |
525 | func (Birthday) StringToTime(str string) (time.Time, error) {
526 | return time.Parse(TimeStandard, str)
527 | }
528 |
529 | func TestConverter(t *testing.T) {
530 | birthday := Birthday{}
531 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday))
532 | assert.Equal(t, 2018, birthday.Time.Year())
533 | assert.Equal(t, time.October, birthday.Time.Month())
534 | assert.Equal(t, 1, birthday.Time.Day())
535 | }
536 | ```
537 |
538 | Make it.
539 |
540 | The type of converter **MUST** be
541 |
542 | ```go
543 | func (inputType) (resultType, error)
544 | ```
545 |
546 | `resultType` **MUST** be the same as the field type, and they can be any type.
547 |
548 | `inputType` **MUST NOT** violate the requirements in [Types](#types).
549 |
550 |
551 |
552 |
--------------------------------------------------------------------------------
/benchmark_test.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "bytes"
5 | "github.com/PuerkitoBio/goquery"
6 | "github.com/stretchr/testify/assert"
7 | "strconv"
8 | "testing"
9 | )
10 |
11 | type (
12 | PartTypesStruct struct {
13 | Slice []int `html:"ul > li"`
14 | Struct TestUser `html:"#test > div"`
15 | String string `html:"#test > p:nth-child(3)"`
16 | Int int `html:"#test > p:nth-child(4)"`
17 | Float64 float64 `html:"#test > p:nth-child(5)"`
18 | Bool bool `html:"#test > p:nth-child(6)"`
19 | }
20 | )
21 |
22 | func (PartTypesStruct) Root() string {
23 | return "#test"
24 | }
25 |
26 | func BenchmarkUnmarshalCourses(b *testing.B) {
27 | assert.NotNil(b, CourseHTML)
28 | for i := 0; i < b.N; i++ {
29 | courses := make(Courses, 0)
30 | assert.Nil(b, Unmarshal(CourseHTML, &courses))
31 | }
32 | }
33 |
34 | func BenchmarkUnmarshalPartTypes(b *testing.B) {
35 | assert.NotNil(b, AllTypeHTML)
36 | for i := 0; i < b.N; i++ {
37 | partTypes := PartTypesStruct{}
38 | assert.Nil(b, Unmarshal(AllTypeHTML, &partTypes))
39 | }
40 | }
41 |
42 | func BenchmarkParseCoursesLogically(b *testing.B) {
43 | assert.NotNil(b, CourseHTML)
44 | for i := 0; i < b.N; i++ {
45 | _, err := parseCoursesLogically()
46 | assert.Nil(b, err)
47 | }
48 | }
49 |
50 | func BenchmarkParsePartTypesLogically(b *testing.B) {
51 | assert.NotNil(b, AllTypeHTML)
52 | for i := 0; i < b.N; i++ {
53 | _, err := parsePartTypesLogically()
54 | assert.Nil(b, err)
55 | }
56 | }
57 |
58 | func getLink(selection *goquery.Selection) Link {
59 | link, _ := selection.Attr(AttrHref)
60 | return Link{Text: selection.Text(), Href: link}
61 | }
62 |
63 | func parseCoursesLogically() (Courses, error) {
64 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(CourseHTML))
65 | courses := make(Courses, 0)
66 | if err == nil {
67 | doc.Find(courses.Root()).Each(func(i int, selection *goquery.Selection) {
68 | course := Course{}
69 | course.Code = getLink(selection.Find(`td:nth-child(1) > a`))
70 | course.Name = getLink(selection.Find(`td:nth-child(2) > a`))
71 | course.Teacher = getLink(selection.Find(`td:nth-child(3) > a`))
72 | course.Semester = selection.Find(`td:nth-child(4)`).Text()
73 | course.Time = selection.Find(`td:nth-child(5)`).Text()
74 | course.Location = selection.Find(`td:nth-child(6)`).Text()
75 | courses = append(courses, course)
76 | })
77 | }
78 |
79 | return courses, err
80 | }
81 |
82 | func parsePartTypesLogically() (PartTypesStruct, error) {
83 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(AllTypeHTML))
84 | partTypes := PartTypesStruct{}
85 | if err == nil {
86 | selection := doc.Find(partTypes.Root())
87 | partTypes.Slice = make([]int, 0)
88 | selection.Find(`ul > li`).Each(func(i int, selection *goquery.Selection) {
89 | Int, parseErr := strconv.Atoi(selection.Text())
90 | if parseErr != nil {
91 | err = parseErr
92 | }
93 | partTypes.Slice = append(partTypes.Slice, Int)
94 | })
95 | if err == nil {
96 | partTypes.Struct.Name = selection.Find(`#test > div > p:nth-child(1)`).Text()
97 | Int, parseErr := strconv.Atoi(selection.Find(`#test > div > p:nth-child(2)`).Text())
98 | if err = parseErr; err == nil {
99 | partTypes.Struct.Age = uint(Int)
100 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > div > p:nth-child(3)`).Text())
101 | if err = parseErr; err == nil {
102 | partTypes.Struct.LikeLemon = Bool
103 |
104 | String := selection.Find(`#test > p:nth-child(3)`).Text()
105 | Int, parseErr := strconv.Atoi(selection.Find(`#test > p:nth-child(4)`).Text())
106 | if err = parseErr; err != nil {
107 | return partTypes, err
108 | }
109 |
110 | Float64, parseErr := strconv.ParseFloat(selection.Find(`#test > p:nth-child(5)`).Text(), 0)
111 | if err = parseErr; err != nil {
112 | return partTypes, err
113 | }
114 |
115 | Bool, parseErr := strconv.ParseBool(selection.Find(`#test > p:nth-child(6)`).Text())
116 | if err = parseErr; err != nil {
117 | return partTypes, err
118 | }
119 | partTypes.String = String
120 | partTypes.Int = Int
121 | partTypes.Float64 = Float64
122 | partTypes.Bool = Bool
123 | }
124 | }
125 | }
126 | }
127 |
128 | return partTypes, err
129 | }
130 |
--------------------------------------------------------------------------------
/examples_test.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | )
7 |
8 | const (
9 | AllTypesHTML = `
10 |
11 |
12 |
13 |
14 | Title
15 |
16 |
17 |
18 |
19 | - 0
20 | - 1
21 | - 2
22 | - 3
23 |
24 |
25 |
Hexilee
26 |
20
27 |
true
28 |
29 |
Hello World!
30 |
10
31 |
3.14
32 |
true
33 |
34 |
35 |
36 | `
37 | )
38 |
39 | func ExampleUnmarshal() {
40 | allTypes := AllTypeTest{}
41 | _ = Unmarshal(AllTypeHTML, &allTypes)
42 | result, _ := json.Marshal(&allTypes)
43 | fmt.Println(string(result))
44 | // Output:
45 | // {"Slice":[0,1,2,3],"Struct":{"Name":"Hexilee","Age":20,"LikeLemon":true},"String":"Hello World!","Int":10,"Int8":10,"Int16":10,"Int32":10,"Int64":10,"Uint":10,"Uint8":10,"Uint16":10,"Uint32":10,"Uint64":10,"Float32":3.14,"Float64":3.14,"Bool":true}
46 | }
47 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/Hexilee/unhtml
2 |
3 | require (
4 | github.com/PuerkitoBio/goquery v1.4.1
5 | github.com/andybalholm/cascadia v1.0.0 // indirect
6 | github.com/davecgh/go-spew v1.1.1 // indirect
7 | github.com/pmezard/go-difflib v1.0.0 // indirect
8 | github.com/stretchr/testify v1.2.2
9 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3 // indirect
10 | )
11 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.4.1 h1:smcIRGdYm/w7JSbcdeLHEMzxmsBQvl8lhf0dSw2nzMI=
2 | github.com/PuerkitoBio/goquery v1.4.1/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA=
3 | github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
4 | github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
5 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
6 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
9 | github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
10 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
11 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
12 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3 h1:dgd4x4kJt7G4k4m93AYLzM8Ni6h2qLTfh9n9vXJT3/0=
13 | golang.org/x/net v0.0.0-20180926154720-4dfa2610cdf3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
14 |
--------------------------------------------------------------------------------
/html.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "bytes"
5 | "github.com/PuerkitoBio/goquery"
6 | "reflect"
7 | "strconv"
8 | )
9 |
10 | type (
11 | // HTMLUnmarshalerBuilder: inner hidden
12 | HTMLUnmarshalerBuilder struct {
13 | dto reflect.Value
14 | kind reflect.Kind
15 | dtoElemType reflect.Type
16 | selection *goquery.Selection
17 | selector string
18 | attrKey string
19 | }
20 |
21 | // HTMLUnmarshaler: inner hidden
22 | HTMLUnmarshaler struct {
23 | dto reflect.Value
24 | kind reflect.Kind
25 | dtoElemType reflect.Type
26 | selection goquery.Selection
27 | selector string
28 | attrKey string
29 | }
30 |
31 | // HTMLModel: HTML model with root selector
32 | HTMLModel interface {
33 | // Root return root selector
34 | Root() string
35 | }
36 | )
37 |
38 | const (
39 | SelectorKey = "html"
40 | AttrKey = "attr"
41 | ConverterKey = "converter"
42 | ZeroStr = ""
43 | )
44 |
45 | const (
46 | AttrHref = "href"
47 | )
48 |
49 | func Unmarshal(data []byte, v interface{}) error {
50 | doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
51 | if err == nil {
52 | err = unmarshal(reflect.ValueOf(v), *doc.Selection, "")
53 | }
54 | return err
55 | }
56 |
57 | func unmarshal(ptr reflect.Value, selection goquery.Selection, tag reflect.StructTag) (err error) {
58 | realUnmarshal, buildErr := new(HTMLUnmarshalerBuilder).
59 | setDto(ptr).
60 | setSelection(&selection).
61 | setSelector(tag.Get(SelectorKey)).
62 | setAttrKey(tag.Get(AttrKey)).
63 | build()
64 |
65 | if err = buildErr; err == nil {
66 | err = realUnmarshal.unmarshal()
67 | }
68 | return err
69 | }
70 |
71 | func (builder *HTMLUnmarshalerBuilder) build() (unmarshaler *HTMLUnmarshaler, err error) {
72 | if err = builder.initRoot(); err == nil {
73 | if err = builder.parseType(); err == nil {
74 | if err = builder.checkBeforeReturn(); err == nil {
75 | unmarshaler = &HTMLUnmarshaler{
76 | dto: builder.dto,
77 | kind: builder.kind,
78 | dtoElemType: builder.dtoElemType,
79 | selection: *builder.selection,
80 | selector: builder.selector,
81 | attrKey: builder.attrKey,
82 | }
83 | }
84 | }
85 | }
86 | return
87 | }
88 |
89 | func (builder *HTMLUnmarshalerBuilder) setDto(v reflect.Value) *HTMLUnmarshalerBuilder {
90 | builder.dto = v
91 | return builder
92 | }
93 |
94 | func (builder *HTMLUnmarshalerBuilder) setSelector(selector string) *HTMLUnmarshalerBuilder {
95 | builder.selector = selector
96 | return builder
97 | }
98 |
99 | func (builder *HTMLUnmarshalerBuilder) setAttrKey(attrKey string) *HTMLUnmarshalerBuilder {
100 | builder.attrKey = attrKey
101 | return builder
102 | }
103 |
104 | func (builder *HTMLUnmarshalerBuilder) setSelection(selection *goquery.Selection) *HTMLUnmarshalerBuilder {
105 | builder.selection = selection
106 | return builder
107 | }
108 |
109 | func (builder *HTMLUnmarshalerBuilder) initRoot() (err error) {
110 | //if err = builder.checkDtoZero(); err == nil {
111 | if value, ok := builder.dto.Interface().(HTMLModel); ok {
112 | builder.selector = value.Root()
113 | }
114 | //}
115 | return
116 | }
117 |
118 | func (builder *HTMLUnmarshalerBuilder) parseType() (err error) {
119 | //if err = builder.checkDtoZero(); err == nil {
120 | dtoType := builder.dto.Type()
121 | switch dtoType.Kind() {
122 | case reflect.Ptr:
123 | builder.dtoElemType = dtoType.Elem()
124 | builder.kind = builder.dtoElemType.Kind()
125 | default:
126 | err = NewUnmarshaledKindMustBePtrError(dtoType)
127 | }
128 | //}
129 |
130 | return
131 | }
132 |
133 | func (builder *HTMLUnmarshalerBuilder) checkItemKind() (err error) {
134 | switch builder.kind {
135 | case reflect.Ptr:
136 | fallthrough
137 | case reflect.Uintptr:
138 | fallthrough
139 | case reflect.Interface:
140 | fallthrough
141 | case reflect.Chan:
142 | fallthrough
143 | case reflect.Func:
144 | fallthrough
145 | case reflect.Map:
146 | err = NewUnmarshalerItemKindError(builder.dtoElemType)
147 | default:
148 | }
149 | return
150 | }
151 |
152 | func (builder *HTMLUnmarshalerBuilder) checkBeforeReturn() (err error) {
153 | //if err = builder.checkDtoZero(); err == nil {
154 | // if err = builder.checkSelectionNil(); err == nil {
155 | err = builder.checkItemKind()
156 | //}
157 | //}
158 | return
159 | }
160 |
161 | // never return err in production env
162 | //func (builder *HTMLUnmarshalerBuilder) checkDtoZero() (err error) {
163 | // // Zero reflect.Value
164 | // if isZero(builder.dto) {
165 | // err = errors.New(DtoZero)
166 | // }
167 | // return
168 | //}
169 |
170 | // never return err in production env
171 | //func (builder *HTMLUnmarshalerBuilder) checkSelectionNil() (err error) {
172 | // if builder.selection == nil {
173 | // err = errors.New(SelectionNil)
174 | // }
175 | // return
176 | //}
177 |
178 | func (unmarshaler HTMLUnmarshaler) getSelection() goquery.Selection {
179 | return unmarshaler.selection
180 | }
181 |
182 | func (unmarshaler HTMLUnmarshaler) getSelector() string {
183 | return unmarshaler.selector
184 | }
185 |
186 | func (unmarshaler HTMLUnmarshaler) getAttrKey() string {
187 | return unmarshaler.attrKey
188 | }
189 |
190 | func (unmarshaler HTMLUnmarshaler) getDto() reflect.Value {
191 | return unmarshaler.dto
192 | }
193 |
194 | func (unmarshaler HTMLUnmarshaler) getKind() reflect.Kind {
195 | return unmarshaler.kind
196 | }
197 |
198 | func (unmarshaler HTMLUnmarshaler) getDtoElemType() reflect.Type {
199 | return unmarshaler.dtoElemType
200 | }
201 |
202 | func (unmarshaler HTMLUnmarshaler) unmarshalSlice(preSelection goquery.Selection) (err error) {
203 | itemType := unmarshaler.getDtoElemType().Elem()
204 | sliceValue := reflect.MakeSlice(reflect.SliceOf(itemType), 0, 0)
205 | preSelection.Each(func(i int, selection *goquery.Selection) {
206 | newItem := reflect.New(itemType)
207 | if err = unmarshal(newItem, *selection, ""); err == nil {
208 | sliceValue = reflect.Append(sliceValue, newItem.Elem())
209 | }
210 | })
211 | unmarshaler.getDto().Elem().Set(sliceValue)
212 | return err
213 | }
214 |
215 | func (unmarshaler HTMLUnmarshaler) callConverter(converter string, fieldIndex int, preSelection goquery.Selection) (result reflect.Value, err error) {
216 | motherValue := unmarshaler.getDto().Elem()
217 | motherType := unmarshaler.getDtoElemType()
218 | tag := motherType.Field(fieldIndex).Tag
219 | resultType := motherType.Field(fieldIndex).Type
220 | method, exist := motherType.MethodByName(converter)
221 | if !exist {
222 | err = NewConverterNotExistError(converter)
223 | }
224 | if err == nil {
225 | methodValue := motherValue.MethodByName(converter)
226 | inputValuePtr, converterTypeErr := checkConverter(method.Name, methodValue.Type(), resultType)
227 | if err = converterTypeErr; err == nil {
228 | if err = unmarshal(inputValuePtr, preSelection, tag); err == nil {
229 | results := methodValue.Call([]reflect.Value{inputValuePtr.Elem()})
230 | if errInterface := results[1].Interface(); errInterface != nil {
231 | err = errInterface.(error)
232 | }
233 | if err == nil {
234 | result = results[0]
235 | }
236 | }
237 | }
238 | }
239 | return
240 | }
241 |
242 | func (unmarshaler HTMLUnmarshaler) unmarshalStruct(preSelection goquery.Selection) (err error) {
243 | motherValue := unmarshaler.getDto().Elem()
244 | motherType := unmarshaler.getDtoElemType()
245 | for i := 0; i < motherValue.NumField(); i++ {
246 | field := motherValue.Field(i)
247 | if field.CanSet() {
248 | fieldPtr := field.Addr()
249 | tag := motherType.Field(i).Tag
250 | if converter := tag.Get(ConverterKey); converter != ZeroStr {
251 | result, callConverterErr := unmarshaler.callConverter(converter, i, preSelection)
252 | if err = callConverterErr; err == nil {
253 | fieldPtr.Elem().Set(result)
254 | }
255 | } else {
256 | err = unmarshal(fieldPtr, preSelection, tag)
257 | }
258 |
259 | if err != nil {
260 | break
261 | }
262 | }
263 | }
264 | return
265 | }
266 |
267 | func (unmarshaler HTMLUnmarshaler) unmarshal() (err error) {
268 | preSelection := unmarshaler.getSelection()
269 | if unmarshaler.getSelector() != ZeroStr {
270 | preSelection = *preSelection.Find(unmarshaler.getSelector())
271 | }
272 | switch unmarshaler.getKind() {
273 | case reflect.Slice:
274 | err = unmarshaler.unmarshalSlice(preSelection)
275 | case reflect.Struct:
276 | err = unmarshaler.unmarshalStruct(preSelection)
277 | case reflect.String:
278 | unmarshaler.getDto().Elem().SetString(unmarshaler.getAttrValue(preSelection))
279 | case reflect.Int:
280 | fallthrough
281 | case reflect.Int8:
282 | fallthrough
283 | case reflect.Int16:
284 | fallthrough
285 | case reflect.Int32:
286 | fallthrough
287 | case reflect.Int64:
288 | valueStr := unmarshaler.getAttrValue(preSelection)
289 | value, err := strconv.Atoi(valueStr)
290 | if err == nil {
291 | unmarshaler.getDto().Elem().SetInt(int64(value))
292 | }
293 | case reflect.Uint:
294 | fallthrough
295 | case reflect.Uint8:
296 | fallthrough
297 | case reflect.Uint16:
298 | fallthrough
299 | case reflect.Uint32:
300 | fallthrough
301 | case reflect.Uint64:
302 | valueStr := unmarshaler.getAttrValue(preSelection)
303 | value, err := strconv.ParseUint(valueStr, 0, 0)
304 | if err == nil {
305 | unmarshaler.getDto().Elem().SetUint(value)
306 | }
307 | case reflect.Float32:
308 | fallthrough
309 | case reflect.Float64:
310 | valueStr := unmarshaler.getAttrValue(preSelection)
311 | value, err := strconv.ParseFloat(valueStr, 0)
312 | if err == nil {
313 | unmarshaler.getDto().Elem().SetFloat(value)
314 | }
315 | case reflect.Bool:
316 | valueStr := unmarshaler.getAttrValue(preSelection)
317 | value, err := strconv.ParseBool(valueStr)
318 | if err == nil {
319 | unmarshaler.getDto().Elem().SetBool(value)
320 | }
321 | }
322 |
323 | return err
324 | }
325 |
326 | func (unmarshaler HTMLUnmarshaler) getAttrValue(selection goquery.Selection) (valueStr string) {
327 | if unmarshaler.getAttrKey() == ZeroStr {
328 | valueStr = selection.Text()
329 | } else {
330 | if str, exist := selection.Attr(unmarshaler.getAttrKey()); exist {
331 | valueStr = str
332 | }
333 | }
334 | return
335 | }
336 |
--------------------------------------------------------------------------------
/html_test.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "encoding/json"
5 | "errors"
6 | "github.com/stretchr/testify/assert"
7 | "io/ioutil"
8 | "reflect"
9 | "testing"
10 | "time"
11 | )
12 |
13 | const (
14 | CoursesJSON = `[{"code":{"text":"061B0020","href":"#"},"name":{"text":"复变函数与积分变换","href":"#"},"teacher":{"text":"王伟","href":"#"},"semester":"秋","time":"周一第1,2节周四第1,2节","location":"紫金港西2-205(多)紫金港西2-205(多)"},{"code":{"text":"101C0350","href":"#"},"name":{"text":"电路与模拟电子技术","href":"#"},"teacher":{"text":"孙盾","href":"#"},"semester":"秋冬","time":"周二第6,7节周二第8节{单周}周五第3,4,5节","location":"紫金港西1-417(多)紫金港西1-417(多)紫金港西1-417(多)"},{"code":{"text":"101C0360","href":"#"},"name":{"text":"电路与模拟电子技术实验","href":"#"},"teacher":{"text":"干于","href":"#"},"semester":"秋冬","time":"周四第3,4,5节","location":"紫金港东3-202"},{"code":{"text":"241L0020","href":"#"},"name":{"text":"博弈论基础","href":"#"},"teacher":{"text":"蒋文华","href":"#"},"semester":"冬","time":"周三第6,7,8节","location":"紫金港西1-316(多)*"},{"code":{"text":"261C0070","href":"#"},"name":{"text":"工程力学","href":"#"},"teacher":{"text":"吴禹季葆华","href":"#"},"semester":"秋冬","time":"周二第1,2节{单周}周四第6,7节周四第8节{双周}","location":"紫金港西1-404(多)紫金港西1-404(多)紫金港西1-404(多)"},{"code":{"text":"74188020","href":"#"},"name":{"text":"专业实习","href":"#"},"teacher":{"text":"陈家旺黄豪彩","href":"#"},"semester":"短","time":" ","location":" "},{"code":{"text":"761T0010","href":"#"},"name":{"text":"大学物理(甲)Ⅰ","href":"#"},"teacher":{"text":"潘国卫","href":"#"},"semester":"秋冬","time":"周六第6,7,8,9节","location":"紫金港西2-101(多)"},{"code":{"text":"761T0020","href":"#"},"name":{"text":"大学物理(甲)Ⅱ","href":"#"},"teacher":{"text":"郑大方","href":"#"},"semester":"秋冬","time":"周一第3,4节周三第1,2节","location":"紫金港西2-202(多)#"},{"code":{"text":"821T0020","href":"#"},"name":{"text":"微积分(甲)Ⅱ","href":"#"},"teacher":{"text":"薛儒英","href":"#"},"semester":"秋冬","time":"周六第1,2,3,4节{单周}周六第1,2,3,4,5节{双周}","location":"紫金港西2-105(多)"}]`
15 | AllTypesJSON = `{"Slice":[0,1,2,3],"Struct":{"Name":"Hexilee","Age":20,"LikeLemon":true},"String":"Hello World!","Int":10,"Int8":10,"Int16":10,"Int32":10,"Int64":10,"Uint":10,"Uint8":10,"Uint16":10,"Uint32":10,"Uint64":10,"Float32":3.14,"Float64":3.14,"Bool":true}`
16 | TestError = "test error"
17 | BirthdayHTML = `2018-10-01 00:00:01
`
18 | TimeStandard = `2006-01-02 15:04:05`
19 | )
20 |
21 | var (
22 | CourseHTML, _ = ioutil.ReadFile("testHTML/courses.html")
23 | AllTypeHTML, _ = ioutil.ReadFile("testHTML/all-type.html")
24 | )
25 |
26 | type (
27 | Link struct {
28 | Text string `json:"text"`
29 | Href string `attr:"href" json:"href"`
30 | }
31 |
32 | Course struct {
33 | Code Link `html:"td:nth-child(1) > a" json:"code"`
34 | Name Link `html:"td:nth-child(2) > a" json:"name"`
35 | Teacher Link `html:"td:nth-child(3) > a" json:"teacher"`
36 | Semester string `html:"td:nth-child(4)" json:"semester"`
37 | Time string `html:"td:nth-child(5)" json:"time"`
38 | Location string `html:"td:nth-child(6)" json:"location"`
39 | }
40 |
41 | Courses []Course
42 |
43 | AllTypeTest struct {
44 | Slice []int `html:"ul > li"`
45 | Struct TestUser `html:"#test > div"`
46 | String string `html:"#test > p:nth-child(3)"`
47 | Int int `html:"#test > p:nth-child(4)"`
48 | Int8 int8 `html:"#test > p:nth-child(4)"`
49 | Int16 int16 `html:"#test > p:nth-child(4)"`
50 | Int32 int32 `html:"#test > p:nth-child(4)"`
51 | Int64 int64 `html:"#test > p:nth-child(4)"`
52 | Uint uint `html:"#test > p:nth-child(4)"`
53 | Uint8 uint8 `html:"#test > p:nth-child(4)"`
54 | Uint16 uint16 `html:"#test > p:nth-child(4)"`
55 | Uint32 uint32 `html:"#test > p:nth-child(4)"`
56 | Uint64 uint64 `html:"#test > p:nth-child(4)"`
57 | Float32 float32 `html:"#test > p:nth-child(5)"`
58 | Float64 float64 `html:"#test > p:nth-child(5)"`
59 | Bool bool `html:"#test > p:nth-child(6)"`
60 | }
61 |
62 | TestUser struct {
63 | Name string `html:"p:nth-child(1)"`
64 | Age uint `html:"p:nth-child(2)"`
65 | LikeLemon bool `html:"p:nth-child(3)"`
66 | }
67 |
68 | WrongTypes struct {
69 | WrongStruct *TestUser `html:"div"`
70 | }
71 |
72 | ConverterTest struct {
73 | ConvertedStruct map[string]interface{} `html:"div" converter:"TestUserToMap"`
74 | }
75 |
76 | ConverterNotExistTest struct {
77 | Foo int `html:"div" converter:"NotExistMethod"`
78 | }
79 |
80 | ConverterTypeWrongTest struct {
81 | Foo string `html:"div" converter:"WrongResultTypeMethod"`
82 | }
83 |
84 | ConverterReturnErrTest struct {
85 | Foo []string `html:"#test > p:nth-child(3)" converter:"ReturnErrorMethod"`
86 | }
87 |
88 | Birthday struct {
89 | Time time.Time `html:"p" converter:"StringToTime"`
90 | }
91 | )
92 |
93 | func (Courses) Root() string {
94 | return "#xsgrid > tbody > tr:nth-child(1n+2)"
95 | }
96 |
97 | func (AllTypeTest) Root() string {
98 | return "#test"
99 | }
100 |
101 | func (WrongTypes) Root() string {
102 | return "#test"
103 | }
104 |
105 | func (ConverterTest) Root() string {
106 | return "#test"
107 | }
108 |
109 | func (ConverterTest) TestUserToMap(user TestUser) (map[string]interface{}, error) {
110 | return map[string]interface{}{
111 | "name": user.Name,
112 | "age": user.Age,
113 | "like_lemon": user.LikeLemon,
114 | }, nil
115 | }
116 |
117 | func (ConverterTypeWrongTest) WrongResultTypeMethod(user TestUser) (Int int, err error) {
118 | return
119 | }
120 |
121 | func (ConverterReturnErrTest) ReturnErrorMethod(input string) (result []string, err error) {
122 | return []string{input}, errors.New(TestError)
123 | }
124 |
125 | func (Birthday) StringToTime(str string) (time.Time, error) {
126 | return time.Parse(TimeStandard, str)
127 | }
128 |
129 | func TestUnmarshal(t *testing.T) {
130 | assert.NotNil(t, CourseHTML)
131 | courses := make(Courses, 0)
132 | assert.Nil(t, Unmarshal(CourseHTML, &courses))
133 | result, err := json.Marshal(courses)
134 | assert.Nil(t, err)
135 | assert.Equal(t, CoursesJSON, string(result))
136 |
137 | assert.NotNil(t, AllTypeHTML)
138 | allTypes := AllTypeTest{}
139 | assert.Nil(t, Unmarshal(AllTypeHTML, &allTypes))
140 | result, err = json.Marshal(&allTypes)
141 | assert.Nil(t, err)
142 | assert.Equal(t, AllTypesJSON, string(result))
143 | }
144 |
145 | func TestBuilderErr(t *testing.T) {
146 | assert.NotNil(t, CourseHTML)
147 | courses := make(Courses, 0)
148 | err := Unmarshal(CourseHTML, courses)
149 | assert.NotNil(t, err)
150 | assert.Equal(t, NewUnmarshaledKindMustBePtrError(reflect.TypeOf(courses)).Error(), err.Error())
151 |
152 | assert.NotNil(t, AllTypeHTML)
153 | wrongTypes := WrongTypes{}
154 | err = Unmarshal(AllTypeHTML, &wrongTypes)
155 | assert.NotNil(t, err)
156 | assert.Equal(t, NewUnmarshalerItemKindError(reflect.TypeOf(new(TestUser))).Error(), err.Error())
157 |
158 | }
159 |
160 | func TestConverter(t *testing.T) {
161 | assert.NotNil(t, AllTypeHTML)
162 | convertedStruct := ConverterTest{}
163 | assert.Nil(t, Unmarshal(AllTypeHTML, &convertedStruct))
164 | assert.Equal(t, "Hexilee", convertedStruct.ConvertedStruct["name"])
165 |
166 | assert.NotNil(t, AllTypeHTML)
167 | converterNotExistTest := ConverterNotExistTest{}
168 | err := Unmarshal(AllTypeHTML, &converterNotExistTest)
169 | assert.NotNil(t, err)
170 | assert.Equal(t, NewConverterNotExistError("NotExistMethod").Error(), err.Error())
171 |
172 | assert.NotNil(t, AllTypeHTML)
173 | converterTypeWrongTest := ConverterTypeWrongTest{}
174 | err = Unmarshal(AllTypeHTML, &converterTypeWrongTest)
175 | assert.NotNil(t, err)
176 | assert.Equal(t, NewConverterTypeWrongError("WrongResultTypeMethod", reflect.ValueOf(converterTypeWrongTest).MethodByName("WrongResultTypeMethod").Type()).Error(), err.Error())
177 |
178 | assert.NotNil(t, AllTypeHTML)
179 | converterReturnErrTest := ConverterReturnErrTest{}
180 | err = Unmarshal(AllTypeHTML, &converterReturnErrTest)
181 | assert.NotNil(t, err)
182 | assert.Equal(t, TestError, err.Error())
183 |
184 | birthday := Birthday{}
185 | assert.Nil(t, Unmarshal([]byte(BirthdayHTML), &birthday))
186 | assert.Equal(t, 2018, birthday.Time.Year())
187 | assert.Equal(t, time.October, birthday.Time.Month())
188 | assert.Equal(t, 1, birthday.Time.Day())
189 | }
190 |
--------------------------------------------------------------------------------
/testHTML/all-type.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 |
10 | - 0
11 | - 1
12 | - 2
13 | - 3
14 |
15 |
16 |
Hexilee
17 |
20
18 |
true
19 |
20 |
Hello World!
21 |
10
22 |
3.14
23 |
true
24 |
25 |
26 |
--------------------------------------------------------------------------------
/testHTML/courses.html:
--------------------------------------------------------------------------------
1 | !DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
2 |
3 |
4 | Title
5 |
6 |
7 |
8 |
9 | 课程代码 |
10 | 课程名称 |
11 | 教师姓名 |
12 | 学期 |
13 | 上课时间 |
14 | 上课地点 |
15 | 选课时间 |
16 | 选课志愿 |
17 |
18 |
19 | 061B0020
20 | |
21 | 复变函数与积分变换
22 | |
23 | 王伟
24 | |
25 | 秋 |
26 | 周一第1,2节 周四第1,2节 |
27 | 紫金港西2-205(多) 紫金港西2-205(多) |
28 | 2018-06-13 14:48:52 |
29 | 1 |
30 |
31 |
32 | 101C0350
33 | |
34 | 电路与模拟电子技术
35 | |
36 | 孙盾
37 | |
38 | 秋冬 |
39 | 周二第6,7节 周二第8节{单周} 周五第3,4,5节 |
40 | 紫金港西1-417(多) 紫金港西1-417(多) 紫金港西1-417(多) |
41 | 2018-06-13 14:49:35 |
42 | 1 |
43 |
44 |
45 | 101C0360
46 | |
47 | 电路与模拟电子技术实验
48 | |
49 | 干于
50 | |
51 | 秋冬 |
52 | 周四第3,4,5节 |
53 | 紫金港东3-202 |
54 | 2018-06-13 14:52:00 |
55 | 1 |
56 |
57 |
58 | 241L0020
59 | |
60 | 博弈论基础
61 | |
62 | 蒋文华
63 | |
64 | 冬 |
65 | 周三第6,7,8节 |
66 | 紫金港西1-316(多)* |
67 | 2018-06-13 15:19:08 |
68 | 1 |
69 |
70 |
71 | 261C0070
72 | |
73 | 工程力学
74 | |
75 | 吴禹 季葆华
76 | |
77 | 秋冬 |
78 | 周二第1,2节{单周} 周四第6,7节 周四第8节{双周} |
79 | 紫金港西1-404(多) 紫金港西1-404(多) 紫金港西1-404(多) |
80 | 2018-06-13 14:55:19 |
81 | 1 |
82 |
83 |
84 | 74188020
85 | |
86 | 专业实习
87 | |
88 | 陈家旺 黄豪彩
89 | |
90 | 短 |
91 | |
92 | |
93 | 2018-06-13 20:07:34 |
94 | 1 |
95 |
96 |
97 | 761T0010
98 | |
99 | 大学物理(甲)Ⅰ
100 | |
101 | 潘国卫
102 | |
103 | 秋冬 |
104 | 周六第6,7,8,9节 |
105 | 紫金港西2-101(多) |
106 | 2018-09-14 13:03:15 |
107 | 1 |
108 |
109 |
110 | 761T0020
111 | |
112 | 大学物理(甲)Ⅱ
113 | |
114 | 郑大方
115 | |
116 | 秋冬 |
117 | 周一第3,4节 周三第1,2节 |
118 | 紫金港西2-202(多)# |
119 | 2018-06-13 14:43:03 |
120 | 1 |
121 |
122 |
123 | 821T0020
124 | |
125 | 微积分(甲)Ⅱ
126 | |
127 | 薛儒英
128 | |
129 | 秋冬 |
130 | 周六第1,2,3,4节{单周} 周六第1,2,3,4,5节{双周} |
131 | 紫金港西2-105(多) |
132 | 2018-09-14 13:01:47 |
133 | 1 |
134 |
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
1 | package unhtml
2 |
3 | import (
4 | "reflect"
5 | )
6 |
7 | const (
8 | ErrorMethodName = "Error"
9 | )
10 |
11 | // cannot use it for reference kind (Ptr, Interface, Func, Map, Slice)
12 | //func isZero(v interface{}) bool {
13 | // return v == reflect.Zero(reflect.TypeOf(v)).Interface()
14 | //}
15 |
16 | // Converter: Func (inputType) -> (resultType, error)
17 | func checkConverter(methodName string, methodType reflect.Type, expectResultType reflect.Type) (inputValuePtr reflect.Value, err error) {
18 | err = NewConverterTypeWrongError(methodName, methodType)
19 | if methodType.NumIn() == 1 &&
20 | methodType.NumOut() == 2 &&
21 | methodType.Out(0) == expectResultType {
22 | if _, exist := methodType.Out(1).MethodByName(ErrorMethodName); exist {
23 | inputValuePtr = reflect.New(methodType.In(0))
24 | err = nil
25 | }
26 | }
27 | return
28 | }
29 |
--------------------------------------------------------------------------------