├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── _examples ├── advance │ └── main.go ├── basic │ └── main.go ├── config │ └── main.go ├── http │ └── main.go └── pages │ └── demo.html ├── builtin.go ├── builtin_functions.go ├── builtin_functions_test.go ├── builtin_selections.go ├── builtin_selections_test.go ├── config.go ├── doc.go ├── doc_test.go ├── extensions ├── markdown │ ├── markdown.go │ └── markdown_test.go └── ugchtml │ └── ugchtml.go ├── go.mod ├── go.sum ├── grammar.png ├── pagser.go ├── pagser_test.go ├── parse.go ├── parse_test.go ├── tokenizer.go ├── tokenizer_test.go ├── utils.go └── utils_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.11.x 5 | - tip 6 | 7 | before_install: 8 | - go get -t -v ./... 9 | 10 | script: 11 | - go test -race -coverprofile=coverage.txt -covermode=atomic 12 | 13 | after_success: 14 | - bash <(curl -s https://codecov.io/bash) 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Foolin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pagser 2 | 3 | [![go-doc-img]][go-doc] [![travis-img]][travis] [![go-report-card-img]][go-report-card] [![Coverage Status][cov-img]][cov] 4 | 5 | **Pagser** inspired by **pag**e par**ser**。 6 | 7 | **Pagser** is a simple, extensible, configurable parse and deserialize html page to struct based on [goquery](https://github.com/PuerkitoBio/goquery) and struct tags for golang crawler. 8 | 9 | ## Contents 10 | 11 | - [Install](#install) 12 | - [Features](#features) 13 | - [Docs](#docs) 14 | - [Usage](#usage) 15 | - [Configuration](#configuration) 16 | - [Struct Tag Grammar](#struct-tag-grammar) 17 | - [Functions](#functions) 18 | - [Builtin functions](#builtin-functions) 19 | - [Extension functions](#extension-functions) 20 | - [Custom function](#custom-function) 21 | - [Function interface](#function-interface) 22 | - [Call Syntax](#call-syntax) 23 | - [Priority Order](#priority-order) 24 | - [More Examples](#more-examples) 25 | - [Examples](#examples) 26 | - [Dependencies](#dependencies) 27 | 28 | 29 | ## Install 30 | 31 | ```bash 32 | go get -u github.com/foolin/pagser 33 | ``` 34 | 35 | Or get the specified version: 36 | ```bash 37 | go get github.com/foolin/pagser@{version} 38 | ``` 39 | The {version} release list: 40 | 41 | 42 | ## Features 43 | 44 | * **Simple** - Use golang struct tag syntax. 45 | * **Easy** - Easy use for your spider/crawler/colly application. 46 | * **Extensible** - Support for extension functions. 47 | * **Struct tag grammar** - Grammar is simple, like \`pagser:"a->attr(href)"\`. 48 | * **Nested Structure** - Support Nested Structure for node. 49 | * **Configurable** - Support configuration. 50 | * **Implicit type conversion** - Automatic implicit type conversion, Output result string convert to int, int64, float64... 51 | * **GoQuery/Colly** - Support all [goquery](https://github.com/PuerkitoBio/goquery) project, such as [go-colly](https://github.com/gocolly/colly). 52 | 53 | ## Docs 54 | 55 | See [Pagser](https://pkg.go.dev/github.com/foolin/pagser) 56 | 57 | 58 | ## Usage 59 | 60 | ```golang 61 | 62 | package main 63 | 64 | import ( 65 | "encoding/json" 66 | "github.com/foolin/pagser" 67 | "log" 68 | ) 69 | 70 | const rawPageHtml = ` 71 | 72 | 73 | 74 | 75 | Pagser Title 76 | 77 | 78 | 79 | 80 |

H1 Pagser Example

81 | 91 | 92 | 93 | ` 94 | 95 | type PageData struct { 96 | Title string `pagser:"title"` 97 | Keywords []string `pagser:"meta[name='keywords']->attrSplit(content)"` 98 | H1 string `pagser:"h1"` 99 | Navs []struct { 100 | ID int `pagser:"->attrEmpty(id, -1)"` 101 | Name string `pagser:"a->text()"` 102 | Url string `pagser:"a->attr(href)"` 103 | } `pagser:".navlink li"` 104 | } 105 | 106 | func main() { 107 | //New default config 108 | p := pagser.New() 109 | 110 | //data parser model 111 | var data PageData 112 | //parse html data 113 | err := p.Parse(&data, rawPageHtml) 114 | //check error 115 | if err != nil { 116 | log.Fatal(err) 117 | } 118 | 119 | //print data 120 | log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data)) 121 | } 122 | 123 | func toJson(v interface{}) string { 124 | data, _ := json.MarshalIndent(v, "", "\t") 125 | return string(data) 126 | } 127 | 128 | ``` 129 | 130 | Run output: 131 | ``` 132 | 133 | Page data json: 134 | ------------- 135 | { 136 | "Title": "Pagser Title", 137 | "Keywords": [ 138 | "golang", 139 | "pagser", 140 | "goquery", 141 | "html", 142 | "page", 143 | "parser", 144 | "colly" 145 | ], 146 | "H1": "H1 Pagser Example", 147 | "Navs": [ 148 | { 149 | "ID": -1, 150 | "Name": "Index", 151 | "Url": "/" 152 | }, 153 | { 154 | "ID": 2, 155 | "Name": "Web page", 156 | "Url": "/list/web" 157 | }, 158 | { 159 | "ID": 3, 160 | "Name": "Pc Page", 161 | "Url": "/list/pc" 162 | }, 163 | { 164 | "ID": 4, 165 | "Name": "Mobile Page", 166 | "Url": "/list/mobile" 167 | } 168 | ] 169 | } 170 | ------------- 171 | 172 | ``` 173 | 174 | ## Configuration 175 | 176 | ```golang 177 | 178 | type Config struct { 179 | TagName string //struct tag name, default is `pagser` 180 | FuncSymbol string //Function symbol, default is `->` 181 | Debug bool //Debug mode, debug will print some log, default is `false` 182 | } 183 | 184 | ``` 185 | 186 | 187 | 188 | ## Struct Tag Grammar 189 | 190 | ``` 191 | [goquery selector]->[function] 192 | ``` 193 | Example: 194 | ```golang 195 | 196 | type ExamData struct { 197 | Herf string `pagser:".navLink li a->attr(href)"` 198 | } 199 | ``` 200 | 201 | > 1.Struct tag name: `pagser` 202 | > 2.[goquery](https://github.com/PuerkitoBio/goquery) selector: `.navLink li a` 203 | > 3.Function symbol: `->` 204 | > 4.Function name: `attr` 205 | > 5.Function arguments: `href` 206 | 207 | ![grammar](grammar.png) 208 | 209 | ## Functions 210 | 211 | ### Builtin functions 212 | 213 | > - text() get element text, return string, this is default function, if not define function in struct tag. 214 | 215 | > - eachText() get each element text, return []string. 216 | 217 | > - html() get element inner html, return string. 218 | 219 | > - eachHtml() get each element inner html, return []string. 220 | 221 | > - outerHtml() get element outer html, return string. 222 | 223 | > - eachOutHtml() get each element outer html, return []string. 224 | 225 | > - attr(name) get element attribute value, return string. 226 | 227 | > - eachAttr() get each element attribute value, return []string. 228 | 229 | > - attrSplit(name, sep) get attribute value and split by separator to array string. 230 | 231 | > - attr('value') get element attribute value by name is `value`, return string, eg: will return "xxx". 232 | 233 | > - textSplit(sep) get element text and split by separator to array string, return []string. 234 | 235 | > - eachTextJoin(sep) get each element text and join to string, return string. 236 | 237 | > - eq(index) reduces the set of matched elements to the one at the specified index, return Selection for nested struct. 238 | 239 | > - ... 240 | 241 | More builtin functions see docs: 242 | 243 | ### Extension functions 244 | 245 | >- Markdown() //convert html to markdown format. 246 | 247 | >- UgcHtml() //sanitize html 248 | 249 | Extensions function need register, like: 250 | ```golang 251 | import "github.com/foolin/pagser/extensions/markdown" 252 | 253 | p := pagser.New() 254 | 255 | //Register Markdown 256 | markdown.Register(p) 257 | 258 | ``` 259 | 260 | ### Custom function 261 | 262 | #### Function interface 263 | ```golang 264 | 265 | type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error) 266 | 267 | ``` 268 | 269 | #### Define global function 270 | ```golang 271 | 272 | //global function need call pagser.RegisterFunc("MyGlob", MyGlobalFunc) before use it. 273 | // this global method must call pagser.RegisterFunc("MyGlob", MyGlobalFunc). 274 | func MyGlobalFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 275 | return "Global-" + node.Text(), nil 276 | } 277 | 278 | type PageData struct{ 279 | MyGlobalValue string `pagser:"->MyGlob()"` 280 | } 281 | 282 | func main(){ 283 | 284 | p := pagser.New() 285 | 286 | //Register global function `MyGlob` 287 | p.RegisterFunc("MyGlob", MyGlobalFunc) 288 | 289 | //Todo 290 | 291 | //data parser model 292 | var data PageData 293 | //parse html data 294 | err := p.Parse(&data, rawPageHtml) 295 | 296 | //... 297 | } 298 | 299 | ``` 300 | 301 | 302 | #### Define struct function 303 | ```golang 304 | 305 | type PageData struct{ 306 | MyFuncValue int `pagser:"->MyFunc()"` 307 | } 308 | 309 | // this method will auto call, not need register. 310 | func (d PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 311 | return "Struct-" + node.Text(), nil 312 | } 313 | 314 | 315 | func main(){ 316 | 317 | p := pagser.New() 318 | 319 | //Todo 320 | 321 | //data parser model 322 | var data PageData 323 | //parse html data 324 | err := p.Parse(&data, rawPageHtml) 325 | 326 | //... 327 | } 328 | 329 | ``` 330 | 331 | #### Call Syntax 332 | 333 | > **Note**: all function arguments are string, single quotes are optional. 334 | 335 | 1. Function call with no arguments 336 | > ->fn() 337 | 338 | 2. Function calls with one argument, and single quotes are optional 339 | 340 | > ->fn(one) 341 | > 342 | > ->fn('one') 343 | 344 | 3. Function calls with many arguments 345 | 346 | > ->fn(one, two, three, ...) 347 | > 348 | > ->fn('one', 'two', 'three', ...) 349 | 350 | 351 | 5. Function calls with single quotes and escape character 352 | 353 | > ->fn('it\\'s ok', 'two,xxx', 'three', ...) 354 | 355 | 356 | ### Priority Order 357 | 358 | Lookup function priority order: 359 | 360 | > struct method -> parent method -> ... -> global 361 | 362 | 363 | ### More Examples 364 | See advance example: 365 | 366 | ## Implicit type conversion 367 | Automatic implicit type conversion, Output result string convert to int, int64, float64... 368 | 369 | **Support type:** 370 | 371 | - bool 372 | - float32 373 | - float64 374 | - int 375 | - int32 376 | - int64 377 | - string 378 | - []bool 379 | - []float32 380 | - []float64 381 | - []int 382 | - []int32 383 | - []int64 384 | - []string 385 | 386 | 387 | 388 | ## Examples 389 | 390 | ### Crawl page example 391 | 392 | ```golang 393 | 394 | package main 395 | 396 | import ( 397 | "encoding/json" 398 | "github.com/foolin/pagser" 399 | "log" 400 | "net/http" 401 | ) 402 | 403 | type PageData struct { 404 | Title string `pagser:"title"` 405 | RepoList []struct { 406 | Names []string `pagser:"h1->textSplit('/', true)"` 407 | Description string `pagser:"h1 + p"` 408 | Stars string `pagser:"a.muted-link->eqAndText(0)"` 409 | Repo string `pagser:"h1 a->attrConcat('href', 'https://github.com', $value, '?from=pagser')"` 410 | } `pagser:"article.Box-row"` 411 | } 412 | 413 | func main() { 414 | resp, err := http.Get("https://github.com/trending") 415 | if err != nil { 416 | log.Fatal(err) 417 | } 418 | defer resp.Body.Close() 419 | 420 | //New default config 421 | p := pagser.New() 422 | 423 | //data parser model 424 | var data PageData 425 | //parse html data 426 | err = p.ParseReader(&data, resp.Body) 427 | //check error 428 | if err != nil { 429 | log.Fatal(err) 430 | } 431 | 432 | //print data 433 | log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data)) 434 | } 435 | 436 | func toJson(v interface{}) string { 437 | data, _ := json.MarshalIndent(v, "", "\t") 438 | return string(data) 439 | } 440 | 441 | 442 | ``` 443 | 444 | Run output: 445 | ``` 446 | 447 | 2020/04/25 12:26:04 Page data json: 448 | ------------- 449 | { 450 | "Title": "Trending repositories on GitHub today · GitHub", 451 | "RepoList": [ 452 | { 453 | "Names": [ 454 | "pcottle", 455 | "learnGitBranching" 456 | ], 457 | "Description": "An interactive git visualization to challenge and educate!", 458 | "Stars": "16,010", 459 | "Repo": "https://github.com/pcottle/learnGitBranching?from=pagser" 460 | }, 461 | { 462 | "Names": [ 463 | "jackfrued", 464 | "Python-100-Days" 465 | ], 466 | "Description": "Python - 100天从新手到大师", 467 | "Stars": "83,484", 468 | "Repo": "https://github.com/jackfrued/Python-100-Days?from=pagser" 469 | }, 470 | { 471 | "Names": [ 472 | "brave", 473 | "brave-browser" 474 | ], 475 | "Description": "Next generation Brave browser for macOS, Windows, Linux, Android.", 476 | "Stars": "5,963", 477 | "Repo": "https://github.com/brave/brave-browser?from=pagser" 478 | }, 479 | { 480 | "Names": [ 481 | "MicrosoftDocs", 482 | "azure-docs" 483 | ], 484 | "Description": "Open source documentation of Microsoft Azure", 485 | "Stars": "3,798", 486 | "Repo": "https://github.com/MicrosoftDocs/azure-docs?from=pagser" 487 | }, 488 | { 489 | "Names": [ 490 | "ahmetb", 491 | "kubectx" 492 | ], 493 | "Description": "Faster way to switch between clusters and namespaces in kubectl", 494 | "Stars": "6,979", 495 | "Repo": "https://github.com/ahmetb/kubectx?from=pagser" 496 | }, 497 | 498 | //... 499 | 500 | { 501 | "Names": [ 502 | "serverless", 503 | "serverless" 504 | ], 505 | "Description": "Serverless Framework – Build web, mobile and IoT applications with serverless architectures using AWS Lambda, Azure Functions, Google CloudFunctions \u0026 more! –", 506 | "Stars": "35,502", 507 | "Repo": "https://github.com/serverless/serverless?from=pagser" 508 | }, 509 | { 510 | "Names": [ 511 | "vuejs", 512 | "vite" 513 | ], 514 | "Description": "Experimental no-bundle dev server for Vue SFCs", 515 | "Stars": "1,573", 516 | "Repo": "https://github.com/vuejs/vite?from=pagser" 517 | } 518 | ] 519 | } 520 | ------------- 521 | ``` 522 | 523 | ### Colly Example 524 | 525 | Work with colly: 526 | ```golang 527 | 528 | p := pagser.New() 529 | 530 | 531 | // On every a element which has href attribute call callback 532 | collector.OnHTML("body", func(e *colly.HTMLElement) { 533 | //data parser model 534 | var data PageData 535 | //parse html data 536 | err := p.ParseSelection(&data, e.Dom) 537 | 538 | }) 539 | 540 | ``` 541 | 542 | - [See Examples](https://github.com/foolin/pagser/tree/master/_examples) 543 | - [See Tests](https://github.com/foolin/pagser/blob/master/parse_test.go) 544 | 545 | ## Dependencies 546 | 547 | - github.com/PuerkitoBio/goquery 548 | 549 | - github.com/spf13/cast 550 | 551 | **Extensions:** 552 | 553 | - github.com/mattn/godown 554 | 555 | - github.com/microcosm-cc/bluemonday 556 | 557 | 558 | 559 | [go-doc]: https://pkg.go.dev/github.com/foolin/pagser 560 | [go-doc-img]: https://godoc.org/github.com/foolin/pagser?status.svg 561 | [travis]: https://travis-ci.org/foolin/pagser 562 | [travis-img]: https://travis-ci.org/foolin/pagser.svg?branch=master 563 | [go-report-card]: https://goreportcard.com/report/github.com/foolin/pagser 564 | [go-report-card-img]: https://goreportcard.com/badge/github.com/foolin/pagser 565 | [cov-img]: https://codecov.io/gh/foolin/pagser/branch/master/graph/badge.svg 566 | [cov]: https://codecov.io/gh/foolin/pagser 567 | -------------------------------------------------------------------------------- /_examples/advance/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/foolin/pagser" 7 | "github.com/foolin/pagser/extensions/markdown" 8 | "log" 9 | ) 10 | 11 | const rawPageHtml = ` 12 | 13 | 14 | 15 | 16 | Pagser Title 17 | 18 | 19 | 20 |

H1 Pagser Example

21 | 31 | 32 | 33 | ` 34 | 35 | type PageData struct { 36 | Title string `pagser:"title"` 37 | H1 string `pagser:"h1"` 38 | Navs []struct { 39 | ID int `pagser:"->attrEmpty(id, -1)"` 40 | Name string `pagser:"a"` 41 | Url string `pagser:"a->attr(href)"` 42 | AbsUrl string `pagser:"a->absHref('https://github.com/foolin/pagser')"` 43 | } `pagser:".navlink li"` 44 | NavFirst struct { 45 | ID int `pagser:"->attrEmpty(id, -1)"` 46 | Name string `pagser:"a"` 47 | Url string `pagser:"a->attr(href)"` 48 | } `pagser:".navlink li->eq(0)"` 49 | NavLast *struct { 50 | ID int `pagser:"->attrEmpty(id, -1)"` 51 | Name string `pagser:"a"` 52 | Url string `pagser:"a->attr(href)"` 53 | } `pagser:".navlink li->eq(-1)"` 54 | NavIds []int `pagser:".navlink li->eachAttrEmpty(id, -1)"` 55 | NavTexts []string `pagser:".navlink li"` 56 | NavEachTexts []string `pagser:".navlink li->eachText()"` 57 | MyFuncValue string `pagser:"h1->MyFunc()"` 58 | MyGlobalValue string `pagser:"h1->MyGlob()"` 59 | Markdown string `pagser:"->Markdown()"` 60 | } 61 | 62 | // this method will auto call, not need register. 63 | func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 64 | return "Struct-" + node.Text(), nil 65 | } 66 | 67 | // this global method must call pagser.RegisterFunc("MyGlobFunc", MyGlobalFunc). 68 | func MyGlobalFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 69 | return "Global-" + node.Text(), nil 70 | } 71 | 72 | func main() { 73 | //New with config 74 | cfg := pagser.Config{ 75 | TagName: "pagser", 76 | FuncSymbol: "->", 77 | CastError: true, 78 | Debug: true, 79 | } 80 | p, err := pagser.NewWithConfig(cfg) 81 | if err != nil { 82 | log.Fatal(err) 83 | } 84 | 85 | //Register Markdown 86 | markdown.Register(p) 87 | 88 | //Register global function 89 | p.RegisterFunc("MyGlob", MyGlobalFunc) 90 | 91 | var data PageData 92 | err = p.Parse(&data, rawPageHtml) 93 | if err != nil { 94 | panic(err) 95 | } 96 | 97 | log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data)) 98 | } 99 | 100 | func toJson(v interface{}) string { 101 | data, _ := json.MarshalIndent(v, "", "\t") 102 | return string(data) 103 | } 104 | -------------------------------------------------------------------------------- /_examples/basic/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/foolin/pagser" 6 | "log" 7 | ) 8 | 9 | const rawPageHtml = ` 10 | 11 | 12 | 13 | 14 | Pagser Title 15 | 16 | 17 | 18 | 19 |

H1 Pagser Example

20 | 30 | 31 | 32 | ` 33 | 34 | type PageData struct { 35 | Title string `pagser:"title"` 36 | Keywords []string `pagser:"meta[name='keywords']->attrSplit(content)"` 37 | H1 string `pagser:"h1"` 38 | Navs []struct { 39 | ID int `pagser:"->attrEmpty(id, -1)"` 40 | Name string `pagser:"a->text()"` 41 | Url string `pagser:"a->attr(href)"` 42 | } `pagser:".navlink li"` 43 | } 44 | 45 | func main() { 46 | //New default config 47 | p := pagser.New() 48 | 49 | //data parser model 50 | var data PageData 51 | //parse html data 52 | err := p.Parse(&data, rawPageHtml) 53 | //check error 54 | if err != nil { 55 | log.Fatal(err) 56 | } 57 | 58 | //print data 59 | log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data)) 60 | } 61 | 62 | func toJson(v interface{}) string { 63 | data, _ := json.MarshalIndent(v, "", "\t") 64 | return string(data) 65 | } 66 | -------------------------------------------------------------------------------- /_examples/config/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/foolin/pagser" 6 | "log" 7 | ) 8 | 9 | type ExampleData struct { 10 | Title string `query:"title"` 11 | Keywords []string `query:"meta[name='keywords']@attrSplit(content)"` 12 | Navs []struct { 13 | ID int `query:"@attrEmpty(id, -1)"` 14 | Name string `query:"a@text()"` 15 | Url string `query:"a@attr(href)"` 16 | } `query:".navlink li"` 17 | } 18 | 19 | const rawExampleHtml = ` 20 | 21 | 22 | 23 | 24 | Pagser Example 25 | 26 | 27 | 28 | 29 | 39 | 40 | 41 | ` 42 | 43 | func main() { 44 | cfg := pagser.Config{ 45 | TagName: "query", 46 | FuncSymbol: "@", 47 | CastError: true, 48 | Debug: true, 49 | } 50 | p, err := pagser.NewWithConfig(cfg) 51 | if err != nil { 52 | log.Fatal(err) 53 | } 54 | 55 | var data ExampleData 56 | err = p.Parse(&data, rawExampleHtml) 57 | if err != nil { 58 | log.Fatal(err) 59 | } 60 | log.Printf("json: %v\n", toJson(data)) 61 | } 62 | 63 | func toJson(v interface{}) string { 64 | data, _ := json.MarshalIndent(v, "", "\t") 65 | return string(data) 66 | } 67 | -------------------------------------------------------------------------------- /_examples/http/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/foolin/pagser" 6 | "log" 7 | "net/http" 8 | ) 9 | 10 | type PageData struct { 11 | Title string `pagser:"title"` 12 | RepoList []struct { 13 | Names []string `pagser:"h1->textSplit('/', true)"` 14 | Description string `pagser:"h1 + p"` 15 | Stars string `pagser:"a.muted-link->eqAndText(0)"` 16 | Repo string `pagser:"h1 a->attrConcat('href', 'https://github.com', $value, '?from=pagser')"` 17 | } `pagser:"article.Box-row"` 18 | } 19 | 20 | func main() { 21 | resp, err := http.Get("https://github.com/trending") 22 | if err != nil { 23 | log.Fatal(err) 24 | } 25 | defer resp.Body.Close() 26 | 27 | //New default config 28 | p := pagser.New() 29 | 30 | //data parser model 31 | var data PageData 32 | //parse html data 33 | err = p.ParseReader(&data, resp.Body) 34 | //check error 35 | if err != nil { 36 | log.Fatal(err) 37 | } 38 | 39 | //print data 40 | log.Printf("Page data json: \n-------------\n%v\n-------------\n", toJson(data)) 41 | } 42 | 43 | func toJson(v interface{}) string { 44 | data, _ := json.MarshalIndent(v, "", "\t") 45 | return string(data) 46 | } 47 | -------------------------------------------------------------------------------- /_examples/pages/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Pagser Title 6 | 7 | 8 | 9 |

Pagser H1 Example

10 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /builtin.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | ) 6 | 7 | // CallFunc write function interface 8 | // 9 | // # Define Global Function 10 | // 11 | // func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 12 | // //Todo 13 | // return "Hello", nil 14 | // } 15 | // 16 | // //Register function 17 | // pagser.RegisterFunc("MyFunc", MyFunc) 18 | // 19 | // //Use function 20 | // type PageData struct{ 21 | // Text string `pagser:"h1->MyFunc()"` 22 | // } 23 | // 24 | // 25 | // # Define Struct Function 26 | // //Use function 27 | // type PageData struct{ 28 | // Text string `pagser:"h1->MyFunc()"` 29 | // } 30 | // 31 | // func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { 32 | // //Todo 33 | // return "Hello", nil 34 | // } 35 | // 36 | // # Lookup function priority order 37 | // 38 | // struct method -> parent method -> ... -> global 39 | // 40 | // # Implicit convert type 41 | // 42 | // Automatic type conversion, Output result string convert to int, int64, float64... 43 | // 44 | // CallFunc is a define function interface 45 | type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error) 46 | 47 | //BuiltinFunctions instance 48 | var builtinFun BuiltinFunctions 49 | 50 | //BuiltinSelections instance 51 | var builtinSel BuiltinSelections 52 | 53 | //builtin functions 54 | var builtinFuncs = map[string]CallFunc{ 55 | "absHref": builtinFun.AbsHref, 56 | "attr": builtinFun.Attr, 57 | "attrConcat": builtinFun.AttrConcat, 58 | "attrEmpty": builtinFun.AttrEmpty, 59 | "attrSplit": builtinFun.AttrSplit, 60 | "eachAttr": builtinFun.EachAttr, 61 | "eachAttrEmpty": builtinFun.EachAttrEmpty, 62 | "eachHtml": builtinFun.EachHtml, 63 | "eachOutHtml": builtinFun.EachOutHtml, 64 | "eachText": builtinFun.EachText, 65 | "eachTextEmpty": builtinFun.EachTextEmpty, 66 | "eachTextJoin": builtinFun.EachTextJoin, 67 | "eqAndAttr": builtinFun.EqAndAttr, 68 | "eqAndHtml": builtinFun.EqAndHtml, 69 | "eqAndOutHtml": builtinFun.EqAndOutHtml, 70 | "eqAndText": builtinFun.EqAndText, 71 | "html": builtinFun.Html, 72 | "outerHtml": builtinFun.OutHtml, 73 | "size": builtinFun.Size, 74 | "text": builtinFun.Text, 75 | "textConcat": builtinFun.TextConcat, 76 | "textEmpty": builtinFun.TextEmpty, 77 | "textSplit": builtinFun.TextSplit, 78 | // selector 79 | "child": builtinSel.Child, 80 | "eq": builtinSel.Eq, 81 | "first": builtinSel.First, 82 | "last": builtinSel.Last, 83 | "next": builtinSel.Next, 84 | "parent": builtinSel.Parent, 85 | "parents": builtinSel.Parents, 86 | "parentsUntil": builtinSel.ParentsUntil, 87 | "prev": builtinSel.Prev, 88 | "siblings": builtinSel.Siblings, 89 | } 90 | 91 | // RegisterFunc register function for parse result 92 | // pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { 93 | // //Todo 94 | // return "Hello", nil 95 | // }) 96 | func (p *Pagser) RegisterFunc(name string, fn CallFunc) { 97 | p.mapFuncs.Store(name, fn) 98 | } 99 | -------------------------------------------------------------------------------- /builtin_functions.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/spf13/cast" 7 | "net/url" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | // BuiltinFunctions builtin functions are registered with a lowercase initial, eg: Text -> text() 13 | type BuiltinFunctions struct { 14 | } 15 | 16 | // AbsHref absHref(baseUrl) get element attribute name `href`, and convert to absolute url, return *URL. 17 | // `baseUrl` is the base url like `https://example.com/`. 18 | // //Pagser 19 | // struct { 20 | // Example string `pagser:".selector->absHref('https://github.com/')"` 21 | // } 22 | func (builtin BuiltinFunctions) AbsHref(selection *goquery.Selection, args ...string) (out interface{}, err error) { 23 | if len(args) < 1 { 24 | return "", fmt.Errorf("args must has baseUrl") 25 | } 26 | baseUrl, err := url.Parse(args[0]) 27 | if err != nil { 28 | return "", fmt.Errorf("invalid base url: %v error: %v", baseUrl, err) 29 | } 30 | hrefUrl, err := url.Parse(selection.AttrOr("href", "")) 31 | if err != nil { 32 | return "", err 33 | } 34 | return baseUrl.ResolveReference(hrefUrl), nil 35 | } 36 | 37 | // Attr attr(name, defaultValue='') get element attribute value, return string. 38 | // outerHtml() get element outer html, return string. 39 | // //Pagser 40 | // struct { 41 | // Example string `pagser:".selector->attr(href)"` 42 | // } 43 | func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error) { 44 | if len(args) < 1 { 45 | return "", fmt.Errorf("attr(name) must has name") 46 | } 47 | name := args[0] 48 | defaultValue := "" 49 | if len(args) > 1 { 50 | defaultValue = args[1] 51 | } 52 | val := node.AttrOr(name, defaultValue) 53 | return val, nil 54 | } 55 | 56 | // AttrConcat attrConcat(name, text1, $value, [ text2, ... text_n ]) 57 | // `name` get element attribute value by name, 58 | // `text1, text2, ... text_n` The strings that you wish to join together, 59 | // `$value` is placeholder for get element text 60 | // return string. 61 | // struct { 62 | // Example string `pagser:".selector->attrConcat('Result:', '<', $value, '>')"` 63 | // } 64 | func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error) { 65 | if len(args) < 3 { 66 | return "", fmt.Errorf("attrConcat(name, text1, $value, [ text2, ... text_n ]) must be more than two arguments") 67 | } 68 | name := args[0] 69 | value := strings.TrimSpace(node.AttrOr(name, "")) 70 | builder := strings.Builder{} 71 | for i, v := range args { 72 | if i == 0 { 73 | continue 74 | } 75 | if v == "$value" { 76 | builder.WriteString(value) 77 | } else { 78 | builder.WriteString(v) 79 | } 80 | } 81 | return builder.String(), nil 82 | } 83 | 84 | // AttrEmpty attrEmpty(name, defaultValue) get element attribute value, return string. 85 | // //Pagser 86 | // struct { 87 | // Example string `pagser:".selector->AttrEmpty(href, '#')"` 88 | // } 89 | func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error) { 90 | if len(args) < 2 { 91 | return "", fmt.Errorf("attr(name, defaultValue) must has name and default value") 92 | } 93 | name := args[0] 94 | defaultValue := args[1] 95 | value := strings.TrimSpace(node.AttrOr(name, defaultValue)) 96 | if value == "" { 97 | value = defaultValue 98 | } 99 | return value, nil 100 | } 101 | 102 | // AttrSplit attrSplit(name, sep=',', trim='true') get attribute value and split by separator to array string, return []string. 103 | // struct { 104 | // Examples []string `pagser:".selector->attrSplit('keywords', ',')"` 105 | // } 106 | func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error) { 107 | if len(args) < 1 { 108 | return "", fmt.Errorf("attr(name) must has `name`") 109 | } 110 | name := args[0] 111 | sep := "," 112 | trim := true 113 | if len(args) > 1 { 114 | sep = args[1] 115 | } 116 | if len(args) > 2 { 117 | var err error 118 | trim, err = cast.ToBoolE(args[2]) 119 | if err != nil { 120 | return nil, fmt.Errorf("`trim` must bool type value: true/false") 121 | } 122 | } 123 | 124 | list := strings.Split(node.AttrOr(name, ""), sep) 125 | if trim { 126 | for i, v := range list { 127 | list[i] = strings.TrimSpace(v) 128 | } 129 | } 130 | return list, nil 131 | } 132 | 133 | // EachAttr eachAttr(name) get each element attribute value, return []string. 134 | // //Pagser 135 | // struct { 136 | // Examples []string `pagser:".selector->eachAttr(href)"` 137 | // } 138 | func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error) { 139 | if len(args) < 1 { 140 | return "", fmt.Errorf("attr(name) must has name") 141 | } 142 | name := args[0] 143 | list := make([]string, 0) 144 | node.Each(func(i int, selection *goquery.Selection) { 145 | list = append(list, strings.TrimSpace(selection.AttrOr(name, ""))) 146 | }) 147 | return list, nil 148 | } 149 | 150 | // EachAttrEmpty eachAttrEmpty(name, defaultValue) get each element attribute value, return []string. 151 | // //Pagser 152 | // struct { 153 | // Examples []string `pagser:".selector->eachAttrEmpty(href, '#')"` 154 | // } 155 | func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error) { 156 | if len(args) < 2 { 157 | return "", fmt.Errorf("eachAttrEmpty(name) must has name") 158 | } 159 | name := args[0] 160 | defaultValue := args[1] 161 | list := make([]string, 0) 162 | node.Each(func(i int, selection *goquery.Selection) { 163 | value := strings.TrimSpace(selection.AttrOr(name, "")) 164 | if value == "" { 165 | value = defaultValue 166 | } 167 | list = append(list, value) 168 | }) 169 | return list, nil 170 | } 171 | 172 | // EachHtml eachHtml() get each element inner html, return []string. 173 | // eachTextEmpty(defaultValue) get each element text, return []string. 174 | // struct { 175 | // Examples []string `pagser:".selector->eachHtml()"` 176 | // } 177 | func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error) { 178 | list := make([]string, 0) 179 | node.EachWithBreak(func(i int, selection *goquery.Selection) bool { 180 | var html string 181 | html, err = node.Html() 182 | if err != nil { 183 | return false 184 | } 185 | list = append(list, html) 186 | return true 187 | }) 188 | if err != nil { 189 | return nil, err 190 | } 191 | return list, nil 192 | } 193 | 194 | // EachOutHtml eachOutHtml() get each element outer html, return []string. 195 | // struct { 196 | // Examples []string `pagser:".selector->eachOutHtml()"` 197 | // } 198 | func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error) { 199 | list := make([]string, 0) 200 | node.EachWithBreak(func(i int, selection *goquery.Selection) bool { 201 | var html string 202 | html, err = goquery.OuterHtml(node) 203 | if err != nil { 204 | return false 205 | } 206 | list = append(list, html) 207 | return true 208 | }) 209 | if err != nil { 210 | return nil, err 211 | } 212 | return list, nil 213 | } 214 | 215 | // EachText eachText() get each element text, return []string. 216 | // struct { 217 | // Examples []string `pagser:".selector->eachText('')"` 218 | // } 219 | func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error) { 220 | list := make([]string, 0) 221 | node.Each(func(i int, selection *goquery.Selection) { 222 | list = append(list, strings.TrimSpace(selection.Text())) 223 | }) 224 | return list, nil 225 | } 226 | 227 | // EachTextEmpty eachTextEmpty(defaultValue) get each element text, return []string. 228 | // struct { 229 | // Examples []string `pagser:".selector->eachTextEmpty('')"` 230 | // } 231 | func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error) { 232 | if len(args) < 1 { 233 | return "", fmt.Errorf("eachTextEmpty(defaultValue) must has defaultValue") 234 | } 235 | defaultValue := args[0] 236 | list := make([]string, 0) 237 | node.Each(func(i int, selection *goquery.Selection) { 238 | value := strings.TrimSpace(selection.Text()) 239 | if value == "" { 240 | value = defaultValue 241 | } 242 | list = append(list, value) 243 | }) 244 | return list, nil 245 | } 246 | 247 | // EachTextJoin eachTextJoin(sep) get each element text and join to string, return string. 248 | // struct { 249 | // Example string `pagser:".selector->eachTextJoin(',')"` 250 | // } 251 | func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error) { 252 | sep := "," 253 | if len(args) > 0 { 254 | sep = args[0] 255 | } 256 | list := make([]string, 0) 257 | node.Each(func(i int, selection *goquery.Selection) { 258 | list = append(list, strings.TrimSpace(selection.Text())) 259 | }) 260 | return strings.Join(list, sep), nil 261 | } 262 | 263 | // EqAndAttr eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string. 264 | // struct { 265 | // Example string `pagser:".selector->eqAndAttr(0, href)"` 266 | // } 267 | func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error) { 268 | if len(args) < 2 { 269 | return "", fmt.Errorf("eqAndAttr(index) must has index and attr name") 270 | } 271 | indexValue := strings.TrimSpace(args[0]) 272 | idx, err := strconv.Atoi(indexValue) 273 | if err != nil { 274 | return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) 275 | } 276 | name := strings.TrimSpace(args[1]) 277 | return node.Eq(idx).AttrOr(name, ""), nil 278 | } 279 | 280 | // EqAndHtml eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string. 281 | // struct { 282 | // Example string `pagser:".selector->eqAndHtml(0)"` 283 | // } 284 | func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error) { 285 | if len(args) < 1 { 286 | return "", fmt.Errorf("eqAndHtml(index) must has index") 287 | } 288 | indexValue := strings.TrimSpace(args[0]) 289 | idx, err := strconv.Atoi(indexValue) 290 | if err != nil { 291 | return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) 292 | } 293 | return node.Eq(idx).Html() 294 | } 295 | 296 | // EqAndOutHtml eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string. 297 | // struct { 298 | // Example string `pagser:".selector->eqAndOutHtml(0)"` 299 | // } 300 | func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error) { 301 | if len(args) < 1 { 302 | return "", fmt.Errorf("eqAndOutHtml(index) must has index") 303 | } 304 | indexValue := strings.TrimSpace(args[0]) 305 | idx, err := strconv.Atoi(indexValue) 306 | if err != nil { 307 | return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) 308 | } 309 | return goquery.OuterHtml(node.Eq(idx)) 310 | } 311 | 312 | // EqAndText eqAndText(index) reduces the set of matched elements to the one at the specified index, return string. 313 | // struct { 314 | // Example string `pagser:".selector->eqAndText(0)"` 315 | // } 316 | func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error) { 317 | if len(args) < 1 { 318 | return "", fmt.Errorf("eqAndText(index) must has index") 319 | } 320 | indexValue := strings.TrimSpace(args[0]) 321 | idx, err := strconv.Atoi(indexValue) 322 | if err != nil { 323 | return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) 324 | } 325 | return strings.TrimSpace(node.Eq(idx).Text()), nil 326 | } 327 | 328 | // Html html() get element inner html, return string. 329 | // struct { 330 | // Example string `pagser:".selector->html()"` 331 | // } 332 | func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error) { 333 | return node.Html() 334 | } 335 | 336 | // OutHtml outerHtml() get element outer html, return string. 337 | // struct { 338 | // Example string `pagser:".selector->outerHtml()"` 339 | // } 340 | func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error) { 341 | html, err := goquery.OuterHtml(node) 342 | if err != nil { 343 | return "", err 344 | } 345 | return html, nil 346 | } 347 | 348 | // Size size() returns the number of elements in the Selection object, return int. 349 | // struct { 350 | // Size int `pagser:".selector->size()"` 351 | // } 352 | func (builtin BuiltinFunctions) Size(node *goquery.Selection, args ...string) (out interface{}, err error) { 353 | return node.Size(), nil 354 | } 355 | 356 | // Text text() get element text, return string, this is default function, if not define function in struct tag. 357 | // struct { 358 | // Example string `pagser:".selector->text()"` 359 | // } 360 | func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error) { 361 | return strings.TrimSpace(node.Text()), nil 362 | } 363 | 364 | // TextConcat textConcat(text1, $value, [ text2, ... text_n ]) 365 | // The `text1, text2, ... text_n` strings that you wish to join together, 366 | // `$value` is placeholder for get element text, return string. 367 | // struct { 368 | // Example string `pagser:".selector->textConcat('Result:', '<', $value, '>')"` 369 | // } 370 | func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error) { 371 | if len(args) < 2 { 372 | return "", fmt.Errorf("textConcat(text1, $value, [ text2, ... text_n ]) must be more than two arguments") 373 | } 374 | value := strings.TrimSpace(node.Text()) 375 | builder := strings.Builder{} 376 | for _, v := range args { 377 | if v == "$value" { 378 | builder.WriteString(value) 379 | } else { 380 | builder.WriteString(v) 381 | } 382 | } 383 | return builder.String(), nil 384 | } 385 | 386 | // TextEmpty textEmpty(defaultValue) get element text, if empty will return defaultValue, return string. 387 | // struct { 388 | // Example string `pagser:".selector->textEmpty('')"` 389 | // } 390 | func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error) { 391 | if len(args) < 1 { 392 | return "", fmt.Errorf("textEmpty(defaultValue) must has defaultValue") 393 | } 394 | defaultValue := args[0] 395 | value := strings.TrimSpace(node.Text()) 396 | if value == "" { 397 | value = defaultValue 398 | } 399 | return value, nil 400 | } 401 | 402 | // TextSplit textSplit(sep=',', trim='true') get element text and split by separator to array string, return []string. 403 | // struct { 404 | // Examples []string `pagser:".selector->textSplit('|')"` 405 | // } 406 | func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error) { 407 | sep := "," 408 | trim := true 409 | if len(args) > 0 { 410 | sep = args[0] 411 | } 412 | if len(args) > 1 { 413 | var err error 414 | trim, err = cast.ToBoolE(args[1]) 415 | if err != nil { 416 | return nil, fmt.Errorf("`trim` must bool type value: true/false") 417 | } 418 | } 419 | list := strings.Split(node.Text(), sep) 420 | if trim { 421 | for i, v := range list { 422 | list[i] = strings.TrimSpace(v) 423 | } 424 | } 425 | return list, nil 426 | } 427 | -------------------------------------------------------------------------------- /builtin_functions_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "github.com/PuerkitoBio/goquery" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | type funcWantError struct { 11 | want bool 12 | fun string 13 | args []string 14 | data string 15 | } 16 | 17 | func (fwe funcWantError) String() string { 18 | return fmt.Sprintf("%v(%v) call `%v`", fwe.fun, strings.Join(fwe.args, ","), fwe.data) 19 | } 20 | 21 | func newTewSelection(content string) *goquery.Selection { 22 | doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) 23 | return doc.Selection.Find("body").Children() 24 | } 25 | 26 | func TestBuiltinFunctionsErrors(t *testing.T) { 27 | tests := []funcWantError{ 28 | //not baseUrl 29 | {true, "absHref", []string{}, ``}, 30 | //baseUrl invalid 31 | {true, "absHref", []string{"http://a b.com/"}, `a`}, 32 | //href value not invalid url 33 | {true, "absHref", []string{"http://github.com/"}, `a`}, 34 | //not attr name 35 | {true, "attr", []string{}, `a`}, 36 | //not attr name 37 | {true, "attrConcat", []string{}, `a`}, 38 | //not attr $value 39 | {true, "attrConcat", []string{"href"}, `a`}, 40 | //not concat second string 41 | {true, "attrConcat", []string{"href", "$value"}, `a`}, 42 | //not default value 43 | {true, "attrEmpty", []string{"href"}, `a`}, 44 | //not attr name 45 | {true, "attrSplit", []string{}, `a`}, 46 | //trim value '1234' is not bool type 47 | {true, "attrSplit", []string{"href", "|", "1234"}, `a`}, 48 | //not attr name 49 | {true, "eachAttr", []string{}, `a`}, 50 | //not default value 51 | {true, "eachAttrEmpty", []string{"href"}, `a`}, 52 | //html error 53 | //{true, "eachOutHtml", []string{}, `abc`}, 54 | //not default value 55 | {true, "eachTextEmpty", []string{}, `a`}, 56 | //not index value 57 | {true, "eqAndAttr", []string{}, `a`}, 58 | //not name value 59 | {true, "eqAndAttr", []string{"0"}, `a`}, 60 | //index not number 61 | {true, "eqAndAttr", []string{"a", "href"}, `a`}, 62 | //not index value 63 | {true, "eqAndHtml", []string{}, `a`}, 64 | //index not number 65 | {true, "eqAndHtml", []string{"a"}, `a`}, 66 | //not index value 67 | {true, "eqAndOutHtml", []string{}, `a`}, 68 | //not name value 69 | {true, "eqAndOutHtml", []string{"a"}, `a`}, 70 | //not index value 71 | {true, "eqAndText", []string{}, `a`}, 72 | //not name value 73 | {true, "eqAndText", []string{"a"}, `a`}, 74 | //html error 75 | //{true, "outerHtml", []string{"a"}, ``}, 76 | //not args 77 | {true, "textConcat", []string{"$value"}, `a`}, 78 | //not args 79 | {true, "textEmpty", []string{}, `a`}, 80 | //not bool 81 | {true, "textSplit", []string{",", "1.2"}, `a`}, 82 | } 83 | 84 | for _, tt := range tests { 85 | var sel = newTewSelection(tt.data) 86 | _, err := builtinFuncs[tt.fun](sel, tt.args...) 87 | if tt.want { 88 | if err == nil { 89 | t.Errorf("%v want an error", tt.String()) 90 | } 91 | continue 92 | } 93 | if err != nil { 94 | t.Errorf("%v want no error, but error is %v", tt.String(), err) 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /builtin_selections.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "github.com/PuerkitoBio/goquery" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | // BuiltinSelections builtin selection functions are registered with a lowercase initial, eg: Text -> text() 11 | type BuiltinSelections struct { 12 | } 13 | 14 | // Child child(selector='') gets the child elements of each element in the Selection, 15 | // Filtered by the specified selector if selector not empty, 16 | // It returns Selection object containing these elements for nested struct.. 17 | // struct { 18 | // SubStruct struct { 19 | // Example string `pagser:".selector->text()"` 20 | // } `pagser:".selector->child()"` 21 | // } 22 | func (builtin BuiltinSelections) Child(node *goquery.Selection, args ...string) (out interface{}, err error) { 23 | selector := "" 24 | if len(args) > 0 { 25 | selector = strings.TrimSpace(args[0]) 26 | } 27 | if selector != "" { 28 | return node.ChildrenFiltered(selector), nil 29 | } 30 | return node.Children(), nil 31 | } 32 | 33 | // Eq eq(index) reduces the set of matched elements to the one at the specified index. 34 | // If a negative index is given, it counts backwards starting at the end of the set. 35 | // It returns a Selection object for nested struct, and an empty Selection object if the 36 | // index is invalid. 37 | // struct { 38 | // SubStruct struct { 39 | // Example string `pagser:".selector->text()"` 40 | // } `pagser:".selector->eq(0)"` 41 | // } 42 | func (builtin BuiltinSelections) Eq(node *goquery.Selection, args ...string) (out interface{}, err error) { 43 | if len(args) < 1 { 44 | return "", fmt.Errorf("nodeEq(index) must has `index` value") 45 | } 46 | indexValue := strings.TrimSpace(args[0]) 47 | idx, err := strconv.Atoi(indexValue) 48 | if err != nil { 49 | return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) 50 | } 51 | return node.Eq(idx), nil 52 | } 53 | 54 | // First first() First reduces the set of matched elements to the first in the set. 55 | // It returns a new Selection object, and an empty Selection object if the 56 | // the selection is empty. 57 | // It returns Selection object containing these elements for nested struct. 58 | // struct { 59 | // SubStruct struct { 60 | // Example string `pagser:".selector->text()"` 61 | // } `pagser:".selector->first()"` 62 | // } 63 | func (builtin BuiltinSelections) First(node *goquery.Selection, args ...string) (out interface{}, err error) { 64 | return node.First(), nil 65 | } 66 | 67 | // Last last(selector='') reduces the set of matched elements to the last in the set. 68 | // It returns a new Selection object, and an empty Selection object if 69 | // the selection is empty. 70 | // struct { 71 | // SubStruct struct { 72 | // Example string `pagser:".selector->text()"` 73 | // } `pagser:".selector->last()"` 74 | // } 75 | func (builtin BuiltinSelections) Last(node *goquery.Selection, args ...string) (out interface{}, err error) { 76 | return node.Last(), nil 77 | } 78 | 79 | // Next next(selector='') gets the immediately following sibling of each element in the Selection. 80 | // Filtered by the specified selector if selector not empty, 81 | // It returns Selection object containing these elements for nested struct. 82 | // struct { 83 | // SubStruct struct { 84 | // Example string `pagser:".selector->text()"` 85 | // } `pagser:".selector->next()"` 86 | // } 87 | func (builtin BuiltinSelections) Next(node *goquery.Selection, args ...string) (out interface{}, err error) { 88 | selector := "" 89 | if len(args) > 0 { 90 | selector = strings.TrimSpace(args[0]) 91 | } 92 | if selector != "" { 93 | return node.NextFiltered(selector), nil 94 | } 95 | return node.Next(), nil 96 | } 97 | 98 | // Parent parent(selector='') gets the parent elements of each element in the Selection. 99 | // Filtered by the specified selector if selector not empty, 100 | // It returns Selection object containing these elements for nested struct. 101 | // struct { 102 | // SubStruct struct { 103 | // Example string `pagser:".selector->text()"` 104 | // } `pagser:".selector->parent()"` 105 | // } 106 | func (builtin BuiltinSelections) Parent(node *goquery.Selection, args ...string) (out interface{}, err error) { 107 | selector := "" 108 | if len(args) > 0 { 109 | selector = strings.TrimSpace(args[0]) 110 | } 111 | if selector != "" { 112 | return node.ParentFiltered(selector), nil 113 | } 114 | return node.Parent(), nil 115 | } 116 | 117 | // Parents parents(selector='') gets the parent elements of each element in the Selection. 118 | // Filtered by the specified selector if selector not empty, 119 | // It returns Selection object containing these elements for nested struct. 120 | // struct { 121 | // SubStruct struct { 122 | // Example string `pagser:".selector->text()"` 123 | // } `pagser:".selector->parents()"` 124 | // } 125 | func (builtin BuiltinSelections) Parents(node *goquery.Selection, args ...string) (out interface{}, err error) { 126 | selector := "" 127 | if len(args) > 0 { 128 | selector = strings.TrimSpace(args[0]) 129 | } 130 | if selector != "" { 131 | return node.ParentsFiltered(selector), nil 132 | } 133 | return node.Parents(), nil 134 | } 135 | 136 | // ParentsUntil parentsUntil(selector) gets the ancestors of each element in the Selection, up to but 137 | // not including the element matched by the selector. It returns a new Selection 138 | // object containing the matched elements. 139 | // It returns Selection object containing these elements for nested struct. 140 | // struct { 141 | // SubStruct struct { 142 | // Example string `pagser:".selector->text()"` 143 | // } `pagser:".selector->parentsUntil('.wrap')"` 144 | // } 145 | func (builtin BuiltinSelections) ParentsUntil(node *goquery.Selection, args ...string) (out interface{}, err error) { 146 | if len(args) < 1 { 147 | return nil, fmt.Errorf("parentsUntil must has selector") 148 | } 149 | selector := strings.TrimSpace(args[0]) 150 | return node.ParentsUntil(selector), nil 151 | } 152 | 153 | // Prev prev() gets the immediately preceding sibling of each element in the Selection. 154 | // Filtered by the specified selector if selector not empty, 155 | // It returns Selection object containing these elements for nested struct. 156 | // struct { 157 | // SubStruct struct { 158 | // Example string `pagser:".selector->text()"` 159 | // } `pagser:".selector->prev()"` 160 | // } 161 | func (builtin BuiltinSelections) Prev(node *goquery.Selection, args ...string) (out interface{}, err error) { 162 | selector := "" 163 | if len(args) > 0 { 164 | selector = strings.TrimSpace(args[0]) 165 | } 166 | if selector != "" { 167 | return node.PrevFiltered(selector), nil 168 | } 169 | return node.Prev(), nil 170 | } 171 | 172 | // Siblings siblings() gets the siblings of each element in the Selection. 173 | // Filtered by the specified selector if selector not empty, 174 | // It returns Selection object containing these elements for nested struct. 175 | // struct { 176 | // SubStruct struct { 177 | // Example string `pagser:".selector->text()"` 178 | // } `pagser:".selector->siblings()"` 179 | // } 180 | func (builtin BuiltinSelections) Siblings(node *goquery.Selection, args ...string) (out interface{}, err error) { 181 | selector := "" 182 | if len(args) > 0 { 183 | selector = strings.TrimSpace(args[0]) 184 | } 185 | if selector != "" { 186 | return node.SiblingsFiltered(selector), nil 187 | } 188 | return node.Siblings(), nil 189 | } 190 | -------------------------------------------------------------------------------- /builtin_selections_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | //test errors 8 | func TestBuiltinSelectionsErrors(t *testing.T) { 9 | tests := []funcWantError{ 10 | //not args 11 | {true, "eq", []string{}, ``}, 12 | //index not number 13 | {true, "eq", []string{"a"}, ``}, 14 | //not args 15 | {true, "parentsUntil", []string{}, ``}, 16 | } 17 | 18 | for _, tt := range tests { 19 | var sel = newTewSelection(tt.data) 20 | _, err := builtinFuncs[tt.fun](sel, tt.args...) 21 | if tt.want { 22 | if err == nil { 23 | t.Errorf("%v want an error", tt.String()) 24 | } 25 | continue 26 | } 27 | if err != nil { 28 | t.Errorf("%v want no error, but error is %v", tt.String(), err) 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | const ignoreSymbol = "-" 4 | 5 | // Config configuration 6 | type Config struct { 7 | TagName string //struct tag name, default is `pagser` 8 | FuncSymbol string //Function symbol, default is `->` 9 | CastError bool //Returns an error when the type cannot be converted, default is `false` 10 | Debug bool //Debug mode, debug will print some log, default is `false` 11 | } 12 | 13 | var defaultCfg = Config{ 14 | TagName: "pagser", 15 | FuncSymbol: "->", 16 | CastError: false, 17 | Debug: false, 18 | } 19 | 20 | // DefaultConfig the default Config 21 | // Config{ 22 | // TagName: "pagser", 23 | // FuncSymbol: "->", 24 | // CastError: false, 25 | // Debug: false, 26 | // } 27 | func DefaultConfig() Config { 28 | return defaultCfg 29 | } 30 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | //Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, 2 | //It's parser library from scrago. 3 | // 4 | // The project source code: https://github.com/foolin/pagser 5 | // 6 | // Features 7 | // 8 | // * Simple - Use golang struct tag syntax. 9 | // 10 | // * Easy - Easy use for your spider/crawler/colly application. 11 | // 12 | // * Extensible - Support for extension functions. 13 | // 14 | // * Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`. 15 | // 16 | // * Nested Structure - Support Nested Structure for node. 17 | // 18 | // * Configurable - Support configuration. 19 | // 20 | // * GoQuery/Colly - Support all goquery project, such as go-colly. 21 | // 22 | // More info: https://github.com/foolin/pagser 23 | package pagser 24 | -------------------------------------------------------------------------------- /doc_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "log" 6 | "net/http" 7 | "strings" 8 | ) 9 | 10 | const rawExampleHtml = ` 11 | 12 | 13 | 14 | 15 | Pagser Example 16 | 17 | 18 | 19 | 20 |

Pagser H1 Title

21 | 31 |
A|B|C|D
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | ` 44 | 45 | type ExamplePage struct { 46 | Title string `pagser:"title"` 47 | H1 string `pagser:"h1"` 48 | Navs []struct { 49 | ID int `pagser:"->attrEmpty(id, -1)"` 50 | Name string `pagser:"a"` 51 | Url string `pagser:"a->attr(href)"` 52 | } `pagser:".navlink li"` 53 | } 54 | 55 | func ExampleNewWithConfig() { 56 | cfg := Config{ 57 | TagName: "pagser", 58 | FuncSymbol: "->", 59 | CastError: false, 60 | Debug: false, 61 | } 62 | p, err := NewWithConfig(cfg) 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | 67 | //data parser model 68 | var page ExamplePage 69 | //parse html data 70 | err = p.Parse(&page, rawExampleHtml) 71 | //check error 72 | if err != nil { 73 | log.Fatal(err) 74 | } 75 | 76 | } 77 | 78 | func ExamplePagser_Parse() { 79 | //New default Config 80 | p := New() 81 | 82 | //data parser model 83 | var page ExamplePage 84 | //parse html data 85 | err := p.Parse(&page, rawExampleHtml) 86 | //check error 87 | if err != nil { 88 | log.Fatal(err) 89 | } 90 | 91 | //print result 92 | log.Printf("%v", page) 93 | } 94 | 95 | func ExamplePagser_ParseDocument() { 96 | //New default Config 97 | p := New() 98 | 99 | //data parser model 100 | var data ExamplePage 101 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml)) 102 | if err != nil { 103 | log.Fatal(err) 104 | } 105 | 106 | //parse document 107 | err = p.ParseDocument(&data, doc) 108 | //check error 109 | if err != nil { 110 | log.Fatal(err) 111 | } 112 | 113 | //print result 114 | log.Printf("%v", data) 115 | } 116 | 117 | func ExamplePagser_ParseSelection() { 118 | //New default Config 119 | p := New() 120 | 121 | //data parser model 122 | var data ExamplePage 123 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml)) 124 | if err != nil { 125 | log.Fatal(err) 126 | } 127 | 128 | //parse document 129 | err = p.ParseSelection(&data, doc.Selection) 130 | //check error 131 | if err != nil { 132 | log.Fatal(err) 133 | } 134 | 135 | //print result 136 | log.Printf("%v", data) 137 | } 138 | 139 | func ExamplePagser_ParseReader() { 140 | resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html") 141 | if err != nil { 142 | log.Fatal(err) 143 | } 144 | defer resp.Body.Close() 145 | 146 | //New default Config 147 | p := New() 148 | //data parser model 149 | var page ExamplePage 150 | //parse html data 151 | err = p.ParseReader(&page, resp.Body) 152 | //check error 153 | if err != nil { 154 | panic(err) 155 | } 156 | 157 | log.Printf("%v", page) 158 | } 159 | 160 | func ExamplePagser_RegisterFunc() { 161 | p := New() 162 | 163 | p.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { 164 | //Todo 165 | return "Hello", nil 166 | }) 167 | } 168 | -------------------------------------------------------------------------------- /extensions/markdown/markdown.go: -------------------------------------------------------------------------------- 1 | package markdown 2 | 3 | import ( 4 | "bytes" 5 | "github.com/PuerkitoBio/goquery" 6 | "github.com/foolin/pagser" 7 | "github.com/mattn/godown" 8 | "regexp" 9 | "strings" 10 | ) 11 | 12 | //var regMutilSpaceLine = regexp.MustCompile("(\\r?\\n\\s*){2,}") 13 | var regMutilSpaceLine = regexp.MustCompile("(\\r?\\n+\\s*){2,}") 14 | 15 | // Markdown convert html to markdown function 16 | func Markdown(node *goquery.Selection, args ...string) (interface{}, error) { 17 | var buf bytes.Buffer 18 | html, err := node.Html() 19 | if err != nil { 20 | return "", err 21 | } 22 | err = godown.Convert(&buf, strings.NewReader(html), &godown.Option{ 23 | Style: true, 24 | Script: false, 25 | }) 26 | md := buf.String() 27 | if err != nil { 28 | return md, err 29 | } 30 | md = regMutilSpaceLine.ReplaceAllString(md, "\n\n") 31 | return md, err 32 | } 33 | 34 | // Register register function name as `Markdown` 35 | func Register(p *pagser.Pagser) { 36 | p.RegisterFunc("Markdown", Markdown) 37 | } 38 | -------------------------------------------------------------------------------- /extensions/markdown/markdown_test.go: -------------------------------------------------------------------------------- 1 | package markdown 2 | -------------------------------------------------------------------------------- /extensions/ugchtml/ugchtml.go: -------------------------------------------------------------------------------- 1 | package ugchtml 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | "github.com/foolin/pagser" 6 | "github.com/microcosm-cc/bluemonday" 7 | ) 8 | 9 | // UgcHtml sanitise HTML5 documents safely function 10 | func UgcHtml(node *goquery.Selection, args ...string) (interface{}, error) { 11 | html, err := goquery.OuterHtml(node) 12 | if err != nil { 13 | return html, err 14 | } 15 | p := bluemonday.UGCPolicy() 16 | // The policy can then be used to sanitize lots of input and it is safe to use the policy in multiple goroutines 17 | return p.Sanitize(html), nil 18 | } 19 | 20 | // Register register function name as `UgcHtml` 21 | func Register(p *pagser.Pagser) { 22 | p.RegisterFunc("UgcHtml", UgcHtml) 23 | } 24 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/foolin/pagser 2 | 3 | go 1.21 4 | 5 | toolchain go1.21.1 6 | 7 | require ( 8 | github.com/PuerkitoBio/goquery v1.8.1 9 | github.com/mattn/godown v0.0.1 10 | github.com/microcosm-cc/bluemonday v1.0.26 11 | github.com/spf13/cast v1.5.1 12 | ) 13 | 14 | require ( 15 | github.com/andybalholm/cascadia v1.3.2 // indirect 16 | github.com/aymerick/douceur v0.2.0 // indirect 17 | github.com/creack/pty v1.1.9 // indirect 18 | github.com/davecgh/go-spew v1.1.1 // indirect 19 | github.com/frankban/quicktest v1.14.4 // indirect 20 | github.com/google/go-cmp v0.5.9 // indirect 21 | github.com/gorilla/css v1.0.0 // indirect 22 | github.com/kr/pretty v0.3.1 // indirect 23 | github.com/kr/text v0.2.0 // indirect 24 | github.com/mattn/go-runewidth v0.0.8 // indirect 25 | github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e // indirect 26 | github.com/pmezard/go-difflib v1.0.0 // indirect 27 | github.com/rogpeppe/go-internal v1.9.0 // indirect 28 | github.com/stretchr/testify v1.2.2 // indirect 29 | github.com/yuin/goldmark v1.4.13 // indirect 30 | golang.org/x/crypto v0.14.0 // indirect 31 | golang.org/x/mod v0.8.0 // indirect 32 | golang.org/x/net v0.17.0 // indirect 33 | golang.org/x/sync v0.1.0 // indirect 34 | golang.org/x/sys v0.13.0 // indirect 35 | golang.org/x/term v0.13.0 // indirect 36 | golang.org/x/text v0.13.0 // indirect 37 | golang.org/x/tools v0.6.0 // indirect 38 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7 // indirect 39 | ) 40 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= 2 | github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= 3 | github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= 4 | github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= 5 | github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= 6 | github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= 7 | github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= 8 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 9 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 10 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= 11 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= 12 | github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= 13 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 14 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 15 | github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= 16 | github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 17 | github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= 18 | github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= 19 | github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= 20 | github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= 21 | github.com/mattn/go-runewidth v0.0.8 h1:3tS41NlGYSmhhe/8fhGRzc+z3AYCw1Fe1WAyLuujKs0= 22 | github.com/mattn/go-runewidth v0.0.8/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= 23 | github.com/mattn/godown v0.0.0-20200217152941-afc959f6a561 h1:0YGo77enc6tJpXQxUeQWs9bPIQPTH1lbOmc5tgRuq4o= 24 | github.com/mattn/godown v0.0.0-20200217152941-afc959f6a561/go.mod h1:/ivCKurgV/bx6yqtP/Jtc2Xmrv3beCYBvlfAUl4X5g4= 25 | github.com/mattn/godown v0.0.1 h1:39uk50ufLVQFs0eapIJVX5fCS74a1Fs2g5f1MVqIHdE= 26 | github.com/mattn/godown v0.0.1/go.mod h1:/ivCKurgV/bx6yqtP/Jtc2Xmrv3beCYBvlfAUl4X5g4= 27 | github.com/microcosm-cc/bluemonday v1.0.16 h1:kHmAq2t7WPWLjiGvzKa5o3HzSfahUKiOq7fAPUiMNIc= 28 | github.com/microcosm-cc/bluemonday v1.0.16/go.mod h1:Z0r70sCuXHig8YpBzCc5eGHAap2K7e/u082ZUpDRRqM= 29 | github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58= 30 | github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs= 31 | github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= 32 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 33 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 34 | github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= 35 | github.com/spf13/cast v1.3.1 h1:nFm6S0SMdyzrzcmThSipiEubIDy8WEXKNZ0UOgiRpng= 36 | github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= 37 | github.com/spf13/cast v1.5.1 h1:R+kOtfhWQE6TVQzY+4D7wJLBgkdVasCEFxSUBYBYIlA= 38 | github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48= 39 | github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= 40 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 41 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 42 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 43 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 44 | golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= 45 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 46 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 47 | golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 48 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 49 | golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 50 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 51 | golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q= 52 | golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 53 | golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= 54 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 55 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 56 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 57 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 58 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 59 | golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= 60 | golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= 61 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 62 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 63 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 64 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 65 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 66 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 67 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 68 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 69 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 70 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 71 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 72 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 73 | golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 74 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 75 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 76 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 77 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 78 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 79 | golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= 80 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 81 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 82 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 83 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 84 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 85 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 86 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 87 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 88 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 89 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 90 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 91 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 92 | -------------------------------------------------------------------------------- /grammar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foolin/pagser/69804390226d2a2931b2d9a7cbf5b3d3be5d76dd/grammar.png -------------------------------------------------------------------------------- /pagser.go: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Foolin 2 | 3 | // Unless required by applicable law or agreed to in writing, software 4 | // distributed under the License is distributed on an "AS IS" BASIS, 5 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 6 | // See the License for the specific language governing permissions and 7 | // limitations under the License. 8 | 9 | package pagser 10 | 11 | import ( 12 | "errors" 13 | "sync" 14 | ) 15 | 16 | // Pagser the page parser 17 | type Pagser struct { 18 | Config Config 19 | //mapTags map[string]*tagTokenizer // tag value => tagTokenizer 20 | mapTags sync.Map //map[string]*tagTokenizer 21 | //mapFuncs map[string]CallFunc // name => func 22 | mapFuncs sync.Map //map[string]CallFunc 23 | } 24 | 25 | // New create pagser client 26 | func New() *Pagser { 27 | p, _ := NewWithConfig(DefaultConfig()) 28 | return p 29 | } 30 | 31 | // NewWithConfig create pagser client with Config and error 32 | func NewWithConfig(cfg Config) (*Pagser, error) { 33 | if cfg.TagName == "" { 34 | return nil, errors.New("tag name must not empty") 35 | } 36 | if cfg.FuncSymbol == "" { 37 | return nil, errors.New("FuncSymbol must not empty") 38 | } 39 | p := Pagser{ 40 | Config: cfg, 41 | //mapTags: make(map[string]*tagTokenizer, 0), 42 | //mapFuncs: builtinFuncs, 43 | } 44 | for k, v := range builtinFuncs { 45 | p.mapFuncs.Store(k, v) 46 | } 47 | return &p, nil 48 | } 49 | -------------------------------------------------------------------------------- /pagser_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | const rawPagserHtml = ` 9 | 10 | 11 | 12 | 13 | Pagser Example 14 | 15 | 16 | 17 | 18 | 28 | 29 | 30 | ` 31 | 32 | type PagserData struct { 33 | Title string `pagser:"title"` 34 | Keywords []string `pagser:"meta[name='keywords']->attrSplit(content)"` 35 | Navs []struct { 36 | ID int `pagser:"->attrEmpty(id, -1)"` 37 | Name string `pagser:"a->text()"` 38 | Url string `pagser:"a->attr(href)"` 39 | } `pagser:".navlink li"` 40 | } 41 | 42 | type ConfigData struct { 43 | Title string `query:"title"` 44 | Keywords []string `query:"meta[name='keywords']@attrSplit(content)"` 45 | Navs []struct { 46 | ID int `query:"@attrEmpty(id, -1)"` 47 | Name string `query:"a@text()"` 48 | Url string `query:"a@attr(href)"` 49 | } `query:".navlink li"` 50 | } 51 | 52 | func TestNew(t *testing.T) { 53 | p := New() 54 | 55 | var data PagserData 56 | err := p.Parse(&data, rawPagserHtml) 57 | if err != nil { 58 | t.Fatal(err) 59 | } 60 | fmt.Printf("json: %v\n", prettyJson(data)) 61 | } 62 | 63 | func TestNewWithConfig(t *testing.T) { 64 | cfg := Config{ 65 | TagName: "query", 66 | FuncSymbol: "@", 67 | CastError: true, 68 | Debug: true, 69 | } 70 | p, err := NewWithConfig(cfg) 71 | if err != nil { 72 | t.Fatal(err) 73 | } 74 | 75 | var data ConfigData 76 | err = p.Parse(&data, rawPagserHtml) 77 | if err != nil { 78 | t.Fatal(err) 79 | } 80 | fmt.Printf("json: %v\n", prettyJson(data)) 81 | } 82 | 83 | func TestNewWithConfigTagNameError(t *testing.T) { 84 | cfg := Config{ 85 | TagName: "", 86 | FuncSymbol: "->", 87 | CastError: true, 88 | Debug: true, 89 | } 90 | _, err := NewWithConfig(cfg) 91 | if err != nil { 92 | t.Log(err) 93 | } else { 94 | t.Fatal("Result must return error") 95 | } 96 | } 97 | 98 | func TestNewWithConfigFuncSymbolError(t *testing.T) { 99 | cfg := Config{ 100 | TagName: "pagser", 101 | FuncSymbol: "", 102 | CastError: true, 103 | Debug: true, 104 | } 105 | _, err := NewWithConfig(cfg) 106 | if err != nil { 107 | t.Log(err) 108 | } else { 109 | t.Fatal("Result must return error") 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /parse.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "reflect" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/spf13/cast" 11 | ) 12 | 13 | // Parse parse html to struct 14 | func (p *Pagser) Parse(v interface{}, document string) (err error) { 15 | reader, err := goquery.NewDocumentFromReader(strings.NewReader(document)) 16 | if err != nil { 17 | return err 18 | } 19 | return p.ParseDocument(v, reader) 20 | } 21 | 22 | // ParseReader parse html to struct 23 | func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error) { 24 | doc, err := goquery.NewDocumentFromReader(reader) 25 | if err != nil { 26 | return err 27 | } 28 | return p.ParseDocument(v, doc) 29 | } 30 | 31 | // ParseDocument parse document to struct 32 | func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error) { 33 | return p.ParseSelection(v, document.Selection) 34 | } 35 | 36 | // ParseSelection parse selection to struct 37 | func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error) { 38 | return p.doParse(v, nil, selection) 39 | } 40 | 41 | // ParseSelection parse selection to struct 42 | func (p *Pagser) doParse(v interface{}, stackRefValues []reflect.Value, selection *goquery.Selection) (err error) { 43 | objRefType := reflect.TypeOf(v) 44 | objRefValue := reflect.ValueOf(v) 45 | 46 | //log.Printf("%#v kind is %v | %v", v, objRefValue.Kind(), reflect.Ptr) 47 | if objRefValue.Kind() != reflect.Ptr { 48 | return fmt.Errorf("%v is non-pointer", objRefType) 49 | } 50 | 51 | if objRefValue.IsNil() { 52 | return fmt.Errorf("%v is nil", objRefType) 53 | } 54 | 55 | objRefTypeElem := objRefType.Elem() 56 | objRefValueElem := objRefValue.Elem() 57 | 58 | for i := 0; i < objRefValueElem.NumField(); i++ { 59 | fieldType := objRefTypeElem.Field(i) 60 | fieldValue := objRefValueElem.Field(i) 61 | kind := fieldType.Type.Kind() 62 | 63 | //tagValue := fieldType.Tag.Get(parserTagName) 64 | tagValue, tagOk := fieldType.Tag.Lookup(p.Config.TagName) 65 | if !tagOk { 66 | if p.Config.Debug { 67 | fmt.Printf("[INFO] not found tag name=[%v] in field: %v, eg: `%v:\".navlink a->attr(href)\"`\n", 68 | p.Config.TagName, fieldType.Name, p.Config.TagName) 69 | } 70 | continue 71 | } 72 | if tagValue == ignoreSymbol { 73 | continue 74 | } 75 | 76 | cacheTag, ok := p.mapTags.Load(tagValue) 77 | var tag *tagTokenizer 78 | if !ok || cacheTag == nil { 79 | tag, err = p.newTag(tagValue) 80 | if err != nil { 81 | return err 82 | } 83 | p.mapTags.Store(tagValue, tag) 84 | } else { 85 | tag = cacheTag.(*tagTokenizer) 86 | } 87 | 88 | node := selection 89 | if tag.Selector != "" { 90 | node = selection.Find(tag.Selector) 91 | } 92 | 93 | var callOutValue interface{} 94 | var callErr error 95 | if tag.FuncName != "" { 96 | callOutValue, callErr = p.findAndExecFunc(objRefValue, stackRefValues, tag, node) 97 | if callErr != nil { 98 | return fmt.Errorf("tag=`%v` parse func error: %v", tagValue, callErr) 99 | } 100 | if subNode, ok := callOutValue.(*goquery.Selection); ok { 101 | //set sub node to current node 102 | node = subNode 103 | } else { 104 | svErr := p.setRefectValue(fieldType.Type.Kind(), fieldValue, callOutValue) 105 | if svErr != nil { 106 | return fmt.Errorf("tag=`%v` set value error: %v", tagValue, svErr) 107 | } 108 | //goto parse next field 109 | continue 110 | } 111 | } 112 | 113 | if stackRefValues == nil { 114 | stackRefValues = make([]reflect.Value, 0) 115 | } 116 | stackRefValues = append(stackRefValues, objRefValue) 117 | 118 | //set value 119 | switch { 120 | case kind == reflect.Ptr: 121 | subModel := reflect.New(fieldType.Type.Elem()) 122 | fieldValue.Set(subModel) 123 | err = p.doParse(subModel.Interface(), stackRefValues, node) 124 | if err != nil { 125 | return fmt.Errorf("tag=`%v` %#v parser error: %v", tagValue, subModel, err) 126 | } 127 | //Slice 128 | case kind == reflect.Slice: 129 | sliceType := fieldValue.Type() 130 | itemType := sliceType.Elem() 131 | itemKind := itemType.Kind() 132 | slice := reflect.MakeSlice(sliceType, node.Size(), node.Size()) 133 | node.EachWithBreak(func(i int, subNode *goquery.Selection) bool { 134 | //outhtml, _ := goquery.OuterHtml(subNode) 135 | //log.Printf("%v => %v", i, outhtml) 136 | itemValue := reflect.New(itemType).Elem() 137 | switch { 138 | case itemKind == reflect.Struct: 139 | err = p.doParse(itemValue.Addr().Interface(), stackRefValues, subNode) 140 | if err != nil { 141 | err = fmt.Errorf("tag=`%v` %#v parser error: %v", tagValue, itemValue, err) 142 | return false 143 | } 144 | case itemKind == reflect.Ptr && itemValue.Type().Elem().Kind() == reflect.Struct: 145 | itemValue = reflect.New(itemType.Elem()) 146 | err = p.doParse(itemValue.Interface(), stackRefValues, subNode) 147 | if err != nil { 148 | err = fmt.Errorf("tag=`%v` %#v parser error: %v", tagValue, itemValue, err) 149 | return false 150 | } 151 | default: 152 | itemValue.SetString(strings.TrimSpace(subNode.Text())) 153 | } 154 | slice.Index(i).Set(itemValue) 155 | return true 156 | }) 157 | if err != nil { 158 | return err 159 | } 160 | fieldValue.Set(slice) 161 | case kind == reflect.Struct: 162 | subModel := reflect.New(fieldType.Type) 163 | err = p.doParse(subModel.Interface(), stackRefValues, node) 164 | if err != nil { 165 | return fmt.Errorf("tag=`%v` %#v parser error: %v", tagValue, subModel, err) 166 | } 167 | fieldValue.Set(subModel.Elem()) 168 | //UnsafePointer 169 | //Complex64 170 | //Complex128 171 | //Array 172 | //Chan 173 | //Func 174 | default: 175 | fieldValue.SetString(strings.TrimSpace(node.Text())) 176 | } 177 | } 178 | return nil 179 | } 180 | 181 | /** 182 | fieldType := refTypeElem.Field(i) 183 | fieldValue := refValueElem.Field(i) 184 | */ 185 | func (p *Pagser) findAndExecFunc(objRefValue reflect.Value, stackRefValues []reflect.Value, selTag *tagTokenizer, node *goquery.Selection) (interface{}, error) { 186 | if selTag.FuncName != "" { 187 | 188 | //call object method 189 | callMethod := findMethod(objRefValue, selTag.FuncName) 190 | if callMethod.IsValid() { 191 | //execute method 192 | return execMethod(callMethod, selTag, node) 193 | } 194 | 195 | //call root method 196 | size := len(stackRefValues) 197 | if size > 0 { 198 | for i := size - 1; i >= 0; i-- { 199 | callMethod = findMethod(stackRefValues[i], selTag.FuncName) 200 | if callMethod.IsValid() { 201 | //execute method 202 | return execMethod(callMethod, selTag, node) 203 | } 204 | } 205 | } 206 | 207 | //global function 208 | if fn, ok := p.mapFuncs.Load(selTag.FuncName); ok { 209 | cfn := fn.(CallFunc) 210 | outValue, err := cfn(node, selTag.FuncParams...) 211 | if err != nil { 212 | return nil, fmt.Errorf("call registered func %v error: %v", selTag.FuncName, err) 213 | } 214 | return outValue, nil 215 | } 216 | 217 | //not found method 218 | return nil, fmt.Errorf("not found method %v", selTag.FuncName) 219 | } 220 | return strings.TrimSpace(node.Text()), nil 221 | } 222 | 223 | func findMethod(objRefValue reflect.Value, funcName string) reflect.Value { 224 | callMethod := objRefValue.MethodByName(funcName) 225 | if callMethod.IsValid() { 226 | return callMethod 227 | } 228 | //call element method 229 | return objRefValue.Elem().MethodByName(funcName) 230 | } 231 | 232 | func execMethod(callMethod reflect.Value, selTag *tagTokenizer, node *goquery.Selection) (interface{}, error) { 233 | callParams := make([]reflect.Value, 0) 234 | callParams = append(callParams, reflect.ValueOf(node)) 235 | 236 | callReturns := callMethod.Call(callParams) 237 | if len(callReturns) <= 0 { 238 | return nil, fmt.Errorf("method %v not return any value", selTag.FuncName) 239 | } 240 | if len(callReturns) > 1 { 241 | if err, ok := callReturns[len(callReturns)-1].Interface().(error); ok { 242 | if err != nil { 243 | return nil, fmt.Errorf("method %v return error: %v", selTag.FuncName, err) 244 | } 245 | } 246 | } 247 | return callReturns[0].Interface(), nil 248 | } 249 | 250 | func (p Pagser) setRefectValue(kind reflect.Kind, fieldValue reflect.Value, v interface{}) (err error) { 251 | //set value 252 | switch { 253 | //Bool 254 | case kind == reflect.Bool: 255 | if p.Config.CastError { 256 | kv, err := cast.ToBoolE(v) 257 | if err != nil { 258 | return err 259 | } 260 | fieldValue.SetBool(kv) 261 | } else { 262 | fieldValue.SetBool(cast.ToBool(v)) 263 | } 264 | case kind >= reflect.Int && kind <= reflect.Int64: 265 | if p.Config.CastError { 266 | kv, err := cast.ToInt64E(v) 267 | if err != nil { 268 | return err 269 | } 270 | fieldValue.SetInt(kv) 271 | } else { 272 | fieldValue.SetInt(cast.ToInt64(v)) 273 | } 274 | case kind >= reflect.Uint && kind <= reflect.Uintptr: 275 | if p.Config.CastError { 276 | kv, err := cast.ToUint64E(v) 277 | if err != nil { 278 | return err 279 | } 280 | fieldValue.SetUint(kv) 281 | } else { 282 | fieldValue.SetUint(cast.ToUint64(v)) 283 | } 284 | case kind == reflect.Float32 || kind == reflect.Float64: 285 | if p.Config.CastError { 286 | value, err := cast.ToFloat64E(v) 287 | if err != nil { 288 | return err 289 | } 290 | fieldValue.SetFloat(value) 291 | } else { 292 | fieldValue.SetFloat(cast.ToFloat64(v)) 293 | } 294 | case kind == reflect.String: 295 | if p.Config.CastError { 296 | kv, err := cast.ToStringE(v) 297 | if err != nil { 298 | return err 299 | } 300 | fieldValue.SetString(kv) 301 | } else { 302 | fieldValue.SetString(cast.ToString(v)) 303 | } 304 | case kind == reflect.Slice || kind == reflect.Array: 305 | sliceType := fieldValue.Type().Elem() 306 | itemKind := sliceType.Kind() 307 | if p.Config.CastError { 308 | switch itemKind { 309 | case reflect.Bool: 310 | kv, err := cast.ToBoolSliceE(v) 311 | if err != nil { 312 | return err 313 | } 314 | fieldValue.Set(reflect.ValueOf(kv)) 315 | case reflect.Int: 316 | kv, err := cast.ToIntSliceE(v) 317 | if err != nil { 318 | return err 319 | } 320 | fieldValue.Set(reflect.ValueOf(kv)) 321 | case reflect.Int32: 322 | kv, err := toInt32SliceE(v) 323 | if err != nil { 324 | return err 325 | } 326 | fieldValue.Set(reflect.ValueOf(kv)) 327 | case reflect.Int64: 328 | kv, err := toInt64SliceE(v) 329 | if err != nil { 330 | return err 331 | } 332 | fieldValue.Set(reflect.ValueOf(kv)) 333 | case reflect.Float32: 334 | kv, err := toFloat32SliceE(v) 335 | if err != nil { 336 | return err 337 | } 338 | fieldValue.Set(reflect.ValueOf(kv)) 339 | case reflect.Float64: 340 | kv, err := toFloat64SliceE(v) 341 | if err != nil { 342 | return err 343 | } 344 | fieldValue.Set(reflect.ValueOf(kv)) 345 | case reflect.String: 346 | kv, err := cast.ToStringSliceE(v) 347 | if err != nil { 348 | return err 349 | } 350 | fieldValue.Set(reflect.ValueOf(kv)) 351 | default: 352 | fieldValue.Set(reflect.ValueOf(v)) 353 | } 354 | } else { 355 | switch itemKind { 356 | case reflect.Bool: 357 | kv := cast.ToBoolSlice(v) 358 | fieldValue.Set(reflect.ValueOf(kv)) 359 | case reflect.Int: 360 | kv := cast.ToIntSlice(v) 361 | fieldValue.Set(reflect.ValueOf(kv)) 362 | case reflect.Int32: 363 | kv := toInt32Slice(v) 364 | fieldValue.Set(reflect.ValueOf(kv)) 365 | case reflect.Int64: 366 | kv := toInt64Slice(v) 367 | fieldValue.Set(reflect.ValueOf(kv)) 368 | case reflect.Float32: 369 | kv := toFloat32Slice(v) 370 | fieldValue.Set(reflect.ValueOf(kv)) 371 | case reflect.Float64: 372 | kv := toFloat64Slice(v) 373 | fieldValue.Set(reflect.ValueOf(kv)) 374 | case reflect.String: 375 | kv := cast.ToStringSlice(v) 376 | fieldValue.Set(reflect.ValueOf(kv)) 377 | default: 378 | fieldValue.Set(reflect.ValueOf(v)) 379 | } 380 | } 381 | //case kind == reflect.Interface: 382 | // fieldValue.Set(reflect.ValueOf(v)) 383 | default: 384 | fieldValue.Set(reflect.ValueOf(v)) 385 | //return fmt.Errorf("not support type %v", kind) 386 | } 387 | return nil 388 | } 389 | -------------------------------------------------------------------------------- /parse_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "strings" 7 | "sync" 8 | "testing" 9 | 10 | "github.com/PuerkitoBio/goquery" 11 | ) 12 | 13 | const rawParseHtml = ` 14 | 15 | 16 | 17 | 18 | Pagser Example 19 | 20 | 21 | 22 | 23 |

Pagser H1 Title

24 | 34 |
A|B|C|D
35 | 36 |
37 |

Email

38 |
    39 |
  • pagser@foolin.github
  • 40 |
  • hello@pagser.foolin
  • 41 |
42 |
43 |
44 |

Bool

45 |
    46 |
  • true
  • 47 |
  • false
  • 48 |
49 |
50 |
51 |

Number

52 |
    53 |
  • 12345
  • 54 |
  • 67890
  • 55 |
56 |
57 |
58 |

Float

59 |
    60 |
  • 123.45
  • 61 |
  • 678.90
  • 62 |
63 |
64 |
65 | 66 |
67 | 68 | 69 | ` 70 | 71 | type ParseData struct { 72 | Title string `pagser:"title"` 73 | Keywords []string `pagser:"meta[name='keywords']->attrSplit(content)"` 74 | H1 string `pagser:"h1"` 75 | H1Text string `pagser:"h1->text()"` 76 | H1TextEmpty string `pagser:"h1->textEmpty('')"` 77 | TextEmptyNoData string `pagser:".empty->textEmpty('nodata')"` 78 | H1Html string `pagser:"h1->html()"` 79 | H1OutHtml string `pagser:"h1->outerHtml()"` 80 | SameFuncValue string `pagser:"h1->SameFunc()"` 81 | MyGlobalFuncValue string `pagser:"h1->MyGlobFunc()"` 82 | MyStructFuncValue string `pagser:"h1->MyStructFunc()"` 83 | FillFieldFuncValue string `pagser:"h1->FillFieldFunc()"` 84 | FillFieldOtherValue string //Set value by FillFieldFunc() 85 | NavList []struct { 86 | ID int `pagser:"->attrEmpty(id, -1)"` 87 | Link struct { 88 | Name string `pagser:"->text()"` 89 | Url string `pagser:"->attr(href)"` 90 | AbsUrl string `pagser:"->absHref('https://thisvar.com')"` 91 | } `pagser:"a"` 92 | LinkHtml string `pagser:"a->html()"` 93 | ParentFuncName string `pagser:"a->ParentFunc()"` 94 | } `pagser:".navlink li"` 95 | NavFirst struct { 96 | ID int `pagser:"->attrEmpty(id, 0)"` 97 | Name string `pagser:"a->text()"` 98 | Url string `pagser:"a->attr(href)"` 99 | } `pagser:".navlink li->first()"` 100 | NavLast struct { 101 | ID int `pagser:"->attrEmpty(id, 0)"` 102 | Name string `pagser:"a->text()"` 103 | Url string `pagser:"a->attr(href)"` 104 | } `pagser:".navlink li->last()"` 105 | SubStruct struct { 106 | Label string `pagser:"label"` 107 | Values []string `pagser:".item->eachAttr(value)"` 108 | } `pagser:".group->eq(0)"` 109 | SubPtrStruct *struct { 110 | Label string `pagser:"label"` 111 | Values []string `pagser:".item->eachAttr(value)"` 112 | } `pagser:".group:last-child"` 113 | NavFirstID int `pagser:".navlink li:first-child->attrEmpty(id, 0)"` 114 | NavLastID uint `pagser:".navlink li:last-child->attr(id)"` 115 | NavLastData string `pagser:".navlink li:last-child->attr(data, 'nodata')"` 116 | NavFirstIDDefaultValue int `pagser:".navlink li:first-child->attrEmpty(id, -999)"` 117 | NavTextList []string `pagser:".navlink li"` 118 | NavEachText []string `pagser:".navlink li->eachText()"` 119 | NavEachTextEmpty []string `pagser:".navlink li->eachTextEmpty('')"` 120 | NavEachTextEmptyNoData []string `pagser:".empty span->eachTextEmpty('nodata')"` 121 | NavEachAttrID []string `pagser:".navlink li->eachAttr(id)"` 122 | NavEachAttrEmptyID []string `pagser:".navlink li->eachAttrEmpty(id, -1)"` 123 | NavEachHtml []string `pagser:".navlink li->eachHtml()"` 124 | NavEachOutHtml []string `pagser:".navlink li->eachOutHtml()"` 125 | NavJoinString string `pagser:".navlink li->eachTextJoin(|)"` 126 | NavEqText string `pagser:".navlink li->eqAndText(1)"` 127 | NavEqAttr string `pagser:".navlink li->eqAndAttr(1, id)"` 128 | NavEqHtml string `pagser:".navlink li->eqAndHtml(1)"` 129 | NavEqOutHtml string `pagser:".navlink li->eqAndOutHtml(1)"` 130 | NavSize int `pagser:".navlink li->size()"` 131 | SubPageData *SubPageData `pagser:".navlink li:last-child"` 132 | SubPageDataList []*SubPageData `pagser:".navlink li"` 133 | WordsSplitArray []string `pagser:".words->textSplit(|)"` 134 | WordsSplitArrayNoTrim []string `pagser:".words->textSplit('|', false)"` 135 | WordsShow bool `pagser:".words->attrEmpty(show, false)"` 136 | WordsConcatText string `pagser:".words->textConcat('this is words:', [, $value, ])"` 137 | WordsConcatAttr string `pagser:".words->attrConcat(show, 'isShow = [', $value, ])"` 138 | Email string `pagser:".item[name='email']->attr('value')"` 139 | Emails []string `pagser:".item[name='email']->eachAttrEmpty(value, '')"` 140 | CastBoolValue bool `pagser:".item[name='bool']->attrEmpty(value, false)"` 141 | CastBoolNoExist bool `pagser:".item[name='bool']->attrEmpty(value2, false)"` 142 | CastBoolArray []bool `pagser:".item[name='bool']->eachAttrEmpty(value, false)"` 143 | CastIntValue int `pagser:".item[name='number']->attrEmpty(value, 0)"` 144 | CastIntNoExist int `pagser:".item[name='number']->attrEmpty(value2, -1)"` 145 | CastIntArray []int `pagser:".item[name='number']->eachAttrEmpty(value, 0)"` 146 | CastInt32Value int32 `pagser:".item[name='number']->attrEmpty(value, 0)"` 147 | CastInt32NoExist int32 `pagser:".item[name='number']->attrEmpty(value2, -1)"` 148 | CastInt32Array []int32 `pagser:".item[name='number']->eachAttrEmpty(value, 0)"` 149 | CastInt64Value int64 `pagser:".item[name='number']->attrEmpty(value, 0)"` 150 | CastInt64NoExist int64 `pagser:".item[name='number']->attrEmpty(value2, -1)"` 151 | CastInt64Array []int64 `pagser:".item[name='number']->eachAttrEmpty(value, 0)"` 152 | CastFloat32Value float32 `pagser:".item[name='float']->attrEmpty(value, 0)"` 153 | CastFloat32NoExist float32 `pagser:".item[name='float']->attrEmpty(value2, 0.0)"` 154 | CastFloat32Array []float32 `pagser:".item[name='float']->eachAttrEmpty(value, 0)"` 155 | CastFloat64Value float64 `pagser:".item[name='float']->attrEmpty(value, 0)"` 156 | CastFloat64NoExist float64 `pagser:".item[name='float']->attrEmpty(value2, 0.0)"` 157 | CastFloat64Array []float64 `pagser:".item[name='float']->eachAttrEmpty(value, 0)"` 158 | NodeChild []struct { 159 | Value string `pagser:"->text()"` 160 | } `pagser:".group->child()"` 161 | NodeChildSelector []struct { 162 | Value string `pagser:"->text()"` 163 | } `pagser:".group->child('h2')"` 164 | NodeEqFirst struct { 165 | Value string `pagser:"h2->text()"` 166 | } `pagser:".group->eq(0)"` 167 | NodeEqLast struct { 168 | Value string `pagser:"h2->text()"` 169 | } `pagser:".group->eq(-1)"` 170 | NodeEqPrev []struct { 171 | Value string `pagser:"->text()"` 172 | } `pagser:".item:last-child->prev()"` 173 | NodeEqPrevSelector struct { 174 | Value string `pagser:"->text()"` 175 | } `pagser:".item:last-child->prev('[id=\"1\"]')"` 176 | NodeEqNext []struct { 177 | Value string `pagser:"->text()"` 178 | } `pagser:".item:first-child->next()"` 179 | NodeEqNextSelector struct { 180 | Value string `pagser:"->text()"` 181 | } `pagser:".item:first-child->next('[id=\"2\"]')"` 182 | NodeParent []struct { 183 | Value string `pagser:"h2->text()"` 184 | } `pagser:"h2:first-child->parent()"` 185 | NodeParents []struct { 186 | Value string `pagser:"h2->text()"` 187 | } `pagser:"h2:first-child->parents()"` 188 | NodeParentsSelector []struct { 189 | Value string `pagser:"h2->text()"` 190 | } `pagser:"h2:first-child->parents('[id=\"b\"]')"` 191 | NodeParentsUntil []struct { 192 | Value string `pagser:"h2->text()"` 193 | } `pagser:"h2:first-child->parentsUntil('[id=\"b\"]')"` 194 | NodeParentSelector []struct { 195 | Value string `pagser:"h2->text()"` 196 | } `pagser:"h2:first-child->parent('[id=\"a\"]')"` 197 | NodeEqSiblings []struct { 198 | Value string `pagser:"->text()"` 199 | } `pagser:".item:first-child->siblings()"` 200 | NodeEqSiblingsSelector []struct { 201 | Value string `pagser:"->text()"` 202 | } `pagser:".item:first-child->siblings('[id=\"2\"]')"` 203 | } 204 | 205 | // this method will auto call, not need register. 206 | func MyGlobalFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 207 | return "Global-" + selection.Text(), nil 208 | } 209 | 210 | // this method will auto call, not need register. 211 | func SameFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 212 | return "Global-Same-Func-" + selection.Text(), nil 213 | } 214 | 215 | // this method will auto call, not need register. 216 | func (pd ParseData) MyStructFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 217 | return "Struct-" + selection.Text(), nil 218 | } 219 | 220 | func (pd ParseData) ParentFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 221 | return "ParentFunc-" + selection.Text(), nil 222 | } 223 | 224 | // this method will auto call, not need register. 225 | func (pd *ParseData) FillFieldFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 226 | text := selection.Text() 227 | pd.FillFieldOtherValue = "This value is set by the FillFieldFunc() function -" + text 228 | return "FillFieldFunc-" + text, nil 229 | } 230 | 231 | func (pd *ParseData) SameFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 232 | return "Struct-Same-Func-" + selection.Text(), nil 233 | } 234 | 235 | type SubPageData struct { 236 | Text string `pagser:"->text()"` 237 | SubFuncValue string `pagser:"->SubFunc()"` 238 | ParentFuncValue string `pagser:"->ParentFunc()"` 239 | SameFuncValue string `pagser:"->SameFunc()"` 240 | } 241 | 242 | func (spd SubPageData) SubFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 243 | return "SubFunc-" + selection.Text(), nil 244 | } 245 | 246 | func (spd SubPageData) SameFunc(selection *goquery.Selection, args ...string) (out interface{}, err error) { 247 | return "Sub-Struct-Same-Func-" + selection.Text(), nil 248 | } 249 | 250 | // Page parse from https://httpbin.org 251 | type HttpBinData struct { 252 | Title string `pagser:"title"` 253 | Version string `pagser:".version->text()"` 254 | Description string `pagser:".description->text()"` 255 | } 256 | 257 | func TestParse(t *testing.T) { 258 | p := New() 259 | //register global function 260 | p.RegisterFunc("MyGlobFunc", MyGlobalFunc) 261 | p.RegisterFunc("SameFunc", SameFunc) 262 | 263 | var data ParseData 264 | err := p.Parse(&data, rawParseHtml) 265 | if err != nil { 266 | t.Fatal(err) 267 | } 268 | fmt.Printf("json: %v\n", prettyJson(data)) 269 | } 270 | 271 | func TestPagser_ParseDocument(t *testing.T) { 272 | cfg := Config{ 273 | TagName: "pagser", 274 | FuncSymbol: "->", 275 | CastError: true, 276 | Debug: true, 277 | } 278 | p, err := NewWithConfig(cfg) 279 | if err != nil { 280 | t.Fatal(err) 281 | } 282 | 283 | //register global function 284 | p.RegisterFunc("MyGlobFunc", MyGlobalFunc) 285 | p.RegisterFunc("SameFunc", SameFunc) 286 | 287 | var data ParseData 288 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawParseHtml)) 289 | if err != nil { 290 | t.Fatal(err) 291 | } 292 | err = p.ParseDocument(&data, doc) 293 | //err = p.ParseSelection(&data, doc.Selection) 294 | if err != nil { 295 | t.Fatal(err) 296 | } 297 | fmt.Printf("json: %v\n", prettyJson(data)) 298 | } 299 | 300 | func TestPagser_ParseReader(t *testing.T) { 301 | res, err := http.Get("https://httpbin.org") 302 | if err != nil { 303 | t.Fatal(err) 304 | } 305 | defer res.Body.Close() 306 | 307 | p := New() 308 | 309 | var data HttpBinData 310 | err = p.ParseReader(&data, res.Body) 311 | if err != nil { 312 | t.Fatal(err) 313 | } 314 | fmt.Printf("json: %v\n", prettyJson(data)) 315 | } 316 | 317 | func TestPagser_RegisterFunc(t *testing.T) { 318 | threads := 1000 319 | p := New() 320 | var wg sync.WaitGroup 321 | 322 | for i := 0; i < threads; i++ { 323 | wg.Add(1) 324 | go func() { 325 | for j := 0; j < 10; j++ { 326 | for k, v := range builtinFuncs { 327 | p.RegisterFunc(k, v) 328 | } 329 | } 330 | wg.Done() 331 | }() 332 | } 333 | 334 | wg.Wait() 335 | } 336 | -------------------------------------------------------------------------------- /tokenizer.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "regexp" 7 | "strings" 8 | ) 9 | 10 | type tokenState int 11 | 12 | const ( 13 | tokenInput tokenState = iota 14 | tokenStartQuote 15 | tokenEndQoute 16 | tokenComma 17 | ) 18 | 19 | //->fn() 20 | //->fn(xxx) 21 | //->fn('xxx') 22 | //->fn('xxx\'xxx', 'xxx,xxx') 23 | var rxFunc = regexp.MustCompile("^\\s*([a-zA-Z]+)\\s*(\\(([^\\)]*)\\))?\\s*$") 24 | 25 | // tagTokenizer struct tag info 26 | type tagTokenizer struct { 27 | Selector string 28 | FuncName string 29 | FuncParams []string 30 | } 31 | 32 | func (p *Pagser) newTag(tagValue string) (*tagTokenizer, error) { 33 | //fmt.Println("tag value: ", tagValue) 34 | tag := &tagTokenizer{} 35 | if tagValue == "" { 36 | return tag, nil 37 | } 38 | selectors := strings.Split(tagValue, p.Config.FuncSymbol) 39 | funcValue := "" 40 | for i := 0; i < len(selectors); i++ { 41 | switch i { 42 | case 0: 43 | tag.Selector = strings.TrimSpace(selectors[i]) 44 | case 1: 45 | funcValue = selectors[i] 46 | } 47 | } 48 | matches := rxFunc.FindStringSubmatch(funcValue) 49 | if len(matches) < 3 { 50 | return tag, nil 51 | } 52 | tag.FuncName = strings.TrimSpace(matches[1]) 53 | //tag.FuncParams = strings.Split(matches[2], ",") 54 | params, err := parseFuncParamTokens(matches[3]) 55 | if err != nil { 56 | return nil, fmt.Errorf("tag=`%v` is invalid: %v", tagValue, err) 57 | } 58 | tag.FuncParams = params 59 | if p.Config.Debug { 60 | fmt.Printf("----- debug -----\n`%v`\n%v\n", tagValue, prettyJson(tag)) 61 | } 62 | return tag, nil 63 | } 64 | 65 | func parseFuncParamTokens(text string) ([]string, error) { 66 | tokens := make([]string, 0) 67 | textLen := len(text) 68 | token := strings.Builder{} 69 | 70 | var currentState tokenState 71 | for pos := 0; pos < textLen; pos++ { 72 | ch := rune(text[pos]) 73 | switch ch { 74 | case '\'': 75 | if currentState == tokenStartQuote { 76 | tokens = append(tokens, token.String()) 77 | token.Reset() 78 | currentState = tokenEndQoute 79 | continue 80 | } 81 | if token.Len() <= 0 { 82 | currentState = tokenStartQuote 83 | continue 84 | } 85 | case ',': 86 | if currentState == tokenStartQuote { 87 | token.WriteRune(ch) 88 | continue 89 | } 90 | if currentState == tokenComma || token.Len() > 0 { 91 | tokens = append(tokens, token.String()) 92 | token.Reset() 93 | currentState = tokenComma 94 | } 95 | continue 96 | case '\\': 97 | if currentState == tokenStartQuote && pos+1 < textLen { 98 | //escape character -> "\'" 99 | nextCh := rune(text[pos+1]) 100 | if nextCh == '\'' { 101 | token.WriteRune(nextCh) 102 | pos += 1 103 | continue 104 | } 105 | } 106 | case ' ': 107 | if (currentState != tokenStartQuote) && token.Len() <= 0 { 108 | continue 109 | } 110 | } 111 | 112 | token.WriteRune(ch) 113 | } 114 | if currentState == tokenStartQuote { 115 | return []string{}, errors.New("syntax error, single quote not closed") 116 | } 117 | //end 118 | if token.Len() > 0 { 119 | tokens = append(tokens, token.String()) 120 | token.Reset() 121 | } 122 | return tokens, nil 123 | } 124 | -------------------------------------------------------------------------------- /tokenizer_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import "testing" 4 | 5 | func TestFuncParamTokens(t *testing.T) { 6 | inputs := []string{ 7 | `''`, 8 | `','`, 9 | `',', ',' `, 10 | `'\''`, 11 | `'\'' , '\''`, 12 | `'', ''`, 13 | `'',,''`, 14 | `',', '\''`, 15 | `href`, 16 | `'href'`, 17 | `'href', abc`, 18 | `'href', 'abc'`, 19 | `'a,b', abc`, 20 | `'a,b\'', abc`, 21 | `'it\'s', abc`, 22 | `'it\'s', 'abc'`, 23 | `href, ,' abc', 123.123`, 24 | `'isShow = [', $value, ]`, 25 | `' this is words: ', [, $value, ]`, 26 | `it's ok?, hello , world!'`, 27 | `'hello, i\'m ok!'`, 28 | `The name is 'ABC'`, 29 | `'hello, i\'m error!`, 30 | } 31 | for _, v := range inputs { 32 | args, err := parseFuncParamTokens(v) 33 | if err != nil { 34 | t.Logf("X [FAIL] (%v) => %v", v, err) 35 | } else { 36 | t.Logf("√ [PASS] (%v) => %v", v, prettyJson(args)) 37 | } 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "reflect" 7 | 8 | "github.com/spf13/cast" 9 | ) 10 | 11 | // toInt32Slice casts an interface to a []int type. 12 | func toInt32Slice(i interface{}) []int32 { 13 | v, _ := toInt32SliceE(i) 14 | return v 15 | } 16 | 17 | // toInt32SliceE casts an interface to a []int type. 18 | func toInt32SliceE(i interface{}) ([]int32, error) { 19 | if i == nil { 20 | return []int32{}, fmt.Errorf("unable to cast %#v of type %T to []int32", i, i) 21 | } 22 | 23 | switch v := i.(type) { 24 | case []int32: 25 | return v, nil 26 | } 27 | 28 | kind := reflect.TypeOf(i).Kind() 29 | switch kind { 30 | case reflect.Slice, reflect.Array: 31 | s := reflect.ValueOf(i) 32 | a := make([]int32, s.Len()) 33 | for j := 0; j < s.Len(); j++ { 34 | val, err := cast.ToInt32E(s.Index(j).Interface()) 35 | if err != nil { 36 | return []int32{}, fmt.Errorf("unable to cast %#v of type %T to []int32", i, i) 37 | } 38 | a[j] = val 39 | } 40 | return a, nil 41 | default: 42 | return []int32{}, fmt.Errorf("unable to cast %#v of type %T to []int32", i, i) 43 | } 44 | } 45 | 46 | // toInt64Slice casts an interface to a []int type. 47 | func toInt64Slice(i interface{}) []int64 { 48 | v, _ := toInt64SliceE(i) 49 | return v 50 | } 51 | 52 | // toInt64SliceE casts an interface to a []int type. 53 | func toInt64SliceE(i interface{}) ([]int64, error) { 54 | if i == nil { 55 | return []int64{}, fmt.Errorf("unable to cast %#v of type %T to []int64", i, i) 56 | } 57 | 58 | switch v := i.(type) { 59 | case []int64: 60 | return v, nil 61 | } 62 | 63 | kind := reflect.TypeOf(i).Kind() 64 | switch kind { 65 | case reflect.Slice, reflect.Array: 66 | s := reflect.ValueOf(i) 67 | a := make([]int64, s.Len()) 68 | for j := 0; j < s.Len(); j++ { 69 | val, err := cast.ToInt64E(s.Index(j).Interface()) 70 | if err != nil { 71 | return []int64{}, fmt.Errorf("unable to cast %#v of type %T to []int64", i, i) 72 | } 73 | a[j] = val 74 | } 75 | return a, nil 76 | default: 77 | return []int64{}, fmt.Errorf("unable to cast %#v of type %T to []int64", i, i) 78 | } 79 | } 80 | 81 | // toFloat32Slice casts an interface to a []int type. 82 | func toFloat32Slice(i interface{}) []float32 { 83 | v, _ := toFloat32SliceE(i) 84 | return v 85 | } 86 | 87 | // toFloat32SliceE casts an interface to a []int type. 88 | func toFloat32SliceE(i interface{}) ([]float32, error) { 89 | if i == nil { 90 | return []float32{}, fmt.Errorf("unable to cast %#v of type %T to []float32", i, i) 91 | } 92 | 93 | switch v := i.(type) { 94 | case []float32: 95 | return v, nil 96 | } 97 | 98 | kind := reflect.TypeOf(i).Kind() 99 | switch kind { 100 | case reflect.Slice, reflect.Array: 101 | s := reflect.ValueOf(i) 102 | a := make([]float32, s.Len()) 103 | for j := 0; j < s.Len(); j++ { 104 | val, err := cast.ToFloat32E(s.Index(j).Interface()) 105 | if err != nil { 106 | return []float32{}, fmt.Errorf("unable to cast %#v of type %T to []float32", i, i) 107 | } 108 | a[j] = val 109 | } 110 | return a, nil 111 | default: 112 | return []float32{}, fmt.Errorf("unable to cast %#v of type %T to []float32", i, i) 113 | } 114 | } 115 | 116 | // toFloat64Slice casts an interface to a []int type. 117 | func toFloat64Slice(i interface{}) []float64 { 118 | v, _ := toFloat64SliceE(i) 119 | return v 120 | } 121 | 122 | // toFloat64SliceE casts an interface to a []int type. 123 | func toFloat64SliceE(i interface{}) ([]float64, error) { 124 | if i == nil { 125 | return []float64{}, fmt.Errorf("unable to cast %#v of type %T to []float64", i, i) 126 | } 127 | 128 | switch v := i.(type) { 129 | case []float64: 130 | return v, nil 131 | } 132 | 133 | kind := reflect.TypeOf(i).Kind() 134 | switch kind { 135 | case reflect.Slice, reflect.Array: 136 | s := reflect.ValueOf(i) 137 | a := make([]float64, s.Len()) 138 | for j := 0; j < s.Len(); j++ { 139 | val, err := cast.ToFloat64E(s.Index(j).Interface()) 140 | if err != nil { 141 | return []float64{}, fmt.Errorf("unable to cast %#v of type %T to []float64", i, i) 142 | } 143 | a[j] = val 144 | } 145 | return a, nil 146 | default: 147 | return []float64{}, fmt.Errorf("unable to cast %#v of type %T to []float64", i, i) 148 | } 149 | } 150 | 151 | func prettyJson(v interface{}) string { 152 | bytes, _ := json.MarshalIndent(v, "", "\t") 153 | return string(bytes) 154 | } 155 | -------------------------------------------------------------------------------- /utils_test.go: -------------------------------------------------------------------------------- 1 | package pagser 2 | 3 | import "testing" 4 | 5 | func TestToInt32SliceE(t *testing.T) { 6 | _, err := toInt32SliceE(nil) 7 | if err == nil { 8 | t.Fatal("nil not return error") 9 | } 10 | 11 | _, err = toInt32SliceE(1) 12 | if err == nil { 13 | t.Fatal("1 not return error") 14 | } 15 | 16 | list := []int32{1, 2, 3} 17 | out, err := toInt32SliceE(list) 18 | t.Logf("out: %v, error: %v", out, err) 19 | } 20 | 21 | func TestToInt64SliceE(t *testing.T) { 22 | _, err := toInt64SliceE(nil) 23 | if err == nil { 24 | t.Fatal("nil not return error") 25 | } 26 | 27 | _, err = toInt64SliceE(1) 28 | if err == nil { 29 | t.Fatal("1 not return error") 30 | } 31 | 32 | list := []int64{1, 2, 3} 33 | out, err := toInt64SliceE(list) 34 | t.Logf("out: %v, error: %v", out, err) 35 | } 36 | 37 | func TestToFloat32SliceE(t *testing.T) { 38 | _, err := toFloat32SliceE(nil) 39 | if err == nil { 40 | t.Fatal("nil not return error") 41 | } 42 | 43 | _, err = toFloat32SliceE(1) 44 | if err == nil { 45 | t.Fatal("1 not return error") 46 | } 47 | 48 | list := []float32{1, 2, 3} 49 | out, err := toFloat32SliceE(list) 50 | t.Logf("out: %v, error: %v", out, err) 51 | } 52 | 53 | func TestToFloat64SliceE(t *testing.T) { 54 | _, err := toFloat64SliceE(nil) 55 | if err == nil { 56 | t.Fatal("nil not return error") 57 | } 58 | 59 | _, err = toFloat64SliceE(1) 60 | if err == nil { 61 | t.Fatal("1 not return error") 62 | } 63 | 64 | list := []float64{1, 2, 3} 65 | out, err := toFloat64SliceE(list) 66 | t.Logf("out: %v, error: %v", out, err) 67 | } 68 | --------------------------------------------------------------------------------