├── .gitignore ├── .travis.yml ├── README.md ├── _examples ├── scrape_hn.go ├── scrape_reddit.go └── scrape_wired_latest.go ├── doc.go ├── extract ├── extractors.go └── extractors_test.go ├── fetcher.go ├── helpers.go ├── options.go ├── package_test.go ├── paginate ├── delay.go ├── paginate.go └── paginate_test.go ├── phantomjs.go ├── results_test.go ├── scrape.go └── util.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Coverage profile 2 | cover.out 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | sudo: false 3 | 4 | matrix: 5 | include: 6 | - go: 1.1 7 | install: 8 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 9 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 10 | env: COVER_FLAG= 11 | 12 | - go: 1.2 13 | install: 14 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 15 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 16 | env: COVER_FLAG= 17 | 18 | - go: 1.3 19 | install: 20 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 21 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 22 | env: COVER_FLAG= 23 | 24 | - go: 1.4 25 | install: 26 | - go get golang.org/x/tools/cmd/cover 27 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 28 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 29 | env: COVER_FLAG=-cover 30 | 31 | - go: 1.5 32 | install: 33 | - go get golang.org/x/tools/cmd/cover 34 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 35 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 36 | env: COVER_FLAG=-cover 37 | 38 | - go: tip 39 | install: 40 | - go get golang.org/x/tools/cmd/cover 41 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v 42 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v 43 | env: COVER_FLAG=-cover 44 | 45 | script: 46 | - go test -v $COVER_FLAG ./... 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # goscrape 2 | 3 | [![Godoc](http://img.shields.io/badge/godoc-reference-blue.svg?style=flat)](https://godoc.org/github.com/andrew-d/goscrape) [![Build Status](https://travis-ci.org/andrew-d/goscrape.svg?branch=master)](https://travis-ci.org/andrew-d/goscrape) 4 | 5 | goscrape is a extensible structured scraper for Go. What does a "structured 6 | scraper" mean? In this case, it means that you define what you want to extract 7 | from a page in a structured, hierarchical manner, and then goscrape takes care 8 | of pagination, splitting the input page, and calling the code to extract chunks 9 | of data. However, goscrape is *extensible*, allowing you to customize nearly 10 | every step of this process. 11 | 12 | The architecture of goscrape is roughly as follows: 13 | 14 | - A single request to start scraping (from a given URL) is called a *scrape*. 15 | - Each scrape consists of some number of *pages*. 16 | - Inside each page, there's 1 or more *blocks* - some logical method of splitting 17 | up a page into subcomponents. By default, there's a single block that consists 18 | of the pages's `` element, but you can change this fairly easily. 19 | - Inside each block, you define some number of *pieces* of data that you wish 20 | to extract. Each piece consists of a name, a selector, and what data to 21 | extract from the current block. 22 | 23 | This all sounds rather complicated, but in practice it's quite simple. Here's 24 | a short example of how to get a list of all the latest news articles from Wired 25 | and dump them as JSON to the screen: 26 | 27 | ```go 28 | package main 29 | 30 | import ( 31 | "encoding/json" 32 | "fmt" 33 | "os" 34 | 35 | "github.com/andrew-d/goscrape" 36 | "github.com/andrew-d/goscrape/extract" 37 | ) 38 | 39 | func main() { 40 | config := &scrape.ScrapeConfig{ 41 | DividePage: scrape.DividePageBySelector("#latest-news li"), 42 | 43 | Pieces: []scrape.Piece{ 44 | {Name: "title", Selector: "h5.exchange-sm", Extractor: extract.Text{}}, 45 | {Name: "byline", Selector: "span.byline", Extractor: extract.Text{}}, 46 | {Name: "link", Selector: "a", Extractor: extract.Attr{Attr: "href"}}, 47 | }, 48 | } 49 | 50 | scraper, err := scrape.New(config) 51 | if err != nil { 52 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err) 53 | os.Exit(1) 54 | } 55 | 56 | results, err := scraper.Scrape("http://www.wired.com") 57 | if err != nil { 58 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err) 59 | os.Exit(1) 60 | } 61 | 62 | json.NewEncoder(os.Stdout).Encode(results) 63 | } 64 | ``` 65 | 66 | As you can see, the entire example, including proper error handling, only takes 67 | 36 lines of code - short and sweet. 68 | 69 | For more usage examples, see the 70 | [examples directory](https://github.com/andrew-d/goscrape/tree/master/_examples). 71 | 72 | ## Roadmap 73 | 74 | Here's the rough roadmap of things that I'd like to add. If you have a feature 75 | request, please let me know by [opening an issue](https://github.com/andrew-d/goscrape/issues/new)! 76 | 77 | - [ ] Allow deduplication of Pieces (a custom callback?) 78 | - [ ] Improve parallelization (scrape multiple pages at a time, but maintain order) 79 | 80 | ## License 81 | 82 | MIT 83 | -------------------------------------------------------------------------------- /_examples/scrape_hn.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "regexp" 8 | 9 | "github.com/andrew-d/goscrape" 10 | "github.com/andrew-d/goscrape/extract" 11 | "github.com/andrew-d/goscrape/paginate" 12 | ) 13 | 14 | func main() { 15 | config := &scrape.ScrapeConfig{ 16 | DividePage: scrape.DividePageBySelector("tr:nth-child(3) tr:nth-child(3n-2):not([style='height:10px'])"), 17 | 18 | Pieces: []scrape.Piece{ 19 | {Name: "title", Selector: "td.title > a", Extractor: extract.Text{}}, 20 | {Name: "link", Selector: "td.title > a", Extractor: extract.Attr{Attr: "href"}}, 21 | {Name: "rank", Selector: "td.title[align='right']", 22 | Extractor: extract.Regex{Regex: regexp.MustCompile(`(\d+)`)}}, 23 | }, 24 | 25 | Paginator: paginate.BySelector("a[rel='nofollow']:last-child", "href"), 26 | } 27 | 28 | scraper, err := scrape.New(config) 29 | if err != nil { 30 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err) 31 | os.Exit(1) 32 | } 33 | 34 | results, err := scraper.ScrapeWithOpts( 35 | "https://news.ycombinator.com", 36 | scrape.ScrapeOptions{MaxPages: 3}, 37 | ) 38 | if err != nil { 39 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err) 40 | os.Exit(1) 41 | } 42 | 43 | json.NewEncoder(os.Stdout).Encode(results) 44 | } 45 | -------------------------------------------------------------------------------- /_examples/scrape_reddit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/andrew-d/goscrape" 9 | "github.com/andrew-d/goscrape/extract" 10 | ) 11 | 12 | func main() { 13 | fetcher, err := scrape.NewPhantomJSFetcher() 14 | if err != nil { 15 | fmt.Fprintf(os.Stderr, "Error creating fetcher: %s\n", err) 16 | os.Exit(1) 17 | } 18 | 19 | config := &scrape.ScrapeConfig{ 20 | Fetcher: fetcher, 21 | 22 | DividePage: scrape.DividePageBySelector(".linklisting > div.thing"), 23 | 24 | Pieces: []scrape.Piece{ 25 | {Name: "title", Selector: "p.title > a", Extractor: extract.Text{}}, 26 | {Name: "link", Selector: "p.title > a", Extractor: extract.Attr{Attr: "href"}}, 27 | {Name: "score", Selector: "div.score.unvoted", Extractor: extract.Text{}}, 28 | {Name: "rank", Selector: "span.rank", Extractor: extract.Text{}}, 29 | {Name: "author", Selector: "a.author", Extractor: extract.Text{}}, 30 | {Name: "subreddit", Selector: "a.subreddit", Extractor: extract.Text{}}, 31 | 32 | // Note: if a self post is edited, then this will be an array with two elements. 33 | {Name: "date", Selector: "time", Extractor: extract.Attr{Attr: "datetime"}}, 34 | }, 35 | } 36 | 37 | scraper, err := scrape.New(config) 38 | if err != nil { 39 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err) 40 | os.Exit(1) 41 | } 42 | 43 | results, err := scraper.Scrape("https://www.reddit.com") 44 | if err != nil { 45 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err) 46 | os.Exit(1) 47 | } 48 | 49 | json.NewEncoder(os.Stdout).Encode(results) 50 | } 51 | -------------------------------------------------------------------------------- /_examples/scrape_wired_latest.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/andrew-d/goscrape" 9 | "github.com/andrew-d/goscrape/extract" 10 | ) 11 | 12 | func main() { 13 | config := &scrape.ScrapeConfig{ 14 | DividePage: scrape.DividePageBySelector("#latest-news li"), 15 | 16 | Pieces: []scrape.Piece{ 17 | {Name: "title", Selector: "h5.exchange-sm", Extractor: extract.Text{}}, 18 | {Name: "byline", Selector: "span.byline", Extractor: extract.Text{}}, 19 | {Name: "link", Selector: "a", Extractor: extract.Attr{Attr: "href"}}, 20 | }, 21 | } 22 | 23 | scraper, err := scrape.New(config) 24 | if err != nil { 25 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err) 26 | os.Exit(1) 27 | } 28 | 29 | results, err := scraper.Scrape("http://www.wired.com") 30 | if err != nil { 31 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err) 32 | os.Exit(1) 33 | } 34 | 35 | json.NewEncoder(os.Stdout).Encode(results) 36 | } 37 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // goscrape is a simple, extensible scraping library for Go. For more 2 | // information, please read the README and examples on GitHub, and the 3 | // documentation for the ScrapeConfig and Scraper types. 4 | package scrape 5 | -------------------------------------------------------------------------------- /extract/extractors.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "regexp" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/andrew-d/goscrape" 11 | "golang.org/x/net/html" 12 | ) 13 | 14 | // Const is a PieceExtractor that returns a constant value. 15 | type Const struct { 16 | // The value to return when the Extract() function is called. 17 | Val interface{} 18 | } 19 | 20 | func (e Const) Extract(sel *goquery.Selection) (interface{}, error) { 21 | return e.Val, nil 22 | } 23 | 24 | var _ scrape.PieceExtractor = Const{} 25 | 26 | // Text is a PieceExtractor that returns the combined text contents of 27 | // the given selection. 28 | type Text struct{} 29 | 30 | func (e Text) Extract(sel *goquery.Selection) (interface{}, error) { 31 | return sel.Text(), nil 32 | } 33 | 34 | var _ scrape.PieceExtractor = Text{} 35 | 36 | // MultipleText is a PieceExtractor that extracts the text from each element 37 | // in the given selection and returns the texts as an array. 38 | type MultipleText struct { 39 | // If there are no items in the selection, then return 'nil' from Extract, 40 | // instead of the empty list. This signals that the result of this Piece 41 | // should be omitted entirely from the results, as opposed to including the 42 | // empty list. 43 | OmitIfEmpty bool 44 | } 45 | 46 | func (e MultipleText) Extract(sel *goquery.Selection) (interface{}, error) { 47 | results := []string{} 48 | 49 | sel.Each(func(i int, s *goquery.Selection) { 50 | results = append(results, s.Text()) 51 | }) 52 | 53 | if len(results) == 0 && e.OmitIfEmpty { 54 | return nil, nil 55 | } 56 | 57 | return results, nil 58 | } 59 | 60 | // Html extracts and returns the HTML from inside each element of the 61 | // given selection, as a string. 62 | // 63 | // Note that this results in what is effectively the innerHTML of the element - 64 | // i.e. if our selection consists of ["

ONE

", "

TWO

"] 65 | // then the output will be: "ONETWO". 66 | // 67 | // The return type is a string of all the inner HTML joined together. 68 | type Html struct{} 69 | 70 | func (e Html) Extract(sel *goquery.Selection) (interface{}, error) { 71 | var ret, h string 72 | var err error 73 | 74 | sel.EachWithBreak(func(i int, s *goquery.Selection) bool { 75 | h, err = s.Html() 76 | if err != nil { 77 | return false 78 | } 79 | 80 | ret += h 81 | return true 82 | }) 83 | 84 | if err != nil { 85 | return nil, err 86 | } 87 | return ret, nil 88 | } 89 | 90 | var _ scrape.PieceExtractor = Html{} 91 | 92 | // OuterHtml extracts and returns the HTML of each element of the 93 | // given selection, as a string. 94 | // 95 | // To illustrate, if our selection consists of 96 | // ["

ONE

", "

TWO

"] then the output will be: 97 | // "

ONE

TWO

". 98 | // 99 | // The return type is a string of all the outer HTML joined together. 100 | type OuterHtml struct{} 101 | 102 | func (e OuterHtml) Extract(sel *goquery.Selection) (interface{}, error) { 103 | output := bytes.NewBufferString("") 104 | for _, node := range sel.Nodes { 105 | if err := html.Render(output, node); err != nil { 106 | return nil, err 107 | } 108 | } 109 | 110 | return output.String(), nil 111 | } 112 | 113 | var _ scrape.PieceExtractor = OuterHtml{} 114 | 115 | // Regex runs the given regex over the contents of each element in the 116 | // given selection, and, for each match, extracts the given subexpression. 117 | // The return type of the extractor is a list of string matches (i.e. []string). 118 | type Regex struct { 119 | // The regular expression to match. This regular expression must define 120 | // exactly one parenthesized subexpression (sometimes known as a "capturing 121 | // group"), which will be extracted. 122 | Regex *regexp.Regexp 123 | 124 | // The subexpression of the regex to match. If this value is not set, and if 125 | // the given regex has more than one subexpression, an error will be thrown. 126 | Subexpression int 127 | 128 | // When OnlyText is true, only run the given regex over the text contents of 129 | // each element in the selection, as opposed to the HTML contents. 130 | OnlyText bool 131 | 132 | // By default, if there is only a single match, Regex will return 133 | // the match itself (as opposed to an array containing the single match). 134 | // Set AlwaysReturnList to true to disable this behaviour, ensuring that the 135 | // Extract function always returns an array. 136 | AlwaysReturnList bool 137 | 138 | // If no matches of the provided regex could be extracted, then return 'nil' 139 | // from Extract, instead of the empty list. This signals that the result of 140 | // this Piece should be omitted entirely from the results, as opposed to 141 | // including the empty list. 142 | OmitIfEmpty bool 143 | } 144 | 145 | func (e Regex) Extract(sel *goquery.Selection) (interface{}, error) { 146 | if e.Regex == nil { 147 | return nil, errors.New("no regex given") 148 | } 149 | if e.Regex.NumSubexp() == 0 { 150 | return nil, errors.New("regex has no subexpressions") 151 | } 152 | 153 | var subexp int 154 | if e.Subexpression == 0 { 155 | if e.Regex.NumSubexp() != 1 { 156 | e := fmt.Errorf( 157 | "regex has more than one subexpression (%d), but which to "+ 158 | "extract was not specified", 159 | e.Regex.NumSubexp()) 160 | return nil, e 161 | } 162 | 163 | subexp = 1 164 | } else { 165 | subexp = e.Subexpression 166 | } 167 | 168 | results := []string{} 169 | 170 | // For each element in the selector... 171 | var err error 172 | sel.EachWithBreak(func(i int, s *goquery.Selection) bool { 173 | var contents string 174 | if e.OnlyText { 175 | contents = s.Text() 176 | } else { 177 | contents, err = s.Html() 178 | if err != nil { 179 | return false 180 | } 181 | } 182 | 183 | ret := e.Regex.FindAllStringSubmatch(contents, -1) 184 | 185 | // A return value of nil == no match 186 | if ret == nil { 187 | return true 188 | } 189 | 190 | // For each regex match... 191 | for _, submatches := range ret { 192 | // The 0th entry will be the match of the entire string. The 1st 193 | // entry will be the first capturing group, which is what we want to 194 | // extract. 195 | if len(submatches) > 1 { 196 | results = append(results, submatches[subexp]) 197 | } 198 | } 199 | 200 | return true 201 | }) 202 | 203 | if err != nil { 204 | return nil, err 205 | } 206 | if len(results) == 0 && e.OmitIfEmpty { 207 | return nil, nil 208 | } 209 | if len(results) == 1 && !e.AlwaysReturnList { 210 | return results[0], nil 211 | } 212 | 213 | return results, nil 214 | } 215 | 216 | var _ scrape.PieceExtractor = Regex{} 217 | 218 | // Attr extracts the value of a given HTML attribute from each element 219 | // in the selection, and returns them as a list. 220 | // The return type of the extractor is a list of attribute valueus (i.e. []string). 221 | type Attr struct { 222 | // The HTML attribute to extract from each element. 223 | Attr string 224 | 225 | // By default, if there is only a single attribute extracted, AttrExtractor 226 | // will return the match itself (as opposed to an array containing the single 227 | // match). Set AlwaysReturnList to true to disable this behaviour, ensuring 228 | // that the Extract function always returns an array. 229 | AlwaysReturnList bool 230 | 231 | // If no elements with this attribute are found, then return 'nil' from 232 | // Extract, instead of the empty list. This signals that the result of this 233 | // Piece should be omitted entirely from the results, as opposed to including 234 | // the empty list. 235 | OmitIfEmpty bool 236 | } 237 | 238 | func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) { 239 | if len(e.Attr) == 0 { 240 | return nil, errors.New("no attribute provided") 241 | } 242 | 243 | results := []string{} 244 | 245 | sel.Each(func(i int, s *goquery.Selection) { 246 | if val, found := s.Attr(e.Attr); found { 247 | results = append(results, val) 248 | } 249 | }) 250 | 251 | if len(results) == 0 && e.OmitIfEmpty { 252 | return nil, nil 253 | } 254 | if len(results) == 1 && !e.AlwaysReturnList { 255 | return results[0], nil 256 | } 257 | 258 | return results, nil 259 | } 260 | 261 | var _ scrape.PieceExtractor = Attr{} 262 | 263 | // Count extracts the count of elements that are matched and returns it. 264 | type Count struct { 265 | // If no elements with this attribute are found, then return 'nil' from 266 | // Extract, instead of a number. This signals that the result of this 267 | // Piece should be omitted entirely from the results, as opposed to including 268 | // the empty list. 269 | OmitIfEmpty bool 270 | } 271 | 272 | func (e Count) Extract(sel *goquery.Selection) (interface{}, error) { 273 | l := sel.Length() 274 | if l == 0 && e.OmitIfEmpty { 275 | return nil, nil 276 | } 277 | 278 | return l, nil 279 | } 280 | -------------------------------------------------------------------------------- /extract/extractors_test.go: -------------------------------------------------------------------------------- 1 | package extract 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | func selFrom(s string) *goquery.Selection { 13 | r := strings.NewReader(s) 14 | doc, err := goquery.NewDocumentFromReader(r) 15 | if err != nil { 16 | panic(err) 17 | } 18 | 19 | return doc.Selection 20 | } 21 | 22 | func TestText(t *testing.T) { 23 | sel := selFrom(`

Test 123

`) 24 | ret, err := Text{}.Extract(sel) 25 | assert.NoError(t, err) 26 | assert.Equal(t, ret, "Test 123") 27 | 28 | sel = selFrom(`

First

Second

`) 29 | ret, err = Text{}.Extract(sel) 30 | assert.NoError(t, err) 31 | assert.Equal(t, ret, "FirstSecond") 32 | } 33 | 34 | func TestMultipleText(t *testing.T) { 35 | sel := selFrom(`

Test 123

`) 36 | ret, err := MultipleText{}.Extract(sel.Find("p")) 37 | assert.NoError(t, err) 38 | assert.Equal(t, ret, []string{"Test 123"}) 39 | 40 | sel = selFrom(`

First

Second

`) 41 | ret, err = MultipleText{}.Extract(sel.Find("p")) 42 | assert.NoError(t, err) 43 | assert.Equal(t, ret, []string{"First", "Second"}) 44 | } 45 | 46 | func TestHtml(t *testing.T) { 47 | sel := selFrom( 48 | `

` + 49 | `

Bar

` + 50 | `

Baz

` + 51 | `

Asdf

` + 52 | `

`) 53 | ret, err := Html{}.Extract(sel.Find(".one")) 54 | assert.NoError(t, err) 55 | assert.Equal(t, ret, `

Bar

Baz

Asdf

`) 56 | 57 | ret, err = Html{}.Extract(sel.Find(".two")) 58 | assert.NoError(t, err) 59 | assert.Equal(t, ret, `BarBaz`) 60 | } 61 | 62 | func TestOuterHtml(t *testing.T) { 63 | // Simple version 64 | sel := selFrom(`

Test 123

`) 65 | ret, err := OuterHtml{}.Extract(sel.Find("p")) 66 | assert.NoError(t, err) 67 | assert.Equal(t, ret, `

Test 123

`) 68 | 69 | // Should only get the outer HTML of the element, not siblings 70 | sel = selFrom(`

Test 123

foo

`) 71 | ret, err = OuterHtml{}.Extract(sel.Find("p")) 72 | assert.NoError(t, err) 73 | assert.Equal(t, ret, `

Test 123

`) 74 | } 75 | 76 | func TestRegexInvalid(t *testing.T) { 77 | var err error 78 | 79 | _, err = Regex{}.Extract(selFrom(`foo`)) 80 | assert.Error(t, err, "no regex given") 81 | 82 | _, err = Regex{Regex: regexp.MustCompile(`foo`)}.Extract(selFrom(`bar`)) 83 | assert.Error(t, err, "regex has no subexpressions") 84 | 85 | _, err = Regex{Regex: regexp.MustCompile(`(a)(b)`)}.Extract(selFrom(`bar`)) 86 | assert.Error(t, err, "regex has more than one subexpression (2), but which to extract was not specified") 87 | } 88 | 89 | func TestRegex(t *testing.T) { 90 | sel := selFrom(`

foo

bar

`) 91 | ret, err := Regex{Regex: regexp.MustCompile("f(o+)o")}.Extract(sel) 92 | assert.NoError(t, err) 93 | assert.Equal(t, ret, []string{"o", "oo"}) 94 | 95 | ret, err = Regex{ 96 | Regex: regexp.MustCompile("f(o)?(oo)bar"), 97 | Subexpression: 2, 98 | }.Extract(sel) 99 | assert.NoError(t, err) 100 | assert.Equal(t, ret, "oo") 101 | 102 | ret, err = Regex{ 103 | Regex: regexp.MustCompile("f(o+)o"), 104 | OnlyText: true, 105 | }.Extract(sel) 106 | assert.NoError(t, err) 107 | assert.Equal(t, ret, "o") 108 | 109 | ret, err = Regex{ 110 | Regex: regexp.MustCompile("f(o+)o"), 111 | OnlyText: true, 112 | AlwaysReturnList: true, 113 | }.Extract(sel) 114 | assert.NoError(t, err) 115 | assert.Equal(t, ret, []string{"o"}) 116 | 117 | ret, err = Regex{ 118 | Regex: regexp.MustCompile("a(sd)f"), 119 | OmitIfEmpty: true, 120 | }.Extract(sel) 121 | assert.NoError(t, err) 122 | assert.Nil(t, ret) 123 | } 124 | 125 | func TestAttrInvalid(t *testing.T) { 126 | var err error 127 | 128 | _, err = Attr{}.Extract(selFrom(`foo`)) 129 | assert.Error(t, err, "no attribute provided") 130 | } 131 | 132 | func TestAttr(t *testing.T) { 133 | sel := selFrom(` 134 | google 135 | yahoo 136 | microsoft 137 | `) 138 | ret, err := Attr{Attr: "href"}.Extract(sel.Find("a")) 139 | assert.NoError(t, err) 140 | assert.Equal(t, ret, []string{ 141 | "http://www.google.com", 142 | "http://www.yahoo.com", 143 | "http://www.microsoft.com", 144 | }) 145 | 146 | ret, err = Attr{Attr: "href"}.Extract(sel.Find(".notsearch")) 147 | assert.NoError(t, err) 148 | assert.Equal(t, ret, "http://www.microsoft.com") 149 | 150 | ret, err = Attr{Attr: "href", AlwaysReturnList: true}.Extract(sel.Find(".notsearch")) 151 | assert.NoError(t, err) 152 | assert.Equal(t, ret, []string{"http://www.microsoft.com"}) 153 | 154 | ret, err = Attr{ 155 | Attr: "href", 156 | AlwaysReturnList: true, 157 | }.Extract(sel.Find(".abc")) 158 | assert.NoError(t, err) 159 | assert.Equal(t, ret, []string{}) 160 | 161 | ret, err = Attr{ 162 | Attr: "href", 163 | OmitIfEmpty: true, 164 | }.Extract(sel.Find(".abc")) 165 | assert.NoError(t, err) 166 | assert.Nil(t, ret) 167 | } 168 | 169 | func TestCount(t *testing.T) { 170 | sel := selFrom(` 171 |

One

172 |

Two

173 |

Three

174 | `) 175 | 176 | ret, err := Count{}.Extract(sel.Find("div")) 177 | assert.NoError(t, err) 178 | assert.Equal(t, ret, 3) 179 | 180 | ret, err = Count{}.Extract(sel.Find(".foo")) 181 | assert.NoError(t, err) 182 | assert.Equal(t, ret, 1) 183 | 184 | ret, err = Count{}.Extract(sel.Find(".bad")) 185 | assert.NoError(t, err) 186 | assert.Equal(t, ret, 0) 187 | 188 | ret, err = Count{OmitIfEmpty: true}.Extract(sel.Find(".bad")) 189 | assert.NoError(t, err) 190 | assert.Nil(t, ret) 191 | } 192 | -------------------------------------------------------------------------------- /fetcher.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "io" 5 | "net/http" 6 | "net/http/cookiejar" 7 | 8 | "golang.org/x/net/publicsuffix" 9 | ) 10 | 11 | // Fetcher is the interface that must be satisfied by things that can fetch 12 | // remote URLs and return their contents. 13 | // 14 | // Note: Fetchers may or may not be safe to use concurrently. Please read the 15 | // documentation for each fetcher for more details. 16 | type Fetcher interface { 17 | // Prepare is called once at the beginning of the scrape. 18 | Prepare() error 19 | 20 | // Fetch is called to retrieve each document from the remote server. 21 | Fetch(method, url string) (io.ReadCloser, error) 22 | 23 | // Close is called when the scrape is finished, and can be used to clean up 24 | // allocated resources or perform other cleanup actions. 25 | Close() 26 | } 27 | 28 | // HttpClientFetcher is a Fetcher that uses the Go standard library's http 29 | // client to fetch URLs. 30 | type HttpClientFetcher struct { 31 | client *http.Client 32 | 33 | // PrepareClient prepares this fetcher's http.Client for usage. Use this 34 | // function to do things like logging in. If the function returns an error, 35 | // the scrape is aborted. 36 | PrepareClient func(*http.Client) error 37 | 38 | // PrepareRequest prepares each request that will be sent, prior to sending. 39 | // This is useful for, e.g. setting custom HTTP headers, changing the User- 40 | // Agent, and so on. If the function returns an error, then the scrape will 41 | // be aborted. 42 | // 43 | // Note: this function does NOT apply to requests made during the 44 | // PrepareClient function (above). 45 | PrepareRequest func(*http.Request) error 46 | 47 | // ProcessResponse modifies a response that is returned from the server before 48 | // it is handled by the scraper. If the function returns an error, then the 49 | // scrape will be aborted. 50 | ProcessResponse func(*http.Response) error 51 | } 52 | 53 | func NewHttpClientFetcher() (*HttpClientFetcher, error) { 54 | // Set up the HTTP client 55 | jarOpts := &cookiejar.Options{PublicSuffixList: publicsuffix.List} 56 | jar, err := cookiejar.New(jarOpts) 57 | if err != nil { 58 | return nil, err 59 | } 60 | client := &http.Client{Jar: jar} 61 | 62 | ret := &HttpClientFetcher{ 63 | client: client, 64 | } 65 | return ret, nil 66 | } 67 | 68 | func (hf *HttpClientFetcher) Prepare() error { 69 | if hf.PrepareClient != nil { 70 | return hf.PrepareClient(hf.client) 71 | } 72 | return nil 73 | } 74 | 75 | func (hf *HttpClientFetcher) Fetch(method, url string) (io.ReadCloser, error) { 76 | req, err := http.NewRequest(method, url, nil) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | if hf.PrepareRequest != nil { 82 | if err = hf.PrepareRequest(req); err != nil { 83 | return nil, err 84 | } 85 | } 86 | 87 | resp, err := hf.client.Do(req) 88 | if err != nil { 89 | return nil, err 90 | } 91 | 92 | if hf.ProcessResponse != nil { 93 | if err = hf.ProcessResponse(resp); err != nil { 94 | return nil, err 95 | } 96 | } 97 | 98 | return resp.Body, nil 99 | } 100 | 101 | func (hf *HttpClientFetcher) Close() { 102 | return 103 | } 104 | 105 | // Static type assertion 106 | var _ Fetcher = &HttpClientFetcher{} 107 | -------------------------------------------------------------------------------- /helpers.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "github.com/PuerkitoBio/goquery" 5 | ) 6 | 7 | type dummyPaginator struct { 8 | } 9 | 10 | func (p dummyPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) { 11 | return "", nil 12 | } 13 | 14 | // DividePageBySelector returns a function that divides a page into blocks by 15 | // CSS selector. Each element in the page with the given selector is treated 16 | // as a new block. 17 | func DividePageBySelector(sel string) DividePageFunc { 18 | ret := func(doc *goquery.Selection) []*goquery.Selection { 19 | sels := []*goquery.Selection{} 20 | doc.Find(sel).Each(func(i int, s *goquery.Selection) { 21 | sels = append(sels, s) 22 | }) 23 | 24 | return sels 25 | } 26 | return ret 27 | } 28 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | // ScrapeOptions contains options that are used during the progress of a 4 | // scrape. 5 | type ScrapeOptions struct { 6 | // The maximum number of pages to scrape. The scrape will proceed until 7 | // either this number of pages have been scraped, or until the paginator 8 | // returns no further URLs. Set this value to 0 to indicate an unlimited 9 | // number of pages can be scraped. 10 | MaxPages int 11 | } 12 | 13 | // The default options during a scrape. 14 | var DefaultOptions = ScrapeOptions{ 15 | MaxPages: 0, 16 | } 17 | -------------------------------------------------------------------------------- /package_test.go: -------------------------------------------------------------------------------- 1 | package scrape_test 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "testing" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/andrew-d/goscrape" 11 | "github.com/andrew-d/goscrape/extract" 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | func TestDefaultPaginator(t *testing.T) { 16 | sc := mustNew(&scrape.ScrapeConfig{ 17 | Fetcher: newDummyFetcher([][]byte{ 18 | []byte("one"), 19 | []byte("two"), 20 | []byte("three"), 21 | []byte("four"), 22 | }), 23 | 24 | Pieces: []scrape.Piece{ 25 | {Name: "dummy", Selector: ".", Extractor: extract.Const{"asdf"}}, 26 | }, 27 | }) 28 | 29 | results, err := sc.ScrapeWithOpts( 30 | "initial", 31 | scrape.ScrapeOptions{MaxPages: 3}, 32 | ) 33 | assert.NoError(t, err) 34 | assert.Equal(t, results.URLs, []string{"initial"}) 35 | assert.Equal(t, len(results.Results), 1) 36 | assert.Equal(t, len(results.Results[0]), 1) 37 | } 38 | 39 | func TestPageLimits(t *testing.T) { 40 | sc := mustNew(&scrape.ScrapeConfig{ 41 | Fetcher: newDummyFetcher([][]byte{ 42 | []byte("one"), 43 | []byte("two"), 44 | []byte("three"), 45 | []byte("four"), 46 | }), 47 | 48 | Paginator: &dummyPaginator{}, 49 | 50 | Pieces: []scrape.Piece{ 51 | {Name: "dummy", Selector: ".", Extractor: extract.Const{"asdf"}}, 52 | }, 53 | }) 54 | 55 | results, err := sc.ScrapeWithOpts( 56 | "initial", 57 | scrape.ScrapeOptions{MaxPages: 3}, 58 | ) 59 | assert.NoError(t, err) 60 | assert.Equal(t, []string{ 61 | "initial", 62 | "url-1", 63 | "url-2", 64 | }, results.URLs) 65 | } 66 | 67 | func mustNew(c *scrape.ScrapeConfig) *scrape.Scraper { 68 | scraper, err := scrape.New(c) 69 | if err != nil { 70 | panic(err) 71 | } 72 | return scraper 73 | } 74 | 75 | type dummyFetcher struct { 76 | data [][]byte 77 | idx int 78 | } 79 | 80 | func newDummyFetcher(data [][]byte) *dummyFetcher { 81 | return &dummyFetcher{ 82 | data: data, 83 | idx: 0, 84 | } 85 | } 86 | 87 | func (d *dummyFetcher) Prepare() error { 88 | return nil 89 | } 90 | 91 | func (d *dummyFetcher) Fetch(method, url string) (io.ReadCloser, error) { 92 | r := dummyReadCloser{bytes.NewReader(d.data[d.idx])} 93 | d.idx++ 94 | return r, nil 95 | } 96 | 97 | func (d *dummyFetcher) Close() { 98 | return 99 | } 100 | 101 | type dummyPaginator struct { 102 | idx int 103 | } 104 | 105 | func (d *dummyPaginator) NextPage(url string, document *goquery.Selection) (string, error) { 106 | d.idx++ 107 | return fmt.Sprintf("url-%d", d.idx), nil 108 | } 109 | 110 | type dummyReadCloser struct { 111 | u io.Reader 112 | } 113 | 114 | func (d dummyReadCloser) Read(b []byte) (int, error) { 115 | return d.u.Read(b) 116 | } 117 | 118 | func (d dummyReadCloser) Close() error { 119 | return nil 120 | } 121 | -------------------------------------------------------------------------------- /paginate/delay.go: -------------------------------------------------------------------------------- 1 | package paginate 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/PuerkitoBio/goquery" 7 | "github.com/andrew-d/goscrape" 8 | ) 9 | 10 | type withDelayPaginator struct { 11 | delay time.Duration 12 | p scrape.Paginator 13 | } 14 | 15 | // WithDelay returns a Paginator that will wait the given duration whenever the 16 | // next page is requested, and will then dispatch to the underling Paginator. 17 | func WithDelay(delay time.Duration, p scrape.Paginator) scrape.Paginator { 18 | return &withDelayPaginator{ 19 | delay: delay, 20 | p: p, 21 | } 22 | } 23 | 24 | func (p *withDelayPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) { 25 | time.Sleep(p.delay) 26 | return p.p.NextPage(uri, doc) 27 | } 28 | -------------------------------------------------------------------------------- /paginate/paginate.go: -------------------------------------------------------------------------------- 1 | package paginate 2 | 3 | import ( 4 | "net/url" 5 | "strconv" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/andrew-d/goscrape" 9 | ) 10 | 11 | // RelUrl is a helper function that aids in calculating the absolute URL from a 12 | // base URL and relative URL. 13 | func RelUrl(base, rel string) (string, error) { 14 | baseUrl, err := url.Parse(base) 15 | if err != nil { 16 | return "", err 17 | } 18 | relUrl, err := url.Parse(rel) 19 | if err != nil { 20 | return "", err 21 | } 22 | 23 | newUrl := baseUrl.ResolveReference(relUrl) 24 | return newUrl.String(), nil 25 | } 26 | 27 | type bySelectorPaginator struct { 28 | sel string 29 | attr string 30 | } 31 | 32 | // BySelector returns a Paginator that extracts the next page from a document by 33 | // querying a given CSS selector and extracting the given HTML attribute from the 34 | // resulting element. 35 | func BySelector(sel, attr string) scrape.Paginator { 36 | return &bySelectorPaginator{ 37 | sel: sel, attr: attr, 38 | } 39 | } 40 | 41 | func (p *bySelectorPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) { 42 | val, found := doc.Find(p.sel).Attr(p.attr) 43 | if !found { 44 | return "", nil 45 | } 46 | 47 | return RelUrl(uri, val) 48 | } 49 | 50 | type byQueryParamPaginator struct { 51 | param string 52 | } 53 | 54 | // ByQueryParam returns a Paginator that returns the next page from a document 55 | // by incrementing a given query parameter. Note that this will paginate 56 | // infinitely - you probably want to specify a maximum number of pages to 57 | // scrape by using the ScrapeWithOpts method. 58 | func ByQueryParam(param string) scrape.Paginator { 59 | return &byQueryParamPaginator{param} 60 | } 61 | 62 | func (p *byQueryParamPaginator) NextPage(u string, _ *goquery.Selection) (string, error) { 63 | // Parse 64 | uri, err := url.Parse(u) 65 | if err != nil { 66 | return "", err 67 | } 68 | 69 | // Parse query 70 | vals, err := url.ParseQuery(uri.RawQuery) 71 | if err != nil { 72 | return "", err 73 | } 74 | 75 | // Find query param and increment. If it doesn't exist, then we just stop. 76 | params, ok := vals[p.param] 77 | if !ok || len(params) < 1 { 78 | return "", nil 79 | } 80 | 81 | parsed, err := strconv.ParseUint(params[0], 10, 64) 82 | if err != nil { 83 | // TODO: should this be fatal? 84 | return "", nil 85 | } 86 | 87 | // Put everything back together 88 | params[0] = strconv.FormatUint(parsed+1, 10) 89 | vals[p.param] = params 90 | query := vals.Encode() 91 | uri.RawQuery = query 92 | return uri.String(), nil 93 | } 94 | -------------------------------------------------------------------------------- /paginate/paginate_test.go: -------------------------------------------------------------------------------- 1 | package paginate 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func selFrom(s string) *goquery.Selection { 12 | r := strings.NewReader(s) 13 | doc, err := goquery.NewDocumentFromReader(r) 14 | if err != nil { 15 | panic(err) 16 | } 17 | 18 | return doc.Selection 19 | } 20 | 21 | func TestBySelector(t *testing.T) { 22 | sel := selFrom(`foo`) 23 | 24 | pg, err := BySelector("a", "href").NextPage("", sel) 25 | assert.NoError(t, err) 26 | assert.Equal(t, pg, "http://www.google.com") 27 | 28 | pg, err = BySelector("div", "xxx").NextPage("", sel) 29 | assert.NoError(t, err) 30 | assert.Equal(t, pg, "") 31 | 32 | sel = selFrom(`foo`) 33 | 34 | pg, err = BySelector("a", "href").NextPage("http://www.google.com", sel) 35 | assert.NoError(t, err) 36 | assert.Equal(t, pg, "http://www.google.com/foobar") 37 | 38 | sel = selFrom(`foo`) 39 | 40 | pg, err = BySelector("a", "href").NextPage("http://www.google.com", sel) 41 | assert.NoError(t, err) 42 | assert.Equal(t, pg, "http://www.google.com/asdf?q=123") 43 | } 44 | 45 | func TestByQueryParam(t *testing.T) { 46 | pg, err := ByQueryParam("foo").NextPage("http://www.google.com?foo=1", nil) 47 | assert.NoError(t, err) 48 | assert.Equal(t, pg, "http://www.google.com?foo=2") 49 | 50 | pg, err = ByQueryParam("bad").NextPage("http://www.google.com", nil) 51 | assert.NoError(t, err) 52 | assert.Equal(t, pg, "") 53 | 54 | pg, err = ByQueryParam("bad").NextPage("http://www.google.com?bad=asdf", nil) 55 | assert.NoError(t, err) 56 | assert.Equal(t, pg, "") 57 | } 58 | -------------------------------------------------------------------------------- /phantomjs.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "io" 9 | "io/ioutil" 10 | "os/exec" 11 | "path/filepath" 12 | ) 13 | 14 | const fetchScript = ` 15 | var system = require('system'), 16 | page = require("webpage").create(); 17 | 18 | // Workaround for https://github.com/ariya/phantomjs/issues/12697 since 19 | // it doesn't seem like there will be another 1.9.x release fixing this 20 | var phantomExit = function(exitCode) { 21 | page.close(); 22 | setTimeout(function() { phantom.exit(exitCode); }, 0); 23 | }; 24 | 25 | if( system.args.length !== 2 ) { 26 | system.stderr.writeLine("Usage: fetch.js URL"); 27 | phantomExit(1); 28 | } 29 | 30 | var resourceWait = 300, 31 | maxRenderWait = 10000, 32 | url = system.args[1], 33 | count = 0, 34 | forcedRenderTimeout, 35 | renderTimeout; 36 | 37 | var doRender = function() { 38 | var c = page.evaluate(function() { 39 | return document.documentElement.outerHTML; 40 | }); 41 | 42 | system.stdout.write(JSON.stringify({contents: c})); 43 | phantomExit(); 44 | } 45 | 46 | page.onResourceRequested = function (req) { 47 | count += 1; 48 | system.stderr.writeLine('> ' + req.id + ' - ' + req.url); 49 | clearTimeout(renderTimeout); 50 | }; 51 | 52 | page.onResourceReceived = function (res) { 53 | if (!res.stage || res.stage === 'end') { 54 | count -= 1; 55 | system.stderr.writeLine(res.id + ' ' + res.status + ' - ' + res.url); 56 | if (count === 0) { 57 | renderTimeout = setTimeout(doRender, resourceWait); 58 | } 59 | } 60 | }; 61 | 62 | page.open(url, function (status) { 63 | if (status !== "success") { 64 | system.stderr.writeLine('Unable to load url'); 65 | phantomExit(1); 66 | } else { 67 | forcedRenderTimeout = setTimeout(function () { 68 | console.log(count); 69 | doRender(); 70 | }, maxRenderWait); 71 | } 72 | }); 73 | ` 74 | 75 | var ( 76 | // PhantomJS was not found on the system. You should consider passing an 77 | // explicit path to NewPhantomJSFetcher(). 78 | ErrNoPhantomJS = errors.New("PhantomJS was not found") 79 | 80 | // This error is returned when we try to use PhantomJS to perform a non-GET 81 | // request. 82 | ErrInvalidMethod = errors.New("invalid method") 83 | ) 84 | 85 | func findPhantomJS() string { 86 | var path string 87 | var err error 88 | 89 | for _, nm := range []string{"phantomjs", "phantom"} { 90 | path, err = exec.LookPath(nm) 91 | if err == nil { 92 | return path 93 | } 94 | } 95 | 96 | return "" 97 | } 98 | 99 | // HasPhantomJS returns whether we can find a PhantomJS installation on this system. 100 | // If this returns "false", creating a PhantomJSFetcher will fail. 101 | func HasPhantomJS() bool { 102 | return findPhantomJS() != "" 103 | } 104 | 105 | // PhantomJSFetcher is a Fetcher that calls out to PhantomJS 106 | // (http://phantomjs.org/) in order to fetch a page's content. Since PhantomJS 107 | // will evaluate Javascript in a page, this is the recommended Fetcher to use 108 | // for Javascript-heavy pages. 109 | type PhantomJSFetcher struct { 110 | binaryPath string 111 | tempDir string 112 | scriptPath string 113 | 114 | // Arguments to pass to PhantomJS 115 | args []string 116 | } 117 | 118 | // NewPhantomJSFetcher will create a new instance of PhantomJSFetcher, 119 | // searching the system's PATH for the appropriate binary. If PhantomJS is not 120 | // in the PATH, or you would like to use an alternate binary, then you can give 121 | // an overridden path. 122 | func NewPhantomJSFetcher(binary ...string) (*PhantomJSFetcher, error) { 123 | var path string 124 | 125 | // Find the PhantomJS binary 126 | if len(binary) == 0 || len(binary) == 1 && binary[0] == "" { 127 | path = findPhantomJS() 128 | } else if len(binary) == 1 { 129 | path = binary[0] 130 | } else { 131 | return nil, errors.New("invalid number of arguments") 132 | } 133 | 134 | if path == "" { 135 | return nil, ErrNoPhantomJS 136 | } 137 | 138 | // Create a temporary directory 139 | tdir, err := ioutil.TempDir("", "goscrape-phantom-") 140 | if err != nil { 141 | return nil, err 142 | } 143 | 144 | // Write our fetching script there (so it can be called) 145 | spath := filepath.Join(tdir, "fetch.js") 146 | err = ioutil.WriteFile(spath, []byte(fetchScript), 0600) 147 | if err != nil { 148 | return nil, err 149 | } 150 | 151 | ret := &PhantomJSFetcher{ 152 | binaryPath: path, 153 | tempDir: tdir, 154 | scriptPath: spath, 155 | } 156 | return ret, nil 157 | } 158 | 159 | func (pf *PhantomJSFetcher) Prepare() error { 160 | // TODO: configure ssl errors / web security 161 | // TODO: cookies file path might break if spaces 162 | pf.args = []string{ 163 | "--ignore-ssl-errors=true", 164 | "--web-security=false", 165 | "--cookies-file=" + filepath.Join(pf.tempDir, "cookies.dat"), 166 | pf.scriptPath, 167 | } 168 | return nil 169 | } 170 | 171 | func (pf *PhantomJSFetcher) Fetch(method, url string) (io.ReadCloser, error) { 172 | if method != "GET" { 173 | return nil, ErrInvalidMethod 174 | } 175 | 176 | // Call the fetch script with these parameters. 177 | cmd := exec.Command(pf.binaryPath, append(pf.args, url)...) 178 | 179 | var stdout, stderr bytes.Buffer 180 | cmd.Stdout = &stdout 181 | cmd.Stderr = &stderr 182 | 183 | err := cmd.Run() 184 | if err != nil { 185 | return nil, err 186 | } 187 | 188 | // Load the resulting JSON. 189 | results := map[string]interface{}{} 190 | err = json.NewDecoder(&stdout).Decode(&results) 191 | if err != nil { 192 | return nil, err 193 | } 194 | 195 | // Return the contents 196 | contents, ok := results["contents"].(string) 197 | if !ok { 198 | return nil, fmt.Errorf("unknown type for 'contents': %T", results["contents"]) 199 | } 200 | 201 | return newStringReadCloser(contents), nil 202 | } 203 | 204 | func (pf *PhantomJSFetcher) Close() { 205 | return 206 | } 207 | 208 | // Static type assertion 209 | var _ Fetcher = &PhantomJSFetcher{} 210 | -------------------------------------------------------------------------------- /results_test.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/assert" 7 | ) 8 | 9 | func TestResultsFirst(t *testing.T) { 10 | r := &ScrapeResults{ 11 | Results: [][]map[string]interface{}{ 12 | {{"foo": 1, "bar": 2}}, 13 | }, 14 | } 15 | 16 | assert.Equal(t, r.First(), map[string]interface{}{ 17 | "foo": 1, 18 | "bar": 2, 19 | }) 20 | 21 | r = &ScrapeResults{ 22 | Results: [][]map[string]interface{}{{}}, 23 | } 24 | assert.Nil(t, r.First()) 25 | } 26 | 27 | func TestResultsAllBlocks(t *testing.T) { 28 | r := &ScrapeResults{ 29 | Results: [][]map[string]interface{}{ 30 | {{"foo": 1, "bar": 2}}, 31 | {{"baz": 3, "asdf": 4}}, 32 | }, 33 | } 34 | 35 | assert.Equal(t, r.AllBlocks(), []map[string]interface{}{ 36 | {"foo": 1, "bar": 2}, 37 | {"baz": 3, "asdf": 4}, 38 | }) 39 | } 40 | -------------------------------------------------------------------------------- /scrape.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | var ( 11 | ErrNoPieces = errors.New("no pieces in the config") 12 | ) 13 | 14 | // The DividePageFunc type is used to extract a page's blocks during a scrape. 15 | // For more information, please see the documentation on the ScrapeConfig type. 16 | type DividePageFunc func(*goquery.Selection) []*goquery.Selection 17 | 18 | // The PieceExtractor interface represents something that can extract data from 19 | // a selection. 20 | type PieceExtractor interface { 21 | // Extract some data from the given Selection and return it. The returned 22 | // data should be encodable - i.e. passing it to json.Marshal should succeed. 23 | // If the returned data is nil, then the output from this piece will not be 24 | // included. 25 | // 26 | // If this function returns an error, then the scrape is aborted. 27 | Extract(*goquery.Selection) (interface{}, error) 28 | } 29 | 30 | // The Paginator interface should be implemented by things that can retrieve the 31 | // next page from the current one. 32 | type Paginator interface { 33 | // NextPage controls the progress of the scrape. It is called for each input 34 | // page, starting with the origin URL, and is expected to return the URL of 35 | // the next page to process. Note that order matters - calling 'NextPage' on 36 | // page 1 should return page 2, not page 3. The function should return an 37 | // empty string when there are no more pages to process. 38 | NextPage(url string, document *goquery.Selection) (string, error) 39 | // TODO(andrew-d): should this return a string, a url.URL, ??? 40 | } 41 | 42 | // A Piece represents a given chunk of data that is to be extracted from every 43 | // block in each page of a scrape. 44 | type Piece struct { 45 | // The name of this piece. Required, and will be used to aggregate results. 46 | Name string 47 | 48 | // A sub-selector within the given block to process. Pass in "." to use 49 | // the root block's selector with no modification. 50 | Selector string 51 | // TODO(andrew-d): Consider making this an interface too. 52 | 53 | // Extractor contains the logic on how to extract some results from the 54 | // selector that is provided to this Piece. 55 | Extractor PieceExtractor 56 | } 57 | 58 | // The main configuration for a scrape. Pass this to the New() function. 59 | type ScrapeConfig struct { 60 | // Fetcher is the underlying transport that is used to fetch documents. 61 | // If this is not specified (i.e. left nil), then a default HttpClientFetcher 62 | // will be created and used. 63 | Fetcher Fetcher 64 | 65 | // Paginator is the Paginator to use for this current scrape. 66 | // 67 | // If Paginator is nil, then no pagination is performed and it is assumed that 68 | // the initial URL is the only page. 69 | Paginator Paginator 70 | 71 | // DividePage splits a page into individual 'blocks'. When scraping, we treat 72 | // each page as if it contains some number of 'blocks', each of which can be 73 | // further subdivided into what actually needs to be extracted. 74 | // 75 | // If the DividePage function is nil, then no division is performed and the 76 | // page is assumed to contain a single block containing the entire 77 | // element. 78 | DividePage DividePageFunc 79 | 80 | // Pieces contains the list of data that is extracted for each block. For 81 | // every block that is the result of the DividePage function (above), all of 82 | // the Pieces entries receives the selector representing the block, and can 83 | // return a result. If the returned result is nil, then the Piece is 84 | // considered not to exist in this block, and is not included. 85 | // 86 | // Note: if a Piece's Extractor returns an error, it results in the scrape 87 | // being aborted - this can be useful if you need to ensure that a given Piece 88 | // is required, for example. 89 | Pieces []Piece 90 | } 91 | 92 | func (c *ScrapeConfig) clone() *ScrapeConfig { 93 | ret := &ScrapeConfig{ 94 | Fetcher: c.Fetcher, 95 | Paginator: c.Paginator, 96 | DividePage: c.DividePage, 97 | Pieces: c.Pieces, 98 | } 99 | return ret 100 | } 101 | 102 | // ScrapeResults describes the results of a scrape. It contains a list of all 103 | // pages (URLs) visited during the process, along with all results generated 104 | // from each Piece in each page. 105 | type ScrapeResults struct { 106 | // All URLs visited during this scrape, in order. Always contains at least 107 | // one element - the initial URL. 108 | URLs []string 109 | 110 | // The results from each Piece of each page. Essentially, the top-level array 111 | // is for each page, the second-level array is for each block in a page, and 112 | // the final map[string]interface{} is the mapping of Piece.Name to results. 113 | Results [][]map[string]interface{} 114 | } 115 | 116 | // First returns the first set of results - i.e. the results from the first 117 | // block on the first page. 118 | // 119 | // This function can return nil if there were no blocks found on the first page 120 | // of the scrape. 121 | func (r *ScrapeResults) First() map[string]interface{} { 122 | if len(r.Results[0]) == 0 { 123 | return nil 124 | } 125 | 126 | return r.Results[0][0] 127 | } 128 | 129 | // AllBlocks returns a single list of results from every block on all pages. 130 | // This function will always return a list, even if no blocks were found. 131 | func (r *ScrapeResults) AllBlocks() []map[string]interface{} { 132 | ret := []map[string]interface{}{} 133 | 134 | for _, page := range r.Results { 135 | for _, block := range page { 136 | ret = append(ret, block) 137 | } 138 | } 139 | 140 | return ret 141 | } 142 | 143 | type Scraper struct { 144 | config *ScrapeConfig 145 | } 146 | 147 | // Create a new scraper with the provided configuration. 148 | func New(c *ScrapeConfig) (*Scraper, error) { 149 | var err error 150 | 151 | // Validate config 152 | if len(c.Pieces) == 0 { 153 | return nil, ErrNoPieces 154 | } 155 | 156 | seenNames := map[string]struct{}{} 157 | for i, piece := range c.Pieces { 158 | if len(piece.Name) == 0 { 159 | return nil, fmt.Errorf("no name provided for piece %d", i) 160 | } 161 | if _, seen := seenNames[piece.Name]; seen { 162 | return nil, fmt.Errorf("piece %d has a duplicate name", i) 163 | } 164 | seenNames[piece.Name] = struct{}{} 165 | 166 | if len(piece.Selector) == 0 { 167 | return nil, fmt.Errorf("no selector provided for piece %d", i) 168 | } 169 | } 170 | 171 | // Clone the configuration and fill in the defaults. 172 | config := c.clone() 173 | if config.Paginator == nil { 174 | config.Paginator = dummyPaginator{} 175 | } 176 | if config.DividePage == nil { 177 | config.DividePage = DividePageBySelector("body") 178 | } 179 | 180 | if config.Fetcher == nil { 181 | config.Fetcher, err = NewHttpClientFetcher() 182 | if err != nil { 183 | return nil, err 184 | } 185 | } 186 | 187 | // All set! 188 | ret := &Scraper{ 189 | config: config, 190 | } 191 | return ret, nil 192 | } 193 | 194 | // Scrape a given URL with default options. See 'ScrapeWithOpts' for more 195 | // information. 196 | func (s *Scraper) Scrape(url string) (*ScrapeResults, error) { 197 | return s.ScrapeWithOpts(url, DefaultOptions) 198 | } 199 | 200 | // Actually start scraping at the given URL. 201 | // 202 | // Note that, while this function and the Scraper in general are safe for use 203 | // from multiple goroutines, making multiple requests in parallel can cause 204 | // strange behaviour - e.g. overwriting cookies in the underlying http.Client. 205 | // Please be careful when running multiple scrapes at a time, unless you know 206 | // that it's safe. 207 | func (s *Scraper) ScrapeWithOpts(url string, opts ScrapeOptions) (*ScrapeResults, error) { 208 | if len(url) == 0 { 209 | return nil, errors.New("no URL provided") 210 | } 211 | 212 | // Prepare the fetcher. 213 | err := s.config.Fetcher.Prepare() 214 | if err != nil { 215 | return nil, err 216 | } 217 | 218 | res := &ScrapeResults{ 219 | URLs: []string{}, 220 | Results: [][]map[string]interface{}{}, 221 | } 222 | 223 | var numPages int 224 | for { 225 | // Repeat until we don't have any more URLs, or until we hit our page limit. 226 | if len(url) == 0 || (opts.MaxPages > 0 && numPages >= opts.MaxPages) { 227 | break 228 | } 229 | 230 | resp, err := s.config.Fetcher.Fetch("GET", url) 231 | if err != nil { 232 | return nil, err 233 | } 234 | 235 | // Create a goquery document. 236 | doc, err := goquery.NewDocumentFromReader(resp) 237 | resp.Close() 238 | if err != nil { 239 | return nil, err 240 | } 241 | 242 | res.URLs = append(res.URLs, url) 243 | results := []map[string]interface{}{} 244 | 245 | // Divide this page into blocks 246 | for _, block := range s.config.DividePage(doc.Selection) { 247 | blockResults := map[string]interface{}{} 248 | 249 | // Process each piece of this block 250 | for _, piece := range s.config.Pieces { 251 | sel := block 252 | if piece.Selector != "." { 253 | sel = sel.Find(piece.Selector) 254 | } 255 | 256 | pieceResults, err := piece.Extractor.Extract(sel) 257 | if err != nil { 258 | return nil, err 259 | } 260 | 261 | // A nil response from an extractor means that we don't even include it in 262 | // the results. 263 | if pieceResults == nil { 264 | continue 265 | } 266 | 267 | blockResults[piece.Name] = pieceResults 268 | } 269 | 270 | // Append the results from this block. 271 | results = append(results, blockResults) 272 | } 273 | 274 | // Append the results from this page. 275 | res.Results = append(res.Results, results) 276 | numPages++ 277 | 278 | // Get the next page. 279 | url, err = s.config.Paginator.NextPage(url, doc.Selection) 280 | if err != nil { 281 | return nil, err 282 | } 283 | } 284 | 285 | // All good! 286 | return res, nil 287 | } 288 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | ) 7 | 8 | func newStringReadCloser(s string) dummyReadCloser { 9 | return dummyReadCloser{strings.NewReader(s)} 10 | } 11 | 12 | type dummyReadCloser struct { 13 | r io.Reader 14 | } 15 | 16 | func (c dummyReadCloser) Read(b []byte) (int, error) { 17 | return c.r.Read(b) 18 | } 19 | 20 | func (s dummyReadCloser) Close() error { 21 | return nil 22 | } 23 | 24 | var _ io.ReadCloser = &dummyReadCloser{} 25 | --------------------------------------------------------------------------------