├── .gitignore
├── .travis.yml
├── README.md
├── _examples
├── scrape_hn.go
├── scrape_reddit.go
└── scrape_wired_latest.go
├── doc.go
├── extract
├── extractors.go
└── extractors_test.go
├── fetcher.go
├── helpers.go
├── options.go
├── package_test.go
├── paginate
├── delay.go
├── paginate.go
└── paginate_test.go
├── phantomjs.go
├── results_test.go
├── scrape.go
└── util.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Coverage profile
2 | cover.out
3 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | sudo: false
3 |
4 | matrix:
5 | include:
6 | - go: 1.1
7 | install:
8 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
9 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
10 | env: COVER_FLAG=
11 |
12 | - go: 1.2
13 | install:
14 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
15 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
16 | env: COVER_FLAG=
17 |
18 | - go: 1.3
19 | install:
20 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
21 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
22 | env: COVER_FLAG=
23 |
24 | - go: 1.4
25 | install:
26 | - go get golang.org/x/tools/cmd/cover
27 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
28 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
29 | env: COVER_FLAG=-cover
30 |
31 | - go: 1.5
32 | install:
33 | - go get golang.org/x/tools/cmd/cover
34 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
35 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
36 | env: COVER_FLAG=-cover
37 |
38 | - go: tip
39 | install:
40 | - go get golang.org/x/tools/cmd/cover
41 | - go list -f '{{range .Imports}}{{.}} {{end}}' ./... | xargs go get -v
42 | - go list -f '{{range .TestImports}}{{.}} {{end}}' ./... | xargs go get -v
43 | env: COVER_FLAG=-cover
44 |
45 | script:
46 | - go test -v $COVER_FLAG ./...
47 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # goscrape
2 |
3 | [](https://godoc.org/github.com/andrew-d/goscrape) [](https://travis-ci.org/andrew-d/goscrape)
4 |
5 | goscrape is a extensible structured scraper for Go. What does a "structured
6 | scraper" mean? In this case, it means that you define what you want to extract
7 | from a page in a structured, hierarchical manner, and then goscrape takes care
8 | of pagination, splitting the input page, and calling the code to extract chunks
9 | of data. However, goscrape is *extensible*, allowing you to customize nearly
10 | every step of this process.
11 |
12 | The architecture of goscrape is roughly as follows:
13 |
14 | - A single request to start scraping (from a given URL) is called a *scrape*.
15 | - Each scrape consists of some number of *pages*.
16 | - Inside each page, there's 1 or more *blocks* - some logical method of splitting
17 | up a page into subcomponents. By default, there's a single block that consists
18 | of the pages's `
` element, but you can change this fairly easily.
19 | - Inside each block, you define some number of *pieces* of data that you wish
20 | to extract. Each piece consists of a name, a selector, and what data to
21 | extract from the current block.
22 |
23 | This all sounds rather complicated, but in practice it's quite simple. Here's
24 | a short example of how to get a list of all the latest news articles from Wired
25 | and dump them as JSON to the screen:
26 |
27 | ```go
28 | package main
29 |
30 | import (
31 | "encoding/json"
32 | "fmt"
33 | "os"
34 |
35 | "github.com/andrew-d/goscrape"
36 | "github.com/andrew-d/goscrape/extract"
37 | )
38 |
39 | func main() {
40 | config := &scrape.ScrapeConfig{
41 | DividePage: scrape.DividePageBySelector("#latest-news li"),
42 |
43 | Pieces: []scrape.Piece{
44 | {Name: "title", Selector: "h5.exchange-sm", Extractor: extract.Text{}},
45 | {Name: "byline", Selector: "span.byline", Extractor: extract.Text{}},
46 | {Name: "link", Selector: "a", Extractor: extract.Attr{Attr: "href"}},
47 | },
48 | }
49 |
50 | scraper, err := scrape.New(config)
51 | if err != nil {
52 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err)
53 | os.Exit(1)
54 | }
55 |
56 | results, err := scraper.Scrape("http://www.wired.com")
57 | if err != nil {
58 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err)
59 | os.Exit(1)
60 | }
61 |
62 | json.NewEncoder(os.Stdout).Encode(results)
63 | }
64 | ```
65 |
66 | As you can see, the entire example, including proper error handling, only takes
67 | 36 lines of code - short and sweet.
68 |
69 | For more usage examples, see the
70 | [examples directory](https://github.com/andrew-d/goscrape/tree/master/_examples).
71 |
72 | ## Roadmap
73 |
74 | Here's the rough roadmap of things that I'd like to add. If you have a feature
75 | request, please let me know by [opening an issue](https://github.com/andrew-d/goscrape/issues/new)!
76 |
77 | - [ ] Allow deduplication of Pieces (a custom callback?)
78 | - [ ] Improve parallelization (scrape multiple pages at a time, but maintain order)
79 |
80 | ## License
81 |
82 | MIT
83 |
--------------------------------------------------------------------------------
/_examples/scrape_hn.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "regexp"
8 |
9 | "github.com/andrew-d/goscrape"
10 | "github.com/andrew-d/goscrape/extract"
11 | "github.com/andrew-d/goscrape/paginate"
12 | )
13 |
14 | func main() {
15 | config := &scrape.ScrapeConfig{
16 | DividePage: scrape.DividePageBySelector("tr:nth-child(3) tr:nth-child(3n-2):not([style='height:10px'])"),
17 |
18 | Pieces: []scrape.Piece{
19 | {Name: "title", Selector: "td.title > a", Extractor: extract.Text{}},
20 | {Name: "link", Selector: "td.title > a", Extractor: extract.Attr{Attr: "href"}},
21 | {Name: "rank", Selector: "td.title[align='right']",
22 | Extractor: extract.Regex{Regex: regexp.MustCompile(`(\d+)`)}},
23 | },
24 |
25 | Paginator: paginate.BySelector("a[rel='nofollow']:last-child", "href"),
26 | }
27 |
28 | scraper, err := scrape.New(config)
29 | if err != nil {
30 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err)
31 | os.Exit(1)
32 | }
33 |
34 | results, err := scraper.ScrapeWithOpts(
35 | "https://news.ycombinator.com",
36 | scrape.ScrapeOptions{MaxPages: 3},
37 | )
38 | if err != nil {
39 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err)
40 | os.Exit(1)
41 | }
42 |
43 | json.NewEncoder(os.Stdout).Encode(results)
44 | }
45 |
--------------------------------------------------------------------------------
/_examples/scrape_reddit.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/andrew-d/goscrape"
9 | "github.com/andrew-d/goscrape/extract"
10 | )
11 |
12 | func main() {
13 | fetcher, err := scrape.NewPhantomJSFetcher()
14 | if err != nil {
15 | fmt.Fprintf(os.Stderr, "Error creating fetcher: %s\n", err)
16 | os.Exit(1)
17 | }
18 |
19 | config := &scrape.ScrapeConfig{
20 | Fetcher: fetcher,
21 |
22 | DividePage: scrape.DividePageBySelector(".linklisting > div.thing"),
23 |
24 | Pieces: []scrape.Piece{
25 | {Name: "title", Selector: "p.title > a", Extractor: extract.Text{}},
26 | {Name: "link", Selector: "p.title > a", Extractor: extract.Attr{Attr: "href"}},
27 | {Name: "score", Selector: "div.score.unvoted", Extractor: extract.Text{}},
28 | {Name: "rank", Selector: "span.rank", Extractor: extract.Text{}},
29 | {Name: "author", Selector: "a.author", Extractor: extract.Text{}},
30 | {Name: "subreddit", Selector: "a.subreddit", Extractor: extract.Text{}},
31 |
32 | // Note: if a self post is edited, then this will be an array with two elements.
33 | {Name: "date", Selector: "time", Extractor: extract.Attr{Attr: "datetime"}},
34 | },
35 | }
36 |
37 | scraper, err := scrape.New(config)
38 | if err != nil {
39 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err)
40 | os.Exit(1)
41 | }
42 |
43 | results, err := scraper.Scrape("https://www.reddit.com")
44 | if err != nil {
45 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err)
46 | os.Exit(1)
47 | }
48 |
49 | json.NewEncoder(os.Stdout).Encode(results)
50 | }
51 |
--------------------------------------------------------------------------------
/_examples/scrape_wired_latest.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/andrew-d/goscrape"
9 | "github.com/andrew-d/goscrape/extract"
10 | )
11 |
12 | func main() {
13 | config := &scrape.ScrapeConfig{
14 | DividePage: scrape.DividePageBySelector("#latest-news li"),
15 |
16 | Pieces: []scrape.Piece{
17 | {Name: "title", Selector: "h5.exchange-sm", Extractor: extract.Text{}},
18 | {Name: "byline", Selector: "span.byline", Extractor: extract.Text{}},
19 | {Name: "link", Selector: "a", Extractor: extract.Attr{Attr: "href"}},
20 | },
21 | }
22 |
23 | scraper, err := scrape.New(config)
24 | if err != nil {
25 | fmt.Fprintf(os.Stderr, "Error creating scraper: %s\n", err)
26 | os.Exit(1)
27 | }
28 |
29 | results, err := scraper.Scrape("http://www.wired.com")
30 | if err != nil {
31 | fmt.Fprintf(os.Stderr, "Error scraping: %s\n", err)
32 | os.Exit(1)
33 | }
34 |
35 | json.NewEncoder(os.Stdout).Encode(results)
36 | }
37 |
--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
1 | // goscrape is a simple, extensible scraping library for Go. For more
2 | // information, please read the README and examples on GitHub, and the
3 | // documentation for the ScrapeConfig and Scraper types.
4 | package scrape
5 |
--------------------------------------------------------------------------------
/extract/extractors.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "bytes"
5 | "errors"
6 | "fmt"
7 | "regexp"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/andrew-d/goscrape"
11 | "golang.org/x/net/html"
12 | )
13 |
14 | // Const is a PieceExtractor that returns a constant value.
15 | type Const struct {
16 | // The value to return when the Extract() function is called.
17 | Val interface{}
18 | }
19 |
20 | func (e Const) Extract(sel *goquery.Selection) (interface{}, error) {
21 | return e.Val, nil
22 | }
23 |
24 | var _ scrape.PieceExtractor = Const{}
25 |
26 | // Text is a PieceExtractor that returns the combined text contents of
27 | // the given selection.
28 | type Text struct{}
29 |
30 | func (e Text) Extract(sel *goquery.Selection) (interface{}, error) {
31 | return sel.Text(), nil
32 | }
33 |
34 | var _ scrape.PieceExtractor = Text{}
35 |
36 | // MultipleText is a PieceExtractor that extracts the text from each element
37 | // in the given selection and returns the texts as an array.
38 | type MultipleText struct {
39 | // If there are no items in the selection, then return 'nil' from Extract,
40 | // instead of the empty list. This signals that the result of this Piece
41 | // should be omitted entirely from the results, as opposed to including the
42 | // empty list.
43 | OmitIfEmpty bool
44 | }
45 |
46 | func (e MultipleText) Extract(sel *goquery.Selection) (interface{}, error) {
47 | results := []string{}
48 |
49 | sel.Each(func(i int, s *goquery.Selection) {
50 | results = append(results, s.Text())
51 | })
52 |
53 | if len(results) == 0 && e.OmitIfEmpty {
54 | return nil, nil
55 | }
56 |
57 | return results, nil
58 | }
59 |
60 | // Html extracts and returns the HTML from inside each element of the
61 | // given selection, as a string.
62 | //
63 | // Note that this results in what is effectively the innerHTML of the element -
64 | // i.e. if our selection consists of ["ONE
", "TWO
"]
65 | // then the output will be: "ONETWO".
66 | //
67 | // The return type is a string of all the inner HTML joined together.
68 | type Html struct{}
69 |
70 | func (e Html) Extract(sel *goquery.Selection) (interface{}, error) {
71 | var ret, h string
72 | var err error
73 |
74 | sel.EachWithBreak(func(i int, s *goquery.Selection) bool {
75 | h, err = s.Html()
76 | if err != nil {
77 | return false
78 | }
79 |
80 | ret += h
81 | return true
82 | })
83 |
84 | if err != nil {
85 | return nil, err
86 | }
87 | return ret, nil
88 | }
89 |
90 | var _ scrape.PieceExtractor = Html{}
91 |
92 | // OuterHtml extracts and returns the HTML of each element of the
93 | // given selection, as a string.
94 | //
95 | // To illustrate, if our selection consists of
96 | // ["ONE
", "TWO
"] then the output will be:
97 | // "ONE
TWO
".
98 | //
99 | // The return type is a string of all the outer HTML joined together.
100 | type OuterHtml struct{}
101 |
102 | func (e OuterHtml) Extract(sel *goquery.Selection) (interface{}, error) {
103 | output := bytes.NewBufferString("")
104 | for _, node := range sel.Nodes {
105 | if err := html.Render(output, node); err != nil {
106 | return nil, err
107 | }
108 | }
109 |
110 | return output.String(), nil
111 | }
112 |
113 | var _ scrape.PieceExtractor = OuterHtml{}
114 |
115 | // Regex runs the given regex over the contents of each element in the
116 | // given selection, and, for each match, extracts the given subexpression.
117 | // The return type of the extractor is a list of string matches (i.e. []string).
118 | type Regex struct {
119 | // The regular expression to match. This regular expression must define
120 | // exactly one parenthesized subexpression (sometimes known as a "capturing
121 | // group"), which will be extracted.
122 | Regex *regexp.Regexp
123 |
124 | // The subexpression of the regex to match. If this value is not set, and if
125 | // the given regex has more than one subexpression, an error will be thrown.
126 | Subexpression int
127 |
128 | // When OnlyText is true, only run the given regex over the text contents of
129 | // each element in the selection, as opposed to the HTML contents.
130 | OnlyText bool
131 |
132 | // By default, if there is only a single match, Regex will return
133 | // the match itself (as opposed to an array containing the single match).
134 | // Set AlwaysReturnList to true to disable this behaviour, ensuring that the
135 | // Extract function always returns an array.
136 | AlwaysReturnList bool
137 |
138 | // If no matches of the provided regex could be extracted, then return 'nil'
139 | // from Extract, instead of the empty list. This signals that the result of
140 | // this Piece should be omitted entirely from the results, as opposed to
141 | // including the empty list.
142 | OmitIfEmpty bool
143 | }
144 |
145 | func (e Regex) Extract(sel *goquery.Selection) (interface{}, error) {
146 | if e.Regex == nil {
147 | return nil, errors.New("no regex given")
148 | }
149 | if e.Regex.NumSubexp() == 0 {
150 | return nil, errors.New("regex has no subexpressions")
151 | }
152 |
153 | var subexp int
154 | if e.Subexpression == 0 {
155 | if e.Regex.NumSubexp() != 1 {
156 | e := fmt.Errorf(
157 | "regex has more than one subexpression (%d), but which to "+
158 | "extract was not specified",
159 | e.Regex.NumSubexp())
160 | return nil, e
161 | }
162 |
163 | subexp = 1
164 | } else {
165 | subexp = e.Subexpression
166 | }
167 |
168 | results := []string{}
169 |
170 | // For each element in the selector...
171 | var err error
172 | sel.EachWithBreak(func(i int, s *goquery.Selection) bool {
173 | var contents string
174 | if e.OnlyText {
175 | contents = s.Text()
176 | } else {
177 | contents, err = s.Html()
178 | if err != nil {
179 | return false
180 | }
181 | }
182 |
183 | ret := e.Regex.FindAllStringSubmatch(contents, -1)
184 |
185 | // A return value of nil == no match
186 | if ret == nil {
187 | return true
188 | }
189 |
190 | // For each regex match...
191 | for _, submatches := range ret {
192 | // The 0th entry will be the match of the entire string. The 1st
193 | // entry will be the first capturing group, which is what we want to
194 | // extract.
195 | if len(submatches) > 1 {
196 | results = append(results, submatches[subexp])
197 | }
198 | }
199 |
200 | return true
201 | })
202 |
203 | if err != nil {
204 | return nil, err
205 | }
206 | if len(results) == 0 && e.OmitIfEmpty {
207 | return nil, nil
208 | }
209 | if len(results) == 1 && !e.AlwaysReturnList {
210 | return results[0], nil
211 | }
212 |
213 | return results, nil
214 | }
215 |
216 | var _ scrape.PieceExtractor = Regex{}
217 |
218 | // Attr extracts the value of a given HTML attribute from each element
219 | // in the selection, and returns them as a list.
220 | // The return type of the extractor is a list of attribute valueus (i.e. []string).
221 | type Attr struct {
222 | // The HTML attribute to extract from each element.
223 | Attr string
224 |
225 | // By default, if there is only a single attribute extracted, AttrExtractor
226 | // will return the match itself (as opposed to an array containing the single
227 | // match). Set AlwaysReturnList to true to disable this behaviour, ensuring
228 | // that the Extract function always returns an array.
229 | AlwaysReturnList bool
230 |
231 | // If no elements with this attribute are found, then return 'nil' from
232 | // Extract, instead of the empty list. This signals that the result of this
233 | // Piece should be omitted entirely from the results, as opposed to including
234 | // the empty list.
235 | OmitIfEmpty bool
236 | }
237 |
238 | func (e Attr) Extract(sel *goquery.Selection) (interface{}, error) {
239 | if len(e.Attr) == 0 {
240 | return nil, errors.New("no attribute provided")
241 | }
242 |
243 | results := []string{}
244 |
245 | sel.Each(func(i int, s *goquery.Selection) {
246 | if val, found := s.Attr(e.Attr); found {
247 | results = append(results, val)
248 | }
249 | })
250 |
251 | if len(results) == 0 && e.OmitIfEmpty {
252 | return nil, nil
253 | }
254 | if len(results) == 1 && !e.AlwaysReturnList {
255 | return results[0], nil
256 | }
257 |
258 | return results, nil
259 | }
260 |
261 | var _ scrape.PieceExtractor = Attr{}
262 |
263 | // Count extracts the count of elements that are matched and returns it.
264 | type Count struct {
265 | // If no elements with this attribute are found, then return 'nil' from
266 | // Extract, instead of a number. This signals that the result of this
267 | // Piece should be omitted entirely from the results, as opposed to including
268 | // the empty list.
269 | OmitIfEmpty bool
270 | }
271 |
272 | func (e Count) Extract(sel *goquery.Selection) (interface{}, error) {
273 | l := sel.Length()
274 | if l == 0 && e.OmitIfEmpty {
275 | return nil, nil
276 | }
277 |
278 | return l, nil
279 | }
280 |
--------------------------------------------------------------------------------
/extract/extractors_test.go:
--------------------------------------------------------------------------------
1 | package extract
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 | "testing"
7 |
8 | "github.com/PuerkitoBio/goquery"
9 | "github.com/stretchr/testify/assert"
10 | )
11 |
12 | func selFrom(s string) *goquery.Selection {
13 | r := strings.NewReader(s)
14 | doc, err := goquery.NewDocumentFromReader(r)
15 | if err != nil {
16 | panic(err)
17 | }
18 |
19 | return doc.Selection
20 | }
21 |
22 | func TestText(t *testing.T) {
23 | sel := selFrom(`Test 123
`)
24 | ret, err := Text{}.Extract(sel)
25 | assert.NoError(t, err)
26 | assert.Equal(t, ret, "Test 123")
27 |
28 | sel = selFrom(`First
Second
`)
29 | ret, err = Text{}.Extract(sel)
30 | assert.NoError(t, err)
31 | assert.Equal(t, ret, "FirstSecond")
32 | }
33 |
34 | func TestMultipleText(t *testing.T) {
35 | sel := selFrom(`Test 123
`)
36 | ret, err := MultipleText{}.Extract(sel.Find("p"))
37 | assert.NoError(t, err)
38 | assert.Equal(t, ret, []string{"Test 123"})
39 |
40 | sel = selFrom(`First
Second
`)
41 | ret, err = MultipleText{}.Extract(sel.Find("p"))
42 | assert.NoError(t, err)
43 | assert.Equal(t, ret, []string{"First", "Second"})
44 | }
45 |
46 | func TestHtml(t *testing.T) {
47 | sel := selFrom(
48 | `` +
49 | `
Bar
` +
50 | `
Baz
` +
51 | `
Asdf
` +
52 | `
`)
53 | ret, err := Html{}.Extract(sel.Find(".one"))
54 | assert.NoError(t, err)
55 | assert.Equal(t, ret, `Bar
Baz
Asdf
`)
56 |
57 | ret, err = Html{}.Extract(sel.Find(".two"))
58 | assert.NoError(t, err)
59 | assert.Equal(t, ret, `BarBaz`)
60 | }
61 |
62 | func TestOuterHtml(t *testing.T) {
63 | // Simple version
64 | sel := selFrom(``)
65 | ret, err := OuterHtml{}.Extract(sel.Find("p"))
66 | assert.NoError(t, err)
67 | assert.Equal(t, ret, `Test 123
`)
68 |
69 | // Should only get the outer HTML of the element, not siblings
70 | sel = selFrom(``)
71 | ret, err = OuterHtml{}.Extract(sel.Find("p"))
72 | assert.NoError(t, err)
73 | assert.Equal(t, ret, `Test 123
`)
74 | }
75 |
76 | func TestRegexInvalid(t *testing.T) {
77 | var err error
78 |
79 | _, err = Regex{}.Extract(selFrom(`foo`))
80 | assert.Error(t, err, "no regex given")
81 |
82 | _, err = Regex{Regex: regexp.MustCompile(`foo`)}.Extract(selFrom(`bar`))
83 | assert.Error(t, err, "regex has no subexpressions")
84 |
85 | _, err = Regex{Regex: regexp.MustCompile(`(a)(b)`)}.Extract(selFrom(`bar`))
86 | assert.Error(t, err, "regex has more than one subexpression (2), but which to extract was not specified")
87 | }
88 |
89 | func TestRegex(t *testing.T) {
90 | sel := selFrom(`foo
bar
`)
91 | ret, err := Regex{Regex: regexp.MustCompile("f(o+)o")}.Extract(sel)
92 | assert.NoError(t, err)
93 | assert.Equal(t, ret, []string{"o", "oo"})
94 |
95 | ret, err = Regex{
96 | Regex: regexp.MustCompile("f(o)?(oo)bar"),
97 | Subexpression: 2,
98 | }.Extract(sel)
99 | assert.NoError(t, err)
100 | assert.Equal(t, ret, "oo")
101 |
102 | ret, err = Regex{
103 | Regex: regexp.MustCompile("f(o+)o"),
104 | OnlyText: true,
105 | }.Extract(sel)
106 | assert.NoError(t, err)
107 | assert.Equal(t, ret, "o")
108 |
109 | ret, err = Regex{
110 | Regex: regexp.MustCompile("f(o+)o"),
111 | OnlyText: true,
112 | AlwaysReturnList: true,
113 | }.Extract(sel)
114 | assert.NoError(t, err)
115 | assert.Equal(t, ret, []string{"o"})
116 |
117 | ret, err = Regex{
118 | Regex: regexp.MustCompile("a(sd)f"),
119 | OmitIfEmpty: true,
120 | }.Extract(sel)
121 | assert.NoError(t, err)
122 | assert.Nil(t, ret)
123 | }
124 |
125 | func TestAttrInvalid(t *testing.T) {
126 | var err error
127 |
128 | _, err = Attr{}.Extract(selFrom(`foo`))
129 | assert.Error(t, err, "no attribute provided")
130 | }
131 |
132 | func TestAttr(t *testing.T) {
133 | sel := selFrom(`
134 | google
135 | yahoo
136 | microsoft
137 | `)
138 | ret, err := Attr{Attr: "href"}.Extract(sel.Find("a"))
139 | assert.NoError(t, err)
140 | assert.Equal(t, ret, []string{
141 | "http://www.google.com",
142 | "http://www.yahoo.com",
143 | "http://www.microsoft.com",
144 | })
145 |
146 | ret, err = Attr{Attr: "href"}.Extract(sel.Find(".notsearch"))
147 | assert.NoError(t, err)
148 | assert.Equal(t, ret, "http://www.microsoft.com")
149 |
150 | ret, err = Attr{Attr: "href", AlwaysReturnList: true}.Extract(sel.Find(".notsearch"))
151 | assert.NoError(t, err)
152 | assert.Equal(t, ret, []string{"http://www.microsoft.com"})
153 |
154 | ret, err = Attr{
155 | Attr: "href",
156 | AlwaysReturnList: true,
157 | }.Extract(sel.Find(".abc"))
158 | assert.NoError(t, err)
159 | assert.Equal(t, ret, []string{})
160 |
161 | ret, err = Attr{
162 | Attr: "href",
163 | OmitIfEmpty: true,
164 | }.Extract(sel.Find(".abc"))
165 | assert.NoError(t, err)
166 | assert.Nil(t, ret)
167 | }
168 |
169 | func TestCount(t *testing.T) {
170 | sel := selFrom(`
171 | One
172 | Two
173 | Three
174 | `)
175 |
176 | ret, err := Count{}.Extract(sel.Find("div"))
177 | assert.NoError(t, err)
178 | assert.Equal(t, ret, 3)
179 |
180 | ret, err = Count{}.Extract(sel.Find(".foo"))
181 | assert.NoError(t, err)
182 | assert.Equal(t, ret, 1)
183 |
184 | ret, err = Count{}.Extract(sel.Find(".bad"))
185 | assert.NoError(t, err)
186 | assert.Equal(t, ret, 0)
187 |
188 | ret, err = Count{OmitIfEmpty: true}.Extract(sel.Find(".bad"))
189 | assert.NoError(t, err)
190 | assert.Nil(t, ret)
191 | }
192 |
--------------------------------------------------------------------------------
/fetcher.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "io"
5 | "net/http"
6 | "net/http/cookiejar"
7 |
8 | "golang.org/x/net/publicsuffix"
9 | )
10 |
11 | // Fetcher is the interface that must be satisfied by things that can fetch
12 | // remote URLs and return their contents.
13 | //
14 | // Note: Fetchers may or may not be safe to use concurrently. Please read the
15 | // documentation for each fetcher for more details.
16 | type Fetcher interface {
17 | // Prepare is called once at the beginning of the scrape.
18 | Prepare() error
19 |
20 | // Fetch is called to retrieve each document from the remote server.
21 | Fetch(method, url string) (io.ReadCloser, error)
22 |
23 | // Close is called when the scrape is finished, and can be used to clean up
24 | // allocated resources or perform other cleanup actions.
25 | Close()
26 | }
27 |
28 | // HttpClientFetcher is a Fetcher that uses the Go standard library's http
29 | // client to fetch URLs.
30 | type HttpClientFetcher struct {
31 | client *http.Client
32 |
33 | // PrepareClient prepares this fetcher's http.Client for usage. Use this
34 | // function to do things like logging in. If the function returns an error,
35 | // the scrape is aborted.
36 | PrepareClient func(*http.Client) error
37 |
38 | // PrepareRequest prepares each request that will be sent, prior to sending.
39 | // This is useful for, e.g. setting custom HTTP headers, changing the User-
40 | // Agent, and so on. If the function returns an error, then the scrape will
41 | // be aborted.
42 | //
43 | // Note: this function does NOT apply to requests made during the
44 | // PrepareClient function (above).
45 | PrepareRequest func(*http.Request) error
46 |
47 | // ProcessResponse modifies a response that is returned from the server before
48 | // it is handled by the scraper. If the function returns an error, then the
49 | // scrape will be aborted.
50 | ProcessResponse func(*http.Response) error
51 | }
52 |
53 | func NewHttpClientFetcher() (*HttpClientFetcher, error) {
54 | // Set up the HTTP client
55 | jarOpts := &cookiejar.Options{PublicSuffixList: publicsuffix.List}
56 | jar, err := cookiejar.New(jarOpts)
57 | if err != nil {
58 | return nil, err
59 | }
60 | client := &http.Client{Jar: jar}
61 |
62 | ret := &HttpClientFetcher{
63 | client: client,
64 | }
65 | return ret, nil
66 | }
67 |
68 | func (hf *HttpClientFetcher) Prepare() error {
69 | if hf.PrepareClient != nil {
70 | return hf.PrepareClient(hf.client)
71 | }
72 | return nil
73 | }
74 |
75 | func (hf *HttpClientFetcher) Fetch(method, url string) (io.ReadCloser, error) {
76 | req, err := http.NewRequest(method, url, nil)
77 | if err != nil {
78 | return nil, err
79 | }
80 |
81 | if hf.PrepareRequest != nil {
82 | if err = hf.PrepareRequest(req); err != nil {
83 | return nil, err
84 | }
85 | }
86 |
87 | resp, err := hf.client.Do(req)
88 | if err != nil {
89 | return nil, err
90 | }
91 |
92 | if hf.ProcessResponse != nil {
93 | if err = hf.ProcessResponse(resp); err != nil {
94 | return nil, err
95 | }
96 | }
97 |
98 | return resp.Body, nil
99 | }
100 |
101 | func (hf *HttpClientFetcher) Close() {
102 | return
103 | }
104 |
105 | // Static type assertion
106 | var _ Fetcher = &HttpClientFetcher{}
107 |
--------------------------------------------------------------------------------
/helpers.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "github.com/PuerkitoBio/goquery"
5 | )
6 |
7 | type dummyPaginator struct {
8 | }
9 |
10 | func (p dummyPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) {
11 | return "", nil
12 | }
13 |
14 | // DividePageBySelector returns a function that divides a page into blocks by
15 | // CSS selector. Each element in the page with the given selector is treated
16 | // as a new block.
17 | func DividePageBySelector(sel string) DividePageFunc {
18 | ret := func(doc *goquery.Selection) []*goquery.Selection {
19 | sels := []*goquery.Selection{}
20 | doc.Find(sel).Each(func(i int, s *goquery.Selection) {
21 | sels = append(sels, s)
22 | })
23 |
24 | return sels
25 | }
26 | return ret
27 | }
28 |
--------------------------------------------------------------------------------
/options.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | // ScrapeOptions contains options that are used during the progress of a
4 | // scrape.
5 | type ScrapeOptions struct {
6 | // The maximum number of pages to scrape. The scrape will proceed until
7 | // either this number of pages have been scraped, or until the paginator
8 | // returns no further URLs. Set this value to 0 to indicate an unlimited
9 | // number of pages can be scraped.
10 | MaxPages int
11 | }
12 |
13 | // The default options during a scrape.
14 | var DefaultOptions = ScrapeOptions{
15 | MaxPages: 0,
16 | }
17 |
--------------------------------------------------------------------------------
/package_test.go:
--------------------------------------------------------------------------------
1 | package scrape_test
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "testing"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/andrew-d/goscrape"
11 | "github.com/andrew-d/goscrape/extract"
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | func TestDefaultPaginator(t *testing.T) {
16 | sc := mustNew(&scrape.ScrapeConfig{
17 | Fetcher: newDummyFetcher([][]byte{
18 | []byte("one"),
19 | []byte("two"),
20 | []byte("three"),
21 | []byte("four"),
22 | }),
23 |
24 | Pieces: []scrape.Piece{
25 | {Name: "dummy", Selector: ".", Extractor: extract.Const{"asdf"}},
26 | },
27 | })
28 |
29 | results, err := sc.ScrapeWithOpts(
30 | "initial",
31 | scrape.ScrapeOptions{MaxPages: 3},
32 | )
33 | assert.NoError(t, err)
34 | assert.Equal(t, results.URLs, []string{"initial"})
35 | assert.Equal(t, len(results.Results), 1)
36 | assert.Equal(t, len(results.Results[0]), 1)
37 | }
38 |
39 | func TestPageLimits(t *testing.T) {
40 | sc := mustNew(&scrape.ScrapeConfig{
41 | Fetcher: newDummyFetcher([][]byte{
42 | []byte("one"),
43 | []byte("two"),
44 | []byte("three"),
45 | []byte("four"),
46 | }),
47 |
48 | Paginator: &dummyPaginator{},
49 |
50 | Pieces: []scrape.Piece{
51 | {Name: "dummy", Selector: ".", Extractor: extract.Const{"asdf"}},
52 | },
53 | })
54 |
55 | results, err := sc.ScrapeWithOpts(
56 | "initial",
57 | scrape.ScrapeOptions{MaxPages: 3},
58 | )
59 | assert.NoError(t, err)
60 | assert.Equal(t, []string{
61 | "initial",
62 | "url-1",
63 | "url-2",
64 | }, results.URLs)
65 | }
66 |
67 | func mustNew(c *scrape.ScrapeConfig) *scrape.Scraper {
68 | scraper, err := scrape.New(c)
69 | if err != nil {
70 | panic(err)
71 | }
72 | return scraper
73 | }
74 |
75 | type dummyFetcher struct {
76 | data [][]byte
77 | idx int
78 | }
79 |
80 | func newDummyFetcher(data [][]byte) *dummyFetcher {
81 | return &dummyFetcher{
82 | data: data,
83 | idx: 0,
84 | }
85 | }
86 |
87 | func (d *dummyFetcher) Prepare() error {
88 | return nil
89 | }
90 |
91 | func (d *dummyFetcher) Fetch(method, url string) (io.ReadCloser, error) {
92 | r := dummyReadCloser{bytes.NewReader(d.data[d.idx])}
93 | d.idx++
94 | return r, nil
95 | }
96 |
97 | func (d *dummyFetcher) Close() {
98 | return
99 | }
100 |
101 | type dummyPaginator struct {
102 | idx int
103 | }
104 |
105 | func (d *dummyPaginator) NextPage(url string, document *goquery.Selection) (string, error) {
106 | d.idx++
107 | return fmt.Sprintf("url-%d", d.idx), nil
108 | }
109 |
110 | type dummyReadCloser struct {
111 | u io.Reader
112 | }
113 |
114 | func (d dummyReadCloser) Read(b []byte) (int, error) {
115 | return d.u.Read(b)
116 | }
117 |
118 | func (d dummyReadCloser) Close() error {
119 | return nil
120 | }
121 |
--------------------------------------------------------------------------------
/paginate/delay.go:
--------------------------------------------------------------------------------
1 | package paginate
2 |
3 | import (
4 | "time"
5 |
6 | "github.com/PuerkitoBio/goquery"
7 | "github.com/andrew-d/goscrape"
8 | )
9 |
10 | type withDelayPaginator struct {
11 | delay time.Duration
12 | p scrape.Paginator
13 | }
14 |
15 | // WithDelay returns a Paginator that will wait the given duration whenever the
16 | // next page is requested, and will then dispatch to the underling Paginator.
17 | func WithDelay(delay time.Duration, p scrape.Paginator) scrape.Paginator {
18 | return &withDelayPaginator{
19 | delay: delay,
20 | p: p,
21 | }
22 | }
23 |
24 | func (p *withDelayPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) {
25 | time.Sleep(p.delay)
26 | return p.p.NextPage(uri, doc)
27 | }
28 |
--------------------------------------------------------------------------------
/paginate/paginate.go:
--------------------------------------------------------------------------------
1 | package paginate
2 |
3 | import (
4 | "net/url"
5 | "strconv"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | "github.com/andrew-d/goscrape"
9 | )
10 |
11 | // RelUrl is a helper function that aids in calculating the absolute URL from a
12 | // base URL and relative URL.
13 | func RelUrl(base, rel string) (string, error) {
14 | baseUrl, err := url.Parse(base)
15 | if err != nil {
16 | return "", err
17 | }
18 | relUrl, err := url.Parse(rel)
19 | if err != nil {
20 | return "", err
21 | }
22 |
23 | newUrl := baseUrl.ResolveReference(relUrl)
24 | return newUrl.String(), nil
25 | }
26 |
27 | type bySelectorPaginator struct {
28 | sel string
29 | attr string
30 | }
31 |
32 | // BySelector returns a Paginator that extracts the next page from a document by
33 | // querying a given CSS selector and extracting the given HTML attribute from the
34 | // resulting element.
35 | func BySelector(sel, attr string) scrape.Paginator {
36 | return &bySelectorPaginator{
37 | sel: sel, attr: attr,
38 | }
39 | }
40 |
41 | func (p *bySelectorPaginator) NextPage(uri string, doc *goquery.Selection) (string, error) {
42 | val, found := doc.Find(p.sel).Attr(p.attr)
43 | if !found {
44 | return "", nil
45 | }
46 |
47 | return RelUrl(uri, val)
48 | }
49 |
50 | type byQueryParamPaginator struct {
51 | param string
52 | }
53 |
54 | // ByQueryParam returns a Paginator that returns the next page from a document
55 | // by incrementing a given query parameter. Note that this will paginate
56 | // infinitely - you probably want to specify a maximum number of pages to
57 | // scrape by using the ScrapeWithOpts method.
58 | func ByQueryParam(param string) scrape.Paginator {
59 | return &byQueryParamPaginator{param}
60 | }
61 |
62 | func (p *byQueryParamPaginator) NextPage(u string, _ *goquery.Selection) (string, error) {
63 | // Parse
64 | uri, err := url.Parse(u)
65 | if err != nil {
66 | return "", err
67 | }
68 |
69 | // Parse query
70 | vals, err := url.ParseQuery(uri.RawQuery)
71 | if err != nil {
72 | return "", err
73 | }
74 |
75 | // Find query param and increment. If it doesn't exist, then we just stop.
76 | params, ok := vals[p.param]
77 | if !ok || len(params) < 1 {
78 | return "", nil
79 | }
80 |
81 | parsed, err := strconv.ParseUint(params[0], 10, 64)
82 | if err != nil {
83 | // TODO: should this be fatal?
84 | return "", nil
85 | }
86 |
87 | // Put everything back together
88 | params[0] = strconv.FormatUint(parsed+1, 10)
89 | vals[p.param] = params
90 | query := vals.Encode()
91 | uri.RawQuery = query
92 | return uri.String(), nil
93 | }
94 |
--------------------------------------------------------------------------------
/paginate/paginate_test.go:
--------------------------------------------------------------------------------
1 | package paginate
2 |
3 | import (
4 | "strings"
5 | "testing"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | "github.com/stretchr/testify/assert"
9 | )
10 |
11 | func selFrom(s string) *goquery.Selection {
12 | r := strings.NewReader(s)
13 | doc, err := goquery.NewDocumentFromReader(r)
14 | if err != nil {
15 | panic(err)
16 | }
17 |
18 | return doc.Selection
19 | }
20 |
21 | func TestBySelector(t *testing.T) {
22 | sel := selFrom(`foo`)
23 |
24 | pg, err := BySelector("a", "href").NextPage("", sel)
25 | assert.NoError(t, err)
26 | assert.Equal(t, pg, "http://www.google.com")
27 |
28 | pg, err = BySelector("div", "xxx").NextPage("", sel)
29 | assert.NoError(t, err)
30 | assert.Equal(t, pg, "")
31 |
32 | sel = selFrom(`foo`)
33 |
34 | pg, err = BySelector("a", "href").NextPage("http://www.google.com", sel)
35 | assert.NoError(t, err)
36 | assert.Equal(t, pg, "http://www.google.com/foobar")
37 |
38 | sel = selFrom(`foo`)
39 |
40 | pg, err = BySelector("a", "href").NextPage("http://www.google.com", sel)
41 | assert.NoError(t, err)
42 | assert.Equal(t, pg, "http://www.google.com/asdf?q=123")
43 | }
44 |
45 | func TestByQueryParam(t *testing.T) {
46 | pg, err := ByQueryParam("foo").NextPage("http://www.google.com?foo=1", nil)
47 | assert.NoError(t, err)
48 | assert.Equal(t, pg, "http://www.google.com?foo=2")
49 |
50 | pg, err = ByQueryParam("bad").NextPage("http://www.google.com", nil)
51 | assert.NoError(t, err)
52 | assert.Equal(t, pg, "")
53 |
54 | pg, err = ByQueryParam("bad").NextPage("http://www.google.com?bad=asdf", nil)
55 | assert.NoError(t, err)
56 | assert.Equal(t, pg, "")
57 | }
58 |
--------------------------------------------------------------------------------
/phantomjs.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "errors"
7 | "fmt"
8 | "io"
9 | "io/ioutil"
10 | "os/exec"
11 | "path/filepath"
12 | )
13 |
14 | const fetchScript = `
15 | var system = require('system'),
16 | page = require("webpage").create();
17 |
18 | // Workaround for https://github.com/ariya/phantomjs/issues/12697 since
19 | // it doesn't seem like there will be another 1.9.x release fixing this
20 | var phantomExit = function(exitCode) {
21 | page.close();
22 | setTimeout(function() { phantom.exit(exitCode); }, 0);
23 | };
24 |
25 | if( system.args.length !== 2 ) {
26 | system.stderr.writeLine("Usage: fetch.js URL");
27 | phantomExit(1);
28 | }
29 |
30 | var resourceWait = 300,
31 | maxRenderWait = 10000,
32 | url = system.args[1],
33 | count = 0,
34 | forcedRenderTimeout,
35 | renderTimeout;
36 |
37 | var doRender = function() {
38 | var c = page.evaluate(function() {
39 | return document.documentElement.outerHTML;
40 | });
41 |
42 | system.stdout.write(JSON.stringify({contents: c}));
43 | phantomExit();
44 | }
45 |
46 | page.onResourceRequested = function (req) {
47 | count += 1;
48 | system.stderr.writeLine('> ' + req.id + ' - ' + req.url);
49 | clearTimeout(renderTimeout);
50 | };
51 |
52 | page.onResourceReceived = function (res) {
53 | if (!res.stage || res.stage === 'end') {
54 | count -= 1;
55 | system.stderr.writeLine(res.id + ' ' + res.status + ' - ' + res.url);
56 | if (count === 0) {
57 | renderTimeout = setTimeout(doRender, resourceWait);
58 | }
59 | }
60 | };
61 |
62 | page.open(url, function (status) {
63 | if (status !== "success") {
64 | system.stderr.writeLine('Unable to load url');
65 | phantomExit(1);
66 | } else {
67 | forcedRenderTimeout = setTimeout(function () {
68 | console.log(count);
69 | doRender();
70 | }, maxRenderWait);
71 | }
72 | });
73 | `
74 |
75 | var (
76 | // PhantomJS was not found on the system. You should consider passing an
77 | // explicit path to NewPhantomJSFetcher().
78 | ErrNoPhantomJS = errors.New("PhantomJS was not found")
79 |
80 | // This error is returned when we try to use PhantomJS to perform a non-GET
81 | // request.
82 | ErrInvalidMethod = errors.New("invalid method")
83 | )
84 |
85 | func findPhantomJS() string {
86 | var path string
87 | var err error
88 |
89 | for _, nm := range []string{"phantomjs", "phantom"} {
90 | path, err = exec.LookPath(nm)
91 | if err == nil {
92 | return path
93 | }
94 | }
95 |
96 | return ""
97 | }
98 |
99 | // HasPhantomJS returns whether we can find a PhantomJS installation on this system.
100 | // If this returns "false", creating a PhantomJSFetcher will fail.
101 | func HasPhantomJS() bool {
102 | return findPhantomJS() != ""
103 | }
104 |
105 | // PhantomJSFetcher is a Fetcher that calls out to PhantomJS
106 | // (http://phantomjs.org/) in order to fetch a page's content. Since PhantomJS
107 | // will evaluate Javascript in a page, this is the recommended Fetcher to use
108 | // for Javascript-heavy pages.
109 | type PhantomJSFetcher struct {
110 | binaryPath string
111 | tempDir string
112 | scriptPath string
113 |
114 | // Arguments to pass to PhantomJS
115 | args []string
116 | }
117 |
118 | // NewPhantomJSFetcher will create a new instance of PhantomJSFetcher,
119 | // searching the system's PATH for the appropriate binary. If PhantomJS is not
120 | // in the PATH, or you would like to use an alternate binary, then you can give
121 | // an overridden path.
122 | func NewPhantomJSFetcher(binary ...string) (*PhantomJSFetcher, error) {
123 | var path string
124 |
125 | // Find the PhantomJS binary
126 | if len(binary) == 0 || len(binary) == 1 && binary[0] == "" {
127 | path = findPhantomJS()
128 | } else if len(binary) == 1 {
129 | path = binary[0]
130 | } else {
131 | return nil, errors.New("invalid number of arguments")
132 | }
133 |
134 | if path == "" {
135 | return nil, ErrNoPhantomJS
136 | }
137 |
138 | // Create a temporary directory
139 | tdir, err := ioutil.TempDir("", "goscrape-phantom-")
140 | if err != nil {
141 | return nil, err
142 | }
143 |
144 | // Write our fetching script there (so it can be called)
145 | spath := filepath.Join(tdir, "fetch.js")
146 | err = ioutil.WriteFile(spath, []byte(fetchScript), 0600)
147 | if err != nil {
148 | return nil, err
149 | }
150 |
151 | ret := &PhantomJSFetcher{
152 | binaryPath: path,
153 | tempDir: tdir,
154 | scriptPath: spath,
155 | }
156 | return ret, nil
157 | }
158 |
159 | func (pf *PhantomJSFetcher) Prepare() error {
160 | // TODO: configure ssl errors / web security
161 | // TODO: cookies file path might break if spaces
162 | pf.args = []string{
163 | "--ignore-ssl-errors=true",
164 | "--web-security=false",
165 | "--cookies-file=" + filepath.Join(pf.tempDir, "cookies.dat"),
166 | pf.scriptPath,
167 | }
168 | return nil
169 | }
170 |
171 | func (pf *PhantomJSFetcher) Fetch(method, url string) (io.ReadCloser, error) {
172 | if method != "GET" {
173 | return nil, ErrInvalidMethod
174 | }
175 |
176 | // Call the fetch script with these parameters.
177 | cmd := exec.Command(pf.binaryPath, append(pf.args, url)...)
178 |
179 | var stdout, stderr bytes.Buffer
180 | cmd.Stdout = &stdout
181 | cmd.Stderr = &stderr
182 |
183 | err := cmd.Run()
184 | if err != nil {
185 | return nil, err
186 | }
187 |
188 | // Load the resulting JSON.
189 | results := map[string]interface{}{}
190 | err = json.NewDecoder(&stdout).Decode(&results)
191 | if err != nil {
192 | return nil, err
193 | }
194 |
195 | // Return the contents
196 | contents, ok := results["contents"].(string)
197 | if !ok {
198 | return nil, fmt.Errorf("unknown type for 'contents': %T", results["contents"])
199 | }
200 |
201 | return newStringReadCloser(contents), nil
202 | }
203 |
204 | func (pf *PhantomJSFetcher) Close() {
205 | return
206 | }
207 |
208 | // Static type assertion
209 | var _ Fetcher = &PhantomJSFetcher{}
210 |
--------------------------------------------------------------------------------
/results_test.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/assert"
7 | )
8 |
9 | func TestResultsFirst(t *testing.T) {
10 | r := &ScrapeResults{
11 | Results: [][]map[string]interface{}{
12 | {{"foo": 1, "bar": 2}},
13 | },
14 | }
15 |
16 | assert.Equal(t, r.First(), map[string]interface{}{
17 | "foo": 1,
18 | "bar": 2,
19 | })
20 |
21 | r = &ScrapeResults{
22 | Results: [][]map[string]interface{}{{}},
23 | }
24 | assert.Nil(t, r.First())
25 | }
26 |
27 | func TestResultsAllBlocks(t *testing.T) {
28 | r := &ScrapeResults{
29 | Results: [][]map[string]interface{}{
30 | {{"foo": 1, "bar": 2}},
31 | {{"baz": 3, "asdf": 4}},
32 | },
33 | }
34 |
35 | assert.Equal(t, r.AllBlocks(), []map[string]interface{}{
36 | {"foo": 1, "bar": 2},
37 | {"baz": 3, "asdf": 4},
38 | })
39 | }
40 |
--------------------------------------------------------------------------------
/scrape.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | )
9 |
10 | var (
11 | ErrNoPieces = errors.New("no pieces in the config")
12 | )
13 |
14 | // The DividePageFunc type is used to extract a page's blocks during a scrape.
15 | // For more information, please see the documentation on the ScrapeConfig type.
16 | type DividePageFunc func(*goquery.Selection) []*goquery.Selection
17 |
18 | // The PieceExtractor interface represents something that can extract data from
19 | // a selection.
20 | type PieceExtractor interface {
21 | // Extract some data from the given Selection and return it. The returned
22 | // data should be encodable - i.e. passing it to json.Marshal should succeed.
23 | // If the returned data is nil, then the output from this piece will not be
24 | // included.
25 | //
26 | // If this function returns an error, then the scrape is aborted.
27 | Extract(*goquery.Selection) (interface{}, error)
28 | }
29 |
30 | // The Paginator interface should be implemented by things that can retrieve the
31 | // next page from the current one.
32 | type Paginator interface {
33 | // NextPage controls the progress of the scrape. It is called for each input
34 | // page, starting with the origin URL, and is expected to return the URL of
35 | // the next page to process. Note that order matters - calling 'NextPage' on
36 | // page 1 should return page 2, not page 3. The function should return an
37 | // empty string when there are no more pages to process.
38 | NextPage(url string, document *goquery.Selection) (string, error)
39 | // TODO(andrew-d): should this return a string, a url.URL, ???
40 | }
41 |
42 | // A Piece represents a given chunk of data that is to be extracted from every
43 | // block in each page of a scrape.
44 | type Piece struct {
45 | // The name of this piece. Required, and will be used to aggregate results.
46 | Name string
47 |
48 | // A sub-selector within the given block to process. Pass in "." to use
49 | // the root block's selector with no modification.
50 | Selector string
51 | // TODO(andrew-d): Consider making this an interface too.
52 |
53 | // Extractor contains the logic on how to extract some results from the
54 | // selector that is provided to this Piece.
55 | Extractor PieceExtractor
56 | }
57 |
58 | // The main configuration for a scrape. Pass this to the New() function.
59 | type ScrapeConfig struct {
60 | // Fetcher is the underlying transport that is used to fetch documents.
61 | // If this is not specified (i.e. left nil), then a default HttpClientFetcher
62 | // will be created and used.
63 | Fetcher Fetcher
64 |
65 | // Paginator is the Paginator to use for this current scrape.
66 | //
67 | // If Paginator is nil, then no pagination is performed and it is assumed that
68 | // the initial URL is the only page.
69 | Paginator Paginator
70 |
71 | // DividePage splits a page into individual 'blocks'. When scraping, we treat
72 | // each page as if it contains some number of 'blocks', each of which can be
73 | // further subdivided into what actually needs to be extracted.
74 | //
75 | // If the DividePage function is nil, then no division is performed and the
76 | // page is assumed to contain a single block containing the entire
77 | // element.
78 | DividePage DividePageFunc
79 |
80 | // Pieces contains the list of data that is extracted for each block. For
81 | // every block that is the result of the DividePage function (above), all of
82 | // the Pieces entries receives the selector representing the block, and can
83 | // return a result. If the returned result is nil, then the Piece is
84 | // considered not to exist in this block, and is not included.
85 | //
86 | // Note: if a Piece's Extractor returns an error, it results in the scrape
87 | // being aborted - this can be useful if you need to ensure that a given Piece
88 | // is required, for example.
89 | Pieces []Piece
90 | }
91 |
92 | func (c *ScrapeConfig) clone() *ScrapeConfig {
93 | ret := &ScrapeConfig{
94 | Fetcher: c.Fetcher,
95 | Paginator: c.Paginator,
96 | DividePage: c.DividePage,
97 | Pieces: c.Pieces,
98 | }
99 | return ret
100 | }
101 |
102 | // ScrapeResults describes the results of a scrape. It contains a list of all
103 | // pages (URLs) visited during the process, along with all results generated
104 | // from each Piece in each page.
105 | type ScrapeResults struct {
106 | // All URLs visited during this scrape, in order. Always contains at least
107 | // one element - the initial URL.
108 | URLs []string
109 |
110 | // The results from each Piece of each page. Essentially, the top-level array
111 | // is for each page, the second-level array is for each block in a page, and
112 | // the final map[string]interface{} is the mapping of Piece.Name to results.
113 | Results [][]map[string]interface{}
114 | }
115 |
116 | // First returns the first set of results - i.e. the results from the first
117 | // block on the first page.
118 | //
119 | // This function can return nil if there were no blocks found on the first page
120 | // of the scrape.
121 | func (r *ScrapeResults) First() map[string]interface{} {
122 | if len(r.Results[0]) == 0 {
123 | return nil
124 | }
125 |
126 | return r.Results[0][0]
127 | }
128 |
129 | // AllBlocks returns a single list of results from every block on all pages.
130 | // This function will always return a list, even if no blocks were found.
131 | func (r *ScrapeResults) AllBlocks() []map[string]interface{} {
132 | ret := []map[string]interface{}{}
133 |
134 | for _, page := range r.Results {
135 | for _, block := range page {
136 | ret = append(ret, block)
137 | }
138 | }
139 |
140 | return ret
141 | }
142 |
143 | type Scraper struct {
144 | config *ScrapeConfig
145 | }
146 |
147 | // Create a new scraper with the provided configuration.
148 | func New(c *ScrapeConfig) (*Scraper, error) {
149 | var err error
150 |
151 | // Validate config
152 | if len(c.Pieces) == 0 {
153 | return nil, ErrNoPieces
154 | }
155 |
156 | seenNames := map[string]struct{}{}
157 | for i, piece := range c.Pieces {
158 | if len(piece.Name) == 0 {
159 | return nil, fmt.Errorf("no name provided for piece %d", i)
160 | }
161 | if _, seen := seenNames[piece.Name]; seen {
162 | return nil, fmt.Errorf("piece %d has a duplicate name", i)
163 | }
164 | seenNames[piece.Name] = struct{}{}
165 |
166 | if len(piece.Selector) == 0 {
167 | return nil, fmt.Errorf("no selector provided for piece %d", i)
168 | }
169 | }
170 |
171 | // Clone the configuration and fill in the defaults.
172 | config := c.clone()
173 | if config.Paginator == nil {
174 | config.Paginator = dummyPaginator{}
175 | }
176 | if config.DividePage == nil {
177 | config.DividePage = DividePageBySelector("body")
178 | }
179 |
180 | if config.Fetcher == nil {
181 | config.Fetcher, err = NewHttpClientFetcher()
182 | if err != nil {
183 | return nil, err
184 | }
185 | }
186 |
187 | // All set!
188 | ret := &Scraper{
189 | config: config,
190 | }
191 | return ret, nil
192 | }
193 |
194 | // Scrape a given URL with default options. See 'ScrapeWithOpts' for more
195 | // information.
196 | func (s *Scraper) Scrape(url string) (*ScrapeResults, error) {
197 | return s.ScrapeWithOpts(url, DefaultOptions)
198 | }
199 |
200 | // Actually start scraping at the given URL.
201 | //
202 | // Note that, while this function and the Scraper in general are safe for use
203 | // from multiple goroutines, making multiple requests in parallel can cause
204 | // strange behaviour - e.g. overwriting cookies in the underlying http.Client.
205 | // Please be careful when running multiple scrapes at a time, unless you know
206 | // that it's safe.
207 | func (s *Scraper) ScrapeWithOpts(url string, opts ScrapeOptions) (*ScrapeResults, error) {
208 | if len(url) == 0 {
209 | return nil, errors.New("no URL provided")
210 | }
211 |
212 | // Prepare the fetcher.
213 | err := s.config.Fetcher.Prepare()
214 | if err != nil {
215 | return nil, err
216 | }
217 |
218 | res := &ScrapeResults{
219 | URLs: []string{},
220 | Results: [][]map[string]interface{}{},
221 | }
222 |
223 | var numPages int
224 | for {
225 | // Repeat until we don't have any more URLs, or until we hit our page limit.
226 | if len(url) == 0 || (opts.MaxPages > 0 && numPages >= opts.MaxPages) {
227 | break
228 | }
229 |
230 | resp, err := s.config.Fetcher.Fetch("GET", url)
231 | if err != nil {
232 | return nil, err
233 | }
234 |
235 | // Create a goquery document.
236 | doc, err := goquery.NewDocumentFromReader(resp)
237 | resp.Close()
238 | if err != nil {
239 | return nil, err
240 | }
241 |
242 | res.URLs = append(res.URLs, url)
243 | results := []map[string]interface{}{}
244 |
245 | // Divide this page into blocks
246 | for _, block := range s.config.DividePage(doc.Selection) {
247 | blockResults := map[string]interface{}{}
248 |
249 | // Process each piece of this block
250 | for _, piece := range s.config.Pieces {
251 | sel := block
252 | if piece.Selector != "." {
253 | sel = sel.Find(piece.Selector)
254 | }
255 |
256 | pieceResults, err := piece.Extractor.Extract(sel)
257 | if err != nil {
258 | return nil, err
259 | }
260 |
261 | // A nil response from an extractor means that we don't even include it in
262 | // the results.
263 | if pieceResults == nil {
264 | continue
265 | }
266 |
267 | blockResults[piece.Name] = pieceResults
268 | }
269 |
270 | // Append the results from this block.
271 | results = append(results, blockResults)
272 | }
273 |
274 | // Append the results from this page.
275 | res.Results = append(res.Results, results)
276 | numPages++
277 |
278 | // Get the next page.
279 | url, err = s.config.Paginator.NextPage(url, doc.Selection)
280 | if err != nil {
281 | return nil, err
282 | }
283 | }
284 |
285 | // All good!
286 | return res, nil
287 | }
288 |
--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
1 | package scrape
2 |
3 | import (
4 | "io"
5 | "strings"
6 | )
7 |
8 | func newStringReadCloser(s string) dummyReadCloser {
9 | return dummyReadCloser{strings.NewReader(s)}
10 | }
11 |
12 | type dummyReadCloser struct {
13 | r io.Reader
14 | }
15 |
16 | func (c dummyReadCloser) Read(b []byte) (int, error) {
17 | return c.r.Read(b)
18 | }
19 |
20 | func (s dummyReadCloser) Close() error {
21 | return nil
22 | }
23 |
24 | var _ io.ReadCloser = &dummyReadCloser{}
25 |
--------------------------------------------------------------------------------