├── LICENSE
├── README.md
├── example
    ├── hackernews.go
    └── pipeline.go
├── scrape.go
└── scrape_test.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, yhat
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |   Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 |   Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrape
 2 | 
 3 | A simple, higher level interface for Go web scraping.
 4 | 
 5 | When scraping with Go, I find myself redefining tree traversal and other
 6 | utility functions.
 7 | 
 8 | This package is a place to put some simple tools which build on top of the
 9 | [Go HTML parsing library](https://godoc.org/golang.org/x/net/html).
10 | 
11 | For the full interface check out the godoc
12 | [![GoDoc](https://godoc.org/github.com/yhat/scrape?status.svg)](https://godoc.org/github.com/yhat/scrape)
13 | 
14 | ## Sample
15 | 
16 | Scrape defines traversal functions like `Find` and `FindAll` while attempting
17 | to be generic. It also defines convenience functions such as `Attr` and `Text`.
18 | 
19 | ```go
20 | // Parse the page
21 | root, err := html.Parse(resp.Body)
22 | if err != nil {
23 |     // handle error
24 | }
25 | // Search for the title
26 | title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
27 | if ok {
28 |     // Print the title
29 |     fmt.Println(scrape.Text(title))
30 | }
31 | ```
32 | 
33 | ## A full example: Scraping Hacker News
34 | 
35 | ```go
36 | package main
37 | 
38 | import (
39 | 	"fmt"
40 | 	"net/http"
41 | 
42 | 	"github.com/yhat/scrape"
43 | 	"golang.org/x/net/html"
44 | 	"golang.org/x/net/html/atom"
45 | )
46 | 
47 | func main() {
48 | 	// request and parse the front page
49 | 	resp, err := http.Get("https://news.ycombinator.com/")
50 | 	if err != nil {
51 | 		panic(err)
52 | 	}
53 | 	root, err := html.Parse(resp.Body)
54 | 	if err != nil {
55 | 		panic(err)
56 | 	}
57 | 
58 | 	// define a matcher
59 | 	matcher := func(n *html.Node) bool {
60 | 		// must check for nil values
61 | 		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
62 | 			return scrape.Attr(n.Parent.Parent, "class") == "athing"
63 | 		}
64 | 		return false
65 | 	}
66 | 	// grab all articles and print them
67 | 	articles := scrape.FindAll(root, matcher)
68 | 	for i, article := range articles {
69 | 		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
70 | 	}
71 | }
72 | ```
73 | 


--------------------------------------------------------------------------------
/example/hackernews.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/yhat/scrape"
 8 | 	"golang.org/x/net/html"
 9 | 	"golang.org/x/net/html/atom"
10 | )
11 | 
12 | func main() {
13 | 	// request and parse the front page
14 | 	resp, err := http.Get("https://news.ycombinator.com/")
15 | 	if err != nil {
16 | 		panic(err)
17 | 	}
18 | 	root, err := html.Parse(resp.Body)
19 | 	if err != nil {
20 | 		panic(err)
21 | 	}
22 | 
23 | 	// define a matcher
24 | 	matcher := func(n *html.Node) bool {
25 | 		// must check for nil values
26 | 		if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil {
27 | 			return scrape.Attr(n.Parent.Parent, "class") == "athing"
28 | 		}
29 | 		return false
30 | 	}
31 | 	// grab all articles and print them
32 | 	articles := scrape.FindAll(root, matcher)
33 | 	for i, article := range articles {
34 | 		fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href"))
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/example/pipeline.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"net/http"
 6 | 	"sync"
 7 | 
 8 | 	"github.com/yhat/scrape"
 9 | 	"golang.org/x/net/html"
10 | 	"golang.org/x/net/html/atom"
11 | )
12 | 
13 | // Set your email here to include in the User-Agent string.
14 | var email = "youremail@gmail.com"
15 | var urls = []string{
16 | 	"http://techcrunch.com/",
17 | 	"https://www.reddit.com/",
18 | 	"https://en.wikipedia.org",
19 | 	"https://news.ycombinator.com/",
20 | 	"https://www.buzzfeed.com/",
21 | 	"http://digg.com",
22 | }
23 | 
24 | func respGen(urls ...string) <-chan *http.Response {
25 | 	var wg sync.WaitGroup
26 | 	out := make(chan *http.Response)
27 | 	wg.Add(len(urls))
28 | 	for _, url := range urls {
29 | 		go func(url string) {
30 | 			req, err := http.NewRequest("GET", url, nil)
31 | 			if err != nil {
32 | 				panic(err)
33 | 			}
34 | 			req.Header.Set("user-agent", "testBot("+email+")")
35 | 			resp, err := http.DefaultClient.Do(req)
36 | 			if err != nil {
37 | 				panic(err)
38 | 			}
39 | 			out <- resp
40 | 			wg.Done()
41 | 		}(url)
42 | 	}
43 | 	go func() {
44 | 		wg.Wait()
45 | 		close(out)
46 | 	}()
47 | 	return out
48 | }
49 | 
50 | func rootGen(in <-chan *http.Response) <-chan *html.Node {
51 | 	var wg sync.WaitGroup
52 | 	out := make(chan *html.Node)
53 | 	for resp := range in {
54 | 		wg.Add(1)
55 | 		go func(resp *http.Response) {
56 | 			root, err := html.Parse(resp.Body)
57 | 			if err != nil {
58 | 				panic(err)
59 | 			}
60 | 			out <- root
61 | 			wg.Done()
62 | 		}(resp)
63 | 	}
64 | 	go func() {
65 | 		wg.Wait()
66 | 		close(out)
67 | 	}()
68 | 	return out
69 | }
70 | 
71 | func titleGen(in <-chan *html.Node) <-chan string {
72 | 	var wg sync.WaitGroup
73 | 	out := make(chan string)
74 | 	for root := range in {
75 | 		wg.Add(1)
76 | 		go func(root *html.Node) {
77 | 			title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
78 | 			if ok {
79 | 				out <- scrape.Text(title)
80 | 			}
81 | 			wg.Done()
82 | 		}(root)
83 | 	}
84 | 	go func() {
85 | 		wg.Wait()
86 | 		close(out)
87 | 	}()
88 | 	return out
89 | }
90 | 
91 | func main() {
92 | 	// Set up the pipeline to consume back-to-back output
93 | 	// ending with the final stage to print the title of
94 | 	// each web page in the main go routine.
95 | 	for title := range titleGen(rootGen(respGen(urls...))) {
96 | 		fmt.Println(title)
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/scrape.go:
--------------------------------------------------------------------------------
  1 | // Package scrape provides a searching api on top of golang.org/x/net/html
  2 | package scrape
  3 | 
  4 | import (
  5 | 	"strings"
  6 | 
  7 | 	"golang.org/x/net/html"
  8 | 	"golang.org/x/net/html/atom"
  9 | )
 10 | 
 11 | // Matcher should return true when a desired node is found.
 12 | type Matcher func(node *html.Node) bool
 13 | 
 14 | // FindAll returns all nodes which match the provided Matcher. After discovering a matching
 15 | // node, it will _not_ discover matching subnodes of that node.
 16 | func FindAll(node *html.Node, matcher Matcher) []*html.Node {
 17 | 	return findAllInternal(node, matcher, false)
 18 | }
 19 | 
 20 | // FindAllNested returns all nodes which match the provided Matcher and _will_ discover
 21 | // matching subnodes of matching nodes.
 22 | func FindAllNested(node *html.Node, matcher Matcher) []*html.Node {
 23 | 	return findAllInternal(node, matcher, true)
 24 | }
 25 | 
 26 | // Find returns the first node which matches the matcher using depth-first search.
 27 | // If no node is found, ok will be false.
 28 | //
 29 | //     root, err := html.Parse(resp.Body)
 30 | //     if err != nil {
 31 | //         // handle error
 32 | //     }
 33 | //     matcher := func(n *html.Node) bool {
 34 | //         return n.DataAtom == atom.Body
 35 | //     }
 36 | //     body, ok := scrape.Find(root, matcher)
 37 | func Find(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
 38 | 	if matcher(node) {
 39 | 		return node, true
 40 | 	}
 41 | 
 42 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 43 | 		n, ok := Find(c, matcher)
 44 | 		if ok {
 45 | 			return n, true
 46 | 		}
 47 | 	}
 48 | 	return nil, false
 49 | }
 50 | 
 51 | // FindParent searches up HTML tree from the current node until either a
 52 | // match is found or the top is hit.
 53 | func FindParent(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
 54 | 	for p := node.Parent; p != nil; p = p.Parent {
 55 | 		if matcher(p) {
 56 | 			return p, true
 57 | 		}
 58 | 	}
 59 | 	return nil, false
 60 | }
 61 | 
 62 | // Text returns text from all descendant text nodes joined.
 63 | // For control over the join function, see TextJoin.
 64 | func Text(node *html.Node) string {
 65 | 	joiner := func(s []string) string {
 66 | 		n := 0
 67 | 		for i := range s {
 68 | 			trimmed := strings.TrimSpace(s[i])
 69 | 			if trimmed != "" {
 70 | 				s[n] = trimmed
 71 | 				n++
 72 | 			}
 73 | 		}
 74 | 		return strings.Join(s[:n], " ")
 75 | 	}
 76 | 	return TextJoin(node, joiner)
 77 | }
 78 | 
 79 | // TextJoin returns a string from all descendant text nodes joined by a
 80 | // caller provided join function.
 81 | func TextJoin(node *html.Node, join func([]string) string) string {
 82 | 	nodes := FindAll(node, func(n *html.Node) bool { return n.Type == html.TextNode })
 83 | 	parts := make([]string, len(nodes))
 84 | 	for i, n := range nodes {
 85 | 		parts[i] = n.Data
 86 | 	}
 87 | 	return join(parts)
 88 | }
 89 | 
 90 | // Attr returns the value of an HTML attribute.
 91 | func Attr(node *html.Node, key string) string {
 92 | 	for _, a := range node.Attr {
 93 | 		if a.Key == key {
 94 | 			return a.Val
 95 | 		}
 96 | 	}
 97 | 	return ""
 98 | }
 99 | 
100 | // ByTag returns a Matcher which matches all nodes of the provided tag type.
101 | //
102 | //     root, err := html.Parse(resp.Body)
103 | //     if err != nil {
104 | //         // handle error
105 | //     }
106 | //     title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
107 | func ByTag(a atom.Atom) Matcher {
108 | 	return func(node *html.Node) bool { return node.DataAtom == a }
109 | }
110 | 
111 | // ById returns a Matcher which matches all nodes with the provided id.
112 | func ById(id string) Matcher {
113 | 	return func(node *html.Node) bool { return Attr(node, "id") == id }
114 | }
115 | 
116 | // ByClass returns a Matcher which matches all nodes with the provided class.
117 | func ByClass(class string) Matcher {
118 | 	return func(node *html.Node) bool {
119 | 		classes := strings.Fields(Attr(node, "class"))
120 | 		for _, c := range classes {
121 | 			if c == class {
122 | 				return true
123 | 			}
124 | 		}
125 | 		return false
126 | 	}
127 | }
128 | 
129 | // findAllInternal encapsulates the node tree traversal
130 | func findAllInternal(node *html.Node, matcher Matcher, searchNested bool) []*html.Node {
131 | 	matched := []*html.Node{}
132 | 
133 | 	if matcher(node) {
134 | 		matched = append(matched, node)
135 | 
136 | 		if !searchNested {
137 | 			return matched
138 | 		}
139 | 	}
140 | 
141 | 	for c := node.FirstChild; c != nil; c = c.NextSibling {
142 | 		found := findAllInternal(c, matcher, searchNested)
143 | 		if len(found) > 0 {
144 | 			matched = append(matched, found...)
145 | 		}
146 | 	}
147 | 	return matched
148 | }
149 | 
150 | // Find returns the first node which matches the matcher using next sibling search.
151 | // If no node is found, ok will be false.
152 | //
153 | //     root, err := html.Parse(resp.Body)
154 | //     if err != nil {
155 | //         // handle error
156 | //     }
157 | //     matcher := func(n *html.Node) bool {
158 | //         return n.DataAtom == atom.Body
159 | //     }
160 | //     body, ok := scrape.FindNextSibling(root, matcher)
161 | func FindNextSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
162 | 
163 | 	for s := node.NextSibling; s != nil; s = s.NextSibling {
164 | 		if matcher(s) {
165 | 			return s, true
166 | 		}
167 | 	}
168 | 	return nil, false
169 | }
170 | 
171 | // Find returns the first node which matches the matcher using previous sibling search.
172 | // If no node is found, ok will be false.
173 | //
174 | //     root, err := html.Parse(resp.Body)
175 | //     if err != nil {
176 | //         // handle error
177 | //     }
178 | //     matcher := func(n *html.Node) bool {
179 | //         return n.DataAtom == atom.Body
180 | //     }
181 | //     body, ok := scrape.FindPrevSibling(root, matcher)
182 | func FindPrevSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
183 | 	for s := node.PrevSibling; s != nil; s = s.PrevSibling {
184 | 		if matcher(s) {
185 | 			return s, true
186 | 		}
187 | 	}
188 | 	return nil, false
189 | }
190 | 


--------------------------------------------------------------------------------
/scrape_test.go:
--------------------------------------------------------------------------------
  1 | package scrape
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"testing"
  6 | 
  7 | 	"golang.org/x/net/html"
  8 | 	"golang.org/x/net/html/atom"
  9 | )
 10 | 
 11 | const testHTML = `
 12 | <html>
 13 |   <body>
 14 |     <div class="bigbird">
 15 |       <div class="container">
 16 |         <div class="bigbird">
 17 |           Hi, I'm Big Bird
 18 |         </div>
 19 |       </div>
 20 |     </div>
 21 |   </body>
 22 | </html>
 23 | `
 24 | 
 25 | const testSiblingHTML = `
 26 | <html>
 27 |    <body>
 28 |    		<div>
 29 |    			<p id="a" class="t1">
 30 |    				aaa
 31 |    				<br/>
 32 |    				<a class="t3">test anchor</a>
 33 |    			</p>
 34 |    			<p id="b" class="t2">bbb
 35 |    				<a class="t1">another anchor</a>
 36 |    			</p>
 37 |    			<p id="c" class="t3">ccc</p>
 38 |    			<p id="d" class="t4">ddd</p>
 39 |    		</div>
 40 |    	</body>
 41 |  </html>
 42 |  `
 43 | 
 44 | func TestFindAllNestedReturnsNestedMatchingNodes(t *testing.T) {
 45 | 	node, _ := html.Parse(strings.NewReader(testHTML))
 46 | 	allResults := FindAllNested(node, ByClass("bigbird"))
 47 | 
 48 | 	if len(allResults) != 2 {
 49 | 		t.Error("Expected 2 nodes returned but only found", len(allResults))
 50 | 	}
 51 | }
 52 | 
 53 | func TestFindAllDoesNotReturnNestedMatchingNodes(t *testing.T) {
 54 | 	node, _ := html.Parse(strings.NewReader(testHTML))
 55 | 	allResults := FindAll(node, ByClass("bigbird"))
 56 | 
 57 | 	if len(allResults) != 1 {
 58 | 		t.Error("Expected 1 node returned but found", len(allResults))
 59 | 	}
 60 | }
 61 | 
 62 | func TestFindNextSiblingReturnsMatchingNode(t *testing.T) {
 63 | 	node, _ := html.Parse(strings.NewReader(testSiblingHTML))
 64 | 
 65 | 	startingPoint, ok := Find(node, ById("a"))
 66 | 	if !ok {
 67 | 		t.Error("Expected to find 'a' P node")
 68 | 	} else {
 69 | 		t3Node, ok := FindNextSibling(startingPoint, ByClass("t3"))
 70 | 		if !ok || t3Node == nil {
 71 | 			t.Error("Expected to find a node")
 72 | 		} else {
 73 | 			if t3Node.DataAtom != atom.P || Text(t3Node) != "ccc" {
 74 | 				t.Error("Expected to find the third P node")
 75 | 
 76 | 			}
 77 | 		}
 78 | 	}
 79 | }
 80 | 
 81 | func TestFindNextSiblingDoesntReturnChildren(t *testing.T) {
 82 | 	node, _ := html.Parse(strings.NewReader(testSiblingHTML))
 83 | 
 84 | 	startingPoint, ok := Find(node, ById("b"))
 85 | 	if !ok {
 86 | 		t.Error("Expected to find 'b' P node")
 87 | 	} else {
 88 | 		_, ok := FindNextSibling(startingPoint, ByClass("t1"))
 89 | 		if ok {
 90 | 			t.Error("Didn't expect to find a next sibling node")
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | func TestFindPrevSiblingReturnsMatchingNode(t *testing.T) {
 96 | 	node, _ := html.Parse(strings.NewReader(testSiblingHTML))
 97 | 
 98 | 	startingPoint, ok := Find(node, ById("c"))
 99 | 	if !ok {
100 | 		t.Error("Expected to find the 'c' P node")
101 | 	} else {
102 | 		t1Node, ok := FindPrevSibling(startingPoint, ByClass("t1"))
103 | 		if !ok || t1Node == nil {
104 | 			t.Error("Expected to find a node")
105 | 		} else {
106 | 			if t1Node.DataAtom != atom.P || Text(t1Node) != "aaa test anchor" {
107 | 				t.Error("Expected to find the first P node")
108 | 
109 | 			}
110 | 		}
111 | 	}
112 | }
113 | 
114 | func TestFindPrevSiblingDoesntReturnChildren(t *testing.T) {
115 | 	node, _ := html.Parse(strings.NewReader(testSiblingHTML))
116 | 
117 | 	startingPoint, ok := Find(node, ById("c"))
118 | 	if !ok {
119 | 		t.Error("Expected to find 'c' P node")
120 | 	} else {
121 | 		_, ok := FindPrevSibling(startingPoint, ByClass("t3"))
122 | 		if ok {
123 | 			t.Error("Didn't expect to find a next sibling node")
124 | 		}
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------