├── LICENSE ├── README.md ├── example ├── hackernews.go └── pipeline.go ├── scrape.go └── scrape_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, yhat 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrape 2 | 3 | A simple, higher level interface for Go web scraping. 4 | 5 | When scraping with Go, I find myself redefining tree traversal and other 6 | utility functions. 7 | 8 | This package is a place to put some simple tools which build on top of the 9 | [Go HTML parsing library](https://godoc.org/golang.org/x/net/html). 10 | 11 | For the full interface check out the godoc 12 | [](https://godoc.org/github.com/yhat/scrape) 13 | 14 | ## Sample 15 | 16 | Scrape defines traversal functions like `Find` and `FindAll` while attempting 17 | to be generic. It also defines convenience functions such as `Attr` and `Text`. 18 | 19 | ```go 20 | // Parse the page 21 | root, err := html.Parse(resp.Body) 22 | if err != nil { 23 | // handle error 24 | } 25 | // Search for the title 26 | title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) 27 | if ok { 28 | // Print the title 29 | fmt.Println(scrape.Text(title)) 30 | } 31 | ``` 32 | 33 | ## A full example: Scraping Hacker News 34 | 35 | ```go 36 | package main 37 | 38 | import ( 39 | "fmt" 40 | "net/http" 41 | 42 | "github.com/yhat/scrape" 43 | "golang.org/x/net/html" 44 | "golang.org/x/net/html/atom" 45 | ) 46 | 47 | func main() { 48 | // request and parse the front page 49 | resp, err := http.Get("https://news.ycombinator.com/") 50 | if err != nil { 51 | panic(err) 52 | } 53 | root, err := html.Parse(resp.Body) 54 | if err != nil { 55 | panic(err) 56 | } 57 | 58 | // define a matcher 59 | matcher := func(n *html.Node) bool { 60 | // must check for nil values 61 | if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { 62 | return scrape.Attr(n.Parent.Parent, "class") == "athing" 63 | } 64 | return false 65 | } 66 | // grab all articles and print them 67 | articles := scrape.FindAll(root, matcher) 68 | for i, article := range articles { 69 | fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) 70 | } 71 | } 72 | ``` 73 | -------------------------------------------------------------------------------- /example/hackernews.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | 7 | "github.com/yhat/scrape" 8 | "golang.org/x/net/html" 9 | "golang.org/x/net/html/atom" 10 | ) 11 | 12 | func main() { 13 | // request and parse the front page 14 | resp, err := http.Get("https://news.ycombinator.com/") 15 | if err != nil { 16 | panic(err) 17 | } 18 | root, err := html.Parse(resp.Body) 19 | if err != nil { 20 | panic(err) 21 | } 22 | 23 | // define a matcher 24 | matcher := func(n *html.Node) bool { 25 | // must check for nil values 26 | if n.DataAtom == atom.A && n.Parent != nil && n.Parent.Parent != nil { 27 | return scrape.Attr(n.Parent.Parent, "class") == "athing" 28 | } 29 | return false 30 | } 31 | // grab all articles and print them 32 | articles := scrape.FindAll(root, matcher) 33 | for i, article := range articles { 34 | fmt.Printf("%2d %s (%s)\n", i, scrape.Text(article), scrape.Attr(article, "href")) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /example/pipeline.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "sync" 7 | 8 | "github.com/yhat/scrape" 9 | "golang.org/x/net/html" 10 | "golang.org/x/net/html/atom" 11 | ) 12 | 13 | // Set your email here to include in the User-Agent string. 14 | var email = "youremail@gmail.com" 15 | var urls = []string{ 16 | "http://techcrunch.com/", 17 | "https://www.reddit.com/", 18 | "https://en.wikipedia.org", 19 | "https://news.ycombinator.com/", 20 | "https://www.buzzfeed.com/", 21 | "http://digg.com", 22 | } 23 | 24 | func respGen(urls ...string) <-chan *http.Response { 25 | var wg sync.WaitGroup 26 | out := make(chan *http.Response) 27 | wg.Add(len(urls)) 28 | for _, url := range urls { 29 | go func(url string) { 30 | req, err := http.NewRequest("GET", url, nil) 31 | if err != nil { 32 | panic(err) 33 | } 34 | req.Header.Set("user-agent", "testBot("+email+")") 35 | resp, err := http.DefaultClient.Do(req) 36 | if err != nil { 37 | panic(err) 38 | } 39 | out <- resp 40 | wg.Done() 41 | }(url) 42 | } 43 | go func() { 44 | wg.Wait() 45 | close(out) 46 | }() 47 | return out 48 | } 49 | 50 | func rootGen(in <-chan *http.Response) <-chan *html.Node { 51 | var wg sync.WaitGroup 52 | out := make(chan *html.Node) 53 | for resp := range in { 54 | wg.Add(1) 55 | go func(resp *http.Response) { 56 | root, err := html.Parse(resp.Body) 57 | if err != nil { 58 | panic(err) 59 | } 60 | out <- root 61 | wg.Done() 62 | }(resp) 63 | } 64 | go func() { 65 | wg.Wait() 66 | close(out) 67 | }() 68 | return out 69 | } 70 | 71 | func titleGen(in <-chan *html.Node) <-chan string { 72 | var wg sync.WaitGroup 73 | out := make(chan string) 74 | for root := range in { 75 | wg.Add(1) 76 | go func(root *html.Node) { 77 | title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) 78 | if ok { 79 | out <- scrape.Text(title) 80 | } 81 | wg.Done() 82 | }(root) 83 | } 84 | go func() { 85 | wg.Wait() 86 | close(out) 87 | }() 88 | return out 89 | } 90 | 91 | func main() { 92 | // Set up the pipeline to consume back-to-back output 93 | // ending with the final stage to print the title of 94 | // each web page in the main go routine. 95 | for title := range titleGen(rootGen(respGen(urls...))) { 96 | fmt.Println(title) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /scrape.go: -------------------------------------------------------------------------------- 1 | // Package scrape provides a searching api on top of golang.org/x/net/html 2 | package scrape 3 | 4 | import ( 5 | "strings" 6 | 7 | "golang.org/x/net/html" 8 | "golang.org/x/net/html/atom" 9 | ) 10 | 11 | // Matcher should return true when a desired node is found. 12 | type Matcher func(node *html.Node) bool 13 | 14 | // FindAll returns all nodes which match the provided Matcher. After discovering a matching 15 | // node, it will _not_ discover matching subnodes of that node. 16 | func FindAll(node *html.Node, matcher Matcher) []*html.Node { 17 | return findAllInternal(node, matcher, false) 18 | } 19 | 20 | // FindAllNested returns all nodes which match the provided Matcher and _will_ discover 21 | // matching subnodes of matching nodes. 22 | func FindAllNested(node *html.Node, matcher Matcher) []*html.Node { 23 | return findAllInternal(node, matcher, true) 24 | } 25 | 26 | // Find returns the first node which matches the matcher using depth-first search. 27 | // If no node is found, ok will be false. 28 | // 29 | // root, err := html.Parse(resp.Body) 30 | // if err != nil { 31 | // // handle error 32 | // } 33 | // matcher := func(n *html.Node) bool { 34 | // return n.DataAtom == atom.Body 35 | // } 36 | // body, ok := scrape.Find(root, matcher) 37 | func Find(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { 38 | if matcher(node) { 39 | return node, true 40 | } 41 | 42 | for c := node.FirstChild; c != nil; c = c.NextSibling { 43 | n, ok := Find(c, matcher) 44 | if ok { 45 | return n, true 46 | } 47 | } 48 | return nil, false 49 | } 50 | 51 | // FindParent searches up HTML tree from the current node until either a 52 | // match is found or the top is hit. 53 | func FindParent(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { 54 | for p := node.Parent; p != nil; p = p.Parent { 55 | if matcher(p) { 56 | return p, true 57 | } 58 | } 59 | return nil, false 60 | } 61 | 62 | // Text returns text from all descendant text nodes joined. 63 | // For control over the join function, see TextJoin. 64 | func Text(node *html.Node) string { 65 | joiner := func(s []string) string { 66 | n := 0 67 | for i := range s { 68 | trimmed := strings.TrimSpace(s[i]) 69 | if trimmed != "" { 70 | s[n] = trimmed 71 | n++ 72 | } 73 | } 74 | return strings.Join(s[:n], " ") 75 | } 76 | return TextJoin(node, joiner) 77 | } 78 | 79 | // TextJoin returns a string from all descendant text nodes joined by a 80 | // caller provided join function. 81 | func TextJoin(node *html.Node, join func([]string) string) string { 82 | nodes := FindAll(node, func(n *html.Node) bool { return n.Type == html.TextNode }) 83 | parts := make([]string, len(nodes)) 84 | for i, n := range nodes { 85 | parts[i] = n.Data 86 | } 87 | return join(parts) 88 | } 89 | 90 | // Attr returns the value of an HTML attribute. 91 | func Attr(node *html.Node, key string) string { 92 | for _, a := range node.Attr { 93 | if a.Key == key { 94 | return a.Val 95 | } 96 | } 97 | return "" 98 | } 99 | 100 | // ByTag returns a Matcher which matches all nodes of the provided tag type. 101 | // 102 | // root, err := html.Parse(resp.Body) 103 | // if err != nil { 104 | // // handle error 105 | // } 106 | // title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) 107 | func ByTag(a atom.Atom) Matcher { 108 | return func(node *html.Node) bool { return node.DataAtom == a } 109 | } 110 | 111 | // ById returns a Matcher which matches all nodes with the provided id. 112 | func ById(id string) Matcher { 113 | return func(node *html.Node) bool { return Attr(node, "id") == id } 114 | } 115 | 116 | // ByClass returns a Matcher which matches all nodes with the provided class. 117 | func ByClass(class string) Matcher { 118 | return func(node *html.Node) bool { 119 | classes := strings.Fields(Attr(node, "class")) 120 | for _, c := range classes { 121 | if c == class { 122 | return true 123 | } 124 | } 125 | return false 126 | } 127 | } 128 | 129 | // findAllInternal encapsulates the node tree traversal 130 | func findAllInternal(node *html.Node, matcher Matcher, searchNested bool) []*html.Node { 131 | matched := []*html.Node{} 132 | 133 | if matcher(node) { 134 | matched = append(matched, node) 135 | 136 | if !searchNested { 137 | return matched 138 | } 139 | } 140 | 141 | for c := node.FirstChild; c != nil; c = c.NextSibling { 142 | found := findAllInternal(c, matcher, searchNested) 143 | if len(found) > 0 { 144 | matched = append(matched, found...) 145 | } 146 | } 147 | return matched 148 | } 149 | 150 | // Find returns the first node which matches the matcher using next sibling search. 151 | // If no node is found, ok will be false. 152 | // 153 | // root, err := html.Parse(resp.Body) 154 | // if err != nil { 155 | // // handle error 156 | // } 157 | // matcher := func(n *html.Node) bool { 158 | // return n.DataAtom == atom.Body 159 | // } 160 | // body, ok := scrape.FindNextSibling(root, matcher) 161 | func FindNextSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { 162 | 163 | for s := node.NextSibling; s != nil; s = s.NextSibling { 164 | if matcher(s) { 165 | return s, true 166 | } 167 | } 168 | return nil, false 169 | } 170 | 171 | // Find returns the first node which matches the matcher using previous sibling search. 172 | // If no node is found, ok will be false. 173 | // 174 | // root, err := html.Parse(resp.Body) 175 | // if err != nil { 176 | // // handle error 177 | // } 178 | // matcher := func(n *html.Node) bool { 179 | // return n.DataAtom == atom.Body 180 | // } 181 | // body, ok := scrape.FindPrevSibling(root, matcher) 182 | func FindPrevSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { 183 | for s := node.PrevSibling; s != nil; s = s.PrevSibling { 184 | if matcher(s) { 185 | return s, true 186 | } 187 | } 188 | return nil, false 189 | } 190 | -------------------------------------------------------------------------------- /scrape_test.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "golang.org/x/net/html" 8 | "golang.org/x/net/html/atom" 9 | ) 10 | 11 | const testHTML = ` 12 | 13 |
14 |