├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── main.go └── test.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | *.csv 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.6 4 | - tip 5 | 6 | sudo: false 7 | 8 | install: 9 | - go get github.com/PuerkitoBio/goquery 10 | script: 11 | - bash test.sh 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Philip I. Thomas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # iterscraper 2 | 3 | [![Build Status](https://travis-ci.org/philipithomas/iterscraper.svg?branch=master)](https://travis-ci.org/philipithomas/iterscraper) 4 | 5 | A basic package used for scraping information from a website where URLs contain an incrementing integer. Information is retrieved from HTML5 elements, and outputted as a CSV. 6 | 7 | Thanks [Francesc](https://github.com/campoy) for featuring this repo in episode #1 of [Just For Func](https://twitter.com/justforfunc). [Watch The Video](https://www.youtube.com/watch?list=PL64wiCrrxh4Jisi7OcCJIUpguV_f5jGnZ&v=eIWFnNz8mF4) or [Review Francesc's pull request](https://github.com/philipithomas/iterscraper/pull/1). 8 | 9 | ## Flags 10 | 11 | Flags are all optional, and are set with a single dash on the command line, e.g. 12 | 13 | ``` 14 | iterscraper \ 15 | -url "http://foo.com/%d" \ 16 | -from 1 \ 17 | -to 10 \ 18 | -concurrency 10 \ 19 | -output foo.csv \ 20 | -nameQuery ".name" \ 21 | -addressQuery ".address" \ 22 | -phoneQuery ".phone" \ 23 | -emailQuery ".email" 24 | ``` 25 | 26 | For an explanation of the options, type `iterscraper -help` 27 | 28 | General usage of iterscraper: 29 | 30 | ``` 31 | -addressQuery string 32 | JQuery-style query for the address element (default ".address") 33 | -concurrency int 34 | How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues) (default 1) 35 | -emailQuery string 36 | JQuery-style query for the email element (default ".email") 37 | -from int 38 | The first ID that should be searched in the URL - inclusive. 39 | -nameQuery string 40 | JQuery-style query for the name element (default ".name") 41 | -output string 42 | Filename to export the CSV results (default "output.csv") 43 | -phoneQuery string 44 | JQuery-style query for the phone element (default ".phone") 45 | -to int 46 | The last ID that should be searched in the URL - exclusive (default 1) 47 | -url string 48 | The URL you wish to scrape, containing "%d" where the id should be substituted (default "http://example.com/v/%d") 49 | ``` 50 | 51 | ## URL Structure 52 | 53 | Successive pages must look like: 54 | 55 | ``` 56 | http://example.com/foo/1/bar 57 | http://example.com/foo/2/bar 58 | http://example.com/foo/3/bar 59 | ``` 60 | 61 | iterscraper would then accept the url in the following style, in `Printf` style such that numbers may be substituted into the url: 62 | 63 | ``` 64 | http://example.com/foo/%d/bar 65 | ``` 66 | 67 | ## Installation 68 | 69 | Building the source requires the [Go programming language](https://golang.org/doc/install) and the [Glide](http://glide.sh) package manager. 70 | 71 | ``` 72 | # Dependency is GoQuery 73 | go get github.com/PuerkitoBio/goquery 74 | # Get and build source 75 | go get github.com/philipithomas/iterscraper 76 | # If your $PATH is configured correctly, you can call it directly 77 | iterscraper [flags] 78 | 79 | ``` 80 | 81 | 82 | ## Errata 83 | 84 | * This is purpose-built for some internal scraping. It's not meant to be the scraping tool for every user case, but you're welcome to modify it for your purposes 85 | * On a `429 - too many requests` error, the app logs and continues, ignoring the request. 86 | * The package will [follow up to 10 redirects](https://golang.org/pkg/net/http/#Get) 87 | * On a `404 - not found` error, the system will log the miss, then continue. It is not exported to the CSV. 88 | 89 | ## Extensions 90 | * [calini/grape](https://github.com/calini/grape) is an extension of iterscraper that also adds the ability to swap the incremental indexes with a dictionary file, and query for different attributes. 91 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | // iterscraper scrapes information from a website where URLs contain an incrementing integer. 2 | // Information is retrieved from HTML5 elements, and outputted as a CSV. 3 | package main 4 | 5 | import ( 6 | "encoding/csv" 7 | "flag" 8 | "fmt" 9 | "log" 10 | "net/http" 11 | "os" 12 | "strconv" 13 | "strings" 14 | "sync" 15 | 16 | "github.com/PuerkitoBio/goquery" 17 | ) 18 | 19 | func main() { 20 | var ( 21 | urlTemplate = flag.String("url", "http://example.com/v/%d", "The URL you wish to scrape, containing \"%d\" where the id should be substituted") 22 | idLow = flag.Int("from", 0, "The first ID that should be searched in the URL - inclusive.") 23 | idHigh = flag.Int("to", 1, "The last ID that should be searched in the URL - exclusive") 24 | concurrency = flag.Int("concurrency", 1, "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwith issues)") 25 | outfile = flag.String("output", "output.csv", "Filename to export the CSV results") 26 | name = flag.String("nameQuery", ".name", "JQuery-style query for the name element") 27 | address = flag.String("addressQuery", ".address", "JQuery-style query for the address element") 28 | phone = flag.String("phoneQuery", ".phone", "JQuery-style query for the phone element") 29 | email = flag.String("emailQuery", ".email", "JQuery-style query for the email element") 30 | ) 31 | flag.Parse() 32 | 33 | columns := []string{*name, *address, *phone, *email} 34 | headers := []string{"name", "address", "phone", "email"} 35 | // url and id are added as the first two columns. 36 | headers = append([]string{"url", "id"}, headers...) 37 | 38 | // create all tasks and send them to the channel. 39 | type task struct { 40 | url string 41 | id int 42 | } 43 | tasks := make(chan task) 44 | go func() { 45 | for i := *idLow; i < *idHigh; i++ { 46 | tasks <- task{url: fmt.Sprintf(*urlTemplate, i), id: i} 47 | } 48 | close(tasks) 49 | }() 50 | 51 | // create workers and schedule closing results when all work is done. 52 | results := make(chan []string) 53 | var wg sync.WaitGroup 54 | wg.Add(*concurrency) 55 | go func() { 56 | wg.Wait() 57 | close(results) 58 | }() 59 | 60 | for i := 0; i < *concurrency; i++ { 61 | go func() { 62 | defer wg.Done() 63 | for t := range tasks { 64 | r, err := fetch(t.url, t.id, columns) 65 | if err != nil { 66 | log.Printf("could not fetch %v: %v", t.url, err) 67 | continue 68 | } 69 | results <- r 70 | } 71 | }() 72 | } 73 | 74 | if err := dumpCSV(*outfile, headers, results); err != nil { 75 | log.Printf("could not write to %s: %v", *outfile, err) 76 | } 77 | } 78 | 79 | func fetch(url string, id int, queries []string) ([]string, error) { 80 | res, err := http.Get(url) 81 | if err != nil { 82 | return nil, fmt.Errorf("could not get %s: %v", url, err) 83 | } 84 | defer res.Body.Close() 85 | 86 | if res.StatusCode != http.StatusOK { 87 | if res.StatusCode == http.StatusTooManyRequests { 88 | return nil, fmt.Errorf("you are being rate limited") 89 | } 90 | 91 | return nil, fmt.Errorf("bad response from server: %s", res.Status) 92 | } 93 | 94 | // parse body with goquery. 95 | doc, err := goquery.NewDocumentFromReader(res.Body) 96 | if err != nil { 97 | return nil, fmt.Errorf("could not parse page: %v", err) 98 | } 99 | 100 | // extract info we want. 101 | r := []string{url, strconv.Itoa(id)} 102 | for _, q := range queries { 103 | r = append(r, strings.TrimSpace(doc.Find(q).Text())) 104 | } 105 | return r, nil 106 | } 107 | 108 | func dumpCSV(path string, headers []string, records <-chan []string) error { 109 | f, err := os.Create(path) 110 | if err != nil { 111 | return fmt.Errorf("unable to create file %s: %v", path, err) 112 | } 113 | defer f.Close() 114 | 115 | w := csv.NewWriter(f) 116 | 117 | // write headers to file. 118 | if err := w.Write(headers); err != nil { 119 | log.Fatalf("error writing record to csv: %v", err) 120 | } 121 | 122 | // write all records. 123 | for r := range records { 124 | if err := w.Write(r); err != nil { 125 | log.Fatalf("could not write record to csv: %v", err) 126 | } 127 | } 128 | 129 | w.Flush() 130 | 131 | // check for extra errors. 132 | if err := w.Error(); err != nil { 133 | return fmt.Errorf("writer failed: %v", err) 134 | } 135 | return nil 136 | } 137 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | set -e 3 | 4 | count=`git ls-files | grep '.go$' | xargs gofmt -l -s | wc -l` 5 | if [ $count -gt 0 ]; then 6 | echo "Files not formatted correctly\n" 7 | exit 1 8 | fi 9 | go vet . 10 | go test -race -v . 11 | go install -race -v . 12 | --------------------------------------------------------------------------------