├── Chapter02 └── ch2ex1 │ ├── index.html │ └── main.go ├── Chapter03 ├── ch3ex1 │ └── main.go ├── ch3ex2 │ └── main.go ├── ch3ex3 │ └── main.go └── ch3ex4 │ └── main.go ├── Chapter04 ├── ch4ex1 │ └── main.go ├── ch4ex2 │ └── main.go ├── ch4ex3 │ └── main.go ├── ch4ex4 │ └── main.go ├── ch4ex5 │ └── main.go ├── ch4ex6 │ └── main.go ├── ch4ex7 │ └── main.go └── ch4ex8 │ └── main.go ├── Chapter05 ├── ch5ex1 │ └── main.go ├── ch5ex2 │ └── main.go ├── ch5ex3 │ └── main.go └── ch5ex4 │ └── main.go ├── Chapter06 ├── ch6ex1 │ └── main.go └── ch6ex2 │ └── main.go ├── Chapter07 ├── ch7ex1 │ └── main.go ├── ch7ex2 │ └── main.go ├── ch7ex3 │ └── main.go └── ch7ex4 │ └── main.go ├── Chapter08 ├── ch8ex1 │ └── main.go ├── ch8ex2 │ └── main.go ├── ch8ex3 │ └── main.go └── ch8ex4 │ └── main.go ├── LICENSE └── README.md /Chapter02/ch2ex1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example Domain 5 | 6 | 7 | 8 | 9 | 40 | 41 | 42 | 43 |
44 |

Example Domain

45 |

This domain is established to be used for illustrative examples in documents. You may use this 46 | domain in examples without prior coordination or asking for permission.

47 |

More information...

48 |
49 | 50 | 51 | -------------------------------------------------------------------------------- /Chapter02/ch2ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | "os" 7 | ) 8 | 9 | func main() { 10 | // Create the variables for the response and error 11 | var r *http.Response 12 | var err error 13 | 14 | // Request index.html from example.com 15 | r, err = http.Get("http://www.example.com/index.html") 16 | 17 | // If there is a problem accessing the server, kill the program and print the error the console 18 | if err != nil { 19 | panic(err) 20 | } 21 | 22 | // Check the status code returned by the server 23 | if r.StatusCode == 200 { 24 | // The request was successful! 25 | var webPageContent []byte 26 | 27 | // We know the size of the response is 1270 from the previous example 28 | var bodyLength int = 1270 29 | 30 | // Initialize the byte array to the size of the data 31 | webPageContent = make([]byte, bodyLength) 32 | 33 | // Read the data from the server 34 | r.Body.Read(webPageContent) 35 | 36 | // Open a writable file on your computer (create if it does not exist) 37 | var out *os.File 38 | out, err = os.OpenFile("index.html", os.O_CREATE|os.O_WRONLY, 0664) 39 | 40 | if err != nil { 41 | panic(err) 42 | } 43 | 44 | // Write the contents to a file 45 | out.Write(webPageContent) 46 | out.Close() 47 | } else { 48 | log.Fatal("Failed to retrieve the webpage. Received status code", r.Status) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Chapter03/ch3ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/temoto/robotstxt" 7 | ) 8 | 9 | func main() { 10 | // Get the contents of robots.txt from packtpub.com 11 | resp, err := http.Get("https://www.packtpub.com/robots.txt") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | // Process the response using temoto/robotstxt 17 | data, err := robotstxt.FromResponse(resp) 18 | if err != nil { 19 | panic(err) 20 | } 21 | 22 | // Look for the definition in the robots.txt file that matches the default Go User-Agent string 23 | grp := data.FindGroup("Go-http-client/1.1") 24 | if grp != nil { 25 | testUrls := []string{ 26 | // These paths are all permissable 27 | "/all", 28 | "/all?search=Go", 29 | "/bundles", 30 | 31 | // These paths are not 32 | "/contact/", 33 | "/search/", 34 | "/user/password/", 35 | } 36 | 37 | for _, url := range testUrls { 38 | print("checking " + url + "...") 39 | 40 | // Test the path against the User-Agent group 41 | if grp.Test(url) == true { 42 | println("OK") 43 | } else { 44 | println("X") 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Chapter03/ch3ex2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "time" 7 | ) 8 | 9 | func main() { 10 | // Tracks the timestamp of the last request to the webserver 11 | var lastRequestTime time.Time 12 | 13 | // The maximum number of requests we will make to the webserver 14 | maximumNumberOfRequests := 5 15 | 16 | // Our scrape rate at 1 page per 5 seconds 17 | pageDelay := 5 * time.Second 18 | 19 | for i := 0; i < maximumNumberOfRequests; i++ { 20 | // Calculate the time difference since our last request 21 | elapsedTime := time.Now().Sub(lastRequestTime) 22 | fmt.Printf("Elapsed Time: %.2f (s)\n", elapsedTime.Seconds()) 23 | 24 | //Check if there has been enough time 25 | if elapsedTime < pageDelay { 26 | // Sleep the difference between the pageDelay and elapsedTime 27 | var timeDiff time.Duration = pageDelay - elapsedTime 28 | fmt.Printf("Sleeping for %.2f (s)\n", timeDiff.Seconds()) 29 | time.Sleep(pageDelay - elapsedTime) 30 | } 31 | 32 | // Just for this example, we are not processing the response 33 | println("GET example.com/index.html") 34 | _, err := http.Get("http://www.example.com/index.html") 35 | if err != nil { 36 | panic(err) 37 | } 38 | 39 | // Update the last request time 40 | lastRequestTime = time.Now() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Chapter03/ch3ex3/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "time" 7 | ) 8 | 9 | func main() { 10 | // Tracks the timestamp of the last request to the webserver 11 | var lastRequestMap map[string]time.Time = map[string]time.Time{ 12 | "example.com": time.Time{}, 13 | "packtpub.com": time.Time{}, 14 | } 15 | 16 | // The maximum number of requests we will make 17 | maximumNumberOfRequests := 5 18 | pageDelay := 5 * time.Second 19 | 20 | examplePage := "http://www.example.com/index.html" 21 | packtPage := "https://www.packtpub.com/" 22 | 23 | for i := 0; i < maximumNumberOfRequests; i++ { 24 | var elapsedTime time.Duration 25 | webpage := examplePage 26 | 27 | // Check if "i" is an even number 28 | if i%2 == 0 { 29 | // Use the Packt Publishing site and elapsed time 30 | webpage = packtPage 31 | elapsedTime = time.Now().Sub(lastRequestMap["packtpub.com"]) 32 | } else { 33 | // Use the example.com elapsed time 34 | elapsedTime = time.Now().Sub(lastRequestMap["example.com"]) 35 | } 36 | 37 | fmt.Printf("Elapsed Time: %.2f (s)\n", elapsedTime.Seconds()) 38 | 39 | if elapsedTime < pageDelay { 40 | var timeDiff time.Duration = pageDelay - elapsedTime 41 | fmt.Printf("Sleeping for %.2f (s)\n", timeDiff.Seconds()) 42 | time.Sleep(pageDelay - elapsedTime) 43 | } 44 | 45 | println("GET " + webpage) 46 | _, err := http.Get(webpage) 47 | if err != nil { 48 | panic(err) 49 | } 50 | 51 | // Update the last request time 52 | if i%2 == 0 { 53 | // Use the Packt Publishing elapsed time 54 | lastRequestMap["packtpub.com"] = time.Now() 55 | } else { 56 | // Use the example.com elapsed time 57 | lastRequestMap["example.com"] = time.Now() 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Chapter03/ch3ex4/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io/ioutil" 5 | 6 | "github.com/gregjones/httpcache" 7 | "github.com/gregjones/httpcache/diskcache" 8 | ) 9 | 10 | func main() { 11 | // Set up the local disk cache 12 | storage := diskcache.New("./cache") 13 | cache := httpcache.NewTransport(storage) 14 | 15 | // Set this to true to inform us if the responses are being read from a cache 16 | cache.MarkCachedResponses = true 17 | cachedClient := cache.Client() 18 | 19 | // Make the initial request 20 | println("Caching: http://www.example.com/index.html") 21 | resp, err := cachedClient.Get("http://www.example.com/index.html") 22 | if err != nil { 23 | panic(err) 24 | } 25 | 26 | // httpcache requires you to read the body in order to cache the response 27 | ioutil.ReadAll(resp.Body) 28 | resp.Body.Close() 29 | 30 | // Request index.html again 31 | println("Requesting: http://www.example.com/index.html") 32 | resp, err = cachedClient.Get("http://www.example.com/index.html") 33 | if err != nil { 34 | panic(err) 35 | } 36 | 37 | // Look for the flag added by httpcache to show the result is read from the cache 38 | _, ok := resp.Header["X-From-Cache"] 39 | if ok { 40 | println("Result was pulled from the cache!") 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Chapter04/ch4ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "net/http" 7 | "strings" 8 | ) 9 | 10 | func main() { 11 | resp, err := http.Get("https://www.packtpub.com/") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | data, err := ioutil.ReadAll(resp.Body) 17 | if err != nil { 18 | panic(err) 19 | } 20 | 21 | stringBody := string(data) 22 | 23 | numLinks := strings.Count(stringBody, "") { 23 | println("This webpage is HTML5") 24 | } else if strings.Contains(stringBody, "html/strict.dtd") { 25 | println("This webpage is HTML4 (Strict)") 26 | } else if strings.Contains(stringBody, "html/loose.dtd") { 27 | println("This webpage is HTML4 (Tranistional)") 28 | } else if strings.Contains(stringBody, "html/frameset.dtd") { 29 | println("This webpage is HTML4 (Frameset)") 30 | } else { 31 | println("Could not determine doctype!") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Chapter04/ch4ex3/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "net/http" 7 | "regexp" 8 | ) 9 | 10 | func main() { 11 | resp, err := http.Get("https://www.packtpub.com/") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | data, err := ioutil.ReadAll(resp.Body) 17 | if err != nil { 18 | panic(err) 19 | } 20 | 21 | stringBody := string(data) 22 | 23 | re := regexp.MustCompile(``) 24 | linkMatches := re.FindAllStringSubmatch(stringBody, -1) 25 | 26 | fmt.Printf("Found %d links:\n", len(linkMatches)) 27 | for _,linkGroup := range(linkMatches){ 28 | println(linkGroup[1]) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Chapter04/ch4ex4/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "net/http" 7 | "regexp" 8 | ) 9 | 10 | func main() { 11 | resp, err := http.Get("https://www.packtpub.com/application-development/hands-go-programming") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | data, err := ioutil.ReadAll(resp.Body) 17 | if err != nil { 18 | panic(err) 19 | } 20 | 21 | stringBody := string(data) 22 | 23 | re := regexp.MustCompile(`.*main-book-price.*\n.*(\$[0-9]*\.[0-9]{0,2})`) 24 | priceMatches := re.FindStringSubmatch(stringBody) 25 | 26 | fmt.Printf("Book Price: %s\n", priceMatches[1]) 27 | } 28 | -------------------------------------------------------------------------------- /Chapter04/ch4ex5/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | 7 | "github.com/antchfx/htmlquery" 8 | ) 9 | 10 | func main() { 11 | doc, err := htmlquery.LoadURL("https://www.packtpub.com/packt/offers/free-learning") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | dealTextNodes := htmlquery.Find(doc, `//div[@class="dotd-main-book-summary float-left"]//text()`) 17 | 18 | if err != nil { 19 | panic(err) 20 | } 21 | 22 | println("Here is the free book of the day!") 23 | println("----------------------------------") 24 | 25 | for _, node := range dealTextNodes { 26 | text := strings.TrimSpace(node.Data) 27 | matchTagNames, _ := regexp.Compile("^(div|span|h2|br|ul|li)$") 28 | text = matchTagNames.ReplaceAllString(text,"") 29 | if text != "" { 30 | println(text) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Chapter04/ch4ex6/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | 7 | "github.com/antchfx/htmlquery" 8 | ) 9 | 10 | func main() { 11 | doc, err := htmlquery.LoadURL("https://www.packtpub.com/latest-releases") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | nodes := htmlquery.Find(doc, `//div[@class="landing-page-row cf"]/div[@itemtype="http://schema.org/Product"]`) 17 | if err != nil { 18 | panic(err) 19 | } 20 | 21 | println("Here are the latest releases!") 22 | println("-----------------------------") 23 | 24 | for _, node := range nodes { 25 | var title string 26 | var price float64 27 | 28 | for _, attribute := range node.Attr { 29 | switch attribute.Key { 30 | case "data-product-title": 31 | title = attribute.Val 32 | case "data-product-price": 33 | price, err = strconv.ParseFloat(attribute.Val, 64) 34 | if err != nil { 35 | println("Failed to parse price") 36 | } 37 | } 38 | } 39 | fmt.Printf("%s ($%0.2f)\n", title, price) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /Chapter04/ch4ex7/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | func main() { 11 | doc, err := goquery.NewDocument("https://www.packtpub.com/latest-releases") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | println("Here are the latest releases!") 17 | println("-----------------------------") 18 | doc.Find(`div.landing-page-row div[itemtype$="/Product"]`). 19 | Each(func(i int, e *goquery.Selection) { 20 | var title string 21 | var price float64 22 | 23 | title,_ = e.Attr("data-product-title") 24 | priceString, _ := e.Attr("data-product-price") 25 | price, err = strconv.ParseFloat(priceString, 64) 26 | if err != nil { 27 | println("Failed to parse price") 28 | } 29 | fmt.Printf("%s ($%0.2f)\n", title, price) 30 | }) 31 | } 32 | -------------------------------------------------------------------------------- /Chapter04/ch4ex8/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "strings" 6 | 7 | "github.com/PuerkitoBio/goquery" 8 | ) 9 | 10 | func main() { 11 | doc, err := goquery.NewDocument("https://www.packtpub.com/packt/offers/free-learning") 12 | if err != nil { 13 | panic(err) 14 | } 15 | 16 | println("Here is the free book of the day!") 17 | println("----------------------------------") 18 | rawText := doc.Find(`div.dotd-main-book-summary div:not(.eighteen-days-countdown-bar)`).Text() 19 | reader := bufio.NewReader(strings.NewReader(rawText)) 20 | 21 | var line []byte 22 | for err == nil{ 23 | line, _, err = reader.ReadLine() 24 | trimmedLine := strings.TrimSpace(string(line)) 25 | if trimmedLine != "" { 26 | println(trimmedLine) 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Chapter05/ch5ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "time" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | ) 10 | 11 | func main() { 12 | doc, err := goquery.NewDocument("https://www.packtpub.com/latest-releases") 13 | if err != nil { 14 | panic(err) 15 | } 16 | 17 | println("Here are the latest releases!") 18 | println("-----------------------------") 19 | time.Sleep(1 * time.Second) 20 | doc.Find(`div.landing-page-row div[itemtype$="/Product"] a`). 21 | Each(func(i int, e *goquery.Selection) { 22 | var title, description, author, price string 23 | link, _ := e.Attr("href") 24 | link = "https://www.packtpub.com" + link 25 | 26 | bookPage, err := goquery.NewDocument(link) 27 | if err != nil { 28 | panic(err) 29 | } 30 | title = bookPage.Find("div.book-top-block-info h1").Text() 31 | description = strings.TrimSpace(bookPage.Find("div.book-top-block-info div.book-top-block-info-one-liner").Text()) 32 | price = strings.TrimSpace(bookPage.Find("div.book-top-block-info div.onlyDesktop div.book-top-pricing-main-ebook-price").Text()) 33 | authorNodes := bookPage.Find("div.book-top-block-info div.book-top-block-info-authors") 34 | if len(authorNodes.Nodes) < 1 { 35 | return 36 | } 37 | author = strings.TrimSpace(authorNodes.Nodes[0].FirstChild.Data) 38 | fmt.Printf("%s\nby: %s\n%s\n%s\n---------------------\n\n", title, author, price, description) 39 | time.Sleep(1 * time.Second) 40 | }) 41 | } 42 | -------------------------------------------------------------------------------- /Chapter05/ch5ex2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "net/http" 5 | "net/url" 6 | ) 7 | 8 | func main() { 9 | data := url.Values{} 10 | data.Set("s", "Golang") 11 | 12 | response, err := http.PostForm("https://hub.packtpub.com/", data) 13 | if err != nil { 14 | panic(err) 15 | } 16 | // ... Continue processing the response ... 17 | println(response.StatusCode) 18 | } 19 | -------------------------------------------------------------------------------- /Chapter05/ch5ex3/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func main() { 4 | visitedURLs := map[string]interface{}{} 5 | 6 | visitedURLs["https://www.packtpub.com/"] = nil 7 | 8 | } 9 | -------------------------------------------------------------------------------- /Chapter05/ch5ex4/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/tebeka/selenium" 5 | ) 6 | 7 | func main() { 8 | 9 | // The paths to these binaries will be different on your machine! 10 | 11 | const ( 12 | seleniumPath = "/home/vincent/Documents/workspace/Go/src/github.com/tebeka/selenium/vendor/selenium-server-standalone-3.14.0.jar" 13 | geckoDriverPath = "/home/vincent/Documents/workspace/Go/src/github.com/tebeka/selenium/vendor/geckodriver-v0.23.0-linux64" 14 | ) 15 | 16 | service, err := selenium.NewSeleniumService( 17 | seleniumPath, 18 | 8080, 19 | selenium.GeckoDriver(geckoDriverPath)) 20 | 21 | if err != nil { 22 | panic(err) 23 | } 24 | defer service.Stop() 25 | 26 | caps := selenium.Capabilities{"browserName": "firefox"} 27 | wd, err := selenium.NewRemote(caps, "http://localhost:8080/wd/hub") 28 | if err != nil { 29 | panic(err) 30 | } 31 | defer wd.Quit() 32 | 33 | err = wd.Get("https://www.packtpub.com/networking-and-servers/mastering-go") 34 | if err != nil { 35 | panic(err) 36 | } 37 | 38 | var elems []selenium.WebElement 39 | wd.Wait(func(wd2 selenium.WebDriver) (bool, error) { 40 | elems, err = wd.FindElements(selenium.ByCSSSelector, "div.product-reviews-review div.review-body") 41 | if err != nil { 42 | return false, err 43 | } else { 44 | return len(elems) > 0, nil 45 | } 46 | }) 47 | 48 | for _, review := range elems { 49 | body, err := review.Text() 50 | if err != nil { 51 | panic(err) 52 | } 53 | println(body) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /Chapter06/ch6ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io/ioutil" 5 | "math/rand" 6 | "net/http" 7 | "net/url" 8 | "time" 9 | ) 10 | 11 | // Public proxies from https://hidemyna.me 12 | var proxies []string = []string{ 13 | "http://207.154.231.208:8080", 14 | "http://138.68.230.88:8080", 15 | "http://162.243.107.45:8080", 16 | } 17 | 18 | func GetProxy(_ *http.Request) (*url.URL, error) { 19 | randomIndex := rand.Int31n(int32(len(proxies))) 20 | randomProxy := proxies[randomIndex] 21 | 22 | return url.Parse(randomProxy) 23 | } 24 | 25 | func main() { 26 | rand.Seed(time.Now().Unix()) 27 | http.DefaultTransport.(*http.Transport).Proxy = GetProxy 28 | 29 | // Continue with your HTTP requests 30 | for i := 0; i < 5; i++ { 31 | resp, err := http.Get("http://ip-api.com/line") 32 | if err != nil { 33 | panic(err) 34 | } 35 | data, err := ioutil.ReadAll(resp.Body) 36 | if err != nil { 37 | panic(err) 38 | } 39 | println("Proxy IP is: " + string(data)) 40 | time.Sleep(1 * time.Second) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Chapter06/ch6ex2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "net/url" 5 | "path" 6 | ) 7 | 8 | func main() { 9 | parsedUrl, err := url.Parse("https://hub.packtpub.com/8-programming-languages-to-learn-in-2019") 10 | 11 | if err != nil { 12 | panic(err) 13 | } 14 | 15 | site := parsedUrl.Host + parsedUrl.Path 16 | doesMatch, err := path.Match("hub.packtpub.com/*", site) 17 | if err != nil { 18 | panic(err) 19 | } 20 | if doesMatch { 21 | // Continue scraping … 22 | println("It's a match") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Chapter07/ch7ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | func startTicker() { 9 | ticks := 0 10 | for true { 11 | fmt.Println(ticks) 12 | ticks++ 13 | time.Sleep(1 * time.Second) 14 | } 15 | } 16 | 17 | func main() { 18 | println("Starting ticker") 19 | go startTicker() 20 | time.Sleep(10 * time.Second) 21 | } 22 | -------------------------------------------------------------------------------- /Chapter07/ch7ex2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func scrapeSite(url string, statusChan chan map[string]string) { 4 | // Performing scraping operations... 5 | statusChan <- map[string]string{url: "DONE"} 6 | } 7 | 8 | func main() { 9 | siteStatus := map[string]string{ 10 | "http://example.com/page1.html": "READY", 11 | "http://example.com/page2.html": "READY", 12 | "http://example.com/page3.html": "READY", 13 | } 14 | 15 | updatesChan := make(chan map[string]string) 16 | 17 | numberCompleted := 0 18 | for site := range siteStatus { 19 | siteStatus[site] = "WORKING" 20 | go scrapeSite(site, updatesChan) 21 | } 22 | 23 | for update := range updatesChan { 24 | for url, status := range update { 25 | siteStatus[url] = status 26 | numberCompleted++ 27 | } 28 | if numberCompleted == len(siteStatus) { 29 | close(updatesChan) 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Chapter07/ch7ex3/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | var sites []string = []string{ 9 | "http://example.com/site1.html", 10 | "http://example.com/site2.html", 11 | "http://example.com/site3.html", 12 | } 13 | var activeThreads = 0 14 | var doneCount = 0 15 | const maxActiveThreads = 1 16 | 17 | func scrapeSite(site string, condition *sync.Cond) { 18 | condition.L.Lock() 19 | if activeThreads >= maxActiveThreads { 20 | println(site + " Max active threads reached!") 21 | println(site + " is waiting...") 22 | condition.Wait() 23 | println(site + " is awake...") 24 | } 25 | activeThreads++ 26 | condition.L.Unlock() 27 | println("scraping " + site) 28 | // Scraping code goes here ... 29 | // We will simulate it with a sleep 30 | time.Sleep(1 * time.Second) 31 | condition.L.Lock() 32 | 33 | activeThreads-- 34 | doneCount++ 35 | condition.L.Unlock() 36 | condition.Signal() 37 | } 38 | 39 | func main() { 40 | var l = sync.Mutex{} 41 | var c = sync.NewCond(&l) 42 | 43 | for _, site := range sites { 44 | println("starting scraper for " + site) 45 | go scrapeSite(site, c) 46 | } 47 | for doneCount < len(sites){ 48 | time.Sleep(1 * time.Second) 49 | } 50 | println("Done!") 51 | } 52 | -------------------------------------------------------------------------------- /Chapter07/ch7ex4/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | var sites []string = []string{ 10 | "http://example.com/site1.html", 11 | "http://example.com/site2.html", 12 | "http://example.com/site3.html", 13 | } 14 | var activeThreads int32 = 0 15 | var doneCount = 0 16 | 17 | const maxActiveThreads = 1 18 | 19 | func scrapeSite(site string, condition *sync.Cond) { 20 | condition.L.Lock() 21 | if activeThreads >= maxActiveThreads { 22 | println(site + " Max threads reached") 23 | condition.Wait() 24 | } 25 | condition.L.Unlock() 26 | 27 | // activeThreads = atomic.AddInt32(&activeThreads, 1) 28 | atomic.AddInt32(&activeThreads, 1) 29 | // Scraping code goes here ... 30 | println("scraping " + site) 31 | // activeThreads = atomic.AddInt32(&activeThreads, -1) 32 | atomic.AddInt32(&activeThreads, -1) 33 | condition.Signal() 34 | } 35 | 36 | func main() { 37 | var l = sync.Mutex{} 38 | var c = sync.NewCond(&l) 39 | 40 | for _, site := range sites { 41 | println("starting scraper for " + site) 42 | go scrapeSite(site, c) 43 | } 44 | for doneCount < len(sites) { 45 | time.Sleep(1 * time.Second) 46 | } 47 | println("Done!") 48 | } 49 | -------------------------------------------------------------------------------- /Chapter08/ch8ex1/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly" 7 | ) 8 | 9 | func main() { 10 | c := colly.NewCollector(colly.AllowedDomains("go-colly.org")) 11 | 12 | // Find and visit all links 13 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 14 | e.Request.Visit(e.Attr("href")) 15 | }) 16 | 17 | c.OnRequest(func(r *colly.Request) { 18 | fmt.Println("Visiting", r.URL) 19 | }) 20 | 21 | c.Visit("http://go-colly.org/") 22 | } 23 | -------------------------------------------------------------------------------- /Chapter08/ch8ex2/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "strings" 7 | "time" 8 | 9 | "github.com/4ydx/cdp/protocol/dom" 10 | "github.com/4ydx/chrome-protocol" 11 | "github.com/4ydx/chrome-protocol/actions" 12 | "github.com/PuerkitoBio/goquery" 13 | ) 14 | 15 | func getHTML() string { 16 | browser := cdp.NewBrowser("/usr/bin/google-chrome", 9222, "browser.log") 17 | handle := cdp.Start(browser, cdp.LogBasic) 18 | err := actions.EnableAll(handle, 2*time.Second) 19 | if err != nil { 20 | panic(err) 21 | } 22 | _, err = actions.Navigate(handle, "https://www.amazon.com/gp/goldbox", 30*time.Second) 23 | if err != nil { 24 | panic(err) 25 | } 26 | 27 | var nodes []dom.Node 28 | retries := 5 29 | 30 | for len(nodes) == 0 && retries > 0 { 31 | nodes, err = actions.FindAll( 32 | handle, 33 | "div.GB-M-COMMON.GB-SUPPLE:first-child #widgetContent", 34 | 10*time.Second) 35 | retries-- 36 | time.Sleep(1 * time.Second) 37 | } 38 | 39 | if len(nodes) == 0 || retries == 0 { 40 | panic("could not find results") 41 | } 42 | 43 | reply, err := actions.Evaluate(handle, "document.body.outerHTML;", 30*time.Second) 44 | if err != nil { 45 | panic(err) 46 | } 47 | 48 | a := struct{ 49 | Value string 50 | }{} 51 | json.Unmarshal([]byte("{\"value\":" + string(*reply.Result.Value)+"}"), &a) 52 | body := a.Value 53 | 54 | handle.Stop(false) 55 | browser.Stop() 56 | return body 57 | } 58 | 59 | func parseProducts(htmlBody string) []string { 60 | println("parsing response") 61 | rdr := strings.NewReader(htmlBody) 62 | body, err := goquery.NewDocumentFromReader(rdr) 63 | if err != nil { 64 | panic(err) 65 | } 66 | 67 | products := []string{} 68 | details := body.Find("div.dealDetailContainer") 69 | println("Looking for products") 70 | details.Each(func(_ int, detail *goquery.Selection) { 71 | println(".") 72 | title := detail.Find("a#dealTitle").Text() 73 | price := detail.Find("div.priceBlock").Text() 74 | 75 | title = strings.TrimSpace(title) 76 | price = strings.TrimSpace(price) 77 | 78 | products = append(products, title + "\n"+price) 79 | }) 80 | return products 81 | } 82 | 83 | func main() { 84 | println("getting HTML...") 85 | html := getHTML() 86 | println("parsing HTML...") 87 | products := parseProducts(html) 88 | 89 | println("Results:") 90 | for _, product := range products { 91 | fmt.Println(product + "\n") 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /Chapter08/ch8ex3/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "net/http" 9 | 10 | "github.com/slotix/dataflowkit/fetch" 11 | ) 12 | 13 | func main() { 14 | r := fetch.Request{ 15 | Type: "base", 16 | URL: "http://example.com", 17 | Method: "GET", 18 | UserToken: "randomString", 19 | Actions: "", 20 | } 21 | 22 | data, err := json.Marshal(&r) 23 | 24 | if err != nil { 25 | panic(err) 26 | } 27 | resp, err := http.Post("http://localhost:8000/fetch", "application/json", bytes.NewBuffer(data)) 28 | if err != nil { 29 | panic(err) 30 | } 31 | 32 | body, err := ioutil.ReadAll(resp.Body) 33 | if err != nil { 34 | panic(err) 35 | } 36 | 37 | fmt.Println(string(body)) 38 | } 39 | -------------------------------------------------------------------------------- /Chapter08/ch8ex4/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "net/http" 9 | 10 | "github.com/slotix/dataflowkit/fetch" 11 | "github.com/slotix/dataflowkit/scrape" 12 | ) 13 | 14 | func main() { 15 | r := scrape.Payload{ 16 | Name: "Daily Deals", 17 | Request: fetch.Request{ 18 | Type: "Base", 19 | URL: "https://www.packtpub.com/latest-releases", 20 | Method: "GET", 21 | }, 22 | Fields: []scrape.Field{ 23 | { 24 | Name: "Title", 25 | Selector: `div.landing-page-row div[itemtype$="/Product"] div.book-block-title`, 26 | Extractor: scrape.Extractor{ 27 | Types: []string{"text"}, 28 | Filters: []string{"trim"}, 29 | }, 30 | }, { 31 | Name: "Price", 32 | Selector: `div.landing-page-row div[itemtype$="/Product"] div.book-block-price-discounted`, 33 | Extractor: scrape.Extractor{ 34 | Types: []string{"text"}, 35 | Filters: []string{"trim"}, 36 | }, 37 | }, 38 | }, 39 | Format: "CSV", 40 | } 41 | 42 | data, err := json.Marshal(&r) 43 | 44 | if err != nil { 45 | panic(err) 46 | } 47 | resp, err := http.Post("http://localhost:8001/parse", "application/json", bytes.NewBuffer(data)) 48 | if err != nil { 49 | panic(err) 50 | } 51 | 52 | body, err := ioutil.ReadAll(resp.Body) 53 | if err != nil { 54 | panic(err) 55 | } 56 | 57 | fmt.Println(string(body)) 58 | } 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Go Web Scraping Quick Start Guide 5 | 6 | [Go Web Scraping Quick Start Guide](https://www.packtpub.com/big-data-and-business-intelligence/go-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=9781789615708) 7 | 8 | 9 | This is the code repository for [Go Web Scraping Quick Start Guide](https://www.packtpub.com/big-data-and-business-intelligence/go-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=9781789615708 ), published by Packt. 10 | 11 | **Implement the power of Go to scrape and crawl data from the web** 12 | 13 | ## What is this book about? 14 | Web scraping is the process of extracting information from the web using various tools that perform scraping and crawling. Go is emerging as the language of choice for scraping using a variety of libraries. This book will quickly explain to you, how to scrape data data from various websites using Go libraries such as Colly and Goquery. 15 | 16 | This book covers the following exciting features: 17 | * Implement Cache-Control to avoid unnecessary network calls 18 | * Coordinate concurrent scrapers 19 | * Design a custom, larger-scale scraping system 20 | * Scrape basic HTML pages with Colly and JavaScript pages with chromedp 21 | * Discover how to search using the "strings" and "regexp" packages 22 | * Set up a Go development environment 23 | * Retrieve information from an HTML document 24 | * Protect your web scraper from being blocked by using proxies 25 | * Control web browsers to scrape JavaScript sites 26 | 27 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789615704) today! 28 | 29 | https://www.packtpub.com/ 31 | 32 | ## Instructions and Navigations 33 | All of the code is organized into folders. For example, Chapter02. 34 | 35 | The code will look like the following: 36 | ``` 37 | POST /login HTTP/1.1 38 | Host: myprotectedsite.com 39 | Content-Type: application/x-www-form-urlencoded 40 | Content-Length: 38 41 | 42 | username=myuser&password=supersecretpw 43 | ``` 44 | 45 | **Following is what you need for this book:** 46 | Data scientists, and web developers with a basic knowledge of Golang wanting to collect web data and analyze them for effective reporting and visualization. 47 | 48 | With the following software and hardware list you can run all code files present in the book (Chapter 1-8). 49 | ### Software and Hardware List 50 | | Chapter | Software required | OS required | 51 | | -------- | ------------------------------------ | ----------------------------------- | 52 | | 1-8 | Git (2.0+) | Windows, Mac OS X, and Linux | 53 | | 1-8 | GO (1.11) | Windows, Mac OS X, and Linux | 54 | 55 | ### Related products 56 | * Python Web Scraping Cookbook [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-web-scraping-cookbook?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1787285219) 57 | 58 | * R Web Scraping Quick Start Guide [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/r-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1789138736) 59 | 60 | ## Get to Know the Author 61 | **Vincent Smith** 62 | has been a software engineer for 10 years, having worked in various fields from health and IT to machine learning, and large-scale web scrapers. He has worked for both large-scale Fortune 500 companies and start-ups alike and has sharpened his skills from the best of both worlds. While obtaining a degree in electrical engineering, he learned the foundations of writing good code through his Java courses. These basics helped spur his career in software development early in his professional career in order to provide support for his team. He fell in love with the process of teaching computers how to behave and set him on the path he still walks today. 63 | 64 | ### Suggestions and Feedback 65 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 66 | 67 | 68 | ### Download a free PDF 69 | 70 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
71 |

https://packt.link/free-ebook/9781789615708

--------------------------------------------------------------------------------