├── Chapter02
└── ch2ex1
│ ├── index.html
│ └── main.go
├── Chapter03
├── ch3ex1
│ └── main.go
├── ch3ex2
│ └── main.go
├── ch3ex3
│ └── main.go
└── ch3ex4
│ └── main.go
├── Chapter04
├── ch4ex1
│ └── main.go
├── ch4ex2
│ └── main.go
├── ch4ex3
│ └── main.go
├── ch4ex4
│ └── main.go
├── ch4ex5
│ └── main.go
├── ch4ex6
│ └── main.go
├── ch4ex7
│ └── main.go
└── ch4ex8
│ └── main.go
├── Chapter05
├── ch5ex1
│ └── main.go
├── ch5ex2
│ └── main.go
├── ch5ex3
│ └── main.go
└── ch5ex4
│ └── main.go
├── Chapter06
├── ch6ex1
│ └── main.go
└── ch6ex2
│ └── main.go
├── Chapter07
├── ch7ex1
│ └── main.go
├── ch7ex2
│ └── main.go
├── ch7ex3
│ └── main.go
└── ch7ex4
│ └── main.go
├── Chapter08
├── ch8ex1
│ └── main.go
├── ch8ex2
│ └── main.go
├── ch8ex3
│ └── main.go
└── ch8ex4
│ └── main.go
├── LICENSE
└── README.md
/Chapter02/ch2ex1/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Example Domain
5 |
6 |
7 |
8 |
9 |
40 |
41 |
42 |
43 |
44 |
Example Domain
45 |
This domain is established to be used for illustrative examples in documents. You may use this
46 | domain in examples without prior coordination or asking for permission.
47 |
More information...
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/Chapter02/ch2ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "net/http"
6 | "os"
7 | )
8 |
9 | func main() {
10 | // Create the variables for the response and error
11 | var r *http.Response
12 | var err error
13 |
14 | // Request index.html from example.com
15 | r, err = http.Get("http://www.example.com/index.html")
16 |
17 | // If there is a problem accessing the server, kill the program and print the error the console
18 | if err != nil {
19 | panic(err)
20 | }
21 |
22 | // Check the status code returned by the server
23 | if r.StatusCode == 200 {
24 | // The request was successful!
25 | var webPageContent []byte
26 |
27 | // We know the size of the response is 1270 from the previous example
28 | var bodyLength int = 1270
29 |
30 | // Initialize the byte array to the size of the data
31 | webPageContent = make([]byte, bodyLength)
32 |
33 | // Read the data from the server
34 | r.Body.Read(webPageContent)
35 |
36 | // Open a writable file on your computer (create if it does not exist)
37 | var out *os.File
38 | out, err = os.OpenFile("index.html", os.O_CREATE|os.O_WRONLY, 0664)
39 |
40 | if err != nil {
41 | panic(err)
42 | }
43 |
44 | // Write the contents to a file
45 | out.Write(webPageContent)
46 | out.Close()
47 | } else {
48 | log.Fatal("Failed to retrieve the webpage. Received status code", r.Status)
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/Chapter03/ch3ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "net/http"
5 |
6 | "github.com/temoto/robotstxt"
7 | )
8 |
9 | func main() {
10 | // Get the contents of robots.txt from packtpub.com
11 | resp, err := http.Get("https://www.packtpub.com/robots.txt")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | // Process the response using temoto/robotstxt
17 | data, err := robotstxt.FromResponse(resp)
18 | if err != nil {
19 | panic(err)
20 | }
21 |
22 | // Look for the definition in the robots.txt file that matches the default Go User-Agent string
23 | grp := data.FindGroup("Go-http-client/1.1")
24 | if grp != nil {
25 | testUrls := []string{
26 | // These paths are all permissable
27 | "/all",
28 | "/all?search=Go",
29 | "/bundles",
30 |
31 | // These paths are not
32 | "/contact/",
33 | "/search/",
34 | "/user/password/",
35 | }
36 |
37 | for _, url := range testUrls {
38 | print("checking " + url + "...")
39 |
40 | // Test the path against the User-Agent group
41 | if grp.Test(url) == true {
42 | println("OK")
43 | } else {
44 | println("X")
45 | }
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Chapter03/ch3ex2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "time"
7 | )
8 |
9 | func main() {
10 | // Tracks the timestamp of the last request to the webserver
11 | var lastRequestTime time.Time
12 |
13 | // The maximum number of requests we will make to the webserver
14 | maximumNumberOfRequests := 5
15 |
16 | // Our scrape rate at 1 page per 5 seconds
17 | pageDelay := 5 * time.Second
18 |
19 | for i := 0; i < maximumNumberOfRequests; i++ {
20 | // Calculate the time difference since our last request
21 | elapsedTime := time.Now().Sub(lastRequestTime)
22 | fmt.Printf("Elapsed Time: %.2f (s)\n", elapsedTime.Seconds())
23 |
24 | //Check if there has been enough time
25 | if elapsedTime < pageDelay {
26 | // Sleep the difference between the pageDelay and elapsedTime
27 | var timeDiff time.Duration = pageDelay - elapsedTime
28 | fmt.Printf("Sleeping for %.2f (s)\n", timeDiff.Seconds())
29 | time.Sleep(pageDelay - elapsedTime)
30 | }
31 |
32 | // Just for this example, we are not processing the response
33 | println("GET example.com/index.html")
34 | _, err := http.Get("http://www.example.com/index.html")
35 | if err != nil {
36 | panic(err)
37 | }
38 |
39 | // Update the last request time
40 | lastRequestTime = time.Now()
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Chapter03/ch3ex3/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "time"
7 | )
8 |
9 | func main() {
10 | // Tracks the timestamp of the last request to the webserver
11 | var lastRequestMap map[string]time.Time = map[string]time.Time{
12 | "example.com": time.Time{},
13 | "packtpub.com": time.Time{},
14 | }
15 |
16 | // The maximum number of requests we will make
17 | maximumNumberOfRequests := 5
18 | pageDelay := 5 * time.Second
19 |
20 | examplePage := "http://www.example.com/index.html"
21 | packtPage := "https://www.packtpub.com/"
22 |
23 | for i := 0; i < maximumNumberOfRequests; i++ {
24 | var elapsedTime time.Duration
25 | webpage := examplePage
26 |
27 | // Check if "i" is an even number
28 | if i%2 == 0 {
29 | // Use the Packt Publishing site and elapsed time
30 | webpage = packtPage
31 | elapsedTime = time.Now().Sub(lastRequestMap["packtpub.com"])
32 | } else {
33 | // Use the example.com elapsed time
34 | elapsedTime = time.Now().Sub(lastRequestMap["example.com"])
35 | }
36 |
37 | fmt.Printf("Elapsed Time: %.2f (s)\n", elapsedTime.Seconds())
38 |
39 | if elapsedTime < pageDelay {
40 | var timeDiff time.Duration = pageDelay - elapsedTime
41 | fmt.Printf("Sleeping for %.2f (s)\n", timeDiff.Seconds())
42 | time.Sleep(pageDelay - elapsedTime)
43 | }
44 |
45 | println("GET " + webpage)
46 | _, err := http.Get(webpage)
47 | if err != nil {
48 | panic(err)
49 | }
50 |
51 | // Update the last request time
52 | if i%2 == 0 {
53 | // Use the Packt Publishing elapsed time
54 | lastRequestMap["packtpub.com"] = time.Now()
55 | } else {
56 | // Use the example.com elapsed time
57 | lastRequestMap["example.com"] = time.Now()
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/Chapter03/ch3ex4/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "io/ioutil"
5 |
6 | "github.com/gregjones/httpcache"
7 | "github.com/gregjones/httpcache/diskcache"
8 | )
9 |
10 | func main() {
11 | // Set up the local disk cache
12 | storage := diskcache.New("./cache")
13 | cache := httpcache.NewTransport(storage)
14 |
15 | // Set this to true to inform us if the responses are being read from a cache
16 | cache.MarkCachedResponses = true
17 | cachedClient := cache.Client()
18 |
19 | // Make the initial request
20 | println("Caching: http://www.example.com/index.html")
21 | resp, err := cachedClient.Get("http://www.example.com/index.html")
22 | if err != nil {
23 | panic(err)
24 | }
25 |
26 | // httpcache requires you to read the body in order to cache the response
27 | ioutil.ReadAll(resp.Body)
28 | resp.Body.Close()
29 |
30 | // Request index.html again
31 | println("Requesting: http://www.example.com/index.html")
32 | resp, err = cachedClient.Get("http://www.example.com/index.html")
33 | if err != nil {
34 | panic(err)
35 | }
36 |
37 | // Look for the flag added by httpcache to show the result is read from the cache
38 | _, ok := resp.Header["X-From-Cache"]
39 | if ok {
40 | println("Result was pulled from the cache!")
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io/ioutil"
6 | "net/http"
7 | "strings"
8 | )
9 |
10 | func main() {
11 | resp, err := http.Get("https://www.packtpub.com/")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | data, err := ioutil.ReadAll(resp.Body)
17 | if err != nil {
18 | panic(err)
19 | }
20 |
21 | stringBody := string(data)
22 |
23 | numLinks := strings.Count(stringBody, "") {
23 | println("This webpage is HTML5")
24 | } else if strings.Contains(stringBody, "html/strict.dtd") {
25 | println("This webpage is HTML4 (Strict)")
26 | } else if strings.Contains(stringBody, "html/loose.dtd") {
27 | println("This webpage is HTML4 (Tranistional)")
28 | } else if strings.Contains(stringBody, "html/frameset.dtd") {
29 | println("This webpage is HTML4 (Frameset)")
30 | } else {
31 | println("Could not determine doctype!")
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex3/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io/ioutil"
6 | "net/http"
7 | "regexp"
8 | )
9 |
10 | func main() {
11 | resp, err := http.Get("https://www.packtpub.com/")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | data, err := ioutil.ReadAll(resp.Body)
17 | if err != nil {
18 | panic(err)
19 | }
20 |
21 | stringBody := string(data)
22 |
23 | re := regexp.MustCompile(``)
24 | linkMatches := re.FindAllStringSubmatch(stringBody, -1)
25 |
26 | fmt.Printf("Found %d links:\n", len(linkMatches))
27 | for _,linkGroup := range(linkMatches){
28 | println(linkGroup[1])
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex4/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io/ioutil"
6 | "net/http"
7 | "regexp"
8 | )
9 |
10 | func main() {
11 | resp, err := http.Get("https://www.packtpub.com/application-development/hands-go-programming")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | data, err := ioutil.ReadAll(resp.Body)
17 | if err != nil {
18 | panic(err)
19 | }
20 |
21 | stringBody := string(data)
22 |
23 | re := regexp.MustCompile(`.*main-book-price.*\n.*(\$[0-9]*\.[0-9]{0,2})`)
24 | priceMatches := re.FindStringSubmatch(stringBody)
25 |
26 | fmt.Printf("Book Price: %s\n", priceMatches[1])
27 | }
28 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex5/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 |
7 | "github.com/antchfx/htmlquery"
8 | )
9 |
10 | func main() {
11 | doc, err := htmlquery.LoadURL("https://www.packtpub.com/packt/offers/free-learning")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | dealTextNodes := htmlquery.Find(doc, `//div[@class="dotd-main-book-summary float-left"]//text()`)
17 |
18 | if err != nil {
19 | panic(err)
20 | }
21 |
22 | println("Here is the free book of the day!")
23 | println("----------------------------------")
24 |
25 | for _, node := range dealTextNodes {
26 | text := strings.TrimSpace(node.Data)
27 | matchTagNames, _ := regexp.Compile("^(div|span|h2|br|ul|li)$")
28 | text = matchTagNames.ReplaceAllString(text,"")
29 | if text != "" {
30 | println(text)
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex6/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "strconv"
6 |
7 | "github.com/antchfx/htmlquery"
8 | )
9 |
10 | func main() {
11 | doc, err := htmlquery.LoadURL("https://www.packtpub.com/latest-releases")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | nodes := htmlquery.Find(doc, `//div[@class="landing-page-row cf"]/div[@itemtype="http://schema.org/Product"]`)
17 | if err != nil {
18 | panic(err)
19 | }
20 |
21 | println("Here are the latest releases!")
22 | println("-----------------------------")
23 |
24 | for _, node := range nodes {
25 | var title string
26 | var price float64
27 |
28 | for _, attribute := range node.Attr {
29 | switch attribute.Key {
30 | case "data-product-title":
31 | title = attribute.Val
32 | case "data-product-price":
33 | price, err = strconv.ParseFloat(attribute.Val, 64)
34 | if err != nil {
35 | println("Failed to parse price")
36 | }
37 | }
38 | }
39 | fmt.Printf("%s ($%0.2f)\n", title, price)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex7/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "strconv"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | )
9 |
10 | func main() {
11 | doc, err := goquery.NewDocument("https://www.packtpub.com/latest-releases")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | println("Here are the latest releases!")
17 | println("-----------------------------")
18 | doc.Find(`div.landing-page-row div[itemtype$="/Product"]`).
19 | Each(func(i int, e *goquery.Selection) {
20 | var title string
21 | var price float64
22 |
23 | title,_ = e.Attr("data-product-title")
24 | priceString, _ := e.Attr("data-product-price")
25 | price, err = strconv.ParseFloat(priceString, 64)
26 | if err != nil {
27 | println("Failed to parse price")
28 | }
29 | fmt.Printf("%s ($%0.2f)\n", title, price)
30 | })
31 | }
32 |
--------------------------------------------------------------------------------
/Chapter04/ch4ex8/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bufio"
5 | "strings"
6 |
7 | "github.com/PuerkitoBio/goquery"
8 | )
9 |
10 | func main() {
11 | doc, err := goquery.NewDocument("https://www.packtpub.com/packt/offers/free-learning")
12 | if err != nil {
13 | panic(err)
14 | }
15 |
16 | println("Here is the free book of the day!")
17 | println("----------------------------------")
18 | rawText := doc.Find(`div.dotd-main-book-summary div:not(.eighteen-days-countdown-bar)`).Text()
19 | reader := bufio.NewReader(strings.NewReader(rawText))
20 |
21 | var line []byte
22 | for err == nil{
23 | line, _, err = reader.ReadLine()
24 | trimmedLine := strings.TrimSpace(string(line))
25 | if trimmedLine != "" {
26 | println(trimmedLine)
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/Chapter05/ch5ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 | "time"
7 |
8 | "github.com/PuerkitoBio/goquery"
9 | )
10 |
11 | func main() {
12 | doc, err := goquery.NewDocument("https://www.packtpub.com/latest-releases")
13 | if err != nil {
14 | panic(err)
15 | }
16 |
17 | println("Here are the latest releases!")
18 | println("-----------------------------")
19 | time.Sleep(1 * time.Second)
20 | doc.Find(`div.landing-page-row div[itemtype$="/Product"] a`).
21 | Each(func(i int, e *goquery.Selection) {
22 | var title, description, author, price string
23 | link, _ := e.Attr("href")
24 | link = "https://www.packtpub.com" + link
25 |
26 | bookPage, err := goquery.NewDocument(link)
27 | if err != nil {
28 | panic(err)
29 | }
30 | title = bookPage.Find("div.book-top-block-info h1").Text()
31 | description = strings.TrimSpace(bookPage.Find("div.book-top-block-info div.book-top-block-info-one-liner").Text())
32 | price = strings.TrimSpace(bookPage.Find("div.book-top-block-info div.onlyDesktop div.book-top-pricing-main-ebook-price").Text())
33 | authorNodes := bookPage.Find("div.book-top-block-info div.book-top-block-info-authors")
34 | if len(authorNodes.Nodes) < 1 {
35 | return
36 | }
37 | author = strings.TrimSpace(authorNodes.Nodes[0].FirstChild.Data)
38 | fmt.Printf("%s\nby: %s\n%s\n%s\n---------------------\n\n", title, author, price, description)
39 | time.Sleep(1 * time.Second)
40 | })
41 | }
42 |
--------------------------------------------------------------------------------
/Chapter05/ch5ex2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "net/http"
5 | "net/url"
6 | )
7 |
8 | func main() {
9 | data := url.Values{}
10 | data.Set("s", "Golang")
11 |
12 | response, err := http.PostForm("https://hub.packtpub.com/", data)
13 | if err != nil {
14 | panic(err)
15 | }
16 | // ... Continue processing the response ...
17 | println(response.StatusCode)
18 | }
19 |
--------------------------------------------------------------------------------
/Chapter05/ch5ex3/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | func main() {
4 | visitedURLs := map[string]interface{}{}
5 |
6 | visitedURLs["https://www.packtpub.com/"] = nil
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/Chapter05/ch5ex4/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/tebeka/selenium"
5 | )
6 |
7 | func main() {
8 |
9 | // The paths to these binaries will be different on your machine!
10 |
11 | const (
12 | seleniumPath = "/home/vincent/Documents/workspace/Go/src/github.com/tebeka/selenium/vendor/selenium-server-standalone-3.14.0.jar"
13 | geckoDriverPath = "/home/vincent/Documents/workspace/Go/src/github.com/tebeka/selenium/vendor/geckodriver-v0.23.0-linux64"
14 | )
15 |
16 | service, err := selenium.NewSeleniumService(
17 | seleniumPath,
18 | 8080,
19 | selenium.GeckoDriver(geckoDriverPath))
20 |
21 | if err != nil {
22 | panic(err)
23 | }
24 | defer service.Stop()
25 |
26 | caps := selenium.Capabilities{"browserName": "firefox"}
27 | wd, err := selenium.NewRemote(caps, "http://localhost:8080/wd/hub")
28 | if err != nil {
29 | panic(err)
30 | }
31 | defer wd.Quit()
32 |
33 | err = wd.Get("https://www.packtpub.com/networking-and-servers/mastering-go")
34 | if err != nil {
35 | panic(err)
36 | }
37 |
38 | var elems []selenium.WebElement
39 | wd.Wait(func(wd2 selenium.WebDriver) (bool, error) {
40 | elems, err = wd.FindElements(selenium.ByCSSSelector, "div.product-reviews-review div.review-body")
41 | if err != nil {
42 | return false, err
43 | } else {
44 | return len(elems) > 0, nil
45 | }
46 | })
47 |
48 | for _, review := range elems {
49 | body, err := review.Text()
50 | if err != nil {
51 | panic(err)
52 | }
53 | println(body)
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/Chapter06/ch6ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "io/ioutil"
5 | "math/rand"
6 | "net/http"
7 | "net/url"
8 | "time"
9 | )
10 |
11 | // Public proxies from https://hidemyna.me
12 | var proxies []string = []string{
13 | "http://207.154.231.208:8080",
14 | "http://138.68.230.88:8080",
15 | "http://162.243.107.45:8080",
16 | }
17 |
18 | func GetProxy(_ *http.Request) (*url.URL, error) {
19 | randomIndex := rand.Int31n(int32(len(proxies)))
20 | randomProxy := proxies[randomIndex]
21 |
22 | return url.Parse(randomProxy)
23 | }
24 |
25 | func main() {
26 | rand.Seed(time.Now().Unix())
27 | http.DefaultTransport.(*http.Transport).Proxy = GetProxy
28 |
29 | // Continue with your HTTP requests
30 | for i := 0; i < 5; i++ {
31 | resp, err := http.Get("http://ip-api.com/line")
32 | if err != nil {
33 | panic(err)
34 | }
35 | data, err := ioutil.ReadAll(resp.Body)
36 | if err != nil {
37 | panic(err)
38 | }
39 | println("Proxy IP is: " + string(data))
40 | time.Sleep(1 * time.Second)
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Chapter06/ch6ex2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "net/url"
5 | "path"
6 | )
7 |
8 | func main() {
9 | parsedUrl, err := url.Parse("https://hub.packtpub.com/8-programming-languages-to-learn-in-2019")
10 |
11 | if err != nil {
12 | panic(err)
13 | }
14 |
15 | site := parsedUrl.Host + parsedUrl.Path
16 | doesMatch, err := path.Match("hub.packtpub.com/*", site)
17 | if err != nil {
18 | panic(err)
19 | }
20 | if doesMatch {
21 | // Continue scraping …
22 | println("It's a match")
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/Chapter07/ch7ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | func startTicker() {
9 | ticks := 0
10 | for true {
11 | fmt.Println(ticks)
12 | ticks++
13 | time.Sleep(1 * time.Second)
14 | }
15 | }
16 |
17 | func main() {
18 | println("Starting ticker")
19 | go startTicker()
20 | time.Sleep(10 * time.Second)
21 | }
22 |
--------------------------------------------------------------------------------
/Chapter07/ch7ex2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | func scrapeSite(url string, statusChan chan map[string]string) {
4 | // Performing scraping operations...
5 | statusChan <- map[string]string{url: "DONE"}
6 | }
7 |
8 | func main() {
9 | siteStatus := map[string]string{
10 | "http://example.com/page1.html": "READY",
11 | "http://example.com/page2.html": "READY",
12 | "http://example.com/page3.html": "READY",
13 | }
14 |
15 | updatesChan := make(chan map[string]string)
16 |
17 | numberCompleted := 0
18 | for site := range siteStatus {
19 | siteStatus[site] = "WORKING"
20 | go scrapeSite(site, updatesChan)
21 | }
22 |
23 | for update := range updatesChan {
24 | for url, status := range update {
25 | siteStatus[url] = status
26 | numberCompleted++
27 | }
28 | if numberCompleted == len(siteStatus) {
29 | close(updatesChan)
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/Chapter07/ch7ex3/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "sync"
5 | "time"
6 | )
7 |
8 | var sites []string = []string{
9 | "http://example.com/site1.html",
10 | "http://example.com/site2.html",
11 | "http://example.com/site3.html",
12 | }
13 | var activeThreads = 0
14 | var doneCount = 0
15 | const maxActiveThreads = 1
16 |
17 | func scrapeSite(site string, condition *sync.Cond) {
18 | condition.L.Lock()
19 | if activeThreads >= maxActiveThreads {
20 | println(site + " Max active threads reached!")
21 | println(site + " is waiting...")
22 | condition.Wait()
23 | println(site + " is awake...")
24 | }
25 | activeThreads++
26 | condition.L.Unlock()
27 | println("scraping " + site)
28 | // Scraping code goes here ...
29 | // We will simulate it with a sleep
30 | time.Sleep(1 * time.Second)
31 | condition.L.Lock()
32 |
33 | activeThreads--
34 | doneCount++
35 | condition.L.Unlock()
36 | condition.Signal()
37 | }
38 |
39 | func main() {
40 | var l = sync.Mutex{}
41 | var c = sync.NewCond(&l)
42 |
43 | for _, site := range sites {
44 | println("starting scraper for " + site)
45 | go scrapeSite(site, c)
46 | }
47 | for doneCount < len(sites){
48 | time.Sleep(1 * time.Second)
49 | }
50 | println("Done!")
51 | }
52 |
--------------------------------------------------------------------------------
/Chapter07/ch7ex4/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "sync"
5 | "sync/atomic"
6 | "time"
7 | )
8 |
9 | var sites []string = []string{
10 | "http://example.com/site1.html",
11 | "http://example.com/site2.html",
12 | "http://example.com/site3.html",
13 | }
14 | var activeThreads int32 = 0
15 | var doneCount = 0
16 |
17 | const maxActiveThreads = 1
18 |
19 | func scrapeSite(site string, condition *sync.Cond) {
20 | condition.L.Lock()
21 | if activeThreads >= maxActiveThreads {
22 | println(site + " Max threads reached")
23 | condition.Wait()
24 | }
25 | condition.L.Unlock()
26 |
27 | // activeThreads = atomic.AddInt32(&activeThreads, 1)
28 | atomic.AddInt32(&activeThreads, 1)
29 | // Scraping code goes here ...
30 | println("scraping " + site)
31 | // activeThreads = atomic.AddInt32(&activeThreads, -1)
32 | atomic.AddInt32(&activeThreads, -1)
33 | condition.Signal()
34 | }
35 |
36 | func main() {
37 | var l = sync.Mutex{}
38 | var c = sync.NewCond(&l)
39 |
40 | for _, site := range sites {
41 | println("starting scraper for " + site)
42 | go scrapeSite(site, c)
43 | }
44 | for doneCount < len(sites) {
45 | time.Sleep(1 * time.Second)
46 | }
47 | println("Done!")
48 | }
49 |
--------------------------------------------------------------------------------
/Chapter08/ch8ex1/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly"
7 | )
8 |
9 | func main() {
10 | c := colly.NewCollector(colly.AllowedDomains("go-colly.org"))
11 |
12 | // Find and visit all links
13 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
14 | e.Request.Visit(e.Attr("href"))
15 | })
16 |
17 | c.OnRequest(func(r *colly.Request) {
18 | fmt.Println("Visiting", r.URL)
19 | })
20 |
21 | c.Visit("http://go-colly.org/")
22 | }
23 |
--------------------------------------------------------------------------------
/Chapter08/ch8ex2/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "strings"
7 | "time"
8 |
9 | "github.com/4ydx/cdp/protocol/dom"
10 | "github.com/4ydx/chrome-protocol"
11 | "github.com/4ydx/chrome-protocol/actions"
12 | "github.com/PuerkitoBio/goquery"
13 | )
14 |
15 | func getHTML() string {
16 | browser := cdp.NewBrowser("/usr/bin/google-chrome", 9222, "browser.log")
17 | handle := cdp.Start(browser, cdp.LogBasic)
18 | err := actions.EnableAll(handle, 2*time.Second)
19 | if err != nil {
20 | panic(err)
21 | }
22 | _, err = actions.Navigate(handle, "https://www.amazon.com/gp/goldbox", 30*time.Second)
23 | if err != nil {
24 | panic(err)
25 | }
26 |
27 | var nodes []dom.Node
28 | retries := 5
29 |
30 | for len(nodes) == 0 && retries > 0 {
31 | nodes, err = actions.FindAll(
32 | handle,
33 | "div.GB-M-COMMON.GB-SUPPLE:first-child #widgetContent",
34 | 10*time.Second)
35 | retries--
36 | time.Sleep(1 * time.Second)
37 | }
38 |
39 | if len(nodes) == 0 || retries == 0 {
40 | panic("could not find results")
41 | }
42 |
43 | reply, err := actions.Evaluate(handle, "document.body.outerHTML;", 30*time.Second)
44 | if err != nil {
45 | panic(err)
46 | }
47 |
48 | a := struct{
49 | Value string
50 | }{}
51 | json.Unmarshal([]byte("{\"value\":" + string(*reply.Result.Value)+"}"), &a)
52 | body := a.Value
53 |
54 | handle.Stop(false)
55 | browser.Stop()
56 | return body
57 | }
58 |
59 | func parseProducts(htmlBody string) []string {
60 | println("parsing response")
61 | rdr := strings.NewReader(htmlBody)
62 | body, err := goquery.NewDocumentFromReader(rdr)
63 | if err != nil {
64 | panic(err)
65 | }
66 |
67 | products := []string{}
68 | details := body.Find("div.dealDetailContainer")
69 | println("Looking for products")
70 | details.Each(func(_ int, detail *goquery.Selection) {
71 | println(".")
72 | title := detail.Find("a#dealTitle").Text()
73 | price := detail.Find("div.priceBlock").Text()
74 |
75 | title = strings.TrimSpace(title)
76 | price = strings.TrimSpace(price)
77 |
78 | products = append(products, title + "\n"+price)
79 | })
80 | return products
81 | }
82 |
83 | func main() {
84 | println("getting HTML...")
85 | html := getHTML()
86 | println("parsing HTML...")
87 | products := parseProducts(html)
88 |
89 | println("Results:")
90 | for _, product := range products {
91 | fmt.Println(product + "\n")
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/Chapter08/ch8ex3/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "io/ioutil"
8 | "net/http"
9 |
10 | "github.com/slotix/dataflowkit/fetch"
11 | )
12 |
13 | func main() {
14 | r := fetch.Request{
15 | Type: "base",
16 | URL: "http://example.com",
17 | Method: "GET",
18 | UserToken: "randomString",
19 | Actions: "",
20 | }
21 |
22 | data, err := json.Marshal(&r)
23 |
24 | if err != nil {
25 | panic(err)
26 | }
27 | resp, err := http.Post("http://localhost:8000/fetch", "application/json", bytes.NewBuffer(data))
28 | if err != nil {
29 | panic(err)
30 | }
31 |
32 | body, err := ioutil.ReadAll(resp.Body)
33 | if err != nil {
34 | panic(err)
35 | }
36 |
37 | fmt.Println(string(body))
38 | }
39 |
--------------------------------------------------------------------------------
/Chapter08/ch8ex4/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "io/ioutil"
8 | "net/http"
9 |
10 | "github.com/slotix/dataflowkit/fetch"
11 | "github.com/slotix/dataflowkit/scrape"
12 | )
13 |
14 | func main() {
15 | r := scrape.Payload{
16 | Name: "Daily Deals",
17 | Request: fetch.Request{
18 | Type: "Base",
19 | URL: "https://www.packtpub.com/latest-releases",
20 | Method: "GET",
21 | },
22 | Fields: []scrape.Field{
23 | {
24 | Name: "Title",
25 | Selector: `div.landing-page-row div[itemtype$="/Product"] div.book-block-title`,
26 | Extractor: scrape.Extractor{
27 | Types: []string{"text"},
28 | Filters: []string{"trim"},
29 | },
30 | }, {
31 | Name: "Price",
32 | Selector: `div.landing-page-row div[itemtype$="/Product"] div.book-block-price-discounted`,
33 | Extractor: scrape.Extractor{
34 | Types: []string{"text"},
35 | Filters: []string{"trim"},
36 | },
37 | },
38 | },
39 | Format: "CSV",
40 | }
41 |
42 | data, err := json.Marshal(&r)
43 |
44 | if err != nil {
45 | panic(err)
46 | }
47 | resp, err := http.Post("http://localhost:8001/parse", "application/json", bytes.NewBuffer(data))
48 | if err != nil {
49 | panic(err)
50 | }
51 |
52 | body, err := ioutil.ReadAll(resp.Body)
53 | if err != nil {
54 | panic(err)
55 | }
56 |
57 | fmt.Println(string(body))
58 | }
59 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Go Web Scraping Quick Start Guide
5 |
6 | [
](https://www.packtpub.com/big-data-and-business-intelligence/go-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=9781789615708)
7 |
8 |
9 | This is the code repository for [Go Web Scraping Quick Start Guide](https://www.packtpub.com/big-data-and-business-intelligence/go-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=9781789615708 ), published by Packt.
10 |
11 | **Implement the power of Go to scrape and crawl data from the web**
12 |
13 | ## What is this book about?
14 | Web scraping is the process of extracting information from the web using various tools that perform scraping and crawling. Go is emerging as the language of choice for scraping using a variety of libraries. This book will quickly explain to you, how to scrape data data from various websites using Go libraries such as Colly and Goquery.
15 |
16 | This book covers the following exciting features:
17 | * Implement Cache-Control to avoid unnecessary network calls
18 | * Coordinate concurrent scrapers
19 | * Design a custom, larger-scale scraping system
20 | * Scrape basic HTML pages with Colly and JavaScript pages with chromedp
21 | * Discover how to search using the "strings" and "regexp" packages
22 | * Set up a Go development environment
23 | * Retrieve information from an HTML document
24 | * Protect your web scraper from being blocked by using proxies
25 | * Control web browsers to scrape JavaScript sites
26 |
27 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789615704) today!
28 |
29 |
31 |
32 | ## Instructions and Navigations
33 | All of the code is organized into folders. For example, Chapter02.
34 |
35 | The code will look like the following:
36 | ```
37 | POST /login HTTP/1.1
38 | Host: myprotectedsite.com
39 | Content-Type: application/x-www-form-urlencoded
40 | Content-Length: 38
41 |
42 | username=myuser&password=supersecretpw
43 | ```
44 |
45 | **Following is what you need for this book:**
46 | Data scientists, and web developers with a basic knowledge of Golang wanting to collect web data and analyze them for effective reporting and visualization.
47 |
48 | With the following software and hardware list you can run all code files present in the book (Chapter 1-8).
49 | ### Software and Hardware List
50 | | Chapter | Software required | OS required |
51 | | -------- | ------------------------------------ | ----------------------------------- |
52 | | 1-8 | Git (2.0+) | Windows, Mac OS X, and Linux |
53 | | 1-8 | GO (1.11) | Windows, Mac OS X, and Linux |
54 |
55 | ### Related products
56 | * Python Web Scraping Cookbook [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/python-web-scraping-cookbook?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1787285219)
57 |
58 | * R Web Scraping Quick Start Guide [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/r-web-scraping-quick-start-guide?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1789138736)
59 |
60 | ## Get to Know the Author
61 | **Vincent Smith**
62 | has been a software engineer for 10 years, having worked in various fields from health and IT to machine learning, and large-scale web scrapers. He has worked for both large-scale Fortune 500 companies and start-ups alike and has sharpened his skills from the best of both worlds. While obtaining a degree in electrical engineering, he learned the foundations of writing good code through his Java courses. These basics helped spur his career in software development early in his professional career in order to provide support for his team. He fell in love with the process of teaching computers how to behave and set him on the path he still walks today.
63 |
64 | ### Suggestions and Feedback
65 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
66 |
67 |
68 | ### Download a free PDF
69 |
70 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
71 | https://packt.link/free-ebook/9781789615708
--------------------------------------------------------------------------------