element with collection-product-card class call callback
73 | c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
74 | // Activate detailCollector if the link contains "coursera.org/learn"
75 | courseURL := e.Request.AbsoluteURL(e.Attr("href"))
76 | if strings.Index(courseURL, "coursera.org/learn") != -1 {
77 | detailCollector.Visit(courseURL)
78 | }
79 | })
80 |
81 | // Extract details of the course
82 | detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
83 | log.Println("Course found", e.Request.URL)
84 | title := e.ChildText(".banner-title")
85 | if title == "" {
86 | log.Println("No title found", e.Request.URL)
87 | }
88 | course := Course{
89 | Title: title,
90 | URL: e.Request.URL.String(),
91 | Description: e.ChildText("div.content"),
92 | Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"),
93 | Rating: e.ChildText("span.number-rating"),
94 | }
95 | // Iterate over div components and add details to course
96 | e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) {
97 | svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ")
98 | lastWord := svgTitle[len(svgTitle)-1]
99 | switch lastWord {
100 | // svg Title: Available Languages
101 | case "languages":
102 | course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)")
103 | // svg Title: Mixed/Beginner/Intermediate/Advanced Level
104 | case "Level":
105 | course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)")
106 | // svg Title: Hours to complete
107 | case "complete":
108 | course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)")
109 | }
110 | })
111 | courses = append(courses, course)
112 | })
113 |
114 | // Start scraping on http://coursera.com/browse
115 | c.Visit("https://coursera.org/browse")
116 |
117 | enc := json.NewEncoder(file)
118 | enc.SetIndent("", " ")
119 |
120 | // Dump json to the standard output
121 | enc.Encode(courses)
122 | }
123 |
--------------------------------------------------------------------------------
/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/csv"
5 | "log"
6 | "os"
7 |
8 | "github.com/gocolly/colly/v2"
9 | )
10 |
11 | func main() {
12 | fName := "cryptocoinmarketcap.csv"
13 | file, err := os.Create(fName)
14 | if err != nil {
15 | log.Fatalf("Cannot create file %q: %s\n", fName, err)
16 | return
17 | }
18 | defer file.Close()
19 | writer := csv.NewWriter(file)
20 | defer writer.Flush()
21 |
22 | // Write CSV header
23 | writer.Write([]string{"Name", "Symbol", "Market Cap (USD)", "Price (USD)", "Circulating Supply (USD)", "Volume (24h)", "Change (1h)", "Change (24h)", "Change (7d)"})
24 |
25 | // Instantiate default collector
26 | c := colly.NewCollector()
27 |
28 | c.OnHTML("tbody tr", func(e *colly.HTMLElement) {
29 | writer.Write([]string{
30 | e.ChildText(".cmc-table__column-name"),
31 | e.ChildText(".cmc-table__cell--sort-by__symbol"),
32 | e.ChildText(".cmc-table__cell--sort-by__market-cap"),
33 | e.ChildText(".cmc-table__cell--sort-by__price"),
34 | e.ChildText(".cmc-table__cell--sort-by__circulating-supply"),
35 | e.ChildText(".cmc-table__cell--sort-by__volume-24-h"),
36 | e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"),
37 | e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"),
38 | e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"),
39 | })
40 | })
41 |
42 | c.Visit("https://coinmarketcap.com/all/views/all/")
43 |
44 | log.Printf("Scraping finished, check file %q for results\n", fName)
45 | }
46 |
--------------------------------------------------------------------------------
/_examples/error_handling/error_handling.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // Create a collector
11 | c := colly.NewCollector()
12 |
13 | // Set HTML callback
14 | // Won't be called if error occurs
15 | c.OnHTML("*", func(e *colly.HTMLElement) {
16 | fmt.Println(e)
17 | })
18 |
19 | // Set error handler
20 | c.OnError(func(r *colly.Response, err error) {
21 | fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
22 | })
23 |
24 | // Start scraping
25 | c.Visit("https://definitely-not-a.website/")
26 | }
27 |
--------------------------------------------------------------------------------
/_examples/factba.se/factbase.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "strconv"
8 |
9 | "github.com/gocolly/colly/v2"
10 | )
11 |
12 | var baseSearchURL = "https://factba.se/json/json-transcript.php?q=&f=&dt=&p="
13 | var baseTranscriptURL = "https://factba.se/transcript/"
14 |
15 | type result struct {
16 | Slug string `json:"slug"`
17 | Date string `json:"date"`
18 | }
19 |
20 | type results struct {
21 | Data []*result `json:"data"`
22 | }
23 |
24 | type transcript struct {
25 | Speaker string
26 | Text string
27 | }
28 |
29 | func main() {
30 | c := colly.NewCollector(
31 | colly.AllowedDomains("factba.se"),
32 | )
33 |
34 | d := c.Clone()
35 |
36 | d.OnHTML("body", func(e *colly.HTMLElement) {
37 | t := make([]transcript, 0)
38 | e.ForEach(".topic-media-row", func(_ int, el *colly.HTMLElement) {
39 | t = append(t, transcript{
40 | Speaker: el.ChildText(".speaker-label"),
41 | Text: el.ChildText(".transcript-text-block"),
42 | })
43 | })
44 | jsonData, err := json.MarshalIndent(t, "", " ")
45 | if err != nil {
46 | return
47 | }
48 | os.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644)
49 | })
50 |
51 | stop := false
52 | c.OnResponse(func(r *colly.Response) {
53 | rs := &results{}
54 | err := json.Unmarshal(r.Body, rs)
55 | if err != nil || len(rs.Data) == 0 {
56 | stop = true
57 | return
58 | }
59 | for _, res := range rs.Data {
60 | u := baseTranscriptURL + res.Slug
61 | ctx := colly.NewContext()
62 | ctx.Put("date", res.Date)
63 | ctx.Put("slug", res.Slug)
64 | d.Request("GET", u, nil, ctx, nil)
65 | }
66 | })
67 |
68 | for i := 1; i < 1000; i++ {
69 | if stop {
70 | break
71 | }
72 | if err := c.Visit(baseSearchURL + strconv.Itoa(i)); err != nil {
73 | fmt.Println("Error:", err)
74 | break
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/_examples/google_groups/google_groups.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "flag"
6 | "log"
7 | "os"
8 | "strings"
9 |
10 | "github.com/gocolly/colly/v2"
11 | )
12 |
13 | // Mail is the container of a single e-mail
14 | type Mail struct {
15 | Title string
16 | Link string
17 | Author string
18 | Date string
19 | Message string
20 | }
21 |
22 | func main() {
23 | var groupName string
24 | flag.StringVar(&groupName, "group", "hspbp", "Google Groups group name")
25 | flag.Parse()
26 |
27 | threads := make(map[string][]Mail)
28 |
29 | threadCollector := colly.NewCollector()
30 | mailCollector := colly.NewCollector()
31 |
32 | // Collect threads
33 | threadCollector.OnHTML("tr", func(e *colly.HTMLElement) {
34 | ch := e.DOM.Children()
35 | author := ch.Eq(1).Text()
36 | // deleted topic
37 | if author == "" {
38 | return
39 | }
40 |
41 | title := ch.Eq(0).Text()
42 | link, _ := ch.Eq(0).Children().Eq(0).Attr("href")
43 | // fix link to point to the pure HTML version of the thread
44 | link = strings.Replace(link, ".com/d/topic", ".com/forum/?_escaped_fragment_=topic", 1)
45 | date := ch.Eq(2).Text()
46 |
47 | log.Printf("Thread found: %s %q %s %s\n", link, title, author, date)
48 | mailCollector.Visit(link)
49 | })
50 |
51 | // Visit next page
52 | threadCollector.OnHTML("body > a[href]", func(e *colly.HTMLElement) {
53 | log.Println("Next page link found:", e.Attr("href"))
54 | e.Request.Visit(e.Attr("href"))
55 | })
56 |
57 | // Extract mails
58 | mailCollector.OnHTML("body", func(e *colly.HTMLElement) {
59 | // Find subject
60 | threadSubject := e.ChildText("h2")
61 | if _, ok := threads[threadSubject]; !ok {
62 | threads[threadSubject] = make([]Mail, 0, 8)
63 | }
64 |
65 | // Extract mails
66 | e.ForEach("table tr", func(_ int, el *colly.HTMLElement) {
67 | mail := Mail{
68 | Title: el.ChildText("td:nth-of-type(1)"),
69 | Link: el.ChildAttr("td:nth-of-type(1)", "href"),
70 | Author: el.ChildText("td:nth-of-type(2)"),
71 | Date: el.ChildText("td:nth-of-type(3)"),
72 | Message: el.ChildText("td:nth-of-type(4)"),
73 | }
74 | threads[threadSubject] = append(threads[threadSubject], mail)
75 | })
76 |
77 | // Follow next page link
78 | if link, found := e.DOM.Find("> a[href]").Attr("href"); found {
79 | e.Request.Visit(link)
80 | } else {
81 | log.Printf("Thread %q done\n", threadSubject)
82 | }
83 | })
84 |
85 | threadCollector.Visit("https://groups.google.com/forum/?_escaped_fragment_=forum/" + groupName)
86 |
87 | enc := json.NewEncoder(os.Stdout)
88 | enc.SetIndent("", " ")
89 |
90 | // Dump json to the standard output
91 | enc.Encode(threads)
92 | }
93 |
--------------------------------------------------------------------------------
/_examples/hackernews_comments/hackernews_comments.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "flag"
6 | "log"
7 | "os"
8 | "strconv"
9 | "strings"
10 |
11 | "github.com/gocolly/colly/v2"
12 | )
13 |
14 | type comment struct {
15 | Author string `selector:"a.hnuser"`
16 | URL string `selector:".age a[href]" attr:"href"`
17 | Comment string `selector:".comment"`
18 | Replies []*comment
19 | depth int
20 | }
21 |
22 | func main() {
23 | var itemID string
24 | flag.StringVar(&itemID, "id", "", "hackernews post id")
25 | flag.Parse()
26 |
27 | if itemID == "" {
28 | log.Println("Hackernews post id required")
29 | os.Exit(1)
30 | }
31 |
32 | comments := make([]*comment, 0)
33 |
34 | // Instantiate default collector
35 | c := colly.NewCollector()
36 |
37 | // Extract comment
38 | c.OnHTML(".comment-tree tr.athing", func(e *colly.HTMLElement) {
39 | width, err := strconv.Atoi(e.ChildAttr("td.ind img", "width"))
40 | if err != nil {
41 | return
42 | }
43 | // hackernews uses 40px spacers to indent comment replies,
44 | // so we have to divide the width with it to get the depth
45 | // of the comment
46 | depth := width / 40
47 | c := &comment{
48 | Replies: make([]*comment, 0),
49 | depth: depth,
50 | }
51 | e.Unmarshal(c)
52 | c.Comment = strings.TrimSpace(c.Comment[:len(c.Comment)-5])
53 | if depth == 0 {
54 | comments = append(comments, c)
55 | return
56 | }
57 | parent := comments[len(comments)-1]
58 | // append comment to its parent
59 | for i := 0; i < depth-1; i++ {
60 | parent = parent.Replies[len(parent.Replies)-1]
61 | }
62 | parent.Replies = append(parent.Replies, c)
63 | })
64 |
65 | c.Visit("https://news.ycombinator.com/item?id=" + itemID)
66 |
67 | enc := json.NewEncoder(os.Stdout)
68 | enc.SetIndent("", " ")
69 |
70 | // Dump json to the standard output
71 | enc.Encode(comments)
72 | }
73 |
--------------------------------------------------------------------------------
/_examples/instagram/instagram.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "crypto/md5"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 | "net/url"
9 | "os"
10 | "regexp"
11 | "strings"
12 |
13 | "github.com/gocolly/colly/v2"
14 | )
15 |
16 | // "id": user id, "after": end cursor
17 | const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s`
18 | const nextPagePayload string = `{"id":"%s","first":50,"after":"%s"}`
19 |
20 | var requestID string
21 | var requestIds [][]byte
22 | var queryIdPattern = regexp.MustCompile(`queryId:".{32}"`)
23 |
24 | type pageInfo struct {
25 | EndCursor string `json:"end_cursor"`
26 | NextPage bool `json:"has_next_page"`
27 | }
28 |
29 | type mainPageData struct {
30 | Rhxgis string `json:"rhx_gis"`
31 | EntryData struct {
32 | ProfilePage []struct {
33 | Graphql struct {
34 | User struct {
35 | Id string `json:"id"`
36 | Media struct {
37 | Edges []struct {
38 | Node struct {
39 | ImageURL string `json:"display_url"`
40 | ThumbnailURL string `json:"thumbnail_src"`
41 | IsVideo bool `json:"is_video"`
42 | Date int `json:"date"`
43 | Dimensions struct {
44 | Width int `json:"width"`
45 | Height int `json:"height"`
46 | } `json:"dimensions"`
47 | } `json::node"`
48 | } `json:"edges"`
49 | PageInfo pageInfo `json:"page_info"`
50 | } `json:"edge_owner_to_timeline_media"`
51 | } `json:"user"`
52 | } `json:"graphql"`
53 | } `json:"ProfilePage"`
54 | } `json:"entry_data"`
55 | }
56 |
57 | type nextPageData struct {
58 | Data struct {
59 | User struct {
60 | Container struct {
61 | PageInfo pageInfo `json:"page_info"`
62 | Edges []struct {
63 | Node struct {
64 | ImageURL string `json:"display_url"`
65 | ThumbnailURL string `json:"thumbnail_src"`
66 | IsVideo bool `json:"is_video"`
67 | Date int `json:"taken_at_timestamp"`
68 | Dimensions struct {
69 | Width int `json:"width"`
70 | Height int `json:"height"`
71 | }
72 | }
73 | } `json:"edges"`
74 | } `json:"edge_owner_to_timeline_media"`
75 | }
76 | } `json:"data"`
77 | }
78 |
79 | func main() {
80 | if len(os.Args) != 2 {
81 | log.Println("Missing account name argument")
82 | os.Exit(1)
83 | }
84 |
85 | var actualUserId string
86 | instagramAccount := os.Args[1]
87 | outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)
88 |
89 | c := colly.NewCollector(
90 | //colly.CacheDir("./_instagram_cache/"),
91 | colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
92 | )
93 |
94 | c.OnRequest(func(r *colly.Request) {
95 | r.Headers.Set("X-Requested-With", "XMLHttpRequest")
96 | r.Headers.Set("Referer", "https://www.instagram.com/"+instagramAccount)
97 | if r.Ctx.Get("gis") != "" {
98 | gis := fmt.Sprintf("%s:%s", r.Ctx.Get("gis"), r.Ctx.Get("variables"))
99 | h := md5.New()
100 | h.Write([]byte(gis))
101 | gisHash := fmt.Sprintf("%x", h.Sum(nil))
102 | r.Headers.Set("X-Instagram-GIS", gisHash)
103 | }
104 | })
105 |
106 | c.OnHTML("html", func(e *colly.HTMLElement) {
107 | d := c.Clone()
108 | d.OnResponse(func(r *colly.Response) {
109 | requestIds = queryIdPattern.FindAll(r.Body, -1)
110 | requestID = string(requestIds[1][9:41])
111 | })
112 | requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as="script"]`, "href"))
113 | d.Visit(requestIDURL)
114 |
115 | dat := e.ChildText("body > script:first-of-type")
116 | jsonData := dat[strings.Index(dat, "{") : len(dat)-1]
117 | data := &mainPageData{}
118 | err := json.Unmarshal([]byte(jsonData), data)
119 | if err != nil {
120 | log.Fatal(err)
121 | }
122 |
123 | log.Println("saving output to ", outputDir)
124 | os.MkdirAll(outputDir, os.ModePerm)
125 | page := data.EntryData.ProfilePage[0]
126 | actualUserId = page.Graphql.User.Id
127 | for _, obj := range page.Graphql.User.Media.Edges {
128 | // skip videos
129 | if obj.Node.IsVideo {
130 | continue
131 | }
132 | c.Visit(obj.Node.ImageURL)
133 | }
134 | nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, page.Graphql.User.Media.PageInfo.EndCursor)
135 | e.Request.Ctx.Put("variables", nextPageVars)
136 | if page.Graphql.User.Media.PageInfo.NextPage {
137 | u := fmt.Sprintf(
138 | nextPageURL,
139 | requestID,
140 | url.QueryEscape(nextPageVars),
141 | )
142 | log.Println("Next page found", u)
143 | e.Request.Ctx.Put("gis", data.Rhxgis)
144 | e.Request.Visit(u)
145 | }
146 | })
147 |
148 | c.OnError(func(r *colly.Response, e error) {
149 | log.Println("error:", e, r.Request.URL, string(r.Body))
150 | })
151 |
152 | c.OnResponse(func(r *colly.Response) {
153 | if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 {
154 | r.Save(outputDir + r.FileName())
155 | return
156 | }
157 |
158 | if strings.Index(r.Headers.Get("Content-Type"), "json") == -1 {
159 | return
160 | }
161 |
162 | data := &nextPageData{}
163 | err := json.Unmarshal(r.Body, data)
164 | if err != nil {
165 | log.Fatal(err)
166 | }
167 |
168 | for _, obj := range data.Data.User.Container.Edges {
169 | // skip videos
170 | if obj.Node.IsVideo {
171 | continue
172 | }
173 | c.Visit(obj.Node.ImageURL)
174 | }
175 | if data.Data.User.Container.PageInfo.NextPage {
176 | nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, data.Data.User.Container.PageInfo.EndCursor)
177 | r.Request.Ctx.Put("variables", nextPageVars)
178 | u := fmt.Sprintf(
179 | nextPageURL,
180 | requestID,
181 | url.QueryEscape(nextPageVars),
182 | )
183 | log.Println("Next page found", u)
184 | r.Request.Visit(u)
185 | }
186 | })
187 |
188 | c.Visit("https://instagram.com/" + instagramAccount)
189 | }
190 |
--------------------------------------------------------------------------------
/_examples/local_files/html/child_page/one.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 | Child Page One
11 |
12 |
--------------------------------------------------------------------------------
/_examples/local_files/html/child_page/three.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 | Child Page Three
11 |
12 |
--------------------------------------------------------------------------------
/_examples/local_files/html/child_page/two.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 | Child Page Two
11 |
12 |
--------------------------------------------------------------------------------
/_examples/local_files/html/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Document
8 |
9 |
10 | Index.html
11 |
16 |
17 |
--------------------------------------------------------------------------------
/_examples/local_files/local_files.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "net/http"
6 | "os"
7 | "path/filepath"
8 |
9 | "github.com/gocolly/colly/v2"
10 | )
11 |
12 | func main() {
13 | dir, err := filepath.Abs(filepath.Dir(os.Args[0]))
14 | if err != nil {
15 | panic(err)
16 | }
17 |
18 | t := &http.Transport{}
19 | t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
20 |
21 | c := colly.NewCollector()
22 | c.WithTransport(t)
23 |
24 | pages := []string{}
25 |
26 | c.OnHTML("h1", func(e *colly.HTMLElement) {
27 | pages = append(pages, e.Text)
28 | })
29 |
30 | c.OnHTML("a", func(e *colly.HTMLElement) {
31 | c.Visit("file://" + dir + "/html" + e.Attr("href"))
32 | })
33 |
34 | fmt.Println("file://" + dir + "/html/index.html")
35 | c.Visit("file://" + dir + "/html/index.html")
36 | c.Wait()
37 | for i, p := range pages {
38 | fmt.Printf("%d : %s\n", i, p)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/_examples/login/login.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // create a new collector
11 | c := colly.NewCollector()
12 |
13 | // authenticate
14 | err := c.Post("http://example.com/login", map[string]string{"username": "admin", "password": "admin"})
15 | if err != nil {
16 | log.Fatal(err)
17 | }
18 |
19 | // attach callbacks after login
20 | c.OnResponse(func(r *colly.Response) {
21 | log.Println("response received", r.StatusCode)
22 | })
23 |
24 | // start scraping
25 | c.Visit("https://example.com/")
26 | }
27 |
--------------------------------------------------------------------------------
/_examples/max_depth/max_depth.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // Instantiate default collector
11 | c := colly.NewCollector(
12 | // MaxDepth is 1, so only the links on the scraped page
13 | // is visited, and no further links are followed
14 | colly.MaxDepth(1),
15 | )
16 |
17 | // On every a element which has href attribute call callback
18 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
19 | link := e.Attr("href")
20 | // Print link
21 | fmt.Println(link)
22 | // Visit link found on page
23 | e.Request.Visit(link)
24 | })
25 |
26 | // Start scraping on https://en.wikipedia.org
27 | c.Visit("https://en.wikipedia.org/")
28 | }
29 |
--------------------------------------------------------------------------------
/_examples/multipart/asciimoo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gocolly/colly/3a490c99cf2a7493271f151949590baae6a72538/_examples/multipart/asciimoo.jpg
--------------------------------------------------------------------------------
/_examples/multipart/multipart.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "net/http"
7 | "os"
8 | "time"
9 |
10 | "github.com/gocolly/colly/v2"
11 | )
12 |
13 | func generateFormData() map[string][]byte {
14 | f, _ := os.Open("gocolly.jpg")
15 | defer f.Close()
16 |
17 | imgData, _ := io.ReadAll(f)
18 |
19 | return map[string][]byte{
20 | "firstname": []byte("one"),
21 | "lastname": []byte("two"),
22 | "email": []byte("onetwo@example.com"),
23 | "file": imgData,
24 | }
25 | }
26 |
27 | func setupServer() {
28 | var handler http.HandlerFunc = func(w http.ResponseWriter, r *http.Request) {
29 | fmt.Println("received request")
30 | err := r.ParseMultipartForm(10000000)
31 | if err != nil {
32 | fmt.Println("server: Error")
33 | w.WriteHeader(500)
34 | w.Write([]byte("Internal Server Error"))
35 | return
36 | }
37 | w.WriteHeader(200)
38 | fmt.Println("server: OK")
39 | w.Write([]byte("Success"))
40 | }
41 |
42 | go http.ListenAndServe(":8080", handler)
43 | }
44 |
45 | func main() {
46 | // Start a single route http server to post an image to.
47 | setupServer()
48 |
49 | c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))
50 |
51 | // On every a element which has href attribute call callback
52 | c.OnHTML("html", func(e *colly.HTMLElement) {
53 | fmt.Println(e.Text)
54 | time.Sleep(1 * time.Second)
55 | e.Request.PostMultipart("http://localhost:8080/", generateFormData())
56 | })
57 |
58 | // Before making a request print "Visiting ..."
59 | c.OnRequest(func(r *colly.Request) {
60 | fmt.Println("Posting gocolly.jpg to", r.URL.String())
61 | })
62 |
63 | // Start scraping
64 | c.PostMultipart("http://localhost:8080/", generateFormData())
65 | c.Wait()
66 | }
67 |
--------------------------------------------------------------------------------
/_examples/openedx_courses/openedx_courses.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "strings"
7 | "time"
8 |
9 | "github.com/gocolly/colly/v2"
10 | )
11 |
12 | // DATE_FORMAT default format date used in openedx
13 | const DATE_FORMAT = "02 Jan, 2006"
14 |
15 | // Course store openedx course data
16 | type Course struct {
17 | CourseID string
18 | Run string
19 | Name string
20 | Number string
21 | StartDate *time.Time
22 | EndDate *time.Time
23 | URL string
24 | }
25 |
26 | func main() {
27 | // Instantiate default collector
28 | c := colly.NewCollector(
29 | // Using IndonesiaX as sample
30 | colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"),
31 |
32 | // Cache responses to prevent multiple download of pages
33 | // even if the collector is restarted
34 | colly.CacheDir("./cache"),
35 | )
36 |
37 | courses := make([]Course, 0, 200)
38 |
39 | // On every a element which has href attribute call callback
40 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
41 | link := e.Attr("href")
42 | if !strings.HasPrefix(link, "/courses/") {
43 | return
44 | }
45 | // start scraping the page under the link found
46 | e.Request.Visit(link)
47 | })
48 |
49 | c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) {
50 | if e.DOM.Find("section#course-info").Length() == 0 {
51 | return
52 | }
53 | title := strings.Split(e.ChildText(".course-info__title"), "\n")[0]
54 | course_id := e.ChildAttr("input[name=course_id]", "value")
55 | texts := e.ChildTexts("span[data-datetime]")
56 | start_date, _ := time.Parse(DATE_FORMAT, texts[0])
57 | end_date, _ := time.Parse(DATE_FORMAT, texts[1])
58 | var run string
59 | if len(strings.Split(course_id, "_")) > 1 {
60 | run = strings.Split(course_id, "_")[1]
61 | }
62 | course := Course{
63 | CourseID: course_id,
64 | Run: run,
65 | Name: title,
66 | Number: e.ChildText("span.course-number"),
67 | StartDate: &start_date,
68 | EndDate: &end_date,
69 | URL: fmt.Sprintf("/courses/%s/about", course_id),
70 | }
71 | courses = append(courses, course)
72 | })
73 |
74 | // Start scraping on https://openedxdomain/courses
75 | c.Visit("https://www.indonesiax.co.id/courses")
76 |
77 | // Convert results to JSON data if the scraping job has finished
78 | jsonData, err := json.MarshalIndent(courses, "", " ")
79 | if err != nil {
80 | panic(err)
81 | }
82 |
83 | // Dump json to the standard output (can be redirected to a file)
84 | fmt.Println(string(jsonData))
85 | }
86 |
--------------------------------------------------------------------------------
/_examples/parallel/parallel.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // Instantiate default collector
11 | c := colly.NewCollector(
12 | // MaxDepth is 2, so only the links on the scraped page
13 | // and links on those pages are visited
14 | colly.MaxDepth(2),
15 | colly.Async(),
16 | )
17 |
18 | // Limit the maximum parallelism to 2
19 | // This is necessary if the goroutines are dynamically
20 | // created to control the limit of simultaneous requests.
21 | //
22 | // Parallelism can be controlled also by spawning fixed
23 | // number of go routines.
24 | c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
25 |
26 | // On every a element which has href attribute call callback
27 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
28 | link := e.Attr("href")
29 | // Print link
30 | fmt.Println(link)
31 | // Visit link found on page on a new thread
32 | e.Request.Visit(link)
33 | })
34 |
35 | // Start scraping on https://en.wikipedia.org
36 | c.Visit("https://en.wikipedia.org/")
37 | // Wait until threads are finished
38 | c.Wait()
39 | }
40 |
--------------------------------------------------------------------------------
/_examples/proxy_switcher/proxy_switcher.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "log"
6 |
7 | "github.com/gocolly/colly/v2"
8 | "github.com/gocolly/colly/v2/proxy"
9 | )
10 |
11 | func main() {
12 | // Instantiate default collector
13 | c := colly.NewCollector(colly.AllowURLRevisit())
14 |
15 | // Rotate two socks5 proxies
16 | rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
17 | if err != nil {
18 | log.Fatal(err)
19 | }
20 | c.SetProxyFunc(rp)
21 |
22 | // Print the response
23 | c.OnResponse(func(r *colly.Response) {
24 | log.Printf("Proxy Address: %s\n", r.Request.ProxyURL)
25 | log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
26 | })
27 |
28 | // Fetch httpbin.org/ip five times
29 | for i := 0; i < 5; i++ {
30 | c.Visit("https://httpbin.org/ip")
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/_examples/queue/queue.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | "github.com/gocolly/colly/v2/queue"
8 | )
9 |
10 | func main() {
11 | url := "https://httpbin.org/delay/1"
12 |
13 | // Instantiate default collector
14 | c := colly.NewCollector(colly.AllowURLRevisit())
15 |
16 | // create a request queue with 2 consumer threads
17 | q, _ := queue.New(
18 | 2, // Number of consumer threads
19 | &queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
20 | )
21 |
22 | c.OnRequest(func(r *colly.Request) {
23 | fmt.Println("visiting", r.URL)
24 | if r.ID < 15 {
25 | r2, err := r.New("GET", fmt.Sprintf("%s?x=%v", url, r.ID), nil)
26 | if err == nil {
27 | q.AddRequest(r2)
28 | }
29 | }
30 | })
31 |
32 | for i := 0; i < 5; i++ {
33 | // Add URLs to the queue
34 | q.AddURL(fmt.Sprintf("%s?n=%d", url, i))
35 | }
36 | // Consume URLs
37 | q.Run(c)
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/_examples/random_delay/random_delay.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "time"
6 |
7 | "github.com/gocolly/colly/v2"
8 | "github.com/gocolly/colly/v2/debug"
9 | )
10 |
11 | func main() {
12 | url := "https://httpbin.org/delay/2"
13 |
14 | // Instantiate default collector
15 | c := colly.NewCollector(
16 | // Attach a debugger to the collector
17 | colly.Debugger(&debug.LogDebugger{}),
18 | colly.Async(),
19 | )
20 |
21 | // Limit the number of threads started by colly to two
22 | // when visiting links which domains' matches "*httpbin.*" glob
23 | c.Limit(&colly.LimitRule{
24 | DomainGlob: "*httpbin.*",
25 | Parallelism: 2,
26 | RandomDelay: 5 * time.Second,
27 | })
28 |
29 | // Start scraping in four threads on https://httpbin.org/delay/2
30 | for i := 0; i < 4; i++ {
31 | c.Visit(fmt.Sprintf("%s?n=%d", url, i))
32 | }
33 | // Start scraping on https://httpbin.org/delay/2
34 | c.Visit(url)
35 | // Wait until threads are finished
36 | c.Wait()
37 | }
38 |
--------------------------------------------------------------------------------
/_examples/rate_limit/rate_limit.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | "github.com/gocolly/colly/v2/debug"
8 | )
9 |
10 | func main() {
11 | url := "https://httpbin.org/delay/2"
12 |
13 | // Instantiate default collector
14 | c := colly.NewCollector(
15 | // Turn on asynchronous requests
16 | colly.Async(),
17 | // Attach a debugger to the collector
18 | colly.Debugger(&debug.LogDebugger{}),
19 | )
20 |
21 | // Limit the number of threads started by colly to two
22 | // when visiting links which domains' matches "*httpbin.*" glob
23 | c.Limit(&colly.LimitRule{
24 | DomainGlob: "*httpbin.*",
25 | Parallelism: 2,
26 | //Delay: 5 * time.Second,
27 | })
28 |
29 | // Start scraping in five threads on https://httpbin.org/delay/2
30 | for i := 0; i < 5; i++ {
31 | c.Visit(fmt.Sprintf("%s?n=%d", url, i))
32 | }
33 | // Wait until threads are finished
34 | c.Wait()
35 | }
36 |
--------------------------------------------------------------------------------
/_examples/reddit/reddit.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "time"
7 |
8 | "github.com/gocolly/colly/v2"
9 | )
10 |
11 | type item struct {
12 | StoryURL string
13 | Source string
14 | comments string
15 | CrawledAt time.Time
16 | Comments string
17 | Title string
18 | }
19 |
20 | func main() {
21 | stories := []item{}
22 | // Instantiate default collector
23 | c := colly.NewCollector(
24 | // Visit only domains: old.reddit.com
25 | colly.AllowedDomains("old.reddit.com"),
26 | // Parallelism
27 | colly.Async(true),
28 | )
29 |
30 | // On every a element which has .top-matter attribute call callback
31 | // This class is unique to the div that holds all information about a story
32 | c.OnHTML(".top-matter", func(e *colly.HTMLElement) {
33 | temp := item{}
34 | temp.StoryURL = e.ChildAttr("a[data-event-action=title]", "href")
35 | temp.Source = "https://old.reddit.com/r/programming/"
36 | temp.Title = e.ChildText("a[data-event-action=title]")
37 | temp.Comments = e.ChildAttr("a[data-event-action=comments]", "href")
38 | temp.CrawledAt = time.Now()
39 | stories = append(stories, temp)
40 | })
41 |
42 | // On every span tag with the class next-button
43 | c.OnHTML("span.next-button", func(h *colly.HTMLElement) {
44 | t := h.ChildAttr("a", "href")
45 | c.Visit(t)
46 | })
47 |
48 | // Set max Parallelism and introduce a Random Delay
49 | c.Limit(&colly.LimitRule{
50 | Parallelism: 2,
51 | RandomDelay: 5 * time.Second,
52 | })
53 |
54 | // Before making a request print "Visiting ..."
55 | c.OnRequest(func(r *colly.Request) {
56 | fmt.Println("Visiting", r.URL.String())
57 |
58 | })
59 |
60 | // Crawl all reddits the user passes in
61 | reddits := os.Args[1:]
62 | for _, reddit := range reddits {
63 | c.Visit(reddit)
64 |
65 | }
66 |
67 | c.Wait()
68 | fmt.Println(stories)
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/_examples/request_context/request_context.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // Instantiate default collector
11 | c := colly.NewCollector()
12 |
13 | // Before making a request put the URL with
14 | // the key of "url" into the context of the request
15 | c.OnRequest(func(r *colly.Request) {
16 | r.Ctx.Put("url", r.URL.String())
17 | })
18 |
19 | // After making a request get "url" from
20 | // the context of the request
21 | c.OnResponse(func(r *colly.Response) {
22 | fmt.Println(r.Ctx.Get("url"))
23 | })
24 |
25 | // Start scraping on https://en.wikipedia.org
26 | c.Visit("https://en.wikipedia.org/")
27 | }
28 |
--------------------------------------------------------------------------------
/_examples/scraper_server/scraper_server.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "log"
6 | "net/http"
7 |
8 | "github.com/gocolly/colly/v2"
9 | )
10 |
11 | type pageInfo struct {
12 | StatusCode int
13 | Links map[string]int
14 | }
15 |
16 | func handler(w http.ResponseWriter, r *http.Request) {
17 | URL := r.URL.Query().Get("url")
18 | if URL == "" {
19 | log.Println("missing URL argument")
20 | return
21 | }
22 | log.Println("visiting", URL)
23 |
24 | c := colly.NewCollector()
25 |
26 | p := &pageInfo{Links: make(map[string]int)}
27 |
28 | // count links
29 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
30 | link := e.Request.AbsoluteURL(e.Attr("href"))
31 | if link != "" {
32 | p.Links[link]++
33 | }
34 | })
35 |
36 | // extract status code
37 | c.OnResponse(func(r *colly.Response) {
38 | log.Println("response received", r.StatusCode)
39 | p.StatusCode = r.StatusCode
40 | })
41 | c.OnError(func(r *colly.Response, err error) {
42 | log.Println("error:", r.StatusCode, err)
43 | p.StatusCode = r.StatusCode
44 | })
45 |
46 | c.Visit(URL)
47 |
48 | // dump results
49 | b, err := json.Marshal(p)
50 | if err != nil {
51 | log.Println("failed to serialize response:", err)
52 | return
53 | }
54 | w.Header().Add("Content-Type", "application/json")
55 | w.Write(b)
56 | }
57 |
58 | func main() {
59 | // example usage: curl -s 'http://127.0.0.1:7171/?url=http://go-colly.org/'
60 | addr := ":7171"
61 |
62 | http.HandleFunc("/", handler)
63 |
64 | log.Println("listening on", addr)
65 | log.Fatal(http.ListenAndServe(addr, nil))
66 | }
67 |
--------------------------------------------------------------------------------
/_examples/shopify_sitemap/shopify_sitemap.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/gocolly/colly/v2"
7 | )
8 |
9 | func main() {
10 | // Array containing all the known URLs in a sitemap
11 | knownUrls := []string{}
12 |
13 | // Create a Collector specifically for Shopify
14 | c := colly.NewCollector(colly.AllowedDomains("www.shopify.com"))
15 |
16 | // Create a callback on the XPath query searching for the URLs
17 | c.OnXML("//urlset/url/loc", func(e *colly.XMLElement) {
18 | knownUrls = append(knownUrls, e.Text)
19 | })
20 |
21 | // Start the collector
22 | c.Visit("https://www.shopify.com/sitemap.xml")
23 |
24 | fmt.Println("All known URLs:")
25 | for _, url := range knownUrls {
26 | fmt.Println("\t", url)
27 | }
28 | fmt.Println("Collected", len(knownUrls), "URLs")
29 | }
30 |
--------------------------------------------------------------------------------
/_examples/url_filter/url_filter.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 |
7 | "github.com/gocolly/colly/v2"
8 | )
9 |
10 | func main() {
11 | // Instantiate default collector
12 | c := colly.NewCollector(
13 | // Visit only root url and urls which start with "e" or "h" on httpbin.org
14 | colly.URLFilters(
15 | regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
16 | regexp.MustCompile("http://httpbin\\.org/h.+"),
17 | ),
18 | )
19 |
20 | // On every a element which has href attribute call callback
21 | c.OnHTML("a[href]", func(e *colly.HTMLElement) {
22 | link := e.Attr("href")
23 | // Print link
24 | fmt.Printf("Link found: %q -> %s\n", e.Text, link)
25 | // Visit link found on page
26 | // Only those links are visited which are matched by any of the URLFilter regexps
27 | c.Visit(e.Request.AbsoluteURL(link))
28 | })
29 |
30 | // Before making a request print "Visiting ..."
31 | c.OnRequest(func(r *colly.Request) {
32 | fmt.Println("Visiting", r.URL.String())
33 | })
34 |
35 | // Start scraping on http://httpbin.org
36 | c.Visit("http://httpbin.org/")
37 | }
38 |
--------------------------------------------------------------------------------
/_examples/xkcd_store/xkcd_store.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/csv"
5 | "log"
6 | "os"
7 |
8 | "github.com/gocolly/colly/v2"
9 | )
10 |
11 | func main() {
12 | fName := "xkcd_store_items.csv"
13 | file, err := os.Create(fName)
14 | if err != nil {
15 | log.Fatalf("Cannot create file %q: %s\n", fName, err)
16 | return
17 | }
18 | defer file.Close()
19 | writer := csv.NewWriter(file)
20 | defer writer.Flush()
21 | // Write CSV header
22 | writer.Write([]string{"Name", "Price", "URL", "Image URL"})
23 |
24 | // Instantiate default collector
25 | c := colly.NewCollector(
26 | // Allow requests only to store.xkcd.com
27 | colly.AllowedDomains("store.xkcd.com"),
28 | )
29 |
30 | // Extract product details
31 | c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
32 | writer.Write([]string{
33 | e.ChildAttr("a", "title"),
34 | e.ChildText("span"),
35 | e.Request.AbsoluteURL(e.ChildAttr("a", "href")),
36 | "https:" + e.ChildAttr("img", "src"),
37 | })
38 | })
39 |
40 | // Find and visit next page links
41 | c.OnHTML(`.next a[href]`, func(e *colly.HTMLElement) {
42 | e.Request.Visit(e.Attr("href"))
43 | })
44 |
45 | c.Visit("https://store.xkcd.com/collections/everything")
46 |
47 | log.Printf("Scraping finished, check file %q for results\n", fName)
48 |
49 | // Display collector's statistics
50 | log.Println(c)
51 | }
52 |
--------------------------------------------------------------------------------
/assets/scrapfly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gocolly/colly/3a490c99cf2a7493271f151949590baae6a72538/assets/scrapfly.png
--------------------------------------------------------------------------------
/cmd/colly/colly.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package main
16 |
17 | import (
18 | "bytes"
19 | "fmt"
20 | "log"
21 | "os"
22 | "strings"
23 |
24 | "github.com/jawher/mow.cli"
25 | )
26 |
27 | var scraperHeadTemplate = `package main
28 |
29 | import (
30 | "log"
31 |
32 | "github.com/gocolly/colly/v2"
33 | )
34 |
35 | func main() {
36 | c := colly.NewCollector()
37 | `
38 |
39 | var scraperEndTemplate = `
40 | c.Visit("https://yourdomain.com/")
41 | }
42 | `
43 |
44 | var htmlCallbackTemplate = `
45 | c.OnHTML("element-selector", func(e *colly.HTMLElement) {
46 | log.Println(e.Text)
47 | })
48 | `
49 |
50 | var requestCallbackTemplate = `
51 | c.OnRequest(func(r *colly.Request) {
52 | log.Println("Visiting", r.URL)
53 | })
54 | `
55 |
56 | var responseCallbackTemplate = `
57 | c.OnResponse(func(r *colly.Response) {
58 | log.Println("Visited", r.Request.URL, r.StatusCode)
59 | })
60 | `
61 |
62 | var errorCallbackTemplate = `
63 | c.OnError(func(r *colly.Response, err error) {
64 | log.Printf("Error on %s: %s", r.Request.URL, err)
65 | })
66 | `
67 |
68 | func main() {
69 | app := cli.App("colly", "Scraping Framework for Gophers")
70 |
71 | app.Command("new", "Create new scraper", func(cmd *cli.Cmd) {
72 | var (
73 | callbacks = cmd.StringOpt("callbacks", "", "Add callbacks to the template. (E.g. '--callbacks=html,response,error')")
74 | hosts = cmd.StringOpt("hosts", "", "Specify scraper's allowed hosts. (e.g. '--hosts=xy.com,abcd.com')")
75 | path = cmd.StringArg("PATH", "", "Path of the new scraper")
76 | )
77 |
78 | cmd.Spec = "[--callbacks] [--hosts] [PATH]"
79 |
80 | cmd.Action = func() {
81 | scraper := bytes.NewBufferString(scraperHeadTemplate)
82 | outfile := os.Stdout
83 | if *path != "" {
84 | var err error
85 | outfile, err = os.Create(*path)
86 | if err != nil {
87 | log.Fatal(err)
88 | }
89 | defer outfile.Close()
90 | }
91 | if *hosts != "" {
92 | scraper.WriteString("\n c.AllowedDomains = []string{")
93 | for i, h := range strings.Split(*hosts, ",") {
94 | if i > 0 {
95 | scraper.WriteString(", ")
96 | }
97 | scraper.WriteString(fmt.Sprintf("%q", h))
98 | }
99 | scraper.WriteString("}\n")
100 | }
101 | if len(*callbacks) > 0 {
102 | for _, c := range strings.Split(*callbacks, ",") {
103 | switch c {
104 | case "html":
105 | scraper.WriteString(htmlCallbackTemplate)
106 | case "request":
107 | scraper.WriteString(requestCallbackTemplate)
108 | case "response":
109 | scraper.WriteString(responseCallbackTemplate)
110 | case "error":
111 | scraper.WriteString(errorCallbackTemplate)
112 | }
113 | }
114 | }
115 | scraper.WriteString(scraperEndTemplate)
116 | outfile.Write(scraper.Bytes())
117 | }
118 | })
119 |
120 | app.Run(os.Args)
121 | }
122 |
--------------------------------------------------------------------------------
/context.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "sync"
19 | )
20 |
21 | // Context provides a tiny layer for passing data between callbacks
22 | type Context struct {
23 | contextMap map[string]interface{}
24 | lock *sync.RWMutex
25 | }
26 |
27 | // NewContext initializes a new Context instance
28 | func NewContext() *Context {
29 | return &Context{
30 | contextMap: make(map[string]interface{}),
31 | lock: &sync.RWMutex{},
32 | }
33 | }
34 |
35 | // UnmarshalBinary decodes Context value to nil
36 | // This function is used by request caching
37 | func (c *Context) UnmarshalBinary(_ []byte) error {
38 | return nil
39 | }
40 |
41 | // MarshalBinary encodes Context value
42 | // This function is used by request caching
43 | func (c *Context) MarshalBinary() (_ []byte, _ error) {
44 | return nil, nil
45 | }
46 |
47 | // Put stores a value of any type in Context
48 | func (c *Context) Put(key string, value interface{}) {
49 | c.lock.Lock()
50 | c.contextMap[key] = value
51 | c.lock.Unlock()
52 | }
53 |
54 | // Get retrieves a string value from Context.
55 | // Get returns an empty string if key not found
56 | func (c *Context) Get(key string) string {
57 | c.lock.RLock()
58 | defer c.lock.RUnlock()
59 | if v, ok := c.contextMap[key]; ok {
60 | return v.(string)
61 | }
62 | return ""
63 | }
64 |
65 | // GetAny retrieves a value from Context.
66 | // GetAny returns nil if key not found
67 | func (c *Context) GetAny(key string) interface{} {
68 | c.lock.RLock()
69 | defer c.lock.RUnlock()
70 | if v, ok := c.contextMap[key]; ok {
71 | return v
72 | }
73 | return nil
74 | }
75 |
76 | // ForEach iterate context
77 | func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{} {
78 | c.lock.RLock()
79 | defer c.lock.RUnlock()
80 |
81 | ret := make([]interface{}, 0, len(c.contextMap))
82 | for k, v := range c.contextMap {
83 | ret = append(ret, fn(k, v))
84 | }
85 |
86 | return ret
87 | }
88 |
--------------------------------------------------------------------------------
/context_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "strconv"
19 | "testing"
20 | )
21 |
22 | func TestContextIteration(t *testing.T) {
23 | ctx := NewContext()
24 | for i := 0; i < 10; i++ {
25 | ctx.Put(strconv.Itoa(i), i)
26 | }
27 | values := ctx.ForEach(func(k string, v interface{}) interface{} {
28 | return v.(int)
29 | })
30 | if len(values) != 10 {
31 | t.Fatal("fail to iterate context")
32 | }
33 | for _, i := range values {
34 | v := i.(int)
35 | if v != ctx.GetAny(strconv.Itoa(v)).(int) {
36 | t.Fatal("value not equal")
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/debug/debug.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package debug
16 |
17 | // Event represents an action inside a collector
18 | type Event struct {
19 | // Type is the type of the event
20 | Type string
21 | // RequestID identifies the HTTP request of the Event
22 | RequestID uint32
23 | // CollectorID identifies the collector of the Event
24 | CollectorID uint32
25 | // Values contains the event's key-value pairs. Different type of events
26 | // can return different key-value pairs
27 | Values map[string]string
28 | }
29 |
30 | // Debugger is an interface for different type of debugging backends
31 | type Debugger interface {
32 | // Init initializes the backend
33 | Init() error
34 | // Event receives a new collector event.
35 | Event(e *Event)
36 | }
37 |
--------------------------------------------------------------------------------
/debug/logdebugger.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package debug
16 |
17 | import (
18 | "io"
19 | "log"
20 | "os"
21 | "sync/atomic"
22 | "time"
23 | )
24 |
25 | // LogDebugger is the simplest debugger which prints log messages to the STDERR
26 | type LogDebugger struct {
27 | // Output is the log destination, anything can be used which implements them
28 | // io.Writer interface. Leave it blank to use STDERR
29 | Output io.Writer
30 | // Prefix appears at the beginning of each generated log line
31 | Prefix string
32 | // Flag defines the logging properties.
33 | Flag int
34 | logger *log.Logger
35 | counter int32
36 | start time.Time
37 | }
38 |
39 | // Init initializes the LogDebugger
40 | func (l *LogDebugger) Init() error {
41 | l.counter = 0
42 | l.start = time.Now()
43 | if l.Output == nil {
44 | l.Output = os.Stderr
45 | }
46 | l.logger = log.New(l.Output, l.Prefix, l.Flag)
47 | return nil
48 | }
49 |
50 | // Event receives Collector events and prints them to STDERR
51 | func (l *LogDebugger) Event(e *Event) {
52 | i := atomic.AddInt32(&l.counter, 1)
53 | l.logger.Printf("[%06d] %d [%6d - %s] %q (%s)\n", i, e.CollectorID, e.RequestID, e.Type, e.Values, time.Since(l.start))
54 | }
55 |
--------------------------------------------------------------------------------
/debug/webdebugger.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package debug
16 |
17 | import (
18 | "encoding/json"
19 | "log"
20 | "net/http"
21 | "sync"
22 | "time"
23 | )
24 |
25 | // WebDebugger is a web based debuging frontend for colly
26 | type WebDebugger struct {
27 | // Address is the address of the web server. It is 127.0.0.1:7676 by default.
28 | Address string
29 | initialized bool
30 | CurrentRequests map[uint32]requestInfo
31 | RequestLog []requestInfo
32 | sync.Mutex
33 | }
34 |
35 | type requestInfo struct {
36 | URL string
37 | Started time.Time
38 | Duration time.Duration
39 | ResponseStatus string
40 | ID uint32
41 | CollectorID uint32
42 | }
43 |
44 | // Init initializes the WebDebugger
45 | func (w *WebDebugger) Init() error {
46 | if w.initialized {
47 | return nil
48 | }
49 | defer func() {
50 | w.initialized = true
51 | }()
52 | if w.Address == "" {
53 | w.Address = "127.0.0.1:7676"
54 | }
55 | w.RequestLog = make([]requestInfo, 0)
56 | w.CurrentRequests = make(map[uint32]requestInfo)
57 | http.HandleFunc("/", w.indexHandler)
58 | http.HandleFunc("/status", w.statusHandler)
59 | log.Println("Starting debug webserver on", w.Address)
60 | go http.ListenAndServe(w.Address, nil)
61 | return nil
62 | }
63 |
64 | // Event updates the debugger's status
65 | func (w *WebDebugger) Event(e *Event) {
66 | w.Lock()
67 | defer w.Unlock()
68 |
69 | switch e.Type {
70 | case "request":
71 | w.CurrentRequests[e.RequestID] = requestInfo{
72 | URL: e.Values["url"],
73 | Started: time.Now(),
74 | ID: e.RequestID,
75 | CollectorID: e.CollectorID,
76 | }
77 | case "response", "error":
78 | r := w.CurrentRequests[e.RequestID]
79 | r.Duration = time.Since(r.Started)
80 | r.ResponseStatus = e.Values["status"]
81 | w.RequestLog = append(w.RequestLog, r)
82 | delete(w.CurrentRequests, e.RequestID)
83 | }
84 | }
85 |
86 | func (w *WebDebugger) indexHandler(wr http.ResponseWriter, r *http.Request) {
87 | wr.Write([]byte(`
88 |
89 |
90 | Colly Debugger WebUI
91 |
92 |
93 |
94 |
95 |
100 |
101 |
102 |
103 |
Current Requests
104 |
105 |
106 |
107 |
Finished Requests
108 |
109 |
110 |
111 |
112 |
140 |
141 |
142 | `))
143 | }
144 |
145 | func (w *WebDebugger) statusHandler(wr http.ResponseWriter, r *http.Request) {
146 | w.Lock()
147 | jsonData, err := json.MarshalIndent(w, "", " ")
148 | w.Unlock()
149 | if err != nil {
150 | panic(err)
151 | }
152 | wr.Write(jsonData)
153 | }
154 |
--------------------------------------------------------------------------------
/extensions/extensions.go:
--------------------------------------------------------------------------------
1 | // Package extensions implements various helper addons for Colly
2 | package extensions
3 |
--------------------------------------------------------------------------------
/extensions/random_user_agent.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "fmt"
5 | "math/rand"
6 | "strings"
7 |
8 | "github.com/gocolly/colly/v2"
9 | )
10 |
11 | var uaGens = []func() string{
12 | genFirefoxUA,
13 | genChromeUA,
14 | genEdgeUA,
15 | genOperaUA,
16 | }
17 |
18 | var uaGensMobile = []func() string{
19 | genMobilePixel7UA,
20 | genMobilePixel6UA,
21 | genMobilePixel5UA,
22 | genMobilePixel4UA,
23 | genMobileNexus10UA,
24 | }
25 |
26 | // RandomUserAgent generates a random DESKTOP browser user-agent on every requests
27 | func RandomUserAgent(c *colly.Collector) {
28 | c.OnRequest(func(r *colly.Request) {
29 | r.Headers.Set("User-Agent", uaGens[rand.Intn(len(uaGens))]())
30 | })
31 | }
32 |
33 | // RandomMobileUserAgent generates a random MOBILE browser user-agent on every requests
34 | func RandomMobileUserAgent(c *colly.Collector) {
35 | c.OnRequest(func(r *colly.Request) {
36 | r.Headers.Set("User-Agent", uaGensMobile[rand.Intn(len(uaGensMobile))]())
37 | })
38 | }
39 |
40 | var ffVersions = []float32{
41 | // NOTE: Only version released after Jun 1, 2022 will be listed.
42 | // Data source: https://en.wikipedia.org/wiki/Firefox_version_history
43 |
44 | // 2022
45 | 102.0,
46 | 103.0,
47 | 104.0,
48 | 105.0,
49 | 106.0,
50 | 107.0,
51 | 108.0,
52 |
53 | // 2023
54 | 109.0,
55 | 110.0,
56 | 111.0,
57 | 112.0,
58 | 113.0,
59 | }
60 |
61 | var chromeVersions = []string{
62 | // NOTE: Only version released after Jun 1, 2022 will be listed.
63 | // Data source: https://chromereleases.googleblog.com/search/label/Stable%20updates
64 |
65 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop.html
66 | "102.0.5005.115",
67 |
68 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_21.html
69 | "103.0.5060.53",
70 |
71 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_27.html
72 | "103.0.5060.66",
73 |
74 | // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop.html
75 | "103.0.5060.114",
76 |
77 | // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop_19.html
78 | "103.0.5060.134",
79 |
80 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop.html
81 | "104.0.5112.79",
82 | "104.0.5112.80",
83 | "104.0.5112.81",
84 |
85 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_16.html
86 | "104.0.5112.101",
87 | "104.0.5112.102",
88 |
89 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_30.html
90 | "105.0.5195.52",
91 | "105.0.5195.53",
92 | "105.0.5195.54",
93 |
94 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop.html
95 | "105.0.5195.102",
96 |
97 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_14.html
98 | "105.0.5195.125",
99 | "105.0.5195.126",
100 | "105.0.5195.127",
101 |
102 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_27.html
103 | "106.0.5249.61",
104 | "106.0.5249.62",
105 |
106 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_30.html
107 | "106.0.5249.91",
108 |
109 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop.html
110 | "106.0.5249.103",
111 |
112 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_11.html
113 | "106.0.5249.119",
114 |
115 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_25.html
116 | "107.0.5304.62",
117 | "107.0.5304.63",
118 | "107.0.5304.68",
119 |
120 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_27.html
121 | "107.0.5304.87",
122 | "107.0.5304.88",
123 |
124 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop.html
125 | "107.0.5304.106",
126 | "107.0.5304.107",
127 | "107.0.5304.110",
128 |
129 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_24.html
130 | "107.0.5304.121",
131 | "107.0.5304.122",
132 |
133 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_29.html
134 | "108.0.5359.71",
135 | "108.0.5359.72",
136 |
137 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop.html
138 | "108.0.5359.94",
139 | "108.0.5359.95",
140 |
141 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_7.html
142 | "108.0.5359.98",
143 | "108.0.5359.99",
144 |
145 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_13.html
146 | "108.0.5359.124",
147 | "108.0.5359.125",
148 |
149 | // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop.html
150 | "109.0.5414.74",
151 | "109.0.5414.75",
152 | "109.0.5414.87",
153 |
154 | // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop_24.html
155 | "109.0.5414.119",
156 | "109.0.5414.120",
157 |
158 | // https://chromereleases.googleblog.com/2023/02/stable-channel-update-for-desktop.html
159 | "110.0.5481.77",
160 | "110.0.5481.78",
161 |
162 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update.html
163 | "110.0.5481.96",
164 | "110.0.5481.97",
165 |
166 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_14.html
167 | "110.0.5481.100",
168 |
169 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_16.html
170 | "110.0.5481.104",
171 |
172 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_22.html
173 | "110.0.5481.177",
174 | "110.0.5481.178",
175 |
176 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_97.html
177 | "109.0.5414.129",
178 |
179 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop.html
180 | "111.0.5563.64",
181 | "111.0.5563.65",
182 |
183 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_21.html
184 | "111.0.5563.110",
185 | "111.0.5563.111",
186 |
187 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_27.html
188 | "111.0.5563.146",
189 | "111.0.5563.147",
190 |
191 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop.html
192 | "112.0.5615.49",
193 | "112.0.5615.50",
194 |
195 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_12.html
196 | "112.0.5615.86",
197 | "112.0.5615.87",
198 |
199 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_14.html
200 | "112.0.5615.121",
201 |
202 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_18.html
203 | "112.0.5615.137",
204 | "112.0.5615.138",
205 | "112.0.5615.165",
206 |
207 | // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop.html
208 | "113.0.5672.63",
209 | "113.0.5672.64",
210 |
211 | // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop_8.html
212 | "113.0.5672.92",
213 | "113.0.5672.93",
214 | }
215 |
216 | var edgeVersions = []string{
217 | // NOTE: Only version released after Jun 1, 2022 will be listed.
218 | // Data source: https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule
219 |
220 | // 2022
221 | "103.0.0.0,103.0.1264.37",
222 | "104.0.0.0,104.0.1293.47",
223 | "105.0.0.0,105.0.1343.25",
224 | "106.0.0.0,106.0.1370.34",
225 | "107.0.0.0,107.0.1418.24",
226 | "108.0.0.0,108.0.1462.42",
227 |
228 | // 2023
229 | "109.0.0.0,109.0.1518.49",
230 | "110.0.0.0,110.0.1587.41",
231 | "111.0.0.0,111.0.1661.41",
232 | "112.0.0.0,112.0.1722.34",
233 | "113.0.0.0,113.0.1774.3",
234 | }
235 |
236 | var operaVersions = []string{
237 | // NOTE: Only version released after Jan 1, 2023 will be listed.
238 | // Data source: https://blogs.opera.com/desktop/
239 |
240 | // https://blogs.opera.com/desktop/changelog-for-96/
241 | "110.0.5449.0,96.0.4640.0",
242 | "110.0.5464.2,96.0.4653.0",
243 | "110.0.5464.2,96.0.4660.0",
244 | "110.0.5481.30,96.0.4674.0",
245 | "110.0.5481.30,96.0.4691.0",
246 | "110.0.5481.30,96.0.4693.12",
247 | "110.0.5481.77,96.0.4693.16",
248 | "110.0.5481.100,96.0.4693.20",
249 | "110.0.5481.178,96.0.4693.31",
250 | "110.0.5481.178,96.0.4693.50",
251 | "110.0.5481.192,96.0.4693.80",
252 |
253 | // https://blogs.opera.com/desktop/changelog-for-97/
254 | "111.0.5532.2,97.0.4711.0",
255 | "111.0.5532.2,97.0.4704.0",
256 | "111.0.5532.2,97.0.4697.0",
257 | "111.0.5562.0,97.0.4718.0",
258 | "111.0.5563.19,97.0.4719.4",
259 | "111.0.5563.19,97.0.4719.11",
260 | "111.0.5563.41,97.0.4719.17",
261 | "111.0.5563.65,97.0.4719.26",
262 | "111.0.5563.65,97.0.4719.28",
263 | "111.0.5563.111,97.0.4719.43",
264 | "111.0.5563.147,97.0.4719.63",
265 | "111.0.5563.147,97.0.4719.83",
266 |
267 | // https://blogs.opera.com/desktop/changelog-for-98/
268 | "112.0.5596.2,98.0.4756.0",
269 | "112.0.5596.2,98.0.4746.0",
270 | "112.0.5615.20,98.0.4759.1",
271 | "112.0.5615.50,98.0.4759.3",
272 | "112.0.5615.87,98.0.4759.6",
273 | "112.0.5615.165,98.0.4759.15",
274 | "112.0.5615.165,98.0.4759.21",
275 | "112.0.5615.165,98.0.4759.39",
276 | }
277 |
278 | var pixel7AndroidVersions = []string{
279 | // Data source:
280 | // - https://developer.android.com/about/versions
281 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
282 | "13",
283 | }
284 |
285 | var pixel6AndroidVersions = []string{
286 | // Data source:
287 | // - https://developer.android.com/about/versions
288 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
289 | "12",
290 | "13",
291 | }
292 |
293 | var pixel5AndroidVersions = []string{
294 | // Data source:
295 | // - https://developer.android.com/about/versions
296 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
297 | "11",
298 | "12",
299 | "13",
300 | }
301 |
302 | var pixel4AndroidVersions = []string{
303 | // Data source:
304 | // - https://developer.android.com/about/versions
305 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
306 | "10",
307 | "11",
308 | "12",
309 | "13",
310 | }
311 |
312 | var nexus10AndroidVersions = []string{
313 | // Data source:
314 | // - https://developer.android.com/about/versions
315 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
316 | "4.4.2",
317 | "4.4.4",
318 | "5.0",
319 | "5.0.1",
320 | "5.0.2",
321 | "5.1",
322 | "5.1.1",
323 | }
324 |
325 | var nexus10Builds = []string{
326 | // Data source: https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
327 |
328 | "LMY49M", // android-5.1.1_r38 (Lollipop)
329 | "LMY49J", // android-5.1.1_r37 (Lollipop)
330 | "LMY49I", // android-5.1.1_r36 (Lollipop)
331 | "LMY49H", // android-5.1.1_r35 (Lollipop)
332 | "LMY49G", // android-5.1.1_r34 (Lollipop)
333 | "LMY49F", // android-5.1.1_r33 (Lollipop)
334 | "LMY48Z", // android-5.1.1_r30 (Lollipop)
335 | "LMY48X", // android-5.1.1_r25 (Lollipop)
336 | "LMY48T", // android-5.1.1_r19 (Lollipop)
337 | "LMY48M", // android-5.1.1_r14 (Lollipop)
338 | "LMY48I", // android-5.1.1_r9 (Lollipop)
339 | "LMY47V", // android-5.1.1_r1 (Lollipop)
340 | "LMY47D", // android-5.1.0_r1 (Lollipop)
341 | "LRX22G", // android-5.0.2_r1 (Lollipop)
342 | "LRX22C", // android-5.0.1_r1 (Lollipop)
343 | "LRX21P", // android-5.0.0_r4.0.1 (Lollipop)
344 | "KTU84P", // android-4.4.4_r1 (KitKat)
345 | "KTU84L", // android-4.4.3_r1 (KitKat)
346 | "KOT49H", // android-4.4.2_r1 (KitKat)
347 | "KOT49E", // android-4.4.1_r1 (KitKat)
348 | "KRT16S", // android-4.4_r1.2 (KitKat)
349 | "JWR66Y", // android-4.3_r1.1 (Jelly Bean)
350 | "JWR66V", // android-4.3_r1 (Jelly Bean)
351 | "JWR66N", // android-4.3_r0.9.1 (Jelly Bean)
352 | "JDQ39 ", // android-4.2.2_r1 (Jelly Bean)
353 | "JOP40F", // android-4.2.1_r1.1 (Jelly Bean)
354 | "JOP40D", // android-4.2.1_r1 (Jelly Bean)
355 | "JOP40C", // android-4.2_r1 (Jelly Bean)
356 | }
357 |
358 | var osStrings = []string{
359 | // MacOS - High Sierra
360 | "Macintosh; Intel Mac OS X 10_13",
361 | "Macintosh; Intel Mac OS X 10_13_1",
362 | "Macintosh; Intel Mac OS X 10_13_2",
363 | "Macintosh; Intel Mac OS X 10_13_3",
364 | "Macintosh; Intel Mac OS X 10_13_4",
365 | "Macintosh; Intel Mac OS X 10_13_5",
366 | "Macintosh; Intel Mac OS X 10_13_6",
367 |
368 | // MacOS - Mojave
369 | "Macintosh; Intel Mac OS X 10_14",
370 | "Macintosh; Intel Mac OS X 10_14_1",
371 | "Macintosh; Intel Mac OS X 10_14_2",
372 | "Macintosh; Intel Mac OS X 10_14_3",
373 | "Macintosh; Intel Mac OS X 10_14_4",
374 | "Macintosh; Intel Mac OS X 10_14_5",
375 | "Macintosh; Intel Mac OS X 10_14_6",
376 |
377 | // MacOS - Catalina
378 | "Macintosh; Intel Mac OS X 10_15",
379 | "Macintosh; Intel Mac OS X 10_15_1",
380 | "Macintosh; Intel Mac OS X 10_15_2",
381 | "Macintosh; Intel Mac OS X 10_15_3",
382 | "Macintosh; Intel Mac OS X 10_15_4",
383 | "Macintosh; Intel Mac OS X 10_15_5",
384 | "Macintosh; Intel Mac OS X 10_15_6",
385 | "Macintosh; Intel Mac OS X 10_15_7",
386 |
387 | // MacOS - Big Sur
388 | "Macintosh; Intel Mac OS X 11_0",
389 | "Macintosh; Intel Mac OS X 11_0_1",
390 | "Macintosh; Intel Mac OS X 11_1",
391 | "Macintosh; Intel Mac OS X 11_2",
392 | "Macintosh; Intel Mac OS X 11_2_1",
393 | "Macintosh; Intel Mac OS X 11_2_2",
394 | "Macintosh; Intel Mac OS X 11_2_3",
395 | "Macintosh; Intel Mac OS X 11_3",
396 | "Macintosh; Intel Mac OS X 11_3_1",
397 | "Macintosh; Intel Mac OS X 11_4",
398 | "Macintosh; Intel Mac OS X 11_5",
399 | "Macintosh; Intel Mac OS X 11_5_1",
400 | "Macintosh; Intel Mac OS X 11_5_2",
401 | "Macintosh; Intel Mac OS X 11_6",
402 | "Macintosh; Intel Mac OS X 11_6_1",
403 | "Macintosh; Intel Mac OS X 11_6_2",
404 | "Macintosh; Intel Mac OS X 11_6_3",
405 | "Macintosh; Intel Mac OS X 11_6_4",
406 | "Macintosh; Intel Mac OS X 11_6_5",
407 | "Macintosh; Intel Mac OS X 11_6_6",
408 | "Macintosh; Intel Mac OS X 11_6_7",
409 | "Macintosh; Intel Mac OS X 11_6_8",
410 | "Macintosh; Intel Mac OS X 11_7",
411 | "Macintosh; Intel Mac OS X 11_7_1",
412 | "Macintosh; Intel Mac OS X 11_7_2",
413 | "Macintosh; Intel Mac OS X 11_7_3",
414 | "Macintosh; Intel Mac OS X 11_7_4",
415 | "Macintosh; Intel Mac OS X 11_7_5",
416 | "Macintosh; Intel Mac OS X 11_7_6",
417 |
418 | // MacOS - Monterey
419 | "Macintosh; Intel Mac OS X 12_0",
420 | "Macintosh; Intel Mac OS X 12_0_1",
421 | "Macintosh; Intel Mac OS X 12_1",
422 | "Macintosh; Intel Mac OS X 12_2",
423 | "Macintosh; Intel Mac OS X 12_2_1",
424 | "Macintosh; Intel Mac OS X 12_3",
425 | "Macintosh; Intel Mac OS X 12_3_1",
426 | "Macintosh; Intel Mac OS X 12_4",
427 | "Macintosh; Intel Mac OS X 12_5",
428 | "Macintosh; Intel Mac OS X 12_5_1",
429 | "Macintosh; Intel Mac OS X 12_6",
430 | "Macintosh; Intel Mac OS X 12_6_1",
431 | "Macintosh; Intel Mac OS X 12_6_2",
432 | "Macintosh; Intel Mac OS X 12_6_3",
433 | "Macintosh; Intel Mac OS X 12_6_4",
434 | "Macintosh; Intel Mac OS X 12_6_5",
435 |
436 | // MacOS - Ventura
437 | "Macintosh; Intel Mac OS X 13_0",
438 | "Macintosh; Intel Mac OS X 13_0_1",
439 | "Macintosh; Intel Mac OS X 13_1",
440 | "Macintosh; Intel Mac OS X 13_2",
441 | "Macintosh; Intel Mac OS X 13_2_1",
442 | "Macintosh; Intel Mac OS X 13_3",
443 | "Macintosh; Intel Mac OS X 13_3_1",
444 |
445 | // Windows
446 | "Windows NT 10.0; Win64; x64",
447 | "Windows NT 5.1",
448 | "Windows NT 6.1; WOW64",
449 | "Windows NT 6.1; Win64; x64",
450 |
451 | // Linux
452 | "X11; Linux x86_64",
453 | }
454 |
455 | // Generates Firefox Browser User-Agent (Desktop)
456 | //
457 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0"
458 | func genFirefoxUA() string {
459 | version := ffVersions[rand.Intn(len(ffVersions))]
460 | os := osStrings[rand.Intn(len(osStrings))]
461 | return fmt.Sprintf("Mozilla/5.0 (%s; rv:%.1f) Gecko/20100101 Firefox/%.1f", os, version, version)
462 | }
463 |
464 | // Generates Chrome Browser User-Agent (Desktop)
465 | //
466 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
467 | func genChromeUA() string {
468 | version := chromeVersions[rand.Intn(len(chromeVersions))]
469 | os := osStrings[rand.Intn(len(osStrings))]
470 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", os, version)
471 | }
472 |
473 | // Generates Microsoft Edge User-Agent (Desktop)
474 | //
475 | // -> "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39"
476 | func genEdgeUA() string {
477 | version := edgeVersions[rand.Intn(len(edgeVersions))]
478 | chromeVersion := strings.Split(version, ",")[0]
479 | edgeVersion := strings.Split(version, ",")[1]
480 | os := osStrings[rand.Intn(len(osStrings))]
481 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 Edg/%s", os, chromeVersion, edgeVersion)
482 | }
483 |
484 | // Generates Opera Browser User-Agent (Desktop)
485 | //
486 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.4759.3"
487 | func genOperaUA() string {
488 | version := operaVersions[rand.Intn(len(operaVersions))]
489 | chromeVersion := strings.Split(version, ",")[0]
490 | operaVersion := strings.Split(version, ",")[1]
491 | os := osStrings[rand.Intn(len(osStrings))]
492 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 OPR/%s", os, chromeVersion, operaVersion)
493 | }
494 |
495 | // Generates Pixel 7 Browser User-Agent (Mobile)
496 | //
497 | // -> Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36
498 | func genMobilePixel7UA() string {
499 | android := pixel7AndroidVersions[rand.Intn(len(pixel7AndroidVersions))]
500 | chrome := chromeVersions[rand.Intn(len(chromeVersions))]
501 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
502 | }
503 |
504 | // Generates Pixel 6 Browser User-Agent (Mobile)
505 | //
506 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
507 | func genMobilePixel6UA() string {
508 | android := pixel6AndroidVersions[rand.Intn(len(pixel6AndroidVersions))]
509 | chrome := chromeVersions[rand.Intn(len(chromeVersions))]
510 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
511 | }
512 |
513 | // Generates Pixel 5 Browser User-Agent (Mobile)
514 | //
515 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
516 | func genMobilePixel5UA() string {
517 | android := pixel5AndroidVersions[rand.Intn(len(pixel5AndroidVersions))]
518 | chrome := chromeVersions[rand.Intn(len(chromeVersions))]
519 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
520 | }
521 |
522 | // Generates Pixel 4 Browser User-Agent (Mobile)
523 | //
524 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
525 | func genMobilePixel4UA() string {
526 | android := pixel4AndroidVersions[rand.Intn(len(pixel4AndroidVersions))]
527 | chrome := chromeVersions[rand.Intn(len(chromeVersions))]
528 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
529 | }
530 |
531 | // Generates Nexus 10 Browser User-Agent (Mobile)
532 | //
533 | // -> "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 10 Build/LMY48T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Safari/537.36"
534 | func genMobileNexus10UA() string {
535 | build := nexus10Builds[rand.Intn(len(nexus10Builds))]
536 | android := nexus10AndroidVersions[rand.Intn(len(nexus10AndroidVersions))]
537 | chrome := chromeVersions[rand.Intn(len(chromeVersions))]
538 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Nexus 10 Build/%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, build, chrome)
539 | }
540 |
--------------------------------------------------------------------------------
/extensions/referer.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "github.com/gocolly/colly/v2"
5 | )
6 |
7 | // Referer sets valid Referer HTTP header to requests.
8 | // Warning: this extension works only if you use Request.Visit
9 | // from callbacks instead of Collector.Visit.
10 | func Referer(c *colly.Collector) {
11 | c.OnResponse(func(r *colly.Response) {
12 | r.Ctx.Put("_referer", r.Request.URL.String())
13 | })
14 | c.OnRequest(func(r *colly.Request) {
15 | if ref := r.Ctx.Get("_referer"); ref != "" {
16 | r.Headers.Set("Referer", ref)
17 | }
18 | })
19 | }
20 |
--------------------------------------------------------------------------------
/extensions/url_length_filter.go:
--------------------------------------------------------------------------------
1 | package extensions
2 |
3 | import (
4 | "github.com/gocolly/colly/v2"
5 | )
6 |
7 | // URLLengthFilter filters out requests with URLs longer than URLLengthLimit
8 | func URLLengthFilter(c *colly.Collector, URLLengthLimit int) {
9 | c.OnRequest(func(r *colly.Request) {
10 | if len(r.URL.String()) > URLLengthLimit {
11 | r.Abort()
12 | }
13 | })
14 | }
15 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/gocolly/colly/v2
2 |
3 | go 1.23.0
4 |
5 | toolchain go1.24.1
6 |
7 | require (
8 | github.com/PuerkitoBio/goquery v1.10.2
9 | github.com/antchfx/htmlquery v1.3.4
10 | github.com/antchfx/xmlquery v1.4.4
11 | github.com/gobwas/glob v0.2.3
12 | github.com/gocolly/colly v1.2.0
13 | github.com/jawher/mow.cli v1.1.0
14 | github.com/kennygrant/sanitize v1.2.4
15 | github.com/nlnwa/whatwg-url v0.6.1
16 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d
17 | github.com/temoto/robotstxt v1.1.2
18 | golang.org/x/net v0.37.0
19 | google.golang.org/appengine v1.6.8
20 | )
21 |
22 | require (
23 | github.com/andybalholm/cascadia v1.3.3 // indirect
24 | github.com/antchfx/xpath v1.3.3 // indirect
25 | github.com/bits-and-blooms/bitset v1.22.0 // indirect
26 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
27 | github.com/golang/protobuf v1.5.4 // indirect
28 | golang.org/x/text v0.23.0 // indirect
29 | google.golang.org/protobuf v1.36.6 // indirect
30 | )
31 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/PuerkitoBio/goquery v1.10.2 h1:7fh2BdHcG6VFZsK7toXBT/Bh1z5Wmy8Q9MV9HqT2AM8=
2 | github.com/PuerkitoBio/goquery v1.10.2/go.mod h1:0guWGjcLu9AYC7C1GHnpysHy056u9aEkUHwhdnePMCU=
3 | github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
4 | github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
5 | github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ=
6 | github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM=
7 | github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg=
8 | github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc=
9 | github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
10 | github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
11 | github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
12 | github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
13 | github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
14 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
15 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
16 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
17 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
18 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
19 | github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
20 | github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
21 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
22 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ=
23 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw=
24 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
25 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
26 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
27 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
28 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
29 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
30 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
31 | github.com/jawher/mow.cli v1.1.0 h1:NdtHXRc0CwZQ507wMvQ/IS+Q3W3x2fycn973/b8Zuk8=
32 | github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
33 | github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
34 | github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
35 | github.com/nlnwa/whatwg-url v0.6.1 h1:Zlefa3aglQFHF/jku45VxbEJwPicDnOz64Ra3F7npqQ=
36 | github.com/nlnwa/whatwg-url v0.6.1/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
37 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
38 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
39 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
40 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
41 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
42 | github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
43 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
44 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
45 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
46 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
47 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
48 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
49 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
50 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
51 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
52 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
53 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
54 | golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
55 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
56 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
57 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
58 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
59 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
60 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
61 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
62 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
63 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
64 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
65 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
66 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
67 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
68 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
69 | golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
70 | golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
71 | golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
72 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
73 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
74 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
75 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
76 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
77 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
78 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
79 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
80 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
81 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
82 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
83 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
84 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
85 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
86 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
87 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
88 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
89 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
90 | golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
91 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
92 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
93 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
94 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
95 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
96 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
97 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
98 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
99 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
100 | golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek=
101 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
102 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
103 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
104 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
105 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
106 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
107 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
108 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
109 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
110 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
111 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
112 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
113 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
114 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
115 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
116 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
117 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
118 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
119 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
120 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
121 | google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM=
122 | google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds=
123 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
124 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
125 | google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
126 | google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
127 |
--------------------------------------------------------------------------------
/htmlelement.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "strings"
19 |
20 | "github.com/PuerkitoBio/goquery"
21 | "golang.org/x/net/html"
22 | )
23 |
24 | // HTMLElement is the representation of a HTML tag.
25 | type HTMLElement struct {
26 | // Name is the name of the tag
27 | Name string
28 | Text string
29 | attributes []html.Attribute
30 | // Request is the request object of the element's HTML document
31 | Request *Request
32 | // Response is the Response object of the element's HTML document
33 | Response *Response
34 | // DOM is the goquery parsed DOM object of the page. DOM is relative
35 | // to the current HTMLElement
36 | DOM *goquery.Selection
37 | // Index stores the position of the current element within all the elements matched by an OnHTML callback
38 | Index int
39 | }
40 |
41 | // NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
42 | func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement {
43 | return &HTMLElement{
44 | Name: n.Data,
45 | Request: resp.Request,
46 | Response: resp,
47 | Text: goquery.NewDocumentFromNode(n).Text(),
48 | DOM: s,
49 | Index: idx,
50 | attributes: n.Attr,
51 | }
52 | }
53 |
54 | // Attr returns the selected attribute of a HTMLElement or empty string
55 | // if no attribute found
56 | func (h *HTMLElement) Attr(k string) string {
57 | for _, a := range h.attributes {
58 | if a.Key == k {
59 | return a.Val
60 | }
61 | }
62 | return ""
63 | }
64 |
65 | // ChildText returns the concatenated and stripped text content of the matching
66 | // elements.
67 | func (h *HTMLElement) ChildText(goquerySelector string) string {
68 | return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
69 | }
70 |
71 | // ChildTexts returns the stripped text content of all the matching
72 | // elements.
73 | func (h *HTMLElement) ChildTexts(goquerySelector string) []string {
74 | var res []string
75 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
76 |
77 | res = append(res, strings.TrimSpace(s.Text()))
78 | })
79 | return res
80 | }
81 |
82 | // ChildAttr returns the stripped text content of the first matching
83 | // element's attribute.
84 | func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
85 | if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok {
86 | return strings.TrimSpace(attr)
87 | }
88 | return ""
89 | }
90 |
91 | // ChildAttrs returns the stripped text content of all the matching
92 | // element's attributes.
93 | func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string {
94 | var res []string
95 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
96 | if attr, ok := s.Attr(attrName); ok {
97 | res = append(res, strings.TrimSpace(attr))
98 | }
99 | })
100 | return res
101 | }
102 |
103 | // ForEach iterates over the elements matched by the first argument
104 | // and calls the callback function on every HTMLElement match.
105 | func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) {
106 | i := 0
107 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
108 | for _, n := range s.Nodes {
109 | callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i))
110 | i++
111 | }
112 | })
113 | }
114 |
115 | // ForEachWithBreak iterates over the elements matched by the first argument
116 | // and calls the callback function on every HTMLElement match.
117 | // It is identical to ForEach except that it is possible to break
118 | // out of the loop by returning false in the callback function. It returns the
119 | // current Selection object.
120 | func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) {
121 | i := 0
122 | h.DOM.Find(goquerySelector).EachWithBreak(func(_ int, s *goquery.Selection) bool {
123 | for _, n := range s.Nodes {
124 | if callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) {
125 | i++
126 | return true
127 | }
128 | }
129 | return false
130 | })
131 | }
132 |
--------------------------------------------------------------------------------
/http_backend.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "crypto/sha1"
19 | "encoding/gob"
20 | "encoding/hex"
21 | "io"
22 | "math/rand"
23 | "net/http"
24 | "os"
25 | "path"
26 | "regexp"
27 | "strings"
28 | "sync"
29 | "time"
30 |
31 | "compress/gzip"
32 |
33 | "github.com/gobwas/glob"
34 | )
35 |
36 | type httpBackend struct {
37 | LimitRules []*LimitRule
38 | Client *http.Client
39 | lock *sync.RWMutex
40 | }
41 |
42 | type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool
43 |
44 | // LimitRule provides connection restrictions for domains.
45 | // Both DomainRegexp and DomainGlob can be used to specify
46 | // the included domains patterns, but at least one is required.
47 | // There can be two kind of limitations:
48 | // - Parallelism: Set limit for the number of concurrent requests to matching domains
49 | // - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
50 | type LimitRule struct {
51 | // DomainRegexp is a regular expression to match against domains
52 | DomainRegexp string
53 | // DomainGlob is a glob pattern to match against domains
54 | DomainGlob string
55 | // Delay is the duration to wait before creating a new request to the matching domains
56 | Delay time.Duration
57 | // RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
58 | RandomDelay time.Duration
59 | // Parallelism is the number of the maximum allowed concurrent requests of the matching domains
60 | Parallelism int
61 | waitChan chan bool
62 | compiledRegexp *regexp.Regexp
63 | compiledGlob glob.Glob
64 | }
65 |
66 | // Init initializes the private members of LimitRule
67 | func (r *LimitRule) Init() error {
68 | waitChanSize := 1
69 | if r.Parallelism > 1 {
70 | waitChanSize = r.Parallelism
71 | }
72 | r.waitChan = make(chan bool, waitChanSize)
73 | hasPattern := false
74 | if r.DomainRegexp != "" {
75 | c, err := regexp.Compile(r.DomainRegexp)
76 | if err != nil {
77 | return err
78 | }
79 | r.compiledRegexp = c
80 | hasPattern = true
81 | }
82 | if r.DomainGlob != "" {
83 | c, err := glob.Compile(r.DomainGlob)
84 | if err != nil {
85 | return err
86 | }
87 | r.compiledGlob = c
88 | hasPattern = true
89 | }
90 | if !hasPattern {
91 | return ErrNoPattern
92 | }
93 | return nil
94 | }
95 |
96 | func (h *httpBackend) Init(jar http.CookieJar) {
97 | rand.Seed(time.Now().UnixNano())
98 | h.Client = &http.Client{
99 | Jar: jar,
100 | Timeout: 10 * time.Second,
101 | }
102 | h.lock = &sync.RWMutex{}
103 | }
104 |
105 | // Match checks that the domain parameter triggers the rule
106 | func (r *LimitRule) Match(domain string) bool {
107 | match := false
108 | if r.compiledRegexp != nil && r.compiledRegexp.MatchString(domain) {
109 | match = true
110 | }
111 | if r.compiledGlob != nil && r.compiledGlob.Match(domain) {
112 | match = true
113 | }
114 | return match
115 | }
116 |
117 | func (h *httpBackend) GetMatchingRule(domain string) *LimitRule {
118 | if h.LimitRules == nil {
119 | return nil
120 | }
121 | h.lock.RLock()
122 | defer h.lock.RUnlock()
123 | for _, r := range h.LimitRules {
124 | if r.Match(domain) {
125 | return r
126 | }
127 | }
128 | return nil
129 | }
130 |
131 | func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string, cacheExpiration time.Duration) (*Response, error) {
132 | if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" {
133 | return h.Do(request, bodySize, checkHeadersFunc)
134 | }
135 | sum := sha1.Sum([]byte(request.URL.String()))
136 | hash := hex.EncodeToString(sum[:])
137 | dir := path.Join(cacheDir, hash[:2])
138 | filename := path.Join(dir, hash)
139 |
140 | if fileInfo, err := os.Stat(filename); err == nil && cacheExpiration > 0 {
141 | if time.Since(fileInfo.ModTime()) > cacheExpiration {
142 | _ = os.Remove(filename)
143 | }
144 | }
145 |
146 | if file, err := os.Open(filename); err == nil {
147 | resp := new(Response)
148 | err := gob.NewDecoder(file).Decode(resp)
149 | file.Close()
150 | checkHeadersFunc(request, resp.StatusCode, *resp.Headers)
151 | if resp.StatusCode < 500 {
152 | return resp, err
153 | }
154 | }
155 | resp, err := h.Do(request, bodySize, checkHeadersFunc)
156 | if err != nil || resp.StatusCode >= 500 {
157 | return resp, err
158 | }
159 | if _, err := os.Stat(dir); err != nil {
160 | if err := os.MkdirAll(dir, 0750); err != nil {
161 | return resp, err
162 | }
163 | }
164 | file, err := os.Create(filename + "~")
165 | if err != nil {
166 | return resp, err
167 | }
168 | if err := gob.NewEncoder(file).Encode(resp); err != nil {
169 | file.Close()
170 | return resp, err
171 | }
172 | file.Close()
173 | return resp, os.Rename(filename+"~", filename)
174 | }
175 |
176 | func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) {
177 | r := h.GetMatchingRule(request.URL.Host)
178 | if r != nil {
179 | r.waitChan <- true
180 | defer func(r *LimitRule) {
181 | randomDelay := time.Duration(0)
182 | if r.RandomDelay != 0 {
183 | randomDelay = time.Duration(rand.Int63n(int64(r.RandomDelay)))
184 | }
185 | time.Sleep(r.Delay + randomDelay)
186 | <-r.waitChan
187 | }(r)
188 | }
189 |
190 | res, err := h.Client.Do(request)
191 | if err != nil {
192 | return nil, err
193 | }
194 | defer res.Body.Close()
195 |
196 | finalRequest := request
197 | if res.Request != nil {
198 | finalRequest = res.Request
199 | }
200 | if !checkHeadersFunc(finalRequest, res.StatusCode, res.Header) {
201 | // closing res.Body (see defer above) without reading it aborts
202 | // the download
203 | return nil, ErrAbortedAfterHeaders
204 | }
205 |
206 | var bodyReader io.Reader = res.Body
207 | if bodySize > 0 {
208 | bodyReader = io.LimitReader(bodyReader, int64(bodySize))
209 | }
210 | contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
211 | if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
212 | bodyReader, err = gzip.NewReader(bodyReader)
213 | if err != nil {
214 | return nil, err
215 | }
216 | defer bodyReader.(*gzip.Reader).Close()
217 | }
218 | body, err := io.ReadAll(bodyReader)
219 | if err != nil {
220 | return nil, err
221 | }
222 | return &Response{
223 | StatusCode: res.StatusCode,
224 | Body: body,
225 | Headers: &res.Header,
226 | }, nil
227 | }
228 |
229 | func (h *httpBackend) Limit(rule *LimitRule) error {
230 | h.lock.Lock()
231 | if h.LimitRules == nil {
232 | h.LimitRules = make([]*LimitRule, 0, 8)
233 | }
234 | h.LimitRules = append(h.LimitRules, rule)
235 | h.lock.Unlock()
236 | return rule.Init()
237 | }
238 |
239 | func (h *httpBackend) Limits(rules []*LimitRule) error {
240 | for _, r := range rules {
241 | if err := h.Limit(r); err != nil {
242 | return err
243 | }
244 | }
245 | return nil
246 | }
247 |
--------------------------------------------------------------------------------
/http_trace.go:
--------------------------------------------------------------------------------
1 | package colly
2 |
3 | import (
4 | "net/http"
5 | "net/http/httptrace"
6 | "time"
7 | )
8 |
9 | // HTTPTrace provides a datastructure for storing an http trace.
10 | type HTTPTrace struct {
11 | start, connect time.Time
12 | ConnectDuration time.Duration
13 | FirstByteDuration time.Duration
14 | }
15 |
16 | // trace returns a httptrace.ClientTrace object to be used with an http
17 | // request via httptrace.WithClientTrace() that fills in the HttpTrace.
18 | func (ht *HTTPTrace) trace() *httptrace.ClientTrace {
19 | trace := &httptrace.ClientTrace{
20 | ConnectStart: func(network, addr string) { ht.connect = time.Now() },
21 | ConnectDone: func(network, addr string, err error) {
22 | ht.ConnectDuration = time.Since(ht.connect)
23 | },
24 |
25 | GetConn: func(hostPort string) { ht.start = time.Now() },
26 | GotFirstResponseByte: func() {
27 | ht.FirstByteDuration = time.Since(ht.start)
28 | },
29 | }
30 | return trace
31 | }
32 |
33 | // WithTrace returns the given HTTP Request with this HTTPTrace added to its
34 | // context.
35 | func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request {
36 | return req.WithContext(httptrace.WithClientTrace(req.Context(), ht.trace()))
37 | }
38 |
--------------------------------------------------------------------------------
/http_trace_test.go:
--------------------------------------------------------------------------------
1 | package colly
2 |
3 | import (
4 | "net/http"
5 | "net/http/httptest"
6 | "testing"
7 | "time"
8 | )
9 |
10 | const testDelay = 200 * time.Millisecond
11 |
12 | func newTraceTestServer(delay time.Duration) *httptest.Server {
13 | mux := http.NewServeMux()
14 |
15 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
16 | time.Sleep(delay)
17 | w.WriteHeader(200)
18 | })
19 | mux.HandleFunc("/error", func(w http.ResponseWriter, r *http.Request) {
20 | time.Sleep(delay)
21 | w.WriteHeader(500)
22 | })
23 |
24 | return httptest.NewServer(mux)
25 | }
26 |
27 | func TestTraceWithNoDelay(t *testing.T) {
28 | ts := newTraceTestServer(0)
29 | defer ts.Close()
30 |
31 | client := ts.Client()
32 | req, err := http.NewRequest("GET", ts.URL, nil)
33 | if err != nil {
34 | t.Errorf("Failed to construct request %v", err)
35 | }
36 | trace := &HTTPTrace{}
37 | req = trace.WithTrace(req)
38 |
39 | if _, err = client.Do(req); err != nil {
40 | t.Errorf("Failed to make request %v", err)
41 | }
42 |
43 | if trace.ConnectDuration > testDelay {
44 | t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration)
45 | }
46 | if trace.FirstByteDuration > testDelay {
47 | t.Errorf("trace FirstByteDuration should be (almost) 0, got %v", trace.FirstByteDuration)
48 | }
49 | }
50 |
51 | func TestTraceWithDelay(t *testing.T) {
52 | ts := newTraceTestServer(testDelay)
53 | defer ts.Close()
54 |
55 | client := ts.Client()
56 | req, err := http.NewRequest("GET", ts.URL, nil)
57 | if err != nil {
58 | t.Errorf("Failed to construct request %v", err)
59 | }
60 | trace := &HTTPTrace{}
61 | req = trace.WithTrace(req)
62 |
63 | if _, err = client.Do(req); err != nil {
64 | t.Errorf("Failed to make request %v", err)
65 | }
66 |
67 | if trace.ConnectDuration > testDelay {
68 | t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration)
69 | }
70 | if trace.FirstByteDuration < testDelay {
71 | t.Errorf("trace FirstByteDuration should be at least 200ms, got %v", trace.FirstByteDuration)
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/proxy/proxy.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package proxy
16 |
17 | import (
18 | "context"
19 | "net/http"
20 | "net/url"
21 | "sync/atomic"
22 |
23 | "github.com/gocolly/colly/v2"
24 | )
25 |
26 | type roundRobinSwitcher struct {
27 | proxyURLs []*url.URL
28 | index uint32
29 | }
30 |
31 | func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) {
32 | index := atomic.AddUint32(&r.index, 1) - 1
33 | u := r.proxyURLs[index%uint32(len(r.proxyURLs))]
34 |
35 | ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, u.String())
36 | *pr = *pr.WithContext(ctx)
37 | return u, nil
38 | }
39 |
40 | // RoundRobinProxySwitcher creates a proxy switcher function which rotates
41 | // ProxyURLs on every request.
42 | // The proxy type is determined by the URL scheme. "http", "https"
43 | // and "socks5" are supported. If the scheme is empty,
44 | // "http" is assumed.
45 | func RoundRobinProxySwitcher(ProxyURLs ...string) (colly.ProxyFunc, error) {
46 | if len(ProxyURLs) < 1 {
47 | return nil, colly.ErrEmptyProxyURL
48 | }
49 | urls := make([]*url.URL, len(ProxyURLs))
50 | for i, u := range ProxyURLs {
51 | parsedU, err := url.Parse(u)
52 | if err != nil {
53 | return nil, err
54 | }
55 | urls[i] = parsedU
56 | }
57 | return (&roundRobinSwitcher{urls, 0}).GetProxy, nil
58 | }
59 |
--------------------------------------------------------------------------------
/queue/queue.go:
--------------------------------------------------------------------------------
1 | package queue
2 |
3 | import (
4 | "net/url"
5 | "sync"
6 |
7 | whatwgUrl "github.com/nlnwa/whatwg-url/url"
8 |
9 | "github.com/gocolly/colly/v2"
10 | )
11 |
12 | const stop = true
13 |
14 | var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
15 |
16 | // Storage is the interface of the queue's storage backend
17 | // Storage must be concurrently safe for multiple goroutines.
18 | type Storage interface {
19 | // Init initializes the storage
20 | Init() error
21 | // AddRequest adds a serialized request to the queue
22 | AddRequest([]byte) error
23 | // GetRequest pops the next request from the queue
24 | // or returns error if the queue is empty
25 | GetRequest() ([]byte, error)
26 | // QueueSize returns with the size of the queue
27 | QueueSize() (int, error)
28 | }
29 |
30 | // Queue is a request queue which uses a Collector to consume
31 | // requests in multiple threads
32 | type Queue struct {
33 | // Threads defines the number of consumer threads
34 | Threads int
35 | storage Storage
36 | wake chan struct{}
37 | mut sync.Mutex // guards wake and running
38 | running bool
39 | }
40 |
41 | // InMemoryQueueStorage is the default implementation of the Storage interface.
42 | // InMemoryQueueStorage holds the request queue in memory.
43 | type InMemoryQueueStorage struct {
44 | // MaxSize defines the capacity of the queue.
45 | // New requests are discarded if the queue size reaches MaxSize
46 | MaxSize int
47 | lock *sync.RWMutex
48 | size int
49 | first *inMemoryQueueItem
50 | last *inMemoryQueueItem
51 | }
52 |
53 | type inMemoryQueueItem struct {
54 | Request []byte
55 | Next *inMemoryQueueItem
56 | }
57 |
58 | // New creates a new queue with a Storage specified in argument
59 | // A standard InMemoryQueueStorage is used if Storage argument is nil.
60 | func New(threads int, s Storage) (*Queue, error) {
61 | if s == nil {
62 | s = &InMemoryQueueStorage{MaxSize: 100000}
63 | }
64 | if err := s.Init(); err != nil {
65 | return nil, err
66 | }
67 | return &Queue{
68 | Threads: threads,
69 | storage: s,
70 | running: true,
71 | }, nil
72 | }
73 |
74 | // IsEmpty returns true if the queue is empty
75 | func (q *Queue) IsEmpty() bool {
76 | s, _ := q.Size()
77 | return s == 0
78 | }
79 |
80 | // AddURL adds a new URL to the queue
81 | func (q *Queue) AddURL(URL string) error {
82 | u, err := urlParser.Parse(URL)
83 | if err != nil {
84 | return err
85 | }
86 | u2, err := url.Parse(u.Href(false))
87 | if err != nil {
88 | return err
89 | }
90 | r := &colly.Request{
91 | URL: u2,
92 | Method: "GET",
93 | }
94 | d, err := r.Marshal()
95 | if err != nil {
96 | return err
97 | }
98 | return q.storage.AddRequest(d)
99 | }
100 |
101 | // AddRequest adds a new Request to the queue
102 | func (q *Queue) AddRequest(r *colly.Request) error {
103 | q.mut.Lock()
104 | waken := q.wake != nil
105 | q.mut.Unlock()
106 | if !waken {
107 | return q.storeRequest(r)
108 | }
109 | err := q.storeRequest(r)
110 | if err != nil {
111 | return err
112 | }
113 | q.wake <- struct{}{}
114 | return nil
115 | }
116 |
117 | func (q *Queue) storeRequest(r *colly.Request) error {
118 | d, err := r.Marshal()
119 | if err != nil {
120 | return err
121 | }
122 | return q.storage.AddRequest(d)
123 | }
124 |
125 | // Size returns the size of the queue
126 | func (q *Queue) Size() (int, error) {
127 | return q.storage.QueueSize()
128 | }
129 |
130 | // Run starts consumer threads and calls the Collector
131 | // to perform requests. Run blocks while the queue has active requests
132 | // The given Storage must not be used directly while Run blocks.
133 | func (q *Queue) Run(c *colly.Collector) error {
134 | q.mut.Lock()
135 | if q.wake != nil && q.running == true {
136 | q.mut.Unlock()
137 | panic("cannot call duplicate Queue.Run")
138 | }
139 | q.wake = make(chan struct{})
140 | q.running = true
141 | q.mut.Unlock()
142 |
143 | requestc := make(chan *colly.Request)
144 | complete, errc := make(chan struct{}), make(chan error, 1)
145 | for i := 0; i < q.Threads; i++ {
146 | go independentRunner(requestc, complete)
147 | }
148 | go q.loop(c, requestc, complete, errc)
149 | defer close(requestc)
150 | return <-errc
151 | }
152 |
153 | // Stop will stop the running queue
154 | func (q *Queue) Stop() {
155 | q.mut.Lock()
156 | q.running = false
157 | q.mut.Unlock()
158 | }
159 |
160 | func (q *Queue) loop(c *colly.Collector, requestc chan<- *colly.Request, complete <-chan struct{}, errc chan<- error) {
161 | var active int
162 | for {
163 | size, err := q.storage.QueueSize()
164 | if err != nil {
165 | errc <- err
166 | break
167 | }
168 | if size == 0 && active == 0 || !q.running {
169 | // Terminate when
170 | // 1. No active requests
171 | // 2. Empty queue
172 | errc <- nil
173 | break
174 | }
175 | sent := requestc
176 | var req *colly.Request
177 | if size > 0 {
178 | req, err = q.loadRequest(c)
179 | if err != nil {
180 | // ignore an error returned by GetRequest() or
181 | // UnmarshalRequest()
182 | continue
183 | }
184 | } else {
185 | sent = nil
186 | }
187 | Sent:
188 | for {
189 | select {
190 | case sent <- req:
191 | active++
192 | break Sent
193 | case <-q.wake:
194 | if sent == nil {
195 | break Sent
196 | }
197 | case <-complete:
198 | active--
199 | if sent == nil && active == 0 {
200 | break Sent
201 | }
202 | }
203 | }
204 | }
205 | }
206 |
207 | func independentRunner(requestc <-chan *colly.Request, complete chan<- struct{}) {
208 | for req := range requestc {
209 | req.Do()
210 | complete <- struct{}{}
211 | }
212 | }
213 |
214 | func (q *Queue) loadRequest(c *colly.Collector) (*colly.Request, error) {
215 | buf, err := q.storage.GetRequest()
216 | if err != nil {
217 | return nil, err
218 | }
219 | copied := make([]byte, len(buf))
220 | copy(copied, buf)
221 | return c.UnmarshalRequest(copied)
222 | }
223 |
224 | // Init implements Storage.Init() function
225 | func (q *InMemoryQueueStorage) Init() error {
226 | q.lock = &sync.RWMutex{}
227 | return nil
228 | }
229 |
230 | // AddRequest implements Storage.AddRequest() function
231 | func (q *InMemoryQueueStorage) AddRequest(r []byte) error {
232 | q.lock.Lock()
233 | defer q.lock.Unlock()
234 | // Discard URLs if size limit exceeded
235 | if q.MaxSize > 0 && q.size >= q.MaxSize {
236 | return colly.ErrQueueFull
237 | }
238 | i := &inMemoryQueueItem{Request: r}
239 | if q.first == nil {
240 | q.first = i
241 | } else {
242 | q.last.Next = i
243 | }
244 | q.last = i
245 | q.size++
246 | return nil
247 | }
248 |
249 | // GetRequest implements Storage.GetRequest() function
250 | func (q *InMemoryQueueStorage) GetRequest() ([]byte, error) {
251 | q.lock.Lock()
252 | defer q.lock.Unlock()
253 | if q.size == 0 {
254 | return nil, nil
255 | }
256 | r := q.first.Request
257 | q.first = q.first.Next
258 | q.size--
259 | return r, nil
260 | }
261 |
262 | // QueueSize implements Storage.QueueSize() function
263 | func (q *InMemoryQueueStorage) QueueSize() (int, error) {
264 | q.lock.Lock()
265 | defer q.lock.Unlock()
266 | return q.size, nil
267 | }
268 |
--------------------------------------------------------------------------------
/queue/queue_test.go:
--------------------------------------------------------------------------------
1 | package queue
2 |
3 | import (
4 | "math/rand"
5 | "net/http"
6 | "net/http/httptest"
7 | "sync"
8 | "sync/atomic"
9 | "testing"
10 | "time"
11 |
12 | "github.com/gocolly/colly/v2"
13 | )
14 |
15 | func TestQueue(t *testing.T) {
16 | server := httptest.NewServer(http.HandlerFunc(serverHandler))
17 | defer server.Close()
18 |
19 | rng := rand.New(rand.NewSource(12387123712321232))
20 | var rngMu sync.Mutex
21 |
22 | var (
23 | items uint32
24 | requests uint32
25 | success uint32
26 | failure uint32
27 | )
28 | storage := &InMemoryQueueStorage{MaxSize: 100000}
29 | q, err := New(10, storage)
30 | if err != nil {
31 | panic(err)
32 | }
33 | put := func() {
34 | rngMu.Lock()
35 | t := time.Duration(rng.Intn(50)) * time.Microsecond
36 | rngMu.Unlock()
37 | url := server.URL + "/delay?t=" + t.String()
38 | atomic.AddUint32(&items, 1)
39 | q.AddURL(url)
40 | }
41 | for i := 0; i < 3000; i++ {
42 | put()
43 | storage.AddRequest([]byte("error request"))
44 | }
45 | c := colly.NewCollector(
46 | colly.AllowURLRevisit(),
47 | )
48 | c.OnRequest(func(req *colly.Request) {
49 | atomic.AddUint32(&requests, 1)
50 | })
51 | c.OnResponse(func(resp *colly.Response) {
52 | if resp.StatusCode == http.StatusOK {
53 | atomic.AddUint32(&success, 1)
54 | } else {
55 | atomic.AddUint32(&failure, 1)
56 | }
57 | rngMu.Lock()
58 | toss := rng.Intn(2) == 0
59 | rngMu.Unlock()
60 | if toss {
61 | put()
62 | }
63 | })
64 | c.OnError(func(resp *colly.Response, err error) {
65 | atomic.AddUint32(&failure, 1)
66 | })
67 | err = q.Run(c)
68 | if err != nil {
69 | t.Fatalf("Queue.Run() return an error: %v", err)
70 | }
71 | if items != requests || success+failure != requests || failure > 0 {
72 | t.Fatalf("wrong Queue implementation: "+
73 | "items = %d, requests = %d, success = %d, failure = %d",
74 | items, requests, success, failure)
75 | }
76 | }
77 |
78 | func serverHandler(w http.ResponseWriter, req *http.Request) {
79 | if !serverRoute(w, req) {
80 | shutdown(w)
81 | }
82 | }
83 |
84 | func serverRoute(w http.ResponseWriter, req *http.Request) bool {
85 | if req.URL.Path == "/delay" {
86 | return serveDelay(w, req) == nil
87 | }
88 | return false
89 | }
90 |
91 | func serveDelay(w http.ResponseWriter, req *http.Request) error {
92 | q := req.URL.Query()
93 | t, err := time.ParseDuration(q.Get("t"))
94 | if err != nil {
95 | return err
96 | }
97 | time.Sleep(t)
98 | w.WriteHeader(http.StatusOK)
99 | return nil
100 | }
101 |
102 | func shutdown(w http.ResponseWriter) {
103 | taker, ok := w.(http.Hijacker)
104 | if !ok {
105 | return
106 | }
107 | raw, _, err := taker.Hijack()
108 | if err != nil {
109 | return
110 | }
111 | raw.Close()
112 | }
113 |
--------------------------------------------------------------------------------
/request.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "bytes"
19 | "encoding/json"
20 | "io"
21 | "net/http"
22 | "net/url"
23 | "strings"
24 | "sync/atomic"
25 | )
26 |
27 | // Request is the representation of a HTTP request made by a Collector
28 | type Request struct {
29 | // URL is the parsed URL of the HTTP request
30 | URL *url.URL
31 | // Headers contains the Request's HTTP headers
32 | Headers *http.Header
33 | // the Host header
34 | Host string
35 | // Ctx is a context between a Request and a Response
36 | Ctx *Context
37 | // Depth is the number of the parents of the request
38 | Depth int
39 | // Method is the HTTP method of the request
40 | Method string
41 | // Body is the request body which is used on POST/PUT requests
42 | Body io.Reader
43 | // ResponseCharacterencoding is the character encoding of the response body.
44 | // Leave it blank to allow automatic character encoding of the response body.
45 | // It is empty by default and it can be set in OnRequest callback.
46 | ResponseCharacterEncoding string
47 | // ID is the Unique identifier of the request
48 | ID uint32
49 | collector *Collector
50 | abort bool
51 | baseURL *url.URL
52 | // ProxyURL is the proxy address that handles the request
53 | ProxyURL string
54 | }
55 |
56 | type serializableRequest struct {
57 | URL string
58 | Method string
59 | Depth int
60 | Body []byte
61 | ID uint32
62 | Ctx map[string]interface{}
63 | Headers http.Header
64 | Host string
65 | }
66 |
67 | // New creates a new request with the context of the original request
68 | func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
69 | u, err := urlParser.Parse(URL)
70 | if err != nil {
71 | return nil, err
72 | }
73 | u2, err := url.Parse(u.Href(false))
74 | if err != nil {
75 | return nil, err
76 | }
77 | return &Request{
78 | Method: method,
79 | URL: u2,
80 | Body: body,
81 | Ctx: r.Ctx,
82 | Headers: &http.Header{},
83 | Host: r.Host,
84 | ID: atomic.AddUint32(&r.collector.requestCount, 1),
85 | collector: r.collector,
86 | }, nil
87 | }
88 |
89 | // Abort cancels the HTTP request when called in an OnRequest callback
90 | func (r *Request) Abort() {
91 | r.abort = true
92 | }
93 |
94 | // AbsoluteURL returns with the resolved absolute URL of an URL chunk.
95 | // AbsoluteURL returns empty string if the URL chunk is a fragment or
96 | // could not be parsed
97 | func (r *Request) AbsoluteURL(u string) string {
98 | if strings.HasPrefix(u, "#") {
99 | return ""
100 | }
101 | var base *url.URL
102 | if r.baseURL != nil {
103 | base = r.baseURL
104 | } else {
105 | base = r.URL
106 | }
107 |
108 | absURL, err := urlParser.ParseRef(base.String(), u)
109 | if err != nil {
110 | return ""
111 | }
112 | return absURL.Href(false)
113 | }
114 |
115 | // Visit continues Collector's collecting job by creating a
116 | // request and preserves the Context of the previous request.
117 | // Visit also calls the previously provided callbacks
118 | func (r *Request) Visit(URL string) error {
119 | return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true)
120 | }
121 |
122 | // HasVisited checks if the provided URL has been visited
123 | func (r *Request) HasVisited(URL string) (bool, error) {
124 | return r.collector.HasVisited(URL)
125 | }
126 |
127 | // Post continues a collector job by creating a POST request and preserves the Context
128 | // of the previous request.
129 | // Post also calls the previously provided callbacks
130 | func (r *Request) Post(URL string, requestData map[string]string) error {
131 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil, true)
132 | }
133 |
134 | // PostRaw starts a collector job by creating a POST request with raw binary data.
135 | // PostRaw preserves the Context of the previous request
136 | // and calls the previously provided callbacks
137 | func (r *Request) PostRaw(URL string, requestData []byte) error {
138 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil, true)
139 | }
140 |
141 | // PostMultipart starts a collector job by creating a Multipart POST request
142 | // with raw binary data. PostMultipart also calls the previously provided.
143 | // callbacks
144 | func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error {
145 | boundary := randomBoundary()
146 | hdr := http.Header{}
147 | hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
148 | hdr.Set("User-Agent", r.collector.UserAgent)
149 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr, true)
150 | }
151 |
152 | // Retry submits HTTP request again with the same parameters
153 | func (r *Request) Retry() error {
154 | r.Headers.Del("Cookie")
155 | if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok {
156 | return ErrRetryBodyUnseekable
157 | }
158 | return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
159 | }
160 |
161 | // Do submits the request
162 | func (r *Request) Do() error {
163 | return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, !r.collector.AllowURLRevisit)
164 | }
165 |
166 | // Marshal serializes the Request
167 | func (r *Request) Marshal() ([]byte, error) {
168 | ctx := make(map[string]interface{})
169 | if r.Ctx != nil {
170 | r.Ctx.ForEach(func(k string, v interface{}) interface{} {
171 | ctx[k] = v
172 | return nil
173 | })
174 | }
175 | var err error
176 | var body []byte
177 | if r.Body != nil {
178 | body, err = io.ReadAll(r.Body)
179 | if err != nil {
180 | return nil, err
181 | }
182 | }
183 | sr := &serializableRequest{
184 | URL: r.URL.String(),
185 | Host: r.Host,
186 | Method: r.Method,
187 | Depth: r.Depth,
188 | Body: body,
189 | ID: r.ID,
190 | Ctx: ctx,
191 | }
192 | if r.Headers != nil {
193 | sr.Headers = *r.Headers
194 | }
195 | return json.Marshal(sr)
196 | }
197 |
--------------------------------------------------------------------------------
/response.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "bytes"
19 | "fmt"
20 | "io"
21 | "mime"
22 | "net/http"
23 | "os"
24 | "strings"
25 |
26 | "github.com/saintfish/chardet"
27 | "golang.org/x/net/html/charset"
28 | )
29 |
30 | // Response is the representation of a HTTP response made by a Collector
31 | type Response struct {
32 | // StatusCode is the status code of the Response
33 | StatusCode int
34 | // Body is the content of the Response
35 | Body []byte
36 | // Ctx is a context between a Request and a Response
37 | Ctx *Context
38 | // Request is the Request object of the response
39 | Request *Request
40 | // Headers contains the Response's HTTP headers
41 | Headers *http.Header
42 | // Trace contains the HTTPTrace for the request. Will only be set by the
43 | // collector if Collector.TraceHTTP is set to true.
44 | Trace *HTTPTrace
45 | }
46 |
47 | // Save writes response body to disk
48 | func (r *Response) Save(fileName string) error {
49 | return os.WriteFile(fileName, r.Body, 0644)
50 | }
51 |
52 | // FileName returns the sanitized file name parsed from "Content-Disposition"
53 | // header or from URL
54 | func (r *Response) FileName() string {
55 | _, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition"))
56 | if fName, ok := params["filename"]; ok && err == nil {
57 | return SanitizeFileName(fName)
58 | }
59 | if r.Request.URL.RawQuery != "" {
60 | return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery))
61 | }
62 | return SanitizeFileName(strings.TrimPrefix(r.Request.URL.Path, "/"))
63 | }
64 |
65 | func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error {
66 | if len(r.Body) == 0 {
67 | return nil
68 | }
69 | if defaultEncoding != "" {
70 | tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding)
71 | if err != nil {
72 | return err
73 | }
74 | r.Body = tmpBody
75 | return nil
76 | }
77 | contentType := strings.ToLower(r.Headers.Get("Content-Type"))
78 |
79 | if strings.Contains(contentType, "image/") ||
80 | strings.Contains(contentType, "video/") ||
81 | strings.Contains(contentType, "audio/") ||
82 | strings.Contains(contentType, "font/") {
83 | // These MIME types should not have textual data.
84 |
85 | return nil
86 | }
87 |
88 | if !strings.Contains(contentType, "charset") {
89 | if !detectCharset {
90 | return nil
91 | }
92 | d := chardet.NewTextDetector()
93 | r, err := d.DetectBest(r.Body)
94 | if err != nil {
95 | return err
96 | }
97 | contentType = "text/plain; charset=" + r.Charset
98 | }
99 | if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") {
100 | return nil
101 | }
102 | tmpBody, err := encodeBytes(r.Body, contentType)
103 | if err != nil {
104 | return err
105 | }
106 | r.Body = tmpBody
107 | return nil
108 | }
109 |
110 | func encodeBytes(b []byte, contentType string) ([]byte, error) {
111 | r, err := charset.NewReader(bytes.NewReader(b), contentType)
112 | if err != nil {
113 | return nil, err
114 | }
115 | return io.ReadAll(r)
116 | }
117 |
--------------------------------------------------------------------------------
/storage/storage.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package storage
16 |
17 | import (
18 | "net/http"
19 | "net/http/cookiejar"
20 | "net/url"
21 | "strings"
22 | "sync"
23 | )
24 |
25 | // Storage is an interface which handles Collector's internal data,
26 | // like visited urls and cookies.
27 | // The default Storage of the Collector is the InMemoryStorage.
28 | // Collector's storage can be changed by calling Collector.SetStorage()
29 | // function.
30 | type Storage interface {
31 | // Init initializes the storage
32 | Init() error
33 | // Visited receives and stores a request ID that is visited by the Collector
34 | Visited(requestID uint64) error
35 | // IsVisited returns true if the request was visited before IsVisited
36 | // is called
37 | IsVisited(requestID uint64) (bool, error)
38 | // Cookies retrieves stored cookies for a given host
39 | Cookies(u *url.URL) string
40 | // SetCookies stores cookies for a given host
41 | SetCookies(u *url.URL, cookies string)
42 | }
43 |
44 | // InMemoryStorage is the default storage backend of colly.
45 | // InMemoryStorage keeps cookies and visited urls in memory
46 | // without persisting data on the disk.
47 | type InMemoryStorage struct {
48 | visitedURLs map[uint64]bool
49 | lock *sync.RWMutex
50 | jar *cookiejar.Jar
51 | }
52 |
53 | // Init initializes InMemoryStorage
54 | func (s *InMemoryStorage) Init() error {
55 | if s.visitedURLs == nil {
56 | s.visitedURLs = make(map[uint64]bool)
57 | }
58 | if s.lock == nil {
59 | s.lock = &sync.RWMutex{}
60 | }
61 | if s.jar == nil {
62 | var err error
63 | s.jar, err = cookiejar.New(nil)
64 | return err
65 | }
66 | return nil
67 | }
68 |
69 | // Visited implements Storage.Visited()
70 | func (s *InMemoryStorage) Visited(requestID uint64) error {
71 | s.lock.Lock()
72 | s.visitedURLs[requestID] = true
73 | s.lock.Unlock()
74 | return nil
75 | }
76 |
77 | // IsVisited implements Storage.IsVisited()
78 | func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) {
79 | s.lock.RLock()
80 | visited := s.visitedURLs[requestID]
81 | s.lock.RUnlock()
82 | return visited, nil
83 | }
84 |
85 | // Cookies implements Storage.Cookies()
86 | func (s *InMemoryStorage) Cookies(u *url.URL) string {
87 | return StringifyCookies(s.jar.Cookies(u))
88 | }
89 |
90 | // SetCookies implements Storage.SetCookies()
91 | func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) {
92 | s.jar.SetCookies(u, UnstringifyCookies(cookies))
93 | }
94 |
95 | // Close implements Storage.Close()
96 | func (s *InMemoryStorage) Close() error {
97 | return nil
98 | }
99 |
100 | // StringifyCookies serializes list of http.Cookies to string
101 | func StringifyCookies(cookies []*http.Cookie) string {
102 | // Stringify cookies.
103 | cs := make([]string, len(cookies))
104 | for i, c := range cookies {
105 | cs[i] = c.String()
106 | }
107 | return strings.Join(cs, "\n")
108 | }
109 |
110 | // UnstringifyCookies deserializes a cookie string to http.Cookies
111 | func UnstringifyCookies(s string) []*http.Cookie {
112 | h := http.Header{}
113 | for _, c := range strings.Split(s, "\n") {
114 | h.Add("Set-Cookie", c)
115 | }
116 | r := http.Response{Header: h}
117 | return r.Cookies()
118 | }
119 |
120 | // ContainsCookie checks if a cookie name is represented in cookies
121 | func ContainsCookie(cookies []*http.Cookie, name string) bool {
122 | for _, c := range cookies {
123 | if c.Name == name {
124 | return true
125 | }
126 | }
127 | return false
128 | }
129 |
--------------------------------------------------------------------------------
/unmarshal.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "errors"
19 | "reflect"
20 | "strings"
21 |
22 | "github.com/PuerkitoBio/goquery"
23 | )
24 |
25 | // Unmarshal is a shorthand for colly.UnmarshalHTML
26 | func (h *HTMLElement) Unmarshal(v interface{}) error {
27 | return UnmarshalHTML(v, h.DOM, nil)
28 | }
29 |
30 | // UnmarshalWithMap is a shorthand for colly.UnmarshalHTML, extended to allow maps to be passed in.
31 | func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error {
32 | return UnmarshalHTML(v, h.DOM, structMap)
33 | }
34 |
35 | // UnmarshalHTML declaratively extracts text or attributes to a struct from
36 | // HTML response using struct tags composed of css selectors.
37 | // Allowed struct tags:
38 | // - "selector" (required): CSS (goquery) selector of the desired data
39 | // - "attr" (optional): Selects the matching element's attribute's value.
40 | // Leave it blank or omit to get the text of the element.
41 | //
42 | // Example struct declaration:
43 | //
44 | // type Nested struct {
45 | // String string `selector:"div > p"`
46 | // Classes []string `selector:"li" attr:"class"`
47 | // Struct *Nested `selector:"div > div"`
48 | // }
49 | //
50 | // Supported types: struct, *struct, string, []string
51 | func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error {
52 | rv := reflect.ValueOf(v)
53 |
54 | if rv.Kind() != reflect.Ptr || rv.IsNil() {
55 | return errors.New("Invalid type or nil-pointer")
56 | }
57 |
58 | sv := rv.Elem()
59 | st := reflect.TypeOf(v).Elem()
60 | if structMap != nil {
61 | for k, v := range structMap {
62 | attrV := sv.FieldByName(k)
63 | if !attrV.CanAddr() || !attrV.CanSet() {
64 | continue
65 | }
66 | if err := unmarshalSelector(s, attrV, v); err != nil {
67 | return err
68 | }
69 | }
70 | } else {
71 | for i := 0; i < sv.NumField(); i++ {
72 | attrV := sv.Field(i)
73 | if !attrV.CanAddr() || !attrV.CanSet() {
74 | continue
75 | }
76 | if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil {
77 | return err
78 | }
79 |
80 | }
81 | }
82 |
83 | return nil
84 | }
85 |
86 | func unmarshalSelector(s *goquery.Selection, attrV reflect.Value, selector string) error {
87 | //selector is "-" specify that field should ignore.
88 | if selector == "-" {
89 | return nil
90 | }
91 | htmlAttr := ""
92 | // TODO support more types
93 | switch attrV.Kind() {
94 | case reflect.Slice:
95 | if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
96 | return err
97 | }
98 | case reflect.String:
99 | var val string
100 | if selector == "" && htmlAttr != "" {
101 | val = getDOMValue(s, htmlAttr)
102 | } else {
103 | val = getDOMValue(s.Find(selector), htmlAttr)
104 | }
105 | attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
106 | case reflect.Struct:
107 | if err := unmarshalStruct(s, selector, attrV); err != nil {
108 | return err
109 | }
110 | case reflect.Ptr:
111 | if err := unmarshalPtr(s, selector, attrV); err != nil {
112 | return err
113 | }
114 | default:
115 | return errors.New("Invalid type: " + attrV.String())
116 | }
117 | return nil
118 | }
119 |
120 | func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error {
121 | selector := attrT.Tag.Get("selector")
122 | //selector is "-" specify that field should ignore.
123 | if selector == "-" {
124 | return nil
125 | }
126 | htmlAttr := attrT.Tag.Get("attr")
127 | // TODO support more types
128 | switch attrV.Kind() {
129 | case reflect.Slice:
130 | if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil {
131 | return err
132 | }
133 | case reflect.String:
134 | val := getDOMValue(s.Find(selector), htmlAttr)
135 | attrV.Set(reflect.Indirect(reflect.ValueOf(val)))
136 | case reflect.Struct:
137 | if err := unmarshalStruct(s, selector, attrV); err != nil {
138 | return err
139 | }
140 | case reflect.Ptr:
141 | if err := unmarshalPtr(s, selector, attrV); err != nil {
142 | return err
143 | }
144 | default:
145 | return errors.New("Invalid type: " + attrV.String())
146 | }
147 | return nil
148 | }
149 |
150 | func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error {
151 | newS := s
152 | if selector != "" {
153 | newS = newS.Find(selector)
154 | }
155 | if newS.Nodes == nil {
156 | return nil
157 | }
158 | v := reflect.New(attrV.Type())
159 | err := UnmarshalHTML(v.Interface(), newS, nil)
160 | if err != nil {
161 | return err
162 | }
163 | attrV.Set(reflect.Indirect(v))
164 | return nil
165 | }
166 |
167 | func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error {
168 | newS := s
169 | if selector != "" {
170 | newS = newS.Find(selector)
171 | }
172 | if newS.Nodes == nil {
173 | return nil
174 | }
175 | e := attrV.Type().Elem()
176 | if e.Kind() != reflect.Struct {
177 | return errors.New("Invalid slice type")
178 | }
179 | v := reflect.New(e)
180 | err := UnmarshalHTML(v.Interface(), newS, nil)
181 | if err != nil {
182 | return err
183 | }
184 | attrV.Set(v)
185 | return nil
186 | }
187 |
188 | func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error {
189 | if attrV.Pointer() == 0 {
190 | v := reflect.MakeSlice(attrV.Type(), 0, 0)
191 | attrV.Set(v)
192 | }
193 | switch attrV.Type().Elem().Kind() {
194 | case reflect.String:
195 | s.Find(selector).Each(func(_ int, s *goquery.Selection) {
196 | val := getDOMValue(s, htmlAttr)
197 | attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val))))
198 | })
199 | case reflect.Ptr:
200 | s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
201 | someVal := reflect.New(attrV.Type().Elem().Elem())
202 | UnmarshalHTML(someVal.Interface(), innerSel, nil)
203 | attrV.Set(reflect.Append(attrV, someVal))
204 | })
205 | case reflect.Struct:
206 | s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) {
207 | someVal := reflect.New(attrV.Type().Elem())
208 | UnmarshalHTML(someVal.Interface(), innerSel, nil)
209 | attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal)))
210 | })
211 | default:
212 | return errors.New("Invalid slice type")
213 | }
214 | return nil
215 | }
216 |
217 | func getDOMValue(s *goquery.Selection, attr string) string {
218 | if attr == "" {
219 | return strings.TrimSpace(s.First().Text())
220 | }
221 | attrV, _ := s.Attr(attr)
222 | return attrV
223 | }
224 |
--------------------------------------------------------------------------------
/unmarshal_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "bytes"
19 | "testing"
20 |
21 | "github.com/PuerkitoBio/goquery"
22 | )
23 |
24 | var basicTestData = []byte(``)
25 | var nestedTestData = []byte(``)
26 | var pointerSliceTestData = []byte(`- Information: Info 1
- Information: Info 2
`)
27 |
28 | func TestBasicUnmarshal(t *testing.T) {
29 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(basicTestData))
30 | e := &HTMLElement{
31 | DOM: doc.First(),
32 | }
33 | s := struct {
34 | String string `selector:"li:first-child" attr:"class"`
35 | Items []string `selector:"li"`
36 | Struct struct {
37 | String string `selector:"li:last-child"`
38 | }
39 | }{}
40 | if err := e.Unmarshal(&s); err != nil {
41 | t.Error("Cannot unmarshal struct: " + err.Error())
42 | }
43 | if s.String != "x" {
44 | t.Errorf(`Invalid data for String: %q, expected "x"`, s.String)
45 | }
46 | if s.Struct.String != "3" {
47 | t.Errorf(`Invalid data for Struct.String: %q, expected "3"`, s.Struct.String)
48 | }
49 | }
50 |
51 | func TestNestedUnmarshalMap(t *testing.T) {
52 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(nestedTestData))
53 | e := &HTMLElement{
54 | DOM: doc.First(),
55 | }
56 | doc2, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(basicTestData))
57 | e2 := &HTMLElement{
58 | DOM: doc2.First(),
59 | }
60 | type nested struct {
61 | String string
62 | }
63 | mapSelector := make(map[string]string)
64 | mapSelector["String"] = "div > p"
65 |
66 | mapSelector2 := make(map[string]string)
67 | mapSelector2["String"] = "span"
68 |
69 | s := nested{}
70 | s2 := nested{}
71 | if err := e.UnmarshalWithMap(&s, mapSelector); err != nil {
72 | t.Error("Cannot unmarshal struct: " + err.Error())
73 | }
74 | if err := e2.UnmarshalWithMap(&s2, mapSelector2); err != nil {
75 | t.Error("Cannot unmarshal struct: " + err.Error())
76 | }
77 | if s.String != "a" {
78 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String)
79 | }
80 | if s2.String != "item" {
81 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String)
82 | }
83 | }
84 |
85 | func TestNestedUnmarshal(t *testing.T) {
86 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(nestedTestData))
87 | e := &HTMLElement{
88 | DOM: doc.First(),
89 | }
90 | type nested struct {
91 | String string `selector:"div > p"`
92 | Struct *nested `selector:"div > div"`
93 | }
94 | s := nested{}
95 | if err := e.Unmarshal(&s); err != nil {
96 | t.Error("Cannot unmarshal struct: " + err.Error())
97 | }
98 | if s.String != "a" {
99 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String)
100 | }
101 | if s.Struct.String != "b" {
102 | t.Errorf(`Invalid data for Struct.String: %q, expected "b"`, s.Struct.String)
103 | }
104 | if s.Struct.Struct.String != "c" {
105 | t.Errorf(`Invalid data for Struct.Struct.String: %q, expected "c"`, s.Struct.Struct.String)
106 | }
107 | }
108 |
109 | func TestPointerSliceUnmarshall(t *testing.T) {
110 | type info struct {
111 | Text string `selector:"span"`
112 | }
113 | type object struct {
114 | Info []*info `selector:"li.info"`
115 | }
116 |
117 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(pointerSliceTestData))
118 | e := HTMLElement{DOM: doc.First()}
119 | o := object{}
120 | err := e.Unmarshal(&o)
121 | if err != nil {
122 | t.Fatalf("Failed to unmarshal page: %s\n", err.Error())
123 | }
124 |
125 | if len(o.Info) != 2 {
126 | t.Errorf("Invalid length for Info: %d, expected 2", len(o.Info))
127 | }
128 | if o.Info[0].Text != "Info 1" {
129 | t.Errorf("Invalid data for Info.[0].Text: %s, expected Info 1", o.Info[0].Text)
130 | }
131 | if o.Info[1].Text != "Info 2" {
132 | t.Errorf("Invalid data for Info.[1].Text: %s, expected Info 2", o.Info[1].Text)
133 | }
134 |
135 | }
136 |
137 | func TestStructSliceUnmarshall(t *testing.T) {
138 | type info struct {
139 | Text string `selector:"span"`
140 | }
141 | type object struct {
142 | Info []info `selector:"li.info"`
143 | }
144 |
145 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(pointerSliceTestData))
146 | e := HTMLElement{DOM: doc.First()}
147 | o := object{}
148 | err := e.Unmarshal(&o)
149 | if err != nil {
150 | t.Fatalf("Failed to unmarshal page: %s\n", err.Error())
151 | }
152 |
153 | if len(o.Info) != 2 {
154 | t.Errorf("Invalid length for Info: %d, expected 2", len(o.Info))
155 | }
156 | if o.Info[0].Text != "Info 1" {
157 | t.Errorf("Invalid data for Info.[0].Text: %s, expected Info 1", o.Info[0].Text)
158 | }
159 | if o.Info[1].Text != "Info 2" {
160 | t.Errorf("Invalid data for Info.[1].Text: %s, expected Info 2", o.Info[1].Text)
161 | }
162 |
163 | }
164 |
--------------------------------------------------------------------------------
/xmlelement.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly
16 |
17 | import (
18 | "strings"
19 |
20 | "github.com/antchfx/htmlquery"
21 | "github.com/antchfx/xmlquery"
22 | "golang.org/x/net/html"
23 | )
24 |
25 | // XMLElement is the representation of a XML tag.
26 | type XMLElement struct {
27 | // Name is the name of the tag
28 | Name string
29 | Text string
30 | attributes interface{}
31 | // Request is the request object of the element's HTML document
32 | Request *Request
33 | // Response is the Response object of the element's HTML document
34 | Response *Response
35 | // DOM is the DOM object of the page. DOM is relative
36 | // to the current XMLElement and is either a html.Node or xmlquery.Node
37 | // based on how the XMLElement was created.
38 | DOM interface{}
39 | isHTML bool
40 | }
41 |
42 | // NewXMLElementFromHTMLNode creates a XMLElement from a html.Node.
43 | func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement {
44 | return &XMLElement{
45 | Name: s.Data,
46 | Request: resp.Request,
47 | Response: resp,
48 | Text: htmlquery.InnerText(s),
49 | DOM: s,
50 | attributes: s.Attr,
51 | isHTML: true,
52 | }
53 | }
54 |
55 | // NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node.
56 | func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement {
57 | return &XMLElement{
58 | Name: s.Data,
59 | Request: resp.Request,
60 | Response: resp,
61 | Text: s.InnerText(),
62 | DOM: s,
63 | attributes: s.Attr,
64 | isHTML: false,
65 | }
66 | }
67 |
68 | // Attr returns the selected attribute of a HTMLElement or empty string
69 | // if no attribute found
70 | func (h *XMLElement) Attr(k string) string {
71 | if h.isHTML {
72 | for _, a := range h.attributes.([]html.Attribute) {
73 | if a.Key == k {
74 | return a.Val
75 | }
76 | }
77 | } else {
78 | for _, a := range h.attributes.([]xmlquery.Attr) {
79 | if a.Name.Local == k {
80 | return a.Value
81 | }
82 | }
83 | }
84 | return ""
85 | }
86 |
87 | // ChildText returns the concatenated and stripped text content of the matching
88 | // elements.
89 | func (h *XMLElement) ChildText(xpathQuery string) string {
90 | if h.isHTML {
91 | child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
92 | if child == nil {
93 | return ""
94 | }
95 | return strings.TrimSpace(htmlquery.InnerText(child))
96 | }
97 | child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
98 | if child == nil {
99 | return ""
100 | }
101 | return strings.TrimSpace(child.InnerText())
102 |
103 | }
104 |
105 | // ChildAttr returns the stripped text content of the first matching
106 | // element's attribute.
107 | func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string {
108 | if h.isHTML {
109 | child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery)
110 | if child != nil {
111 | for _, attr := range child.Attr {
112 | if attr.Key == attrName {
113 | return strings.TrimSpace(attr.Val)
114 | }
115 | }
116 | }
117 | } else {
118 | child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery)
119 | if child != nil {
120 | for _, attr := range child.Attr {
121 | if attr.Name.Local == attrName {
122 | return strings.TrimSpace(attr.Value)
123 | }
124 | }
125 | }
126 | }
127 |
128 | return ""
129 | }
130 |
131 | // ChildAttrs returns the stripped text content of all the matching
132 | // element's attributes.
133 | func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string {
134 | var res []string
135 | if h.isHTML {
136 | for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) {
137 | for _, attr := range child.Attr {
138 | if attr.Key == attrName {
139 | res = append(res, strings.TrimSpace(attr.Val))
140 | }
141 | }
142 | }
143 | } else {
144 | xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
145 | for _, attr := range child.Attr {
146 | if attr.Name.Local == attrName {
147 | res = append(res, strings.TrimSpace(attr.Value))
148 | }
149 | }
150 | })
151 | }
152 | return res
153 | }
154 |
155 | // ChildTexts returns an array of strings corresponding to child elements that match the xpath query.
156 | // Each item in the array is the stripped text content of the corresponding matching child element.
157 | func (h *XMLElement) ChildTexts(xpathQuery string) []string {
158 | texts := make([]string, 0)
159 | if h.isHTML {
160 | for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) {
161 | texts = append(texts, strings.TrimSpace(htmlquery.InnerText(child)))
162 | }
163 | } else {
164 | xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) {
165 | texts = append(texts, strings.TrimSpace(child.InnerText()))
166 | })
167 | }
168 | return texts
169 | }
170 |
--------------------------------------------------------------------------------
/xmlelement_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Adam Tauber
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package colly_test
16 |
17 | import (
18 | "github.com/antchfx/htmlquery"
19 | "github.com/gocolly/colly/v2"
20 | "reflect"
21 | "strings"
22 | "testing"
23 | )
24 |
25 | // Borrowed from http://infohost.nmt.edu/tcc/help/pubs/xhtml/example.html
26 | // Added attributes to the `` tags for testing purposes
27 | const htmlPage = `
28 |
30 |
31 |
32 | Your page title here
33 |
34 |
35 | Your major heading here
36 |
37 | This is a regular text paragraph.
38 |
39 |
40 | -
41 | First bullet of a bullet list.
42 |
43 | -
44 | This is the second bullet.
45 |
46 |
47 |
48 |
49 | `
50 |
51 | func TestAttr(t *testing.T) {
52 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)}
53 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage))
54 | xmlNode := htmlquery.FindOne(doc, "/html")
55 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode)
56 |
57 | if xmlElem.Attr("xmlns") != "http://www.w3.org/1999/xhtml" {
58 | t.Fatalf("failed xmlns attribute test: %v != http://www.w3.org/1999/xhtml", xmlElem.Attr("xmlns"))
59 | }
60 |
61 | if xmlElem.Attr("xml:lang") != "en" {
62 | t.Fatalf("failed lang attribute test: %v != en", xmlElem.Attr("lang"))
63 | }
64 | }
65 |
66 | func TestChildText(t *testing.T) {
67 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)}
68 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage))
69 | xmlNode := htmlquery.FindOne(doc, "/html")
70 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode)
71 |
72 | if text := xmlElem.ChildText("//p"); text != "This is a regular text paragraph." {
73 | t.Fatalf("failed child tag test: %v != This is a regular text paragraph.", text)
74 | }
75 | if text := xmlElem.ChildText("//dl"); text != "" {
76 | t.Fatalf("failed child tag test: %v != \"\"", text)
77 | }
78 | }
79 |
80 | func TestChildTexts(t *testing.T) {
81 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)}
82 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage))
83 | xmlNode := htmlquery.FindOne(doc, "/html")
84 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode)
85 | expected := []string{"First bullet of a bullet list.", "This is the second bullet."}
86 | if texts := xmlElem.ChildTexts("//li"); reflect.DeepEqual(texts, expected) == false {
87 | t.Fatalf("failed child tags test: %v != %v", texts, expected)
88 | }
89 | if texts := xmlElem.ChildTexts("//dl"); reflect.DeepEqual(texts, make([]string, 0)) == false {
90 | t.Fatalf("failed child tag test: %v != \"\"", texts)
91 | }
92 | }
93 | func TestChildAttr(t *testing.T) {
94 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)}
95 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage))
96 | xmlNode := htmlquery.FindOne(doc, "/html")
97 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode)
98 |
99 | if attr := xmlElem.ChildAttr("/body/ul/li[1]", "class"); attr != "list-item-1" {
100 | t.Fatalf("failed child attribute test: %v != list-item-1", attr)
101 | }
102 | if attr := xmlElem.ChildAttr("/body/ul/li[2]", "class"); attr != "list-item-2" {
103 | t.Fatalf("failed child attribute test: %v != list-item-2", attr)
104 | }
105 | }
106 |
107 | func TestChildAttrs(t *testing.T) {
108 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)}
109 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage))
110 | xmlNode := htmlquery.FindOne(doc, "/html")
111 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode)
112 |
113 | attrs := xmlElem.ChildAttrs("/body/ul/li", "class")
114 | if len(attrs) != 2 {
115 | t.Fatalf("failed child attributes length test: %d != 2", len(attrs))
116 | }
117 |
118 | for _, attr := range attrs {
119 | if !(attr == "list-item-1" || attr == "list-item-2") {
120 | t.Fatalf("failed child attributes values test: %s != list-item-(1 or 2)", attr)
121 | }
122 | }
123 | }
124 |
--------------------------------------------------------------------------------