├── .codecov.yml ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md └── workflows │ └── ci.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── VERSION ├── _examples ├── README.md ├── basic │ └── basic.go ├── coursera_courses │ └── coursera_courses.go ├── cryptocoinmarketcap │ └── cryptocoinmarketcap.go ├── error_handling │ └── error_handling.go ├── factba.se │ └── factbase.go ├── google_groups │ └── google_groups.go ├── hackernews_comments │ └── hackernews_comments.go ├── instagram │ └── instagram.go ├── local_files │ ├── html │ │ ├── child_page │ │ │ ├── one.html │ │ │ ├── three.html │ │ │ └── two.html │ │ └── index.html │ └── local_files.go ├── login │ └── login.go ├── max_depth │ └── max_depth.go ├── multipart │ ├── asciimoo.jpg │ └── multipart.go ├── openedx_courses │ └── openedx_courses.go ├── parallel │ └── parallel.go ├── proxy_switcher │ └── proxy_switcher.go ├── queue │ └── queue.go ├── random_delay │ └── random_delay.go ├── rate_limit │ └── rate_limit.go ├── reddit │ └── reddit.go ├── request_context │ └── request_context.go ├── scraper_server │ └── scraper_server.go ├── shopify_sitemap │ └── shopify_sitemap.go ├── url_filter │ └── url_filter.go └── xkcd_store │ └── xkcd_store.go ├── assets └── scrapfly.png ├── cmd └── colly │ └── colly.go ├── colly.go ├── colly_test.go ├── context.go ├── context_test.go ├── debug ├── debug.go ├── logdebugger.go └── webdebugger.go ├── extensions ├── extensions.go ├── random_user_agent.go ├── referer.go └── url_length_filter.go ├── go.mod ├── go.sum ├── htmlelement.go ├── http_backend.go ├── http_trace.go ├── http_trace_test.go ├── proxy └── proxy.go ├── queue ├── queue.go └── queue_test.go ├── request.go ├── response.go ├── storage └── storage.go ├── unmarshal.go ├── unmarshal_test.go ├── xmlelement.go └── xmlelement_test.go /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Question 4 | url: https://stackoverflow.com/ 5 | about: Questions should go to Stack Overflow. You can use go-colly tag. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 14 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | name: Test ${{matrix.go}} 12 | runs-on: [ubuntu-latest] 13 | strategy: 14 | fail-fast: false 15 | max-parallel: 4 16 | matrix: 17 | go: [ 18 | "1.24", 19 | "1.23", 20 | "1.22", 21 | "1.21", 22 | ] 23 | 24 | steps: 25 | - name: Checkout branch 26 | uses: actions/checkout@v2 27 | 28 | - name: Setup go 29 | uses: actions/setup-go@v2 30 | with: 31 | go-version: ${{matrix.go}} 32 | 33 | - name: Test 34 | run: | 35 | go install golang.org/x/lint/golint@latest 36 | OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1) 37 | OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1) 38 | golint -set_exit_status 39 | go vet -v ./... 40 | go test -race -v -coverprofile=coverage.txt -covermode=atomic ./... 41 | 42 | build: 43 | name: Build ${{matrix.go}} 44 | runs-on: [ubuntu-latest] 45 | strategy: 46 | fail-fast: false 47 | max-parallel: 4 48 | matrix: 49 | go: [ 50 | "1.24", 51 | "1.23", 52 | "1.22", 53 | "1.21", 54 | ] 55 | 56 | steps: 57 | - name: Checkout branch 58 | uses: actions/checkout@v2 59 | 60 | - name: Setup go 61 | uses: actions/setup-go@v2 62 | with: 63 | go-version: ${{matrix.go}} 64 | 65 | - name: Build 66 | run: | 67 | go install golang.org/x/lint/golint@latest 68 | OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1) 69 | OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1) 70 | golint -set_exit_status 71 | go build 72 | 73 | codecov: 74 | name: Codecov 75 | runs-on: [ubuntu-latest] 76 | needs: 77 | - test 78 | - build 79 | steps: 80 | - name: Run Codecov 81 | run: bash <(curl -s https://codecov.io/bash) 82 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 2.1.0 - 2020.06.09 2 | 3 | - HTTP tracing support 4 | - New callback: OnResponseHeader 5 | - Queue fixes 6 | - New collector option: Collector.CheckHead 7 | - Proxy fixes 8 | - Fixed POST revisit checking 9 | - Updated dependencies 10 | 11 | # 2.0.0 - 2019.11.28 12 | 13 | - Breaking change: Change Collector.RedirectHandler member to Collector.SetRedirectHandler function 14 | - Go module support 15 | - Collector.HasVisited method added to be able to check if an url has been visited 16 | - Collector.SetClient method introduced 17 | - HTMLElement.ChildTexts method added 18 | - New user agents 19 | - Multiple bugfixes 20 | 21 | # 1.2.0 - 2019.02.13 22 | 23 | - Compatibility with the latest htmlquery package 24 | - New request shortcut for HEAD requests 25 | - Check URL availibility before visiting 26 | - Fix proxy URL value 27 | - Request counter fix 28 | - Minor fixes in examples 29 | 30 | # 1.1.0 - 2018.08.13 31 | 32 | - Appengine integration takes context.Context instead of http.Request (API change) 33 | - Added "Accept" http header by default to every request 34 | - Support slices of pointers in unmarshal 35 | - Fixed a race condition in queues 36 | - ForEachWithBreak method added to HTMLElement 37 | - Added a local file example 38 | - Support gzip decompression of response bodies 39 | - Don't share waitgroup when cloning a collector 40 | - Fixed instagram example 41 | 42 | 43 | # 1.0.0 - 2018.05.13 44 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute 2 | 3 | ## Introduction 4 | 5 | First, thank you for considering contributing to colly! It's people like you that make the open source community such a great community! 😊 6 | 7 | We welcome any type of contribution, not only code. You can help with 8 | - **QA**: file bug reports, the more details you can give the better (e.g. screenshots with the console open) 9 | - **Marketing**: writing blog posts, howto's, printing stickers, ... 10 | - **Community**: presenting the project at meetups, organizing a dedicated meetup for the local community, ... 11 | - **Code**: take a look at the [open issues](https://github.com/gocolly/colly/issues). Even if you can't write code, commenting on them, showing that you care about a given issue matters. It helps us triage them. 12 | - **Money**: we welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly). 13 | 14 | ## Your First Contribution 15 | 16 | Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://app.egghead.io/playlists/how-to-contribute-to-an-open-source-project-on-github). 17 | 18 | ## Submitting code 19 | 20 | Any code change should be submitted as a pull request. The description should explain what the code does and give steps to execute it. The pull request should also contain tests. 21 | 22 | ## Code review process 23 | 24 | The bigger the pull request, the longer it will take to review and merge. Try to break down large pull requests in smaller chunks that are easier to review and merge. 25 | It is also always helpful to have some context for your pull request. What was the purpose? Why does it matter to you? 26 | 27 | ## Financial contributions 28 | 29 | We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly). 30 | Anyone can file an expense. If the expense makes sense for the development of the community, it will be "merged" in the ledger of our open collective by the core contributors and the person who filed the expense will be reimbursed. 31 | 32 | ## Questions 33 | 34 | If you have any questions, create an [issue](https://github.com/gocolly/colly/issues/new) (protip: do a quick search first to see if someone else didn't ask the same question before!). 35 | You can also reach us at hello@colly.opencollective.com. 36 | 37 | ## Credits 38 | 39 | ### Contributors 40 | 41 | Thank you to all the people who have already contributed to colly! 42 | 43 | 44 | 45 | ### Backers 46 | 47 | Thank you to all our backers! [[Become a backer](https://opencollective.com/colly#backer)] 48 | 49 | 50 | 51 | 52 | ### Sponsors 53 | 54 | Thank you to all our sponsors! (please ask your company to also support this open source project by [becoming a sponsor](https://opencollective.com/colly#sponsor)) 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Colly 2 | 3 | Lightning Fast and Elegant Scraping Framework for Gophers 4 | 5 | Colly provides a clean interface to write any kind of crawler/scraper/spider. 6 | 7 | With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving. 8 | 9 | [![GoDoc](https://godoc.org/github.com/gocolly/colly?status.svg)](https://pkg.go.dev/github.com/gocolly/colly/v2) 10 | [![Backers on Open Collective](https://opencollective.com/colly/backers/badge.svg)](#backers) [![Sponsors on Open Collective](https://opencollective.com/colly/sponsors/badge.svg)](#sponsors) [![build status](https://github.com/gocolly/colly/actions/workflows/ci.yml/badge.svg)](https://github.com/gocolly/colly/actions/workflows/ci.yml) 11 | [![report card](https://img.shields.io/badge/report%20card-a%2B-ff3333.svg?style=flat-square)](http://goreportcard.com/report/gocolly/colly) 12 | [![view examples](https://img.shields.io/badge/learn%20by-examples-0077b3.svg?style=flat-square)](https://github.com/gocolly/colly/tree/master/_examples) 13 | [![Code Coverage](https://img.shields.io/codecov/c/github/gocolly/colly/master.svg)](https://codecov.io/github/gocolly/colly?branch=master) 14 | [![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2Fgocolly%2Fcolly.svg?type=shield)](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_shield) 15 | [![Twitter URL](https://img.shields.io/badge/twitter-follow-green.svg)](https://twitter.com/gocolly) 16 | 17 | 18 | ------ 19 | 20 | 21 | 22 | ## Features 23 | 24 | - Clean API 25 | - Fast (>1k request/sec on a single core) 26 | - Manages request delays and maximum concurrency per domain 27 | - Automatic cookie and session handling 28 | - Sync/async/parallel scraping 29 | - Caching 30 | - Automatic encoding of non-unicode responses 31 | - Robots.txt support 32 | - Distributed scraping 33 | - Configuration via environment variables 34 | - Extensions 35 | 36 | ## Example 37 | 38 | ```go 39 | func main() { 40 | c := colly.NewCollector() 41 | 42 | // Find and visit all links 43 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 44 | e.Request.Visit(e.Attr("href")) 45 | }) 46 | 47 | c.OnRequest(func(r *colly.Request) { 48 | fmt.Println("Visiting", r.URL) 49 | }) 50 | 51 | c.Visit("http://go-colly.org/") 52 | } 53 | ``` 54 | 55 | See [examples folder](https://github.com/gocolly/colly/tree/master/_examples) for more detailed examples. 56 | 57 | ## Installation 58 | 59 | Add colly to your `go.mod` file: 60 | 61 | ``` 62 | module github.com/x/y 63 | 64 | go 1.14 65 | 66 | require ( 67 | github.com/gocolly/colly/v2 latest 68 | ) 69 | ``` 70 | 71 | ## Bugs 72 | 73 | Bugs or suggestions? Visit the [issue tracker](https://github.com/gocolly/colly/issues) or join `#colly` on freenode 74 | 75 | ## Other Projects Using Colly 76 | 77 | Below is a list of public, open source projects that use Colly: 78 | 79 | - [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive. 80 | - [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go. 81 | - [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scraper, making your day a little better! 82 | - [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you. 83 | - [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects. 84 | - [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line. 85 | - [yringler/inside-chassidus-scraper](https://github.com/yringler/inside-chassidus-scraper) Scrapes Rabbi Paltiel's web site for lesson metadata. 86 | - [gamedb/gamedb](https://github.com/gamedb/gamedb) A database of Steam games. 87 | - [lawzava/scrape](https://github.com/lawzava/scrape) CLI for email scraping from any website. 88 | - [eureka101v/WeiboSpiderGo](https://github.com/eureka101v/WeiboSpiderGo) A sina weibo(chinese twitter) scraper 89 | - [Go-phie/gophie](https://github.com/Go-phie/gophie) Search, Download and Stream movies from your terminal 90 | - [imthaghost/goclone](https://github.com/imthaghost/goclone) Clone websites to your computer within seconds. 91 | - [superiss/spidy](https://github.com/superiss/spidy) Crawl the web and collect expired domains. 92 | - [docker-slim/docker-slim](https://github.com/docker-slim/docker-slim) Optimize your Docker containers to make them smaller and better. 93 | - [seversky/gachifinder](https://github.com/seversky/gachifinder) an agent for asynchronous scraping, parsing and writing to some storages(elasticsearch for now) 94 | - [eval-exec/goodreads](https://github.com/eval-exec/goodreads) crawl all tags and all pages of quotes from goodreads. 95 | 96 | If you are using Colly in a project please send a pull request to add it to the list. 97 | 98 | ## Contributors 99 | 100 | This project exists thanks to all the people who contribute. [[Contribute]](CONTRIBUTING.md). 101 | 102 | 103 | ## Backers 104 | 105 | Thank you to all our backers! 🙏 [[Become a backer](https://opencollective.com/colly#backer)] 106 | 107 | 108 | 109 | ## Sponsors 110 | 111 | Support this project by becoming a sponsor. Your logo will show up here with a link to your website. [[Become a sponsor](https://opencollective.com/colly#sponsor)] 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | ## License 125 | 126 | [![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2Fgocolly%2Fcolly.svg?type=large)](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_large) 127 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.1.0 2 | -------------------------------------------------------------------------------- /_examples/README.md: -------------------------------------------------------------------------------- 1 | # Colly examples 2 | 3 | This folder provides easy to understand code snippets on how to get started with colly. 4 | 5 | To execute an example run `go run [example/example.go]` 6 | 7 | 8 | ## Demo 9 | 10 | ``` 11 | $ go run rate_limit/rate_limit.go 12 | [000001] 1 [ 1 - request] map["url":"https://httpbin.org/delay/2?n=4"] (60.872µs) 13 | [000002] 1 [ 2 - request] map["url":"https://httpbin.org/delay/2?n=2"] (154.425µs) 14 | [000003] 1 [ 3 - request] map["url":"https://httpbin.org/delay/2?n=0"] (158.374µs) 15 | [000004] 1 [ 5 - request] map["url":"https://httpbin.org/delay/2?n=3"] (426.999µs) 16 | [000005] 1 [ 4 - request] map["url":"https://httpbin.org/delay/2?n=1"] (448.75µs) 17 | [000007] 1 [ 2 - response] map["url":"https://httpbin.org/delay/2?n=2" "status":"OK"] (2.855764394s) 18 | [000008] 1 [ 2 - scraped] map["url":"https://httpbin.org/delay/2?n=2"] (2.855797868s) 19 | [000006] 1 [ 1 - response] map["url":"https://httpbin.org/delay/2?n=4" "status":"OK"] (2.855756753s) 20 | [000009] 1 [ 1 - scraped] map["url":"https://httpbin.org/delay/2?n=4"] (2.855819581s) 21 | [000010] 1 [ 3 - response] map["status":"OK" "url":"https://httpbin.org/delay/2?n=0"] (5.002065299s) 22 | [000011] 1 [ 3 - scraped] map["url":"https://httpbin.org/delay/2?n=0"] (5.002103755s) 23 | [000012] 1 [ 5 - response] map["status":"OK" "url":"https://httpbin.org/delay/2?n=3"] (5.012080614s) 24 | [000013] 1 [ 5 - scraped] map["url":"https://httpbin.org/delay/2?n=3"] (5.012101056s) 25 | [000014] 1 [ 4 - response] map["url":"https://httpbin.org/delay/2?n=1" "status":"OK"] (7.155725591s) 26 | [000015] 1 [ 4 - scraped] map["url":"https://httpbin.org/delay/2?n=1"] (7.155759136s) 27 | 28 | ``` 29 | -------------------------------------------------------------------------------- /_examples/basic/basic.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Instantiate default collector 11 | c := colly.NewCollector( 12 | // Visit only domains: hackerspaces.org, wiki.hackerspaces.org 13 | colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"), 14 | ) 15 | 16 | // On every a element which has href attribute call callback 17 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 18 | link := e.Attr("href") 19 | // Print link 20 | fmt.Printf("Link found: %q -> %s\n", e.Text, link) 21 | // Visit link found on page 22 | // Only those links are visited which are in AllowedDomains 23 | c.Visit(e.Request.AbsoluteURL(link)) 24 | }) 25 | 26 | // Before making a request print "Visiting ..." 27 | c.OnRequest(func(r *colly.Request) { 28 | fmt.Println("Visiting", r.URL.String()) 29 | }) 30 | 31 | // Start scraping on https://hackerspaces.org 32 | c.Visit("https://hackerspaces.org/") 33 | } 34 | -------------------------------------------------------------------------------- /_examples/coursera_courses/coursera_courses.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "log" 6 | "os" 7 | "strings" 8 | "time" 9 | 10 | "github.com/gocolly/colly/v2" 11 | ) 12 | 13 | // Course stores information about a coursera course 14 | type Course struct { 15 | Title string 16 | Description string 17 | Creator string 18 | Level string 19 | URL string 20 | Language string 21 | Commitment string 22 | Rating string 23 | } 24 | 25 | func main() { 26 | fName := "courses.json" 27 | file, err := os.Create(fName) 28 | if err != nil { 29 | log.Fatalf("Cannot create file %q: %s\n", fName, err) 30 | return 31 | } 32 | defer file.Close() 33 | 34 | // Instantiate default collector 35 | c := colly.NewCollector( 36 | // Visit only domains: coursera.org, www.coursera.org 37 | colly.AllowedDomains("coursera.org", "www.coursera.org"), 38 | 39 | // Cache responses to prevent multiple download of pages 40 | // even if the collector is restarted 41 | colly.CacheDir("./coursera_cache"), 42 | // Cached responses older than the specified duration will be refreshed 43 | colly.CacheExpiration(24*time.Hour), 44 | ) 45 | 46 | // Create another collector to scrape course details 47 | detailCollector := c.Clone() 48 | 49 | courses := make([]Course, 0, 200) 50 | 51 | // On every element which has "href" attribute call callback 52 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 53 | // If attribute class is this long string return from callback 54 | // As this a is irrelevant 55 | if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" { 56 | return 57 | } 58 | link := e.Attr("href") 59 | // If link start with browse or includes either signup or login return from callback 60 | if !strings.HasPrefix(link, "/browse") || strings.Index(link, "=signup") > -1 || strings.Index(link, "=login") > -1 { 61 | return 62 | } 63 | // start scaping the page under the link found 64 | e.Request.Visit(link) 65 | }) 66 | 67 | // Before making a request print "Visiting ..." 68 | c.OnRequest(func(r *colly.Request) { 69 | log.Println("visiting", r.URL.String()) 70 | }) 71 | 72 | // On every element with collection-product-card class call callback 73 | c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) { 74 | // Activate detailCollector if the link contains "coursera.org/learn" 75 | courseURL := e.Request.AbsoluteURL(e.Attr("href")) 76 | if strings.Index(courseURL, "coursera.org/learn") != -1 { 77 | detailCollector.Visit(courseURL) 78 | } 79 | }) 80 | 81 | // Extract details of the course 82 | detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) { 83 | log.Println("Course found", e.Request.URL) 84 | title := e.ChildText(".banner-title") 85 | if title == "" { 86 | log.Println("No title found", e.Request.URL) 87 | } 88 | course := Course{ 89 | Title: title, 90 | URL: e.Request.URL.String(), 91 | Description: e.ChildText("div.content"), 92 | Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"), 93 | Rating: e.ChildText("span.number-rating"), 94 | } 95 | // Iterate over div components and add details to course 96 | e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) { 97 | svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ") 98 | lastWord := svgTitle[len(svgTitle)-1] 99 | switch lastWord { 100 | // svg Title: Available Languages 101 | case "languages": 102 | course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)") 103 | // svg Title: Mixed/Beginner/Intermediate/Advanced Level 104 | case "Level": 105 | course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)") 106 | // svg Title: Hours to complete 107 | case "complete": 108 | course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)") 109 | } 110 | }) 111 | courses = append(courses, course) 112 | }) 113 | 114 | // Start scraping on http://coursera.com/browse 115 | c.Visit("https://coursera.org/browse") 116 | 117 | enc := json.NewEncoder(file) 118 | enc.SetIndent("", " ") 119 | 120 | // Dump json to the standard output 121 | enc.Encode(courses) 122 | } 123 | -------------------------------------------------------------------------------- /_examples/cryptocoinmarketcap/cryptocoinmarketcap.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "log" 6 | "os" 7 | 8 | "github.com/gocolly/colly/v2" 9 | ) 10 | 11 | func main() { 12 | fName := "cryptocoinmarketcap.csv" 13 | file, err := os.Create(fName) 14 | if err != nil { 15 | log.Fatalf("Cannot create file %q: %s\n", fName, err) 16 | return 17 | } 18 | defer file.Close() 19 | writer := csv.NewWriter(file) 20 | defer writer.Flush() 21 | 22 | // Write CSV header 23 | writer.Write([]string{"Name", "Symbol", "Market Cap (USD)", "Price (USD)", "Circulating Supply (USD)", "Volume (24h)", "Change (1h)", "Change (24h)", "Change (7d)"}) 24 | 25 | // Instantiate default collector 26 | c := colly.NewCollector() 27 | 28 | c.OnHTML("tbody tr", func(e *colly.HTMLElement) { 29 | writer.Write([]string{ 30 | e.ChildText(".cmc-table__column-name"), 31 | e.ChildText(".cmc-table__cell--sort-by__symbol"), 32 | e.ChildText(".cmc-table__cell--sort-by__market-cap"), 33 | e.ChildText(".cmc-table__cell--sort-by__price"), 34 | e.ChildText(".cmc-table__cell--sort-by__circulating-supply"), 35 | e.ChildText(".cmc-table__cell--sort-by__volume-24-h"), 36 | e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"), 37 | e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"), 38 | e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"), 39 | }) 40 | }) 41 | 42 | c.Visit("https://coinmarketcap.com/all/views/all/") 43 | 44 | log.Printf("Scraping finished, check file %q for results\n", fName) 45 | } 46 | -------------------------------------------------------------------------------- /_examples/error_handling/error_handling.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Create a collector 11 | c := colly.NewCollector() 12 | 13 | // Set HTML callback 14 | // Won't be called if error occurs 15 | c.OnHTML("*", func(e *colly.HTMLElement) { 16 | fmt.Println(e) 17 | }) 18 | 19 | // Set error handler 20 | c.OnError(func(r *colly.Response, err error) { 21 | fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err) 22 | }) 23 | 24 | // Start scraping 25 | c.Visit("https://definitely-not-a.website/") 26 | } 27 | -------------------------------------------------------------------------------- /_examples/factba.se/factbase.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "strconv" 8 | 9 | "github.com/gocolly/colly/v2" 10 | ) 11 | 12 | var baseSearchURL = "https://factba.se/json/json-transcript.php?q=&f=&dt=&p=" 13 | var baseTranscriptURL = "https://factba.se/transcript/" 14 | 15 | type result struct { 16 | Slug string `json:"slug"` 17 | Date string `json:"date"` 18 | } 19 | 20 | type results struct { 21 | Data []*result `json:"data"` 22 | } 23 | 24 | type transcript struct { 25 | Speaker string 26 | Text string 27 | } 28 | 29 | func main() { 30 | c := colly.NewCollector( 31 | colly.AllowedDomains("factba.se"), 32 | ) 33 | 34 | d := c.Clone() 35 | 36 | d.OnHTML("body", func(e *colly.HTMLElement) { 37 | t := make([]transcript, 0) 38 | e.ForEach(".topic-media-row", func(_ int, el *colly.HTMLElement) { 39 | t = append(t, transcript{ 40 | Speaker: el.ChildText(".speaker-label"), 41 | Text: el.ChildText(".transcript-text-block"), 42 | }) 43 | }) 44 | jsonData, err := json.MarshalIndent(t, "", " ") 45 | if err != nil { 46 | return 47 | } 48 | os.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644) 49 | }) 50 | 51 | stop := false 52 | c.OnResponse(func(r *colly.Response) { 53 | rs := &results{} 54 | err := json.Unmarshal(r.Body, rs) 55 | if err != nil || len(rs.Data) == 0 { 56 | stop = true 57 | return 58 | } 59 | for _, res := range rs.Data { 60 | u := baseTranscriptURL + res.Slug 61 | ctx := colly.NewContext() 62 | ctx.Put("date", res.Date) 63 | ctx.Put("slug", res.Slug) 64 | d.Request("GET", u, nil, ctx, nil) 65 | } 66 | }) 67 | 68 | for i := 1; i < 1000; i++ { 69 | if stop { 70 | break 71 | } 72 | if err := c.Visit(baseSearchURL + strconv.Itoa(i)); err != nil { 73 | fmt.Println("Error:", err) 74 | break 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /_examples/google_groups/google_groups.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "log" 7 | "os" 8 | "strings" 9 | 10 | "github.com/gocolly/colly/v2" 11 | ) 12 | 13 | // Mail is the container of a single e-mail 14 | type Mail struct { 15 | Title string 16 | Link string 17 | Author string 18 | Date string 19 | Message string 20 | } 21 | 22 | func main() { 23 | var groupName string 24 | flag.StringVar(&groupName, "group", "hspbp", "Google Groups group name") 25 | flag.Parse() 26 | 27 | threads := make(map[string][]Mail) 28 | 29 | threadCollector := colly.NewCollector() 30 | mailCollector := colly.NewCollector() 31 | 32 | // Collect threads 33 | threadCollector.OnHTML("tr", func(e *colly.HTMLElement) { 34 | ch := e.DOM.Children() 35 | author := ch.Eq(1).Text() 36 | // deleted topic 37 | if author == "" { 38 | return 39 | } 40 | 41 | title := ch.Eq(0).Text() 42 | link, _ := ch.Eq(0).Children().Eq(0).Attr("href") 43 | // fix link to point to the pure HTML version of the thread 44 | link = strings.Replace(link, ".com/d/topic", ".com/forum/?_escaped_fragment_=topic", 1) 45 | date := ch.Eq(2).Text() 46 | 47 | log.Printf("Thread found: %s %q %s %s\n", link, title, author, date) 48 | mailCollector.Visit(link) 49 | }) 50 | 51 | // Visit next page 52 | threadCollector.OnHTML("body > a[href]", func(e *colly.HTMLElement) { 53 | log.Println("Next page link found:", e.Attr("href")) 54 | e.Request.Visit(e.Attr("href")) 55 | }) 56 | 57 | // Extract mails 58 | mailCollector.OnHTML("body", func(e *colly.HTMLElement) { 59 | // Find subject 60 | threadSubject := e.ChildText("h2") 61 | if _, ok := threads[threadSubject]; !ok { 62 | threads[threadSubject] = make([]Mail, 0, 8) 63 | } 64 | 65 | // Extract mails 66 | e.ForEach("table tr", func(_ int, el *colly.HTMLElement) { 67 | mail := Mail{ 68 | Title: el.ChildText("td:nth-of-type(1)"), 69 | Link: el.ChildAttr("td:nth-of-type(1)", "href"), 70 | Author: el.ChildText("td:nth-of-type(2)"), 71 | Date: el.ChildText("td:nth-of-type(3)"), 72 | Message: el.ChildText("td:nth-of-type(4)"), 73 | } 74 | threads[threadSubject] = append(threads[threadSubject], mail) 75 | }) 76 | 77 | // Follow next page link 78 | if link, found := e.DOM.Find("> a[href]").Attr("href"); found { 79 | e.Request.Visit(link) 80 | } else { 81 | log.Printf("Thread %q done\n", threadSubject) 82 | } 83 | }) 84 | 85 | threadCollector.Visit("https://groups.google.com/forum/?_escaped_fragment_=forum/" + groupName) 86 | 87 | enc := json.NewEncoder(os.Stdout) 88 | enc.SetIndent("", " ") 89 | 90 | // Dump json to the standard output 91 | enc.Encode(threads) 92 | } 93 | -------------------------------------------------------------------------------- /_examples/hackernews_comments/hackernews_comments.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "log" 7 | "os" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/gocolly/colly/v2" 12 | ) 13 | 14 | type comment struct { 15 | Author string `selector:"a.hnuser"` 16 | URL string `selector:".age a[href]" attr:"href"` 17 | Comment string `selector:".comment"` 18 | Replies []*comment 19 | depth int 20 | } 21 | 22 | func main() { 23 | var itemID string 24 | flag.StringVar(&itemID, "id", "", "hackernews post id") 25 | flag.Parse() 26 | 27 | if itemID == "" { 28 | log.Println("Hackernews post id required") 29 | os.Exit(1) 30 | } 31 | 32 | comments := make([]*comment, 0) 33 | 34 | // Instantiate default collector 35 | c := colly.NewCollector() 36 | 37 | // Extract comment 38 | c.OnHTML(".comment-tree tr.athing", func(e *colly.HTMLElement) { 39 | width, err := strconv.Atoi(e.ChildAttr("td.ind img", "width")) 40 | if err != nil { 41 | return 42 | } 43 | // hackernews uses 40px spacers to indent comment replies, 44 | // so we have to divide the width with it to get the depth 45 | // of the comment 46 | depth := width / 40 47 | c := &comment{ 48 | Replies: make([]*comment, 0), 49 | depth: depth, 50 | } 51 | e.Unmarshal(c) 52 | c.Comment = strings.TrimSpace(c.Comment[:len(c.Comment)-5]) 53 | if depth == 0 { 54 | comments = append(comments, c) 55 | return 56 | } 57 | parent := comments[len(comments)-1] 58 | // append comment to its parent 59 | for i := 0; i < depth-1; i++ { 60 | parent = parent.Replies[len(parent.Replies)-1] 61 | } 62 | parent.Replies = append(parent.Replies, c) 63 | }) 64 | 65 | c.Visit("https://news.ycombinator.com/item?id=" + itemID) 66 | 67 | enc := json.NewEncoder(os.Stdout) 68 | enc.SetIndent("", " ") 69 | 70 | // Dump json to the standard output 71 | enc.Encode(comments) 72 | } 73 | -------------------------------------------------------------------------------- /_examples/instagram/instagram.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/md5" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "os" 10 | "regexp" 11 | "strings" 12 | 13 | "github.com/gocolly/colly/v2" 14 | ) 15 | 16 | // "id": user id, "after": end cursor 17 | const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s` 18 | const nextPagePayload string = `{"id":"%s","first":50,"after":"%s"}` 19 | 20 | var requestID string 21 | var requestIds [][]byte 22 | var queryIdPattern = regexp.MustCompile(`queryId:".{32}"`) 23 | 24 | type pageInfo struct { 25 | EndCursor string `json:"end_cursor"` 26 | NextPage bool `json:"has_next_page"` 27 | } 28 | 29 | type mainPageData struct { 30 | Rhxgis string `json:"rhx_gis"` 31 | EntryData struct { 32 | ProfilePage []struct { 33 | Graphql struct { 34 | User struct { 35 | Id string `json:"id"` 36 | Media struct { 37 | Edges []struct { 38 | Node struct { 39 | ImageURL string `json:"display_url"` 40 | ThumbnailURL string `json:"thumbnail_src"` 41 | IsVideo bool `json:"is_video"` 42 | Date int `json:"date"` 43 | Dimensions struct { 44 | Width int `json:"width"` 45 | Height int `json:"height"` 46 | } `json:"dimensions"` 47 | } `json::node"` 48 | } `json:"edges"` 49 | PageInfo pageInfo `json:"page_info"` 50 | } `json:"edge_owner_to_timeline_media"` 51 | } `json:"user"` 52 | } `json:"graphql"` 53 | } `json:"ProfilePage"` 54 | } `json:"entry_data"` 55 | } 56 | 57 | type nextPageData struct { 58 | Data struct { 59 | User struct { 60 | Container struct { 61 | PageInfo pageInfo `json:"page_info"` 62 | Edges []struct { 63 | Node struct { 64 | ImageURL string `json:"display_url"` 65 | ThumbnailURL string `json:"thumbnail_src"` 66 | IsVideo bool `json:"is_video"` 67 | Date int `json:"taken_at_timestamp"` 68 | Dimensions struct { 69 | Width int `json:"width"` 70 | Height int `json:"height"` 71 | } 72 | } 73 | } `json:"edges"` 74 | } `json:"edge_owner_to_timeline_media"` 75 | } 76 | } `json:"data"` 77 | } 78 | 79 | func main() { 80 | if len(os.Args) != 2 { 81 | log.Println("Missing account name argument") 82 | os.Exit(1) 83 | } 84 | 85 | var actualUserId string 86 | instagramAccount := os.Args[1] 87 | outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount) 88 | 89 | c := colly.NewCollector( 90 | //colly.CacheDir("./_instagram_cache/"), 91 | colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"), 92 | ) 93 | 94 | c.OnRequest(func(r *colly.Request) { 95 | r.Headers.Set("X-Requested-With", "XMLHttpRequest") 96 | r.Headers.Set("Referer", "https://www.instagram.com/"+instagramAccount) 97 | if r.Ctx.Get("gis") != "" { 98 | gis := fmt.Sprintf("%s:%s", r.Ctx.Get("gis"), r.Ctx.Get("variables")) 99 | h := md5.New() 100 | h.Write([]byte(gis)) 101 | gisHash := fmt.Sprintf("%x", h.Sum(nil)) 102 | r.Headers.Set("X-Instagram-GIS", gisHash) 103 | } 104 | }) 105 | 106 | c.OnHTML("html", func(e *colly.HTMLElement) { 107 | d := c.Clone() 108 | d.OnResponse(func(r *colly.Response) { 109 | requestIds = queryIdPattern.FindAll(r.Body, -1) 110 | requestID = string(requestIds[1][9:41]) 111 | }) 112 | requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as="script"]`, "href")) 113 | d.Visit(requestIDURL) 114 | 115 | dat := e.ChildText("body > script:first-of-type") 116 | jsonData := dat[strings.Index(dat, "{") : len(dat)-1] 117 | data := &mainPageData{} 118 | err := json.Unmarshal([]byte(jsonData), data) 119 | if err != nil { 120 | log.Fatal(err) 121 | } 122 | 123 | log.Println("saving output to ", outputDir) 124 | os.MkdirAll(outputDir, os.ModePerm) 125 | page := data.EntryData.ProfilePage[0] 126 | actualUserId = page.Graphql.User.Id 127 | for _, obj := range page.Graphql.User.Media.Edges { 128 | // skip videos 129 | if obj.Node.IsVideo { 130 | continue 131 | } 132 | c.Visit(obj.Node.ImageURL) 133 | } 134 | nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, page.Graphql.User.Media.PageInfo.EndCursor) 135 | e.Request.Ctx.Put("variables", nextPageVars) 136 | if page.Graphql.User.Media.PageInfo.NextPage { 137 | u := fmt.Sprintf( 138 | nextPageURL, 139 | requestID, 140 | url.QueryEscape(nextPageVars), 141 | ) 142 | log.Println("Next page found", u) 143 | e.Request.Ctx.Put("gis", data.Rhxgis) 144 | e.Request.Visit(u) 145 | } 146 | }) 147 | 148 | c.OnError(func(r *colly.Response, e error) { 149 | log.Println("error:", e, r.Request.URL, string(r.Body)) 150 | }) 151 | 152 | c.OnResponse(func(r *colly.Response) { 153 | if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 { 154 | r.Save(outputDir + r.FileName()) 155 | return 156 | } 157 | 158 | if strings.Index(r.Headers.Get("Content-Type"), "json") == -1 { 159 | return 160 | } 161 | 162 | data := &nextPageData{} 163 | err := json.Unmarshal(r.Body, data) 164 | if err != nil { 165 | log.Fatal(err) 166 | } 167 | 168 | for _, obj := range data.Data.User.Container.Edges { 169 | // skip videos 170 | if obj.Node.IsVideo { 171 | continue 172 | } 173 | c.Visit(obj.Node.ImageURL) 174 | } 175 | if data.Data.User.Container.PageInfo.NextPage { 176 | nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, data.Data.User.Container.PageInfo.EndCursor) 177 | r.Request.Ctx.Put("variables", nextPageVars) 178 | u := fmt.Sprintf( 179 | nextPageURL, 180 | requestID, 181 | url.QueryEscape(nextPageVars), 182 | ) 183 | log.Println("Next page found", u) 184 | r.Request.Visit(u) 185 | } 186 | }) 187 | 188 | c.Visit("https://instagram.com/" + instagramAccount) 189 | } 190 | -------------------------------------------------------------------------------- /_examples/local_files/html/child_page/one.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

Child Page One

11 | 12 | -------------------------------------------------------------------------------- /_examples/local_files/html/child_page/three.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

Child Page Three

11 | 12 | -------------------------------------------------------------------------------- /_examples/local_files/html/child_page/two.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

Child Page Two

11 | 12 | -------------------------------------------------------------------------------- /_examples/local_files/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Document 8 | 9 | 10 |

Index.html

11 |
16 | 17 | -------------------------------------------------------------------------------- /_examples/local_files/local_files.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/gocolly/colly/v2" 10 | ) 11 | 12 | func main() { 13 | dir, err := filepath.Abs(filepath.Dir(os.Args[0])) 14 | if err != nil { 15 | panic(err) 16 | } 17 | 18 | t := &http.Transport{} 19 | t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/"))) 20 | 21 | c := colly.NewCollector() 22 | c.WithTransport(t) 23 | 24 | pages := []string{} 25 | 26 | c.OnHTML("h1", func(e *colly.HTMLElement) { 27 | pages = append(pages, e.Text) 28 | }) 29 | 30 | c.OnHTML("a", func(e *colly.HTMLElement) { 31 | c.Visit("file://" + dir + "/html" + e.Attr("href")) 32 | }) 33 | 34 | fmt.Println("file://" + dir + "/html/index.html") 35 | c.Visit("file://" + dir + "/html/index.html") 36 | c.Wait() 37 | for i, p := range pages { 38 | fmt.Printf("%d : %s\n", i, p) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /_examples/login/login.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // create a new collector 11 | c := colly.NewCollector() 12 | 13 | // authenticate 14 | err := c.Post("http://example.com/login", map[string]string{"username": "admin", "password": "admin"}) 15 | if err != nil { 16 | log.Fatal(err) 17 | } 18 | 19 | // attach callbacks after login 20 | c.OnResponse(func(r *colly.Response) { 21 | log.Println("response received", r.StatusCode) 22 | }) 23 | 24 | // start scraping 25 | c.Visit("https://example.com/") 26 | } 27 | -------------------------------------------------------------------------------- /_examples/max_depth/max_depth.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Instantiate default collector 11 | c := colly.NewCollector( 12 | // MaxDepth is 1, so only the links on the scraped page 13 | // is visited, and no further links are followed 14 | colly.MaxDepth(1), 15 | ) 16 | 17 | // On every a element which has href attribute call callback 18 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 19 | link := e.Attr("href") 20 | // Print link 21 | fmt.Println(link) 22 | // Visit link found on page 23 | e.Request.Visit(link) 24 | }) 25 | 26 | // Start scraping on https://en.wikipedia.org 27 | c.Visit("https://en.wikipedia.org/") 28 | } 29 | -------------------------------------------------------------------------------- /_examples/multipart/asciimoo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gocolly/colly/3a490c99cf2a7493271f151949590baae6a72538/_examples/multipart/asciimoo.jpg -------------------------------------------------------------------------------- /_examples/multipart/multipart.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "net/http" 7 | "os" 8 | "time" 9 | 10 | "github.com/gocolly/colly/v2" 11 | ) 12 | 13 | func generateFormData() map[string][]byte { 14 | f, _ := os.Open("gocolly.jpg") 15 | defer f.Close() 16 | 17 | imgData, _ := io.ReadAll(f) 18 | 19 | return map[string][]byte{ 20 | "firstname": []byte("one"), 21 | "lastname": []byte("two"), 22 | "email": []byte("onetwo@example.com"), 23 | "file": imgData, 24 | } 25 | } 26 | 27 | func setupServer() { 28 | var handler http.HandlerFunc = func(w http.ResponseWriter, r *http.Request) { 29 | fmt.Println("received request") 30 | err := r.ParseMultipartForm(10000000) 31 | if err != nil { 32 | fmt.Println("server: Error") 33 | w.WriteHeader(500) 34 | w.Write([]byte("Internal Server Error")) 35 | return 36 | } 37 | w.WriteHeader(200) 38 | fmt.Println("server: OK") 39 | w.Write([]byte("Success")) 40 | } 41 | 42 | go http.ListenAndServe(":8080", handler) 43 | } 44 | 45 | func main() { 46 | // Start a single route http server to post an image to. 47 | setupServer() 48 | 49 | c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5)) 50 | 51 | // On every a element which has href attribute call callback 52 | c.OnHTML("html", func(e *colly.HTMLElement) { 53 | fmt.Println(e.Text) 54 | time.Sleep(1 * time.Second) 55 | e.Request.PostMultipart("http://localhost:8080/", generateFormData()) 56 | }) 57 | 58 | // Before making a request print "Visiting ..." 59 | c.OnRequest(func(r *colly.Request) { 60 | fmt.Println("Posting gocolly.jpg to", r.URL.String()) 61 | }) 62 | 63 | // Start scraping 64 | c.PostMultipart("http://localhost:8080/", generateFormData()) 65 | c.Wait() 66 | } 67 | -------------------------------------------------------------------------------- /_examples/openedx_courses/openedx_courses.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "strings" 7 | "time" 8 | 9 | "github.com/gocolly/colly/v2" 10 | ) 11 | 12 | // DATE_FORMAT default format date used in openedx 13 | const DATE_FORMAT = "02 Jan, 2006" 14 | 15 | // Course store openedx course data 16 | type Course struct { 17 | CourseID string 18 | Run string 19 | Name string 20 | Number string 21 | StartDate *time.Time 22 | EndDate *time.Time 23 | URL string 24 | } 25 | 26 | func main() { 27 | // Instantiate default collector 28 | c := colly.NewCollector( 29 | // Using IndonesiaX as sample 30 | colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"), 31 | 32 | // Cache responses to prevent multiple download of pages 33 | // even if the collector is restarted 34 | colly.CacheDir("./cache"), 35 | ) 36 | 37 | courses := make([]Course, 0, 200) 38 | 39 | // On every a element which has href attribute call callback 40 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 41 | link := e.Attr("href") 42 | if !strings.HasPrefix(link, "/courses/") { 43 | return 44 | } 45 | // start scraping the page under the link found 46 | e.Request.Visit(link) 47 | }) 48 | 49 | c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) { 50 | if e.DOM.Find("section#course-info").Length() == 0 { 51 | return 52 | } 53 | title := strings.Split(e.ChildText(".course-info__title"), "\n")[0] 54 | course_id := e.ChildAttr("input[name=course_id]", "value") 55 | texts := e.ChildTexts("span[data-datetime]") 56 | start_date, _ := time.Parse(DATE_FORMAT, texts[0]) 57 | end_date, _ := time.Parse(DATE_FORMAT, texts[1]) 58 | var run string 59 | if len(strings.Split(course_id, "_")) > 1 { 60 | run = strings.Split(course_id, "_")[1] 61 | } 62 | course := Course{ 63 | CourseID: course_id, 64 | Run: run, 65 | Name: title, 66 | Number: e.ChildText("span.course-number"), 67 | StartDate: &start_date, 68 | EndDate: &end_date, 69 | URL: fmt.Sprintf("/courses/%s/about", course_id), 70 | } 71 | courses = append(courses, course) 72 | }) 73 | 74 | // Start scraping on https://openedxdomain/courses 75 | c.Visit("https://www.indonesiax.co.id/courses") 76 | 77 | // Convert results to JSON data if the scraping job has finished 78 | jsonData, err := json.MarshalIndent(courses, "", " ") 79 | if err != nil { 80 | panic(err) 81 | } 82 | 83 | // Dump json to the standard output (can be redirected to a file) 84 | fmt.Println(string(jsonData)) 85 | } 86 | -------------------------------------------------------------------------------- /_examples/parallel/parallel.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Instantiate default collector 11 | c := colly.NewCollector( 12 | // MaxDepth is 2, so only the links on the scraped page 13 | // and links on those pages are visited 14 | colly.MaxDepth(2), 15 | colly.Async(), 16 | ) 17 | 18 | // Limit the maximum parallelism to 2 19 | // This is necessary if the goroutines are dynamically 20 | // created to control the limit of simultaneous requests. 21 | // 22 | // Parallelism can be controlled also by spawning fixed 23 | // number of go routines. 24 | c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2}) 25 | 26 | // On every a element which has href attribute call callback 27 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 28 | link := e.Attr("href") 29 | // Print link 30 | fmt.Println(link) 31 | // Visit link found on page on a new thread 32 | e.Request.Visit(link) 33 | }) 34 | 35 | // Start scraping on https://en.wikipedia.org 36 | c.Visit("https://en.wikipedia.org/") 37 | // Wait until threads are finished 38 | c.Wait() 39 | } 40 | -------------------------------------------------------------------------------- /_examples/proxy_switcher/proxy_switcher.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "log" 6 | 7 | "github.com/gocolly/colly/v2" 8 | "github.com/gocolly/colly/v2/proxy" 9 | ) 10 | 11 | func main() { 12 | // Instantiate default collector 13 | c := colly.NewCollector(colly.AllowURLRevisit()) 14 | 15 | // Rotate two socks5 proxies 16 | rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338") 17 | if err != nil { 18 | log.Fatal(err) 19 | } 20 | c.SetProxyFunc(rp) 21 | 22 | // Print the response 23 | c.OnResponse(func(r *colly.Response) { 24 | log.Printf("Proxy Address: %s\n", r.Request.ProxyURL) 25 | log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1)) 26 | }) 27 | 28 | // Fetch httpbin.org/ip five times 29 | for i := 0; i < 5; i++ { 30 | c.Visit("https://httpbin.org/ip") 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /_examples/queue/queue.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | "github.com/gocolly/colly/v2/queue" 8 | ) 9 | 10 | func main() { 11 | url := "https://httpbin.org/delay/1" 12 | 13 | // Instantiate default collector 14 | c := colly.NewCollector(colly.AllowURLRevisit()) 15 | 16 | // create a request queue with 2 consumer threads 17 | q, _ := queue.New( 18 | 2, // Number of consumer threads 19 | &queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage 20 | ) 21 | 22 | c.OnRequest(func(r *colly.Request) { 23 | fmt.Println("visiting", r.URL) 24 | if r.ID < 15 { 25 | r2, err := r.New("GET", fmt.Sprintf("%s?x=%v", url, r.ID), nil) 26 | if err == nil { 27 | q.AddRequest(r2) 28 | } 29 | } 30 | }) 31 | 32 | for i := 0; i < 5; i++ { 33 | // Add URLs to the queue 34 | q.AddURL(fmt.Sprintf("%s?n=%d", url, i)) 35 | } 36 | // Consume URLs 37 | q.Run(c) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /_examples/random_delay/random_delay.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/gocolly/colly/v2" 8 | "github.com/gocolly/colly/v2/debug" 9 | ) 10 | 11 | func main() { 12 | url := "https://httpbin.org/delay/2" 13 | 14 | // Instantiate default collector 15 | c := colly.NewCollector( 16 | // Attach a debugger to the collector 17 | colly.Debugger(&debug.LogDebugger{}), 18 | colly.Async(), 19 | ) 20 | 21 | // Limit the number of threads started by colly to two 22 | // when visiting links which domains' matches "*httpbin.*" glob 23 | c.Limit(&colly.LimitRule{ 24 | DomainGlob: "*httpbin.*", 25 | Parallelism: 2, 26 | RandomDelay: 5 * time.Second, 27 | }) 28 | 29 | // Start scraping in four threads on https://httpbin.org/delay/2 30 | for i := 0; i < 4; i++ { 31 | c.Visit(fmt.Sprintf("%s?n=%d", url, i)) 32 | } 33 | // Start scraping on https://httpbin.org/delay/2 34 | c.Visit(url) 35 | // Wait until threads are finished 36 | c.Wait() 37 | } 38 | -------------------------------------------------------------------------------- /_examples/rate_limit/rate_limit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | "github.com/gocolly/colly/v2/debug" 8 | ) 9 | 10 | func main() { 11 | url := "https://httpbin.org/delay/2" 12 | 13 | // Instantiate default collector 14 | c := colly.NewCollector( 15 | // Turn on asynchronous requests 16 | colly.Async(), 17 | // Attach a debugger to the collector 18 | colly.Debugger(&debug.LogDebugger{}), 19 | ) 20 | 21 | // Limit the number of threads started by colly to two 22 | // when visiting links which domains' matches "*httpbin.*" glob 23 | c.Limit(&colly.LimitRule{ 24 | DomainGlob: "*httpbin.*", 25 | Parallelism: 2, 26 | //Delay: 5 * time.Second, 27 | }) 28 | 29 | // Start scraping in five threads on https://httpbin.org/delay/2 30 | for i := 0; i < 5; i++ { 31 | c.Visit(fmt.Sprintf("%s?n=%d", url, i)) 32 | } 33 | // Wait until threads are finished 34 | c.Wait() 35 | } 36 | -------------------------------------------------------------------------------- /_examples/reddit/reddit.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "time" 7 | 8 | "github.com/gocolly/colly/v2" 9 | ) 10 | 11 | type item struct { 12 | StoryURL string 13 | Source string 14 | comments string 15 | CrawledAt time.Time 16 | Comments string 17 | Title string 18 | } 19 | 20 | func main() { 21 | stories := []item{} 22 | // Instantiate default collector 23 | c := colly.NewCollector( 24 | // Visit only domains: old.reddit.com 25 | colly.AllowedDomains("old.reddit.com"), 26 | // Parallelism 27 | colly.Async(true), 28 | ) 29 | 30 | // On every a element which has .top-matter attribute call callback 31 | // This class is unique to the div that holds all information about a story 32 | c.OnHTML(".top-matter", func(e *colly.HTMLElement) { 33 | temp := item{} 34 | temp.StoryURL = e.ChildAttr("a[data-event-action=title]", "href") 35 | temp.Source = "https://old.reddit.com/r/programming/" 36 | temp.Title = e.ChildText("a[data-event-action=title]") 37 | temp.Comments = e.ChildAttr("a[data-event-action=comments]", "href") 38 | temp.CrawledAt = time.Now() 39 | stories = append(stories, temp) 40 | }) 41 | 42 | // On every span tag with the class next-button 43 | c.OnHTML("span.next-button", func(h *colly.HTMLElement) { 44 | t := h.ChildAttr("a", "href") 45 | c.Visit(t) 46 | }) 47 | 48 | // Set max Parallelism and introduce a Random Delay 49 | c.Limit(&colly.LimitRule{ 50 | Parallelism: 2, 51 | RandomDelay: 5 * time.Second, 52 | }) 53 | 54 | // Before making a request print "Visiting ..." 55 | c.OnRequest(func(r *colly.Request) { 56 | fmt.Println("Visiting", r.URL.String()) 57 | 58 | }) 59 | 60 | // Crawl all reddits the user passes in 61 | reddits := os.Args[1:] 62 | for _, reddit := range reddits { 63 | c.Visit(reddit) 64 | 65 | } 66 | 67 | c.Wait() 68 | fmt.Println(stories) 69 | 70 | } 71 | -------------------------------------------------------------------------------- /_examples/request_context/request_context.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Instantiate default collector 11 | c := colly.NewCollector() 12 | 13 | // Before making a request put the URL with 14 | // the key of "url" into the context of the request 15 | c.OnRequest(func(r *colly.Request) { 16 | r.Ctx.Put("url", r.URL.String()) 17 | }) 18 | 19 | // After making a request get "url" from 20 | // the context of the request 21 | c.OnResponse(func(r *colly.Response) { 22 | fmt.Println(r.Ctx.Get("url")) 23 | }) 24 | 25 | // Start scraping on https://en.wikipedia.org 26 | c.Visit("https://en.wikipedia.org/") 27 | } 28 | -------------------------------------------------------------------------------- /_examples/scraper_server/scraper_server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "log" 6 | "net/http" 7 | 8 | "github.com/gocolly/colly/v2" 9 | ) 10 | 11 | type pageInfo struct { 12 | StatusCode int 13 | Links map[string]int 14 | } 15 | 16 | func handler(w http.ResponseWriter, r *http.Request) { 17 | URL := r.URL.Query().Get("url") 18 | if URL == "" { 19 | log.Println("missing URL argument") 20 | return 21 | } 22 | log.Println("visiting", URL) 23 | 24 | c := colly.NewCollector() 25 | 26 | p := &pageInfo{Links: make(map[string]int)} 27 | 28 | // count links 29 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 30 | link := e.Request.AbsoluteURL(e.Attr("href")) 31 | if link != "" { 32 | p.Links[link]++ 33 | } 34 | }) 35 | 36 | // extract status code 37 | c.OnResponse(func(r *colly.Response) { 38 | log.Println("response received", r.StatusCode) 39 | p.StatusCode = r.StatusCode 40 | }) 41 | c.OnError(func(r *colly.Response, err error) { 42 | log.Println("error:", r.StatusCode, err) 43 | p.StatusCode = r.StatusCode 44 | }) 45 | 46 | c.Visit(URL) 47 | 48 | // dump results 49 | b, err := json.Marshal(p) 50 | if err != nil { 51 | log.Println("failed to serialize response:", err) 52 | return 53 | } 54 | w.Header().Add("Content-Type", "application/json") 55 | w.Write(b) 56 | } 57 | 58 | func main() { 59 | // example usage: curl -s 'http://127.0.0.1:7171/?url=http://go-colly.org/' 60 | addr := ":7171" 61 | 62 | http.HandleFunc("/", handler) 63 | 64 | log.Println("listening on", addr) 65 | log.Fatal(http.ListenAndServe(addr, nil)) 66 | } 67 | -------------------------------------------------------------------------------- /_examples/shopify_sitemap/shopify_sitemap.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/gocolly/colly/v2" 7 | ) 8 | 9 | func main() { 10 | // Array containing all the known URLs in a sitemap 11 | knownUrls := []string{} 12 | 13 | // Create a Collector specifically for Shopify 14 | c := colly.NewCollector(colly.AllowedDomains("www.shopify.com")) 15 | 16 | // Create a callback on the XPath query searching for the URLs 17 | c.OnXML("//urlset/url/loc", func(e *colly.XMLElement) { 18 | knownUrls = append(knownUrls, e.Text) 19 | }) 20 | 21 | // Start the collector 22 | c.Visit("https://www.shopify.com/sitemap.xml") 23 | 24 | fmt.Println("All known URLs:") 25 | for _, url := range knownUrls { 26 | fmt.Println("\t", url) 27 | } 28 | fmt.Println("Collected", len(knownUrls), "URLs") 29 | } 30 | -------------------------------------------------------------------------------- /_examples/url_filter/url_filter.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | 7 | "github.com/gocolly/colly/v2" 8 | ) 9 | 10 | func main() { 11 | // Instantiate default collector 12 | c := colly.NewCollector( 13 | // Visit only root url and urls which start with "e" or "h" on httpbin.org 14 | colly.URLFilters( 15 | regexp.MustCompile("http://httpbin\\.org/(|e.+)$"), 16 | regexp.MustCompile("http://httpbin\\.org/h.+"), 17 | ), 18 | ) 19 | 20 | // On every a element which has href attribute call callback 21 | c.OnHTML("a[href]", func(e *colly.HTMLElement) { 22 | link := e.Attr("href") 23 | // Print link 24 | fmt.Printf("Link found: %q -> %s\n", e.Text, link) 25 | // Visit link found on page 26 | // Only those links are visited which are matched by any of the URLFilter regexps 27 | c.Visit(e.Request.AbsoluteURL(link)) 28 | }) 29 | 30 | // Before making a request print "Visiting ..." 31 | c.OnRequest(func(r *colly.Request) { 32 | fmt.Println("Visiting", r.URL.String()) 33 | }) 34 | 35 | // Start scraping on http://httpbin.org 36 | c.Visit("http://httpbin.org/") 37 | } 38 | -------------------------------------------------------------------------------- /_examples/xkcd_store/xkcd_store.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "log" 6 | "os" 7 | 8 | "github.com/gocolly/colly/v2" 9 | ) 10 | 11 | func main() { 12 | fName := "xkcd_store_items.csv" 13 | file, err := os.Create(fName) 14 | if err != nil { 15 | log.Fatalf("Cannot create file %q: %s\n", fName, err) 16 | return 17 | } 18 | defer file.Close() 19 | writer := csv.NewWriter(file) 20 | defer writer.Flush() 21 | // Write CSV header 22 | writer.Write([]string{"Name", "Price", "URL", "Image URL"}) 23 | 24 | // Instantiate default collector 25 | c := colly.NewCollector( 26 | // Allow requests only to store.xkcd.com 27 | colly.AllowedDomains("store.xkcd.com"), 28 | ) 29 | 30 | // Extract product details 31 | c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) { 32 | writer.Write([]string{ 33 | e.ChildAttr("a", "title"), 34 | e.ChildText("span"), 35 | e.Request.AbsoluteURL(e.ChildAttr("a", "href")), 36 | "https:" + e.ChildAttr("img", "src"), 37 | }) 38 | }) 39 | 40 | // Find and visit next page links 41 | c.OnHTML(`.next a[href]`, func(e *colly.HTMLElement) { 42 | e.Request.Visit(e.Attr("href")) 43 | }) 44 | 45 | c.Visit("https://store.xkcd.com/collections/everything") 46 | 47 | log.Printf("Scraping finished, check file %q for results\n", fName) 48 | 49 | // Display collector's statistics 50 | log.Println(c) 51 | } 52 | -------------------------------------------------------------------------------- /assets/scrapfly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gocolly/colly/3a490c99cf2a7493271f151949590baae6a72538/assets/scrapfly.png -------------------------------------------------------------------------------- /cmd/colly/colly.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package main 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "log" 21 | "os" 22 | "strings" 23 | 24 | "github.com/jawher/mow.cli" 25 | ) 26 | 27 | var scraperHeadTemplate = `package main 28 | 29 | import ( 30 | "log" 31 | 32 | "github.com/gocolly/colly/v2" 33 | ) 34 | 35 | func main() { 36 | c := colly.NewCollector() 37 | ` 38 | 39 | var scraperEndTemplate = ` 40 | c.Visit("https://yourdomain.com/") 41 | } 42 | ` 43 | 44 | var htmlCallbackTemplate = ` 45 | c.OnHTML("element-selector", func(e *colly.HTMLElement) { 46 | log.Println(e.Text) 47 | }) 48 | ` 49 | 50 | var requestCallbackTemplate = ` 51 | c.OnRequest(func(r *colly.Request) { 52 | log.Println("Visiting", r.URL) 53 | }) 54 | ` 55 | 56 | var responseCallbackTemplate = ` 57 | c.OnResponse(func(r *colly.Response) { 58 | log.Println("Visited", r.Request.URL, r.StatusCode) 59 | }) 60 | ` 61 | 62 | var errorCallbackTemplate = ` 63 | c.OnError(func(r *colly.Response, err error) { 64 | log.Printf("Error on %s: %s", r.Request.URL, err) 65 | }) 66 | ` 67 | 68 | func main() { 69 | app := cli.App("colly", "Scraping Framework for Gophers") 70 | 71 | app.Command("new", "Create new scraper", func(cmd *cli.Cmd) { 72 | var ( 73 | callbacks = cmd.StringOpt("callbacks", "", "Add callbacks to the template. (E.g. '--callbacks=html,response,error')") 74 | hosts = cmd.StringOpt("hosts", "", "Specify scraper's allowed hosts. (e.g. '--hosts=xy.com,abcd.com')") 75 | path = cmd.StringArg("PATH", "", "Path of the new scraper") 76 | ) 77 | 78 | cmd.Spec = "[--callbacks] [--hosts] [PATH]" 79 | 80 | cmd.Action = func() { 81 | scraper := bytes.NewBufferString(scraperHeadTemplate) 82 | outfile := os.Stdout 83 | if *path != "" { 84 | var err error 85 | outfile, err = os.Create(*path) 86 | if err != nil { 87 | log.Fatal(err) 88 | } 89 | defer outfile.Close() 90 | } 91 | if *hosts != "" { 92 | scraper.WriteString("\n c.AllowedDomains = []string{") 93 | for i, h := range strings.Split(*hosts, ",") { 94 | if i > 0 { 95 | scraper.WriteString(", ") 96 | } 97 | scraper.WriteString(fmt.Sprintf("%q", h)) 98 | } 99 | scraper.WriteString("}\n") 100 | } 101 | if len(*callbacks) > 0 { 102 | for _, c := range strings.Split(*callbacks, ",") { 103 | switch c { 104 | case "html": 105 | scraper.WriteString(htmlCallbackTemplate) 106 | case "request": 107 | scraper.WriteString(requestCallbackTemplate) 108 | case "response": 109 | scraper.WriteString(responseCallbackTemplate) 110 | case "error": 111 | scraper.WriteString(errorCallbackTemplate) 112 | } 113 | } 114 | } 115 | scraper.WriteString(scraperEndTemplate) 116 | outfile.Write(scraper.Bytes()) 117 | } 118 | }) 119 | 120 | app.Run(os.Args) 121 | } 122 | -------------------------------------------------------------------------------- /context.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "sync" 19 | ) 20 | 21 | // Context provides a tiny layer for passing data between callbacks 22 | type Context struct { 23 | contextMap map[string]interface{} 24 | lock *sync.RWMutex 25 | } 26 | 27 | // NewContext initializes a new Context instance 28 | func NewContext() *Context { 29 | return &Context{ 30 | contextMap: make(map[string]interface{}), 31 | lock: &sync.RWMutex{}, 32 | } 33 | } 34 | 35 | // UnmarshalBinary decodes Context value to nil 36 | // This function is used by request caching 37 | func (c *Context) UnmarshalBinary(_ []byte) error { 38 | return nil 39 | } 40 | 41 | // MarshalBinary encodes Context value 42 | // This function is used by request caching 43 | func (c *Context) MarshalBinary() (_ []byte, _ error) { 44 | return nil, nil 45 | } 46 | 47 | // Put stores a value of any type in Context 48 | func (c *Context) Put(key string, value interface{}) { 49 | c.lock.Lock() 50 | c.contextMap[key] = value 51 | c.lock.Unlock() 52 | } 53 | 54 | // Get retrieves a string value from Context. 55 | // Get returns an empty string if key not found 56 | func (c *Context) Get(key string) string { 57 | c.lock.RLock() 58 | defer c.lock.RUnlock() 59 | if v, ok := c.contextMap[key]; ok { 60 | return v.(string) 61 | } 62 | return "" 63 | } 64 | 65 | // GetAny retrieves a value from Context. 66 | // GetAny returns nil if key not found 67 | func (c *Context) GetAny(key string) interface{} { 68 | c.lock.RLock() 69 | defer c.lock.RUnlock() 70 | if v, ok := c.contextMap[key]; ok { 71 | return v 72 | } 73 | return nil 74 | } 75 | 76 | // ForEach iterate context 77 | func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{} { 78 | c.lock.RLock() 79 | defer c.lock.RUnlock() 80 | 81 | ret := make([]interface{}, 0, len(c.contextMap)) 82 | for k, v := range c.contextMap { 83 | ret = append(ret, fn(k, v)) 84 | } 85 | 86 | return ret 87 | } 88 | -------------------------------------------------------------------------------- /context_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "strconv" 19 | "testing" 20 | ) 21 | 22 | func TestContextIteration(t *testing.T) { 23 | ctx := NewContext() 24 | for i := 0; i < 10; i++ { 25 | ctx.Put(strconv.Itoa(i), i) 26 | } 27 | values := ctx.ForEach(func(k string, v interface{}) interface{} { 28 | return v.(int) 29 | }) 30 | if len(values) != 10 { 31 | t.Fatal("fail to iterate context") 32 | } 33 | for _, i := range values { 34 | v := i.(int) 35 | if v != ctx.GetAny(strconv.Itoa(v)).(int) { 36 | t.Fatal("value not equal") 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /debug/debug.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package debug 16 | 17 | // Event represents an action inside a collector 18 | type Event struct { 19 | // Type is the type of the event 20 | Type string 21 | // RequestID identifies the HTTP request of the Event 22 | RequestID uint32 23 | // CollectorID identifies the collector of the Event 24 | CollectorID uint32 25 | // Values contains the event's key-value pairs. Different type of events 26 | // can return different key-value pairs 27 | Values map[string]string 28 | } 29 | 30 | // Debugger is an interface for different type of debugging backends 31 | type Debugger interface { 32 | // Init initializes the backend 33 | Init() error 34 | // Event receives a new collector event. 35 | Event(e *Event) 36 | } 37 | -------------------------------------------------------------------------------- /debug/logdebugger.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package debug 16 | 17 | import ( 18 | "io" 19 | "log" 20 | "os" 21 | "sync/atomic" 22 | "time" 23 | ) 24 | 25 | // LogDebugger is the simplest debugger which prints log messages to the STDERR 26 | type LogDebugger struct { 27 | // Output is the log destination, anything can be used which implements them 28 | // io.Writer interface. Leave it blank to use STDERR 29 | Output io.Writer 30 | // Prefix appears at the beginning of each generated log line 31 | Prefix string 32 | // Flag defines the logging properties. 33 | Flag int 34 | logger *log.Logger 35 | counter int32 36 | start time.Time 37 | } 38 | 39 | // Init initializes the LogDebugger 40 | func (l *LogDebugger) Init() error { 41 | l.counter = 0 42 | l.start = time.Now() 43 | if l.Output == nil { 44 | l.Output = os.Stderr 45 | } 46 | l.logger = log.New(l.Output, l.Prefix, l.Flag) 47 | return nil 48 | } 49 | 50 | // Event receives Collector events and prints them to STDERR 51 | func (l *LogDebugger) Event(e *Event) { 52 | i := atomic.AddInt32(&l.counter, 1) 53 | l.logger.Printf("[%06d] %d [%6d - %s] %q (%s)\n", i, e.CollectorID, e.RequestID, e.Type, e.Values, time.Since(l.start)) 54 | } 55 | -------------------------------------------------------------------------------- /debug/webdebugger.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package debug 16 | 17 | import ( 18 | "encoding/json" 19 | "log" 20 | "net/http" 21 | "sync" 22 | "time" 23 | ) 24 | 25 | // WebDebugger is a web based debuging frontend for colly 26 | type WebDebugger struct { 27 | // Address is the address of the web server. It is 127.0.0.1:7676 by default. 28 | Address string 29 | initialized bool 30 | CurrentRequests map[uint32]requestInfo 31 | RequestLog []requestInfo 32 | sync.Mutex 33 | } 34 | 35 | type requestInfo struct { 36 | URL string 37 | Started time.Time 38 | Duration time.Duration 39 | ResponseStatus string 40 | ID uint32 41 | CollectorID uint32 42 | } 43 | 44 | // Init initializes the WebDebugger 45 | func (w *WebDebugger) Init() error { 46 | if w.initialized { 47 | return nil 48 | } 49 | defer func() { 50 | w.initialized = true 51 | }() 52 | if w.Address == "" { 53 | w.Address = "127.0.0.1:7676" 54 | } 55 | w.RequestLog = make([]requestInfo, 0) 56 | w.CurrentRequests = make(map[uint32]requestInfo) 57 | http.HandleFunc("/", w.indexHandler) 58 | http.HandleFunc("/status", w.statusHandler) 59 | log.Println("Starting debug webserver on", w.Address) 60 | go http.ListenAndServe(w.Address, nil) 61 | return nil 62 | } 63 | 64 | // Event updates the debugger's status 65 | func (w *WebDebugger) Event(e *Event) { 66 | w.Lock() 67 | defer w.Unlock() 68 | 69 | switch e.Type { 70 | case "request": 71 | w.CurrentRequests[e.RequestID] = requestInfo{ 72 | URL: e.Values["url"], 73 | Started: time.Now(), 74 | ID: e.RequestID, 75 | CollectorID: e.CollectorID, 76 | } 77 | case "response", "error": 78 | r := w.CurrentRequests[e.RequestID] 79 | r.Duration = time.Since(r.Started) 80 | r.ResponseStatus = e.Values["status"] 81 | w.RequestLog = append(w.RequestLog, r) 82 | delete(w.CurrentRequests, e.RequestID) 83 | } 84 | } 85 | 86 | func (w *WebDebugger) indexHandler(wr http.ResponseWriter, r *http.Request) { 87 | wr.Write([]byte(` 88 | 89 | 90 | Colly Debugger WebUI 91 | 92 | 93 | 94 | 95 | 100 |
101 |
102 |
103 |

Current Requests

104 |
105 |
106 |
107 |

Finished Requests

108 |
109 |
110 |
111 |
112 | 140 | 141 | 142 | `)) 143 | } 144 | 145 | func (w *WebDebugger) statusHandler(wr http.ResponseWriter, r *http.Request) { 146 | w.Lock() 147 | jsonData, err := json.MarshalIndent(w, "", " ") 148 | w.Unlock() 149 | if err != nil { 150 | panic(err) 151 | } 152 | wr.Write(jsonData) 153 | } 154 | -------------------------------------------------------------------------------- /extensions/extensions.go: -------------------------------------------------------------------------------- 1 | // Package extensions implements various helper addons for Colly 2 | package extensions 3 | -------------------------------------------------------------------------------- /extensions/random_user_agent.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "strings" 7 | 8 | "github.com/gocolly/colly/v2" 9 | ) 10 | 11 | var uaGens = []func() string{ 12 | genFirefoxUA, 13 | genChromeUA, 14 | genEdgeUA, 15 | genOperaUA, 16 | } 17 | 18 | var uaGensMobile = []func() string{ 19 | genMobilePixel7UA, 20 | genMobilePixel6UA, 21 | genMobilePixel5UA, 22 | genMobilePixel4UA, 23 | genMobileNexus10UA, 24 | } 25 | 26 | // RandomUserAgent generates a random DESKTOP browser user-agent on every requests 27 | func RandomUserAgent(c *colly.Collector) { 28 | c.OnRequest(func(r *colly.Request) { 29 | r.Headers.Set("User-Agent", uaGens[rand.Intn(len(uaGens))]()) 30 | }) 31 | } 32 | 33 | // RandomMobileUserAgent generates a random MOBILE browser user-agent on every requests 34 | func RandomMobileUserAgent(c *colly.Collector) { 35 | c.OnRequest(func(r *colly.Request) { 36 | r.Headers.Set("User-Agent", uaGensMobile[rand.Intn(len(uaGensMobile))]()) 37 | }) 38 | } 39 | 40 | var ffVersions = []float32{ 41 | // NOTE: Only version released after Jun 1, 2022 will be listed. 42 | // Data source: https://en.wikipedia.org/wiki/Firefox_version_history 43 | 44 | // 2022 45 | 102.0, 46 | 103.0, 47 | 104.0, 48 | 105.0, 49 | 106.0, 50 | 107.0, 51 | 108.0, 52 | 53 | // 2023 54 | 109.0, 55 | 110.0, 56 | 111.0, 57 | 112.0, 58 | 113.0, 59 | } 60 | 61 | var chromeVersions = []string{ 62 | // NOTE: Only version released after Jun 1, 2022 will be listed. 63 | // Data source: https://chromereleases.googleblog.com/search/label/Stable%20updates 64 | 65 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop.html 66 | "102.0.5005.115", 67 | 68 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_21.html 69 | "103.0.5060.53", 70 | 71 | // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_27.html 72 | "103.0.5060.66", 73 | 74 | // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop.html 75 | "103.0.5060.114", 76 | 77 | // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop_19.html 78 | "103.0.5060.134", 79 | 80 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop.html 81 | "104.0.5112.79", 82 | "104.0.5112.80", 83 | "104.0.5112.81", 84 | 85 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_16.html 86 | "104.0.5112.101", 87 | "104.0.5112.102", 88 | 89 | // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_30.html 90 | "105.0.5195.52", 91 | "105.0.5195.53", 92 | "105.0.5195.54", 93 | 94 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop.html 95 | "105.0.5195.102", 96 | 97 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_14.html 98 | "105.0.5195.125", 99 | "105.0.5195.126", 100 | "105.0.5195.127", 101 | 102 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_27.html 103 | "106.0.5249.61", 104 | "106.0.5249.62", 105 | 106 | // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_30.html 107 | "106.0.5249.91", 108 | 109 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop.html 110 | "106.0.5249.103", 111 | 112 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_11.html 113 | "106.0.5249.119", 114 | 115 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_25.html 116 | "107.0.5304.62", 117 | "107.0.5304.63", 118 | "107.0.5304.68", 119 | 120 | // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_27.html 121 | "107.0.5304.87", 122 | "107.0.5304.88", 123 | 124 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop.html 125 | "107.0.5304.106", 126 | "107.0.5304.107", 127 | "107.0.5304.110", 128 | 129 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_24.html 130 | "107.0.5304.121", 131 | "107.0.5304.122", 132 | 133 | // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_29.html 134 | "108.0.5359.71", 135 | "108.0.5359.72", 136 | 137 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop.html 138 | "108.0.5359.94", 139 | "108.0.5359.95", 140 | 141 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_7.html 142 | "108.0.5359.98", 143 | "108.0.5359.99", 144 | 145 | // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_13.html 146 | "108.0.5359.124", 147 | "108.0.5359.125", 148 | 149 | // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop.html 150 | "109.0.5414.74", 151 | "109.0.5414.75", 152 | "109.0.5414.87", 153 | 154 | // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop_24.html 155 | "109.0.5414.119", 156 | "109.0.5414.120", 157 | 158 | // https://chromereleases.googleblog.com/2023/02/stable-channel-update-for-desktop.html 159 | "110.0.5481.77", 160 | "110.0.5481.78", 161 | 162 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update.html 163 | "110.0.5481.96", 164 | "110.0.5481.97", 165 | 166 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_14.html 167 | "110.0.5481.100", 168 | 169 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_16.html 170 | "110.0.5481.104", 171 | 172 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_22.html 173 | "110.0.5481.177", 174 | "110.0.5481.178", 175 | 176 | // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_97.html 177 | "109.0.5414.129", 178 | 179 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop.html 180 | "111.0.5563.64", 181 | "111.0.5563.65", 182 | 183 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_21.html 184 | "111.0.5563.110", 185 | "111.0.5563.111", 186 | 187 | // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_27.html 188 | "111.0.5563.146", 189 | "111.0.5563.147", 190 | 191 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop.html 192 | "112.0.5615.49", 193 | "112.0.5615.50", 194 | 195 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_12.html 196 | "112.0.5615.86", 197 | "112.0.5615.87", 198 | 199 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_14.html 200 | "112.0.5615.121", 201 | 202 | // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_18.html 203 | "112.0.5615.137", 204 | "112.0.5615.138", 205 | "112.0.5615.165", 206 | 207 | // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop.html 208 | "113.0.5672.63", 209 | "113.0.5672.64", 210 | 211 | // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop_8.html 212 | "113.0.5672.92", 213 | "113.0.5672.93", 214 | } 215 | 216 | var edgeVersions = []string{ 217 | // NOTE: Only version released after Jun 1, 2022 will be listed. 218 | // Data source: https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule 219 | 220 | // 2022 221 | "103.0.0.0,103.0.1264.37", 222 | "104.0.0.0,104.0.1293.47", 223 | "105.0.0.0,105.0.1343.25", 224 | "106.0.0.0,106.0.1370.34", 225 | "107.0.0.0,107.0.1418.24", 226 | "108.0.0.0,108.0.1462.42", 227 | 228 | // 2023 229 | "109.0.0.0,109.0.1518.49", 230 | "110.0.0.0,110.0.1587.41", 231 | "111.0.0.0,111.0.1661.41", 232 | "112.0.0.0,112.0.1722.34", 233 | "113.0.0.0,113.0.1774.3", 234 | } 235 | 236 | var operaVersions = []string{ 237 | // NOTE: Only version released after Jan 1, 2023 will be listed. 238 | // Data source: https://blogs.opera.com/desktop/ 239 | 240 | // https://blogs.opera.com/desktop/changelog-for-96/ 241 | "110.0.5449.0,96.0.4640.0", 242 | "110.0.5464.2,96.0.4653.0", 243 | "110.0.5464.2,96.0.4660.0", 244 | "110.0.5481.30,96.0.4674.0", 245 | "110.0.5481.30,96.0.4691.0", 246 | "110.0.5481.30,96.0.4693.12", 247 | "110.0.5481.77,96.0.4693.16", 248 | "110.0.5481.100,96.0.4693.20", 249 | "110.0.5481.178,96.0.4693.31", 250 | "110.0.5481.178,96.0.4693.50", 251 | "110.0.5481.192,96.0.4693.80", 252 | 253 | // https://blogs.opera.com/desktop/changelog-for-97/ 254 | "111.0.5532.2,97.0.4711.0", 255 | "111.0.5532.2,97.0.4704.0", 256 | "111.0.5532.2,97.0.4697.0", 257 | "111.0.5562.0,97.0.4718.0", 258 | "111.0.5563.19,97.0.4719.4", 259 | "111.0.5563.19,97.0.4719.11", 260 | "111.0.5563.41,97.0.4719.17", 261 | "111.0.5563.65,97.0.4719.26", 262 | "111.0.5563.65,97.0.4719.28", 263 | "111.0.5563.111,97.0.4719.43", 264 | "111.0.5563.147,97.0.4719.63", 265 | "111.0.5563.147,97.0.4719.83", 266 | 267 | // https://blogs.opera.com/desktop/changelog-for-98/ 268 | "112.0.5596.2,98.0.4756.0", 269 | "112.0.5596.2,98.0.4746.0", 270 | "112.0.5615.20,98.0.4759.1", 271 | "112.0.5615.50,98.0.4759.3", 272 | "112.0.5615.87,98.0.4759.6", 273 | "112.0.5615.165,98.0.4759.15", 274 | "112.0.5615.165,98.0.4759.21", 275 | "112.0.5615.165,98.0.4759.39", 276 | } 277 | 278 | var pixel7AndroidVersions = []string{ 279 | // Data source: 280 | // - https://developer.android.com/about/versions 281 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 282 | "13", 283 | } 284 | 285 | var pixel6AndroidVersions = []string{ 286 | // Data source: 287 | // - https://developer.android.com/about/versions 288 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 289 | "12", 290 | "13", 291 | } 292 | 293 | var pixel5AndroidVersions = []string{ 294 | // Data source: 295 | // - https://developer.android.com/about/versions 296 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 297 | "11", 298 | "12", 299 | "13", 300 | } 301 | 302 | var pixel4AndroidVersions = []string{ 303 | // Data source: 304 | // - https://developer.android.com/about/versions 305 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 306 | "10", 307 | "11", 308 | "12", 309 | "13", 310 | } 311 | 312 | var nexus10AndroidVersions = []string{ 313 | // Data source: 314 | // - https://developer.android.com/about/versions 315 | // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 316 | "4.4.2", 317 | "4.4.4", 318 | "5.0", 319 | "5.0.1", 320 | "5.0.2", 321 | "5.1", 322 | "5.1.1", 323 | } 324 | 325 | var nexus10Builds = []string{ 326 | // Data source: https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds 327 | 328 | "LMY49M", // android-5.1.1_r38 (Lollipop) 329 | "LMY49J", // android-5.1.1_r37 (Lollipop) 330 | "LMY49I", // android-5.1.1_r36 (Lollipop) 331 | "LMY49H", // android-5.1.1_r35 (Lollipop) 332 | "LMY49G", // android-5.1.1_r34 (Lollipop) 333 | "LMY49F", // android-5.1.1_r33 (Lollipop) 334 | "LMY48Z", // android-5.1.1_r30 (Lollipop) 335 | "LMY48X", // android-5.1.1_r25 (Lollipop) 336 | "LMY48T", // android-5.1.1_r19 (Lollipop) 337 | "LMY48M", // android-5.1.1_r14 (Lollipop) 338 | "LMY48I", // android-5.1.1_r9 (Lollipop) 339 | "LMY47V", // android-5.1.1_r1 (Lollipop) 340 | "LMY47D", // android-5.1.0_r1 (Lollipop) 341 | "LRX22G", // android-5.0.2_r1 (Lollipop) 342 | "LRX22C", // android-5.0.1_r1 (Lollipop) 343 | "LRX21P", // android-5.0.0_r4.0.1 (Lollipop) 344 | "KTU84P", // android-4.4.4_r1 (KitKat) 345 | "KTU84L", // android-4.4.3_r1 (KitKat) 346 | "KOT49H", // android-4.4.2_r1 (KitKat) 347 | "KOT49E", // android-4.4.1_r1 (KitKat) 348 | "KRT16S", // android-4.4_r1.2 (KitKat) 349 | "JWR66Y", // android-4.3_r1.1 (Jelly Bean) 350 | "JWR66V", // android-4.3_r1 (Jelly Bean) 351 | "JWR66N", // android-4.3_r0.9.1 (Jelly Bean) 352 | "JDQ39 ", // android-4.2.2_r1 (Jelly Bean) 353 | "JOP40F", // android-4.2.1_r1.1 (Jelly Bean) 354 | "JOP40D", // android-4.2.1_r1 (Jelly Bean) 355 | "JOP40C", // android-4.2_r1 (Jelly Bean) 356 | } 357 | 358 | var osStrings = []string{ 359 | // MacOS - High Sierra 360 | "Macintosh; Intel Mac OS X 10_13", 361 | "Macintosh; Intel Mac OS X 10_13_1", 362 | "Macintosh; Intel Mac OS X 10_13_2", 363 | "Macintosh; Intel Mac OS X 10_13_3", 364 | "Macintosh; Intel Mac OS X 10_13_4", 365 | "Macintosh; Intel Mac OS X 10_13_5", 366 | "Macintosh; Intel Mac OS X 10_13_6", 367 | 368 | // MacOS - Mojave 369 | "Macintosh; Intel Mac OS X 10_14", 370 | "Macintosh; Intel Mac OS X 10_14_1", 371 | "Macintosh; Intel Mac OS X 10_14_2", 372 | "Macintosh; Intel Mac OS X 10_14_3", 373 | "Macintosh; Intel Mac OS X 10_14_4", 374 | "Macintosh; Intel Mac OS X 10_14_5", 375 | "Macintosh; Intel Mac OS X 10_14_6", 376 | 377 | // MacOS - Catalina 378 | "Macintosh; Intel Mac OS X 10_15", 379 | "Macintosh; Intel Mac OS X 10_15_1", 380 | "Macintosh; Intel Mac OS X 10_15_2", 381 | "Macintosh; Intel Mac OS X 10_15_3", 382 | "Macintosh; Intel Mac OS X 10_15_4", 383 | "Macintosh; Intel Mac OS X 10_15_5", 384 | "Macintosh; Intel Mac OS X 10_15_6", 385 | "Macintosh; Intel Mac OS X 10_15_7", 386 | 387 | // MacOS - Big Sur 388 | "Macintosh; Intel Mac OS X 11_0", 389 | "Macintosh; Intel Mac OS X 11_0_1", 390 | "Macintosh; Intel Mac OS X 11_1", 391 | "Macintosh; Intel Mac OS X 11_2", 392 | "Macintosh; Intel Mac OS X 11_2_1", 393 | "Macintosh; Intel Mac OS X 11_2_2", 394 | "Macintosh; Intel Mac OS X 11_2_3", 395 | "Macintosh; Intel Mac OS X 11_3", 396 | "Macintosh; Intel Mac OS X 11_3_1", 397 | "Macintosh; Intel Mac OS X 11_4", 398 | "Macintosh; Intel Mac OS X 11_5", 399 | "Macintosh; Intel Mac OS X 11_5_1", 400 | "Macintosh; Intel Mac OS X 11_5_2", 401 | "Macintosh; Intel Mac OS X 11_6", 402 | "Macintosh; Intel Mac OS X 11_6_1", 403 | "Macintosh; Intel Mac OS X 11_6_2", 404 | "Macintosh; Intel Mac OS X 11_6_3", 405 | "Macintosh; Intel Mac OS X 11_6_4", 406 | "Macintosh; Intel Mac OS X 11_6_5", 407 | "Macintosh; Intel Mac OS X 11_6_6", 408 | "Macintosh; Intel Mac OS X 11_6_7", 409 | "Macintosh; Intel Mac OS X 11_6_8", 410 | "Macintosh; Intel Mac OS X 11_7", 411 | "Macintosh; Intel Mac OS X 11_7_1", 412 | "Macintosh; Intel Mac OS X 11_7_2", 413 | "Macintosh; Intel Mac OS X 11_7_3", 414 | "Macintosh; Intel Mac OS X 11_7_4", 415 | "Macintosh; Intel Mac OS X 11_7_5", 416 | "Macintosh; Intel Mac OS X 11_7_6", 417 | 418 | // MacOS - Monterey 419 | "Macintosh; Intel Mac OS X 12_0", 420 | "Macintosh; Intel Mac OS X 12_0_1", 421 | "Macintosh; Intel Mac OS X 12_1", 422 | "Macintosh; Intel Mac OS X 12_2", 423 | "Macintosh; Intel Mac OS X 12_2_1", 424 | "Macintosh; Intel Mac OS X 12_3", 425 | "Macintosh; Intel Mac OS X 12_3_1", 426 | "Macintosh; Intel Mac OS X 12_4", 427 | "Macintosh; Intel Mac OS X 12_5", 428 | "Macintosh; Intel Mac OS X 12_5_1", 429 | "Macintosh; Intel Mac OS X 12_6", 430 | "Macintosh; Intel Mac OS X 12_6_1", 431 | "Macintosh; Intel Mac OS X 12_6_2", 432 | "Macintosh; Intel Mac OS X 12_6_3", 433 | "Macintosh; Intel Mac OS X 12_6_4", 434 | "Macintosh; Intel Mac OS X 12_6_5", 435 | 436 | // MacOS - Ventura 437 | "Macintosh; Intel Mac OS X 13_0", 438 | "Macintosh; Intel Mac OS X 13_0_1", 439 | "Macintosh; Intel Mac OS X 13_1", 440 | "Macintosh; Intel Mac OS X 13_2", 441 | "Macintosh; Intel Mac OS X 13_2_1", 442 | "Macintosh; Intel Mac OS X 13_3", 443 | "Macintosh; Intel Mac OS X 13_3_1", 444 | 445 | // Windows 446 | "Windows NT 10.0; Win64; x64", 447 | "Windows NT 5.1", 448 | "Windows NT 6.1; WOW64", 449 | "Windows NT 6.1; Win64; x64", 450 | 451 | // Linux 452 | "X11; Linux x86_64", 453 | } 454 | 455 | // Generates Firefox Browser User-Agent (Desktop) 456 | // 457 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0" 458 | func genFirefoxUA() string { 459 | version := ffVersions[rand.Intn(len(ffVersions))] 460 | os := osStrings[rand.Intn(len(osStrings))] 461 | return fmt.Sprintf("Mozilla/5.0 (%s; rv:%.1f) Gecko/20100101 Firefox/%.1f", os, version, version) 462 | } 463 | 464 | // Generates Chrome Browser User-Agent (Desktop) 465 | // 466 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36" 467 | func genChromeUA() string { 468 | version := chromeVersions[rand.Intn(len(chromeVersions))] 469 | os := osStrings[rand.Intn(len(osStrings))] 470 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", os, version) 471 | } 472 | 473 | // Generates Microsoft Edge User-Agent (Desktop) 474 | // 475 | // -> "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39" 476 | func genEdgeUA() string { 477 | version := edgeVersions[rand.Intn(len(edgeVersions))] 478 | chromeVersion := strings.Split(version, ",")[0] 479 | edgeVersion := strings.Split(version, ",")[1] 480 | os := osStrings[rand.Intn(len(osStrings))] 481 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 Edg/%s", os, chromeVersion, edgeVersion) 482 | } 483 | 484 | // Generates Opera Browser User-Agent (Desktop) 485 | // 486 | // -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.4759.3" 487 | func genOperaUA() string { 488 | version := operaVersions[rand.Intn(len(operaVersions))] 489 | chromeVersion := strings.Split(version, ",")[0] 490 | operaVersion := strings.Split(version, ",")[1] 491 | os := osStrings[rand.Intn(len(osStrings))] 492 | return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 OPR/%s", os, chromeVersion, operaVersion) 493 | } 494 | 495 | // Generates Pixel 7 Browser User-Agent (Mobile) 496 | // 497 | // -> Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36 498 | func genMobilePixel7UA() string { 499 | android := pixel7AndroidVersions[rand.Intn(len(pixel7AndroidVersions))] 500 | chrome := chromeVersions[rand.Intn(len(chromeVersions))] 501 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) 502 | } 503 | 504 | // Generates Pixel 6 Browser User-Agent (Mobile) 505 | // 506 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" 507 | func genMobilePixel6UA() string { 508 | android := pixel6AndroidVersions[rand.Intn(len(pixel6AndroidVersions))] 509 | chrome := chromeVersions[rand.Intn(len(chromeVersions))] 510 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) 511 | } 512 | 513 | // Generates Pixel 5 Browser User-Agent (Mobile) 514 | // 515 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" 516 | func genMobilePixel5UA() string { 517 | android := pixel5AndroidVersions[rand.Intn(len(pixel5AndroidVersions))] 518 | chrome := chromeVersions[rand.Intn(len(chromeVersions))] 519 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) 520 | } 521 | 522 | // Generates Pixel 4 Browser User-Agent (Mobile) 523 | // 524 | // -> "Mozilla/5.0 (Linux; Android 13; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" 525 | func genMobilePixel4UA() string { 526 | android := pixel4AndroidVersions[rand.Intn(len(pixel4AndroidVersions))] 527 | chrome := chromeVersions[rand.Intn(len(chromeVersions))] 528 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) 529 | } 530 | 531 | // Generates Nexus 10 Browser User-Agent (Mobile) 532 | // 533 | // -> "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 10 Build/LMY48T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Safari/537.36" 534 | func genMobileNexus10UA() string { 535 | build := nexus10Builds[rand.Intn(len(nexus10Builds))] 536 | android := nexus10AndroidVersions[rand.Intn(len(nexus10AndroidVersions))] 537 | chrome := chromeVersions[rand.Intn(len(chromeVersions))] 538 | return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Nexus 10 Build/%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, build, chrome) 539 | } 540 | -------------------------------------------------------------------------------- /extensions/referer.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "github.com/gocolly/colly/v2" 5 | ) 6 | 7 | // Referer sets valid Referer HTTP header to requests. 8 | // Warning: this extension works only if you use Request.Visit 9 | // from callbacks instead of Collector.Visit. 10 | func Referer(c *colly.Collector) { 11 | c.OnResponse(func(r *colly.Response) { 12 | r.Ctx.Put("_referer", r.Request.URL.String()) 13 | }) 14 | c.OnRequest(func(r *colly.Request) { 15 | if ref := r.Ctx.Get("_referer"); ref != "" { 16 | r.Headers.Set("Referer", ref) 17 | } 18 | }) 19 | } 20 | -------------------------------------------------------------------------------- /extensions/url_length_filter.go: -------------------------------------------------------------------------------- 1 | package extensions 2 | 3 | import ( 4 | "github.com/gocolly/colly/v2" 5 | ) 6 | 7 | // URLLengthFilter filters out requests with URLs longer than URLLengthLimit 8 | func URLLengthFilter(c *colly.Collector, URLLengthLimit int) { 9 | c.OnRequest(func(r *colly.Request) { 10 | if len(r.URL.String()) > URLLengthLimit { 11 | r.Abort() 12 | } 13 | }) 14 | } 15 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/gocolly/colly/v2 2 | 3 | go 1.23.0 4 | 5 | toolchain go1.24.1 6 | 7 | require ( 8 | github.com/PuerkitoBio/goquery v1.10.2 9 | github.com/antchfx/htmlquery v1.3.4 10 | github.com/antchfx/xmlquery v1.4.4 11 | github.com/gobwas/glob v0.2.3 12 | github.com/gocolly/colly v1.2.0 13 | github.com/jawher/mow.cli v1.1.0 14 | github.com/kennygrant/sanitize v1.2.4 15 | github.com/nlnwa/whatwg-url v0.6.1 16 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d 17 | github.com/temoto/robotstxt v1.1.2 18 | golang.org/x/net v0.37.0 19 | google.golang.org/appengine v1.6.8 20 | ) 21 | 22 | require ( 23 | github.com/andybalholm/cascadia v1.3.3 // indirect 24 | github.com/antchfx/xpath v1.3.3 // indirect 25 | github.com/bits-and-blooms/bitset v1.22.0 // indirect 26 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect 27 | github.com/golang/protobuf v1.5.4 // indirect 28 | golang.org/x/text v0.23.0 // indirect 29 | google.golang.org/protobuf v1.36.6 // indirect 30 | ) 31 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.10.2 h1:7fh2BdHcG6VFZsK7toXBT/Bh1z5Wmy8Q9MV9HqT2AM8= 2 | github.com/PuerkitoBio/goquery v1.10.2/go.mod h1:0guWGjcLu9AYC7C1GHnpysHy056u9aEkUHwhdnePMCU= 3 | github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= 4 | github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= 5 | github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ= 6 | github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM= 7 | github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg= 8 | github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc= 9 | github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs= 10 | github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= 11 | github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= 12 | github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= 13 | github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= 14 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 15 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 16 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 17 | github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= 18 | github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= 19 | github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= 20 | github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= 21 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 22 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= 23 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= 24 | github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= 25 | github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= 26 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 27 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 28 | github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 29 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 30 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 31 | github.com/jawher/mow.cli v1.1.0 h1:NdtHXRc0CwZQ507wMvQ/IS+Q3W3x2fycn973/b8Zuk8= 32 | github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= 33 | github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= 34 | github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= 35 | github.com/nlnwa/whatwg-url v0.6.1 h1:Zlefa3aglQFHF/jku45VxbEJwPicDnOz64Ra3F7npqQ= 36 | github.com/nlnwa/whatwg-url v0.6.1/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= 37 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 38 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 39 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= 40 | github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= 41 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 42 | github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= 43 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 44 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 45 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= 46 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 47 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 48 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 49 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 50 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= 51 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 52 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= 53 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 54 | golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= 55 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 56 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 57 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 58 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 59 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 60 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 61 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 62 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 63 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 64 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 65 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= 66 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 67 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= 68 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 69 | golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= 70 | golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c= 71 | golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= 72 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 73 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 74 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 75 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 76 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 77 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 78 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 79 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 80 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 81 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 82 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 83 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 84 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 85 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 86 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 87 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 88 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 89 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 90 | golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 91 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= 92 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 93 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 94 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 95 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 96 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= 97 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= 98 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= 99 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 100 | golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= 101 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 102 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 103 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 104 | golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= 105 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 106 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 107 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 108 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 109 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 110 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 111 | golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= 112 | golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= 113 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 114 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 115 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 116 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 117 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= 118 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 119 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 120 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 121 | google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= 122 | google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= 123 | google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= 124 | google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= 125 | google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= 126 | google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= 127 | -------------------------------------------------------------------------------- /htmlelement.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "strings" 19 | 20 | "github.com/PuerkitoBio/goquery" 21 | "golang.org/x/net/html" 22 | ) 23 | 24 | // HTMLElement is the representation of a HTML tag. 25 | type HTMLElement struct { 26 | // Name is the name of the tag 27 | Name string 28 | Text string 29 | attributes []html.Attribute 30 | // Request is the request object of the element's HTML document 31 | Request *Request 32 | // Response is the Response object of the element's HTML document 33 | Response *Response 34 | // DOM is the goquery parsed DOM object of the page. DOM is relative 35 | // to the current HTMLElement 36 | DOM *goquery.Selection 37 | // Index stores the position of the current element within all the elements matched by an OnHTML callback 38 | Index int 39 | } 40 | 41 | // NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node. 42 | func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement { 43 | return &HTMLElement{ 44 | Name: n.Data, 45 | Request: resp.Request, 46 | Response: resp, 47 | Text: goquery.NewDocumentFromNode(n).Text(), 48 | DOM: s, 49 | Index: idx, 50 | attributes: n.Attr, 51 | } 52 | } 53 | 54 | // Attr returns the selected attribute of a HTMLElement or empty string 55 | // if no attribute found 56 | func (h *HTMLElement) Attr(k string) string { 57 | for _, a := range h.attributes { 58 | if a.Key == k { 59 | return a.Val 60 | } 61 | } 62 | return "" 63 | } 64 | 65 | // ChildText returns the concatenated and stripped text content of the matching 66 | // elements. 67 | func (h *HTMLElement) ChildText(goquerySelector string) string { 68 | return strings.TrimSpace(h.DOM.Find(goquerySelector).Text()) 69 | } 70 | 71 | // ChildTexts returns the stripped text content of all the matching 72 | // elements. 73 | func (h *HTMLElement) ChildTexts(goquerySelector string) []string { 74 | var res []string 75 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) { 76 | 77 | res = append(res, strings.TrimSpace(s.Text())) 78 | }) 79 | return res 80 | } 81 | 82 | // ChildAttr returns the stripped text content of the first matching 83 | // element's attribute. 84 | func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string { 85 | if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok { 86 | return strings.TrimSpace(attr) 87 | } 88 | return "" 89 | } 90 | 91 | // ChildAttrs returns the stripped text content of all the matching 92 | // element's attributes. 93 | func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string { 94 | var res []string 95 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) { 96 | if attr, ok := s.Attr(attrName); ok { 97 | res = append(res, strings.TrimSpace(attr)) 98 | } 99 | }) 100 | return res 101 | } 102 | 103 | // ForEach iterates over the elements matched by the first argument 104 | // and calls the callback function on every HTMLElement match. 105 | func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) { 106 | i := 0 107 | h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) { 108 | for _, n := range s.Nodes { 109 | callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) 110 | i++ 111 | } 112 | }) 113 | } 114 | 115 | // ForEachWithBreak iterates over the elements matched by the first argument 116 | // and calls the callback function on every HTMLElement match. 117 | // It is identical to ForEach except that it is possible to break 118 | // out of the loop by returning false in the callback function. It returns the 119 | // current Selection object. 120 | func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) { 121 | i := 0 122 | h.DOM.Find(goquerySelector).EachWithBreak(func(_ int, s *goquery.Selection) bool { 123 | for _, n := range s.Nodes { 124 | if callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) { 125 | i++ 126 | return true 127 | } 128 | } 129 | return false 130 | }) 131 | } 132 | -------------------------------------------------------------------------------- /http_backend.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "crypto/sha1" 19 | "encoding/gob" 20 | "encoding/hex" 21 | "io" 22 | "math/rand" 23 | "net/http" 24 | "os" 25 | "path" 26 | "regexp" 27 | "strings" 28 | "sync" 29 | "time" 30 | 31 | "compress/gzip" 32 | 33 | "github.com/gobwas/glob" 34 | ) 35 | 36 | type httpBackend struct { 37 | LimitRules []*LimitRule 38 | Client *http.Client 39 | lock *sync.RWMutex 40 | } 41 | 42 | type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool 43 | 44 | // LimitRule provides connection restrictions for domains. 45 | // Both DomainRegexp and DomainGlob can be used to specify 46 | // the included domains patterns, but at least one is required. 47 | // There can be two kind of limitations: 48 | // - Parallelism: Set limit for the number of concurrent requests to matching domains 49 | // - Delay: Wait specified amount of time between requests (parallelism is 1 in this case) 50 | type LimitRule struct { 51 | // DomainRegexp is a regular expression to match against domains 52 | DomainRegexp string 53 | // DomainGlob is a glob pattern to match against domains 54 | DomainGlob string 55 | // Delay is the duration to wait before creating a new request to the matching domains 56 | Delay time.Duration 57 | // RandomDelay is the extra randomized duration to wait added to Delay before creating a new request 58 | RandomDelay time.Duration 59 | // Parallelism is the number of the maximum allowed concurrent requests of the matching domains 60 | Parallelism int 61 | waitChan chan bool 62 | compiledRegexp *regexp.Regexp 63 | compiledGlob glob.Glob 64 | } 65 | 66 | // Init initializes the private members of LimitRule 67 | func (r *LimitRule) Init() error { 68 | waitChanSize := 1 69 | if r.Parallelism > 1 { 70 | waitChanSize = r.Parallelism 71 | } 72 | r.waitChan = make(chan bool, waitChanSize) 73 | hasPattern := false 74 | if r.DomainRegexp != "" { 75 | c, err := regexp.Compile(r.DomainRegexp) 76 | if err != nil { 77 | return err 78 | } 79 | r.compiledRegexp = c 80 | hasPattern = true 81 | } 82 | if r.DomainGlob != "" { 83 | c, err := glob.Compile(r.DomainGlob) 84 | if err != nil { 85 | return err 86 | } 87 | r.compiledGlob = c 88 | hasPattern = true 89 | } 90 | if !hasPattern { 91 | return ErrNoPattern 92 | } 93 | return nil 94 | } 95 | 96 | func (h *httpBackend) Init(jar http.CookieJar) { 97 | rand.Seed(time.Now().UnixNano()) 98 | h.Client = &http.Client{ 99 | Jar: jar, 100 | Timeout: 10 * time.Second, 101 | } 102 | h.lock = &sync.RWMutex{} 103 | } 104 | 105 | // Match checks that the domain parameter triggers the rule 106 | func (r *LimitRule) Match(domain string) bool { 107 | match := false 108 | if r.compiledRegexp != nil && r.compiledRegexp.MatchString(domain) { 109 | match = true 110 | } 111 | if r.compiledGlob != nil && r.compiledGlob.Match(domain) { 112 | match = true 113 | } 114 | return match 115 | } 116 | 117 | func (h *httpBackend) GetMatchingRule(domain string) *LimitRule { 118 | if h.LimitRules == nil { 119 | return nil 120 | } 121 | h.lock.RLock() 122 | defer h.lock.RUnlock() 123 | for _, r := range h.LimitRules { 124 | if r.Match(domain) { 125 | return r 126 | } 127 | } 128 | return nil 129 | } 130 | 131 | func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string, cacheExpiration time.Duration) (*Response, error) { 132 | if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { 133 | return h.Do(request, bodySize, checkHeadersFunc) 134 | } 135 | sum := sha1.Sum([]byte(request.URL.String())) 136 | hash := hex.EncodeToString(sum[:]) 137 | dir := path.Join(cacheDir, hash[:2]) 138 | filename := path.Join(dir, hash) 139 | 140 | if fileInfo, err := os.Stat(filename); err == nil && cacheExpiration > 0 { 141 | if time.Since(fileInfo.ModTime()) > cacheExpiration { 142 | _ = os.Remove(filename) 143 | } 144 | } 145 | 146 | if file, err := os.Open(filename); err == nil { 147 | resp := new(Response) 148 | err := gob.NewDecoder(file).Decode(resp) 149 | file.Close() 150 | checkHeadersFunc(request, resp.StatusCode, *resp.Headers) 151 | if resp.StatusCode < 500 { 152 | return resp, err 153 | } 154 | } 155 | resp, err := h.Do(request, bodySize, checkHeadersFunc) 156 | if err != nil || resp.StatusCode >= 500 { 157 | return resp, err 158 | } 159 | if _, err := os.Stat(dir); err != nil { 160 | if err := os.MkdirAll(dir, 0750); err != nil { 161 | return resp, err 162 | } 163 | } 164 | file, err := os.Create(filename + "~") 165 | if err != nil { 166 | return resp, err 167 | } 168 | if err := gob.NewEncoder(file).Encode(resp); err != nil { 169 | file.Close() 170 | return resp, err 171 | } 172 | file.Close() 173 | return resp, os.Rename(filename+"~", filename) 174 | } 175 | 176 | func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { 177 | r := h.GetMatchingRule(request.URL.Host) 178 | if r != nil { 179 | r.waitChan <- true 180 | defer func(r *LimitRule) { 181 | randomDelay := time.Duration(0) 182 | if r.RandomDelay != 0 { 183 | randomDelay = time.Duration(rand.Int63n(int64(r.RandomDelay))) 184 | } 185 | time.Sleep(r.Delay + randomDelay) 186 | <-r.waitChan 187 | }(r) 188 | } 189 | 190 | res, err := h.Client.Do(request) 191 | if err != nil { 192 | return nil, err 193 | } 194 | defer res.Body.Close() 195 | 196 | finalRequest := request 197 | if res.Request != nil { 198 | finalRequest = res.Request 199 | } 200 | if !checkHeadersFunc(finalRequest, res.StatusCode, res.Header) { 201 | // closing res.Body (see defer above) without reading it aborts 202 | // the download 203 | return nil, ErrAbortedAfterHeaders 204 | } 205 | 206 | var bodyReader io.Reader = res.Body 207 | if bodySize > 0 { 208 | bodyReader = io.LimitReader(bodyReader, int64(bodySize)) 209 | } 210 | contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) 211 | if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { 212 | bodyReader, err = gzip.NewReader(bodyReader) 213 | if err != nil { 214 | return nil, err 215 | } 216 | defer bodyReader.(*gzip.Reader).Close() 217 | } 218 | body, err := io.ReadAll(bodyReader) 219 | if err != nil { 220 | return nil, err 221 | } 222 | return &Response{ 223 | StatusCode: res.StatusCode, 224 | Body: body, 225 | Headers: &res.Header, 226 | }, nil 227 | } 228 | 229 | func (h *httpBackend) Limit(rule *LimitRule) error { 230 | h.lock.Lock() 231 | if h.LimitRules == nil { 232 | h.LimitRules = make([]*LimitRule, 0, 8) 233 | } 234 | h.LimitRules = append(h.LimitRules, rule) 235 | h.lock.Unlock() 236 | return rule.Init() 237 | } 238 | 239 | func (h *httpBackend) Limits(rules []*LimitRule) error { 240 | for _, r := range rules { 241 | if err := h.Limit(r); err != nil { 242 | return err 243 | } 244 | } 245 | return nil 246 | } 247 | -------------------------------------------------------------------------------- /http_trace.go: -------------------------------------------------------------------------------- 1 | package colly 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptrace" 6 | "time" 7 | ) 8 | 9 | // HTTPTrace provides a datastructure for storing an http trace. 10 | type HTTPTrace struct { 11 | start, connect time.Time 12 | ConnectDuration time.Duration 13 | FirstByteDuration time.Duration 14 | } 15 | 16 | // trace returns a httptrace.ClientTrace object to be used with an http 17 | // request via httptrace.WithClientTrace() that fills in the HttpTrace. 18 | func (ht *HTTPTrace) trace() *httptrace.ClientTrace { 19 | trace := &httptrace.ClientTrace{ 20 | ConnectStart: func(network, addr string) { ht.connect = time.Now() }, 21 | ConnectDone: func(network, addr string, err error) { 22 | ht.ConnectDuration = time.Since(ht.connect) 23 | }, 24 | 25 | GetConn: func(hostPort string) { ht.start = time.Now() }, 26 | GotFirstResponseByte: func() { 27 | ht.FirstByteDuration = time.Since(ht.start) 28 | }, 29 | } 30 | return trace 31 | } 32 | 33 | // WithTrace returns the given HTTP Request with this HTTPTrace added to its 34 | // context. 35 | func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request { 36 | return req.WithContext(httptrace.WithClientTrace(req.Context(), ht.trace())) 37 | } 38 | -------------------------------------------------------------------------------- /http_trace_test.go: -------------------------------------------------------------------------------- 1 | package colly 2 | 3 | import ( 4 | "net/http" 5 | "net/http/httptest" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | const testDelay = 200 * time.Millisecond 11 | 12 | func newTraceTestServer(delay time.Duration) *httptest.Server { 13 | mux := http.NewServeMux() 14 | 15 | mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 16 | time.Sleep(delay) 17 | w.WriteHeader(200) 18 | }) 19 | mux.HandleFunc("/error", func(w http.ResponseWriter, r *http.Request) { 20 | time.Sleep(delay) 21 | w.WriteHeader(500) 22 | }) 23 | 24 | return httptest.NewServer(mux) 25 | } 26 | 27 | func TestTraceWithNoDelay(t *testing.T) { 28 | ts := newTraceTestServer(0) 29 | defer ts.Close() 30 | 31 | client := ts.Client() 32 | req, err := http.NewRequest("GET", ts.URL, nil) 33 | if err != nil { 34 | t.Errorf("Failed to construct request %v", err) 35 | } 36 | trace := &HTTPTrace{} 37 | req = trace.WithTrace(req) 38 | 39 | if _, err = client.Do(req); err != nil { 40 | t.Errorf("Failed to make request %v", err) 41 | } 42 | 43 | if trace.ConnectDuration > testDelay { 44 | t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration) 45 | } 46 | if trace.FirstByteDuration > testDelay { 47 | t.Errorf("trace FirstByteDuration should be (almost) 0, got %v", trace.FirstByteDuration) 48 | } 49 | } 50 | 51 | func TestTraceWithDelay(t *testing.T) { 52 | ts := newTraceTestServer(testDelay) 53 | defer ts.Close() 54 | 55 | client := ts.Client() 56 | req, err := http.NewRequest("GET", ts.URL, nil) 57 | if err != nil { 58 | t.Errorf("Failed to construct request %v", err) 59 | } 60 | trace := &HTTPTrace{} 61 | req = trace.WithTrace(req) 62 | 63 | if _, err = client.Do(req); err != nil { 64 | t.Errorf("Failed to make request %v", err) 65 | } 66 | 67 | if trace.ConnectDuration > testDelay { 68 | t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration) 69 | } 70 | if trace.FirstByteDuration < testDelay { 71 | t.Errorf("trace FirstByteDuration should be at least 200ms, got %v", trace.FirstByteDuration) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /proxy/proxy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package proxy 16 | 17 | import ( 18 | "context" 19 | "net/http" 20 | "net/url" 21 | "sync/atomic" 22 | 23 | "github.com/gocolly/colly/v2" 24 | ) 25 | 26 | type roundRobinSwitcher struct { 27 | proxyURLs []*url.URL 28 | index uint32 29 | } 30 | 31 | func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) { 32 | index := atomic.AddUint32(&r.index, 1) - 1 33 | u := r.proxyURLs[index%uint32(len(r.proxyURLs))] 34 | 35 | ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, u.String()) 36 | *pr = *pr.WithContext(ctx) 37 | return u, nil 38 | } 39 | 40 | // RoundRobinProxySwitcher creates a proxy switcher function which rotates 41 | // ProxyURLs on every request. 42 | // The proxy type is determined by the URL scheme. "http", "https" 43 | // and "socks5" are supported. If the scheme is empty, 44 | // "http" is assumed. 45 | func RoundRobinProxySwitcher(ProxyURLs ...string) (colly.ProxyFunc, error) { 46 | if len(ProxyURLs) < 1 { 47 | return nil, colly.ErrEmptyProxyURL 48 | } 49 | urls := make([]*url.URL, len(ProxyURLs)) 50 | for i, u := range ProxyURLs { 51 | parsedU, err := url.Parse(u) 52 | if err != nil { 53 | return nil, err 54 | } 55 | urls[i] = parsedU 56 | } 57 | return (&roundRobinSwitcher{urls, 0}).GetProxy, nil 58 | } 59 | -------------------------------------------------------------------------------- /queue/queue.go: -------------------------------------------------------------------------------- 1 | package queue 2 | 3 | import ( 4 | "net/url" 5 | "sync" 6 | 7 | whatwgUrl "github.com/nlnwa/whatwg-url/url" 8 | 9 | "github.com/gocolly/colly/v2" 10 | ) 11 | 12 | const stop = true 13 | 14 | var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) 15 | 16 | // Storage is the interface of the queue's storage backend 17 | // Storage must be concurrently safe for multiple goroutines. 18 | type Storage interface { 19 | // Init initializes the storage 20 | Init() error 21 | // AddRequest adds a serialized request to the queue 22 | AddRequest([]byte) error 23 | // GetRequest pops the next request from the queue 24 | // or returns error if the queue is empty 25 | GetRequest() ([]byte, error) 26 | // QueueSize returns with the size of the queue 27 | QueueSize() (int, error) 28 | } 29 | 30 | // Queue is a request queue which uses a Collector to consume 31 | // requests in multiple threads 32 | type Queue struct { 33 | // Threads defines the number of consumer threads 34 | Threads int 35 | storage Storage 36 | wake chan struct{} 37 | mut sync.Mutex // guards wake and running 38 | running bool 39 | } 40 | 41 | // InMemoryQueueStorage is the default implementation of the Storage interface. 42 | // InMemoryQueueStorage holds the request queue in memory. 43 | type InMemoryQueueStorage struct { 44 | // MaxSize defines the capacity of the queue. 45 | // New requests are discarded if the queue size reaches MaxSize 46 | MaxSize int 47 | lock *sync.RWMutex 48 | size int 49 | first *inMemoryQueueItem 50 | last *inMemoryQueueItem 51 | } 52 | 53 | type inMemoryQueueItem struct { 54 | Request []byte 55 | Next *inMemoryQueueItem 56 | } 57 | 58 | // New creates a new queue with a Storage specified in argument 59 | // A standard InMemoryQueueStorage is used if Storage argument is nil. 60 | func New(threads int, s Storage) (*Queue, error) { 61 | if s == nil { 62 | s = &InMemoryQueueStorage{MaxSize: 100000} 63 | } 64 | if err := s.Init(); err != nil { 65 | return nil, err 66 | } 67 | return &Queue{ 68 | Threads: threads, 69 | storage: s, 70 | running: true, 71 | }, nil 72 | } 73 | 74 | // IsEmpty returns true if the queue is empty 75 | func (q *Queue) IsEmpty() bool { 76 | s, _ := q.Size() 77 | return s == 0 78 | } 79 | 80 | // AddURL adds a new URL to the queue 81 | func (q *Queue) AddURL(URL string) error { 82 | u, err := urlParser.Parse(URL) 83 | if err != nil { 84 | return err 85 | } 86 | u2, err := url.Parse(u.Href(false)) 87 | if err != nil { 88 | return err 89 | } 90 | r := &colly.Request{ 91 | URL: u2, 92 | Method: "GET", 93 | } 94 | d, err := r.Marshal() 95 | if err != nil { 96 | return err 97 | } 98 | return q.storage.AddRequest(d) 99 | } 100 | 101 | // AddRequest adds a new Request to the queue 102 | func (q *Queue) AddRequest(r *colly.Request) error { 103 | q.mut.Lock() 104 | waken := q.wake != nil 105 | q.mut.Unlock() 106 | if !waken { 107 | return q.storeRequest(r) 108 | } 109 | err := q.storeRequest(r) 110 | if err != nil { 111 | return err 112 | } 113 | q.wake <- struct{}{} 114 | return nil 115 | } 116 | 117 | func (q *Queue) storeRequest(r *colly.Request) error { 118 | d, err := r.Marshal() 119 | if err != nil { 120 | return err 121 | } 122 | return q.storage.AddRequest(d) 123 | } 124 | 125 | // Size returns the size of the queue 126 | func (q *Queue) Size() (int, error) { 127 | return q.storage.QueueSize() 128 | } 129 | 130 | // Run starts consumer threads and calls the Collector 131 | // to perform requests. Run blocks while the queue has active requests 132 | // The given Storage must not be used directly while Run blocks. 133 | func (q *Queue) Run(c *colly.Collector) error { 134 | q.mut.Lock() 135 | if q.wake != nil && q.running == true { 136 | q.mut.Unlock() 137 | panic("cannot call duplicate Queue.Run") 138 | } 139 | q.wake = make(chan struct{}) 140 | q.running = true 141 | q.mut.Unlock() 142 | 143 | requestc := make(chan *colly.Request) 144 | complete, errc := make(chan struct{}), make(chan error, 1) 145 | for i := 0; i < q.Threads; i++ { 146 | go independentRunner(requestc, complete) 147 | } 148 | go q.loop(c, requestc, complete, errc) 149 | defer close(requestc) 150 | return <-errc 151 | } 152 | 153 | // Stop will stop the running queue 154 | func (q *Queue) Stop() { 155 | q.mut.Lock() 156 | q.running = false 157 | q.mut.Unlock() 158 | } 159 | 160 | func (q *Queue) loop(c *colly.Collector, requestc chan<- *colly.Request, complete <-chan struct{}, errc chan<- error) { 161 | var active int 162 | for { 163 | size, err := q.storage.QueueSize() 164 | if err != nil { 165 | errc <- err 166 | break 167 | } 168 | if size == 0 && active == 0 || !q.running { 169 | // Terminate when 170 | // 1. No active requests 171 | // 2. Empty queue 172 | errc <- nil 173 | break 174 | } 175 | sent := requestc 176 | var req *colly.Request 177 | if size > 0 { 178 | req, err = q.loadRequest(c) 179 | if err != nil { 180 | // ignore an error returned by GetRequest() or 181 | // UnmarshalRequest() 182 | continue 183 | } 184 | } else { 185 | sent = nil 186 | } 187 | Sent: 188 | for { 189 | select { 190 | case sent <- req: 191 | active++ 192 | break Sent 193 | case <-q.wake: 194 | if sent == nil { 195 | break Sent 196 | } 197 | case <-complete: 198 | active-- 199 | if sent == nil && active == 0 { 200 | break Sent 201 | } 202 | } 203 | } 204 | } 205 | } 206 | 207 | func independentRunner(requestc <-chan *colly.Request, complete chan<- struct{}) { 208 | for req := range requestc { 209 | req.Do() 210 | complete <- struct{}{} 211 | } 212 | } 213 | 214 | func (q *Queue) loadRequest(c *colly.Collector) (*colly.Request, error) { 215 | buf, err := q.storage.GetRequest() 216 | if err != nil { 217 | return nil, err 218 | } 219 | copied := make([]byte, len(buf)) 220 | copy(copied, buf) 221 | return c.UnmarshalRequest(copied) 222 | } 223 | 224 | // Init implements Storage.Init() function 225 | func (q *InMemoryQueueStorage) Init() error { 226 | q.lock = &sync.RWMutex{} 227 | return nil 228 | } 229 | 230 | // AddRequest implements Storage.AddRequest() function 231 | func (q *InMemoryQueueStorage) AddRequest(r []byte) error { 232 | q.lock.Lock() 233 | defer q.lock.Unlock() 234 | // Discard URLs if size limit exceeded 235 | if q.MaxSize > 0 && q.size >= q.MaxSize { 236 | return colly.ErrQueueFull 237 | } 238 | i := &inMemoryQueueItem{Request: r} 239 | if q.first == nil { 240 | q.first = i 241 | } else { 242 | q.last.Next = i 243 | } 244 | q.last = i 245 | q.size++ 246 | return nil 247 | } 248 | 249 | // GetRequest implements Storage.GetRequest() function 250 | func (q *InMemoryQueueStorage) GetRequest() ([]byte, error) { 251 | q.lock.Lock() 252 | defer q.lock.Unlock() 253 | if q.size == 0 { 254 | return nil, nil 255 | } 256 | r := q.first.Request 257 | q.first = q.first.Next 258 | q.size-- 259 | return r, nil 260 | } 261 | 262 | // QueueSize implements Storage.QueueSize() function 263 | func (q *InMemoryQueueStorage) QueueSize() (int, error) { 264 | q.lock.Lock() 265 | defer q.lock.Unlock() 266 | return q.size, nil 267 | } 268 | -------------------------------------------------------------------------------- /queue/queue_test.go: -------------------------------------------------------------------------------- 1 | package queue 2 | 3 | import ( 4 | "math/rand" 5 | "net/http" 6 | "net/http/httptest" 7 | "sync" 8 | "sync/atomic" 9 | "testing" 10 | "time" 11 | 12 | "github.com/gocolly/colly/v2" 13 | ) 14 | 15 | func TestQueue(t *testing.T) { 16 | server := httptest.NewServer(http.HandlerFunc(serverHandler)) 17 | defer server.Close() 18 | 19 | rng := rand.New(rand.NewSource(12387123712321232)) 20 | var rngMu sync.Mutex 21 | 22 | var ( 23 | items uint32 24 | requests uint32 25 | success uint32 26 | failure uint32 27 | ) 28 | storage := &InMemoryQueueStorage{MaxSize: 100000} 29 | q, err := New(10, storage) 30 | if err != nil { 31 | panic(err) 32 | } 33 | put := func() { 34 | rngMu.Lock() 35 | t := time.Duration(rng.Intn(50)) * time.Microsecond 36 | rngMu.Unlock() 37 | url := server.URL + "/delay?t=" + t.String() 38 | atomic.AddUint32(&items, 1) 39 | q.AddURL(url) 40 | } 41 | for i := 0; i < 3000; i++ { 42 | put() 43 | storage.AddRequest([]byte("error request")) 44 | } 45 | c := colly.NewCollector( 46 | colly.AllowURLRevisit(), 47 | ) 48 | c.OnRequest(func(req *colly.Request) { 49 | atomic.AddUint32(&requests, 1) 50 | }) 51 | c.OnResponse(func(resp *colly.Response) { 52 | if resp.StatusCode == http.StatusOK { 53 | atomic.AddUint32(&success, 1) 54 | } else { 55 | atomic.AddUint32(&failure, 1) 56 | } 57 | rngMu.Lock() 58 | toss := rng.Intn(2) == 0 59 | rngMu.Unlock() 60 | if toss { 61 | put() 62 | } 63 | }) 64 | c.OnError(func(resp *colly.Response, err error) { 65 | atomic.AddUint32(&failure, 1) 66 | }) 67 | err = q.Run(c) 68 | if err != nil { 69 | t.Fatalf("Queue.Run() return an error: %v", err) 70 | } 71 | if items != requests || success+failure != requests || failure > 0 { 72 | t.Fatalf("wrong Queue implementation: "+ 73 | "items = %d, requests = %d, success = %d, failure = %d", 74 | items, requests, success, failure) 75 | } 76 | } 77 | 78 | func serverHandler(w http.ResponseWriter, req *http.Request) { 79 | if !serverRoute(w, req) { 80 | shutdown(w) 81 | } 82 | } 83 | 84 | func serverRoute(w http.ResponseWriter, req *http.Request) bool { 85 | if req.URL.Path == "/delay" { 86 | return serveDelay(w, req) == nil 87 | } 88 | return false 89 | } 90 | 91 | func serveDelay(w http.ResponseWriter, req *http.Request) error { 92 | q := req.URL.Query() 93 | t, err := time.ParseDuration(q.Get("t")) 94 | if err != nil { 95 | return err 96 | } 97 | time.Sleep(t) 98 | w.WriteHeader(http.StatusOK) 99 | return nil 100 | } 101 | 102 | func shutdown(w http.ResponseWriter) { 103 | taker, ok := w.(http.Hijacker) 104 | if !ok { 105 | return 106 | } 107 | raw, _, err := taker.Hijack() 108 | if err != nil { 109 | return 110 | } 111 | raw.Close() 112 | } 113 | -------------------------------------------------------------------------------- /request.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "bytes" 19 | "encoding/json" 20 | "io" 21 | "net/http" 22 | "net/url" 23 | "strings" 24 | "sync/atomic" 25 | ) 26 | 27 | // Request is the representation of a HTTP request made by a Collector 28 | type Request struct { 29 | // URL is the parsed URL of the HTTP request 30 | URL *url.URL 31 | // Headers contains the Request's HTTP headers 32 | Headers *http.Header 33 | // the Host header 34 | Host string 35 | // Ctx is a context between a Request and a Response 36 | Ctx *Context 37 | // Depth is the number of the parents of the request 38 | Depth int 39 | // Method is the HTTP method of the request 40 | Method string 41 | // Body is the request body which is used on POST/PUT requests 42 | Body io.Reader 43 | // ResponseCharacterencoding is the character encoding of the response body. 44 | // Leave it blank to allow automatic character encoding of the response body. 45 | // It is empty by default and it can be set in OnRequest callback. 46 | ResponseCharacterEncoding string 47 | // ID is the Unique identifier of the request 48 | ID uint32 49 | collector *Collector 50 | abort bool 51 | baseURL *url.URL 52 | // ProxyURL is the proxy address that handles the request 53 | ProxyURL string 54 | } 55 | 56 | type serializableRequest struct { 57 | URL string 58 | Method string 59 | Depth int 60 | Body []byte 61 | ID uint32 62 | Ctx map[string]interface{} 63 | Headers http.Header 64 | Host string 65 | } 66 | 67 | // New creates a new request with the context of the original request 68 | func (r *Request) New(method, URL string, body io.Reader) (*Request, error) { 69 | u, err := urlParser.Parse(URL) 70 | if err != nil { 71 | return nil, err 72 | } 73 | u2, err := url.Parse(u.Href(false)) 74 | if err != nil { 75 | return nil, err 76 | } 77 | return &Request{ 78 | Method: method, 79 | URL: u2, 80 | Body: body, 81 | Ctx: r.Ctx, 82 | Headers: &http.Header{}, 83 | Host: r.Host, 84 | ID: atomic.AddUint32(&r.collector.requestCount, 1), 85 | collector: r.collector, 86 | }, nil 87 | } 88 | 89 | // Abort cancels the HTTP request when called in an OnRequest callback 90 | func (r *Request) Abort() { 91 | r.abort = true 92 | } 93 | 94 | // AbsoluteURL returns with the resolved absolute URL of an URL chunk. 95 | // AbsoluteURL returns empty string if the URL chunk is a fragment or 96 | // could not be parsed 97 | func (r *Request) AbsoluteURL(u string) string { 98 | if strings.HasPrefix(u, "#") { 99 | return "" 100 | } 101 | var base *url.URL 102 | if r.baseURL != nil { 103 | base = r.baseURL 104 | } else { 105 | base = r.URL 106 | } 107 | 108 | absURL, err := urlParser.ParseRef(base.String(), u) 109 | if err != nil { 110 | return "" 111 | } 112 | return absURL.Href(false) 113 | } 114 | 115 | // Visit continues Collector's collecting job by creating a 116 | // request and preserves the Context of the previous request. 117 | // Visit also calls the previously provided callbacks 118 | func (r *Request) Visit(URL string) error { 119 | return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true) 120 | } 121 | 122 | // HasVisited checks if the provided URL has been visited 123 | func (r *Request) HasVisited(URL string) (bool, error) { 124 | return r.collector.HasVisited(URL) 125 | } 126 | 127 | // Post continues a collector job by creating a POST request and preserves the Context 128 | // of the previous request. 129 | // Post also calls the previously provided callbacks 130 | func (r *Request) Post(URL string, requestData map[string]string) error { 131 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil, true) 132 | } 133 | 134 | // PostRaw starts a collector job by creating a POST request with raw binary data. 135 | // PostRaw preserves the Context of the previous request 136 | // and calls the previously provided callbacks 137 | func (r *Request) PostRaw(URL string, requestData []byte) error { 138 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil, true) 139 | } 140 | 141 | // PostMultipart starts a collector job by creating a Multipart POST request 142 | // with raw binary data. PostMultipart also calls the previously provided. 143 | // callbacks 144 | func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error { 145 | boundary := randomBoundary() 146 | hdr := http.Header{} 147 | hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary) 148 | hdr.Set("User-Agent", r.collector.UserAgent) 149 | return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr, true) 150 | } 151 | 152 | // Retry submits HTTP request again with the same parameters 153 | func (r *Request) Retry() error { 154 | r.Headers.Del("Cookie") 155 | if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok { 156 | return ErrRetryBodyUnseekable 157 | } 158 | return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false) 159 | } 160 | 161 | // Do submits the request 162 | func (r *Request) Do() error { 163 | return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, !r.collector.AllowURLRevisit) 164 | } 165 | 166 | // Marshal serializes the Request 167 | func (r *Request) Marshal() ([]byte, error) { 168 | ctx := make(map[string]interface{}) 169 | if r.Ctx != nil { 170 | r.Ctx.ForEach(func(k string, v interface{}) interface{} { 171 | ctx[k] = v 172 | return nil 173 | }) 174 | } 175 | var err error 176 | var body []byte 177 | if r.Body != nil { 178 | body, err = io.ReadAll(r.Body) 179 | if err != nil { 180 | return nil, err 181 | } 182 | } 183 | sr := &serializableRequest{ 184 | URL: r.URL.String(), 185 | Host: r.Host, 186 | Method: r.Method, 187 | Depth: r.Depth, 188 | Body: body, 189 | ID: r.ID, 190 | Ctx: ctx, 191 | } 192 | if r.Headers != nil { 193 | sr.Headers = *r.Headers 194 | } 195 | return json.Marshal(sr) 196 | } 197 | -------------------------------------------------------------------------------- /response.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "io" 21 | "mime" 22 | "net/http" 23 | "os" 24 | "strings" 25 | 26 | "github.com/saintfish/chardet" 27 | "golang.org/x/net/html/charset" 28 | ) 29 | 30 | // Response is the representation of a HTTP response made by a Collector 31 | type Response struct { 32 | // StatusCode is the status code of the Response 33 | StatusCode int 34 | // Body is the content of the Response 35 | Body []byte 36 | // Ctx is a context between a Request and a Response 37 | Ctx *Context 38 | // Request is the Request object of the response 39 | Request *Request 40 | // Headers contains the Response's HTTP headers 41 | Headers *http.Header 42 | // Trace contains the HTTPTrace for the request. Will only be set by the 43 | // collector if Collector.TraceHTTP is set to true. 44 | Trace *HTTPTrace 45 | } 46 | 47 | // Save writes response body to disk 48 | func (r *Response) Save(fileName string) error { 49 | return os.WriteFile(fileName, r.Body, 0644) 50 | } 51 | 52 | // FileName returns the sanitized file name parsed from "Content-Disposition" 53 | // header or from URL 54 | func (r *Response) FileName() string { 55 | _, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition")) 56 | if fName, ok := params["filename"]; ok && err == nil { 57 | return SanitizeFileName(fName) 58 | } 59 | if r.Request.URL.RawQuery != "" { 60 | return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery)) 61 | } 62 | return SanitizeFileName(strings.TrimPrefix(r.Request.URL.Path, "/")) 63 | } 64 | 65 | func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error { 66 | if len(r.Body) == 0 { 67 | return nil 68 | } 69 | if defaultEncoding != "" { 70 | tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding) 71 | if err != nil { 72 | return err 73 | } 74 | r.Body = tmpBody 75 | return nil 76 | } 77 | contentType := strings.ToLower(r.Headers.Get("Content-Type")) 78 | 79 | if strings.Contains(contentType, "image/") || 80 | strings.Contains(contentType, "video/") || 81 | strings.Contains(contentType, "audio/") || 82 | strings.Contains(contentType, "font/") { 83 | // These MIME types should not have textual data. 84 | 85 | return nil 86 | } 87 | 88 | if !strings.Contains(contentType, "charset") { 89 | if !detectCharset { 90 | return nil 91 | } 92 | d := chardet.NewTextDetector() 93 | r, err := d.DetectBest(r.Body) 94 | if err != nil { 95 | return err 96 | } 97 | contentType = "text/plain; charset=" + r.Charset 98 | } 99 | if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") { 100 | return nil 101 | } 102 | tmpBody, err := encodeBytes(r.Body, contentType) 103 | if err != nil { 104 | return err 105 | } 106 | r.Body = tmpBody 107 | return nil 108 | } 109 | 110 | func encodeBytes(b []byte, contentType string) ([]byte, error) { 111 | r, err := charset.NewReader(bytes.NewReader(b), contentType) 112 | if err != nil { 113 | return nil, err 114 | } 115 | return io.ReadAll(r) 116 | } 117 | -------------------------------------------------------------------------------- /storage/storage.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package storage 16 | 17 | import ( 18 | "net/http" 19 | "net/http/cookiejar" 20 | "net/url" 21 | "strings" 22 | "sync" 23 | ) 24 | 25 | // Storage is an interface which handles Collector's internal data, 26 | // like visited urls and cookies. 27 | // The default Storage of the Collector is the InMemoryStorage. 28 | // Collector's storage can be changed by calling Collector.SetStorage() 29 | // function. 30 | type Storage interface { 31 | // Init initializes the storage 32 | Init() error 33 | // Visited receives and stores a request ID that is visited by the Collector 34 | Visited(requestID uint64) error 35 | // IsVisited returns true if the request was visited before IsVisited 36 | // is called 37 | IsVisited(requestID uint64) (bool, error) 38 | // Cookies retrieves stored cookies for a given host 39 | Cookies(u *url.URL) string 40 | // SetCookies stores cookies for a given host 41 | SetCookies(u *url.URL, cookies string) 42 | } 43 | 44 | // InMemoryStorage is the default storage backend of colly. 45 | // InMemoryStorage keeps cookies and visited urls in memory 46 | // without persisting data on the disk. 47 | type InMemoryStorage struct { 48 | visitedURLs map[uint64]bool 49 | lock *sync.RWMutex 50 | jar *cookiejar.Jar 51 | } 52 | 53 | // Init initializes InMemoryStorage 54 | func (s *InMemoryStorage) Init() error { 55 | if s.visitedURLs == nil { 56 | s.visitedURLs = make(map[uint64]bool) 57 | } 58 | if s.lock == nil { 59 | s.lock = &sync.RWMutex{} 60 | } 61 | if s.jar == nil { 62 | var err error 63 | s.jar, err = cookiejar.New(nil) 64 | return err 65 | } 66 | return nil 67 | } 68 | 69 | // Visited implements Storage.Visited() 70 | func (s *InMemoryStorage) Visited(requestID uint64) error { 71 | s.lock.Lock() 72 | s.visitedURLs[requestID] = true 73 | s.lock.Unlock() 74 | return nil 75 | } 76 | 77 | // IsVisited implements Storage.IsVisited() 78 | func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) { 79 | s.lock.RLock() 80 | visited := s.visitedURLs[requestID] 81 | s.lock.RUnlock() 82 | return visited, nil 83 | } 84 | 85 | // Cookies implements Storage.Cookies() 86 | func (s *InMemoryStorage) Cookies(u *url.URL) string { 87 | return StringifyCookies(s.jar.Cookies(u)) 88 | } 89 | 90 | // SetCookies implements Storage.SetCookies() 91 | func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) { 92 | s.jar.SetCookies(u, UnstringifyCookies(cookies)) 93 | } 94 | 95 | // Close implements Storage.Close() 96 | func (s *InMemoryStorage) Close() error { 97 | return nil 98 | } 99 | 100 | // StringifyCookies serializes list of http.Cookies to string 101 | func StringifyCookies(cookies []*http.Cookie) string { 102 | // Stringify cookies. 103 | cs := make([]string, len(cookies)) 104 | for i, c := range cookies { 105 | cs[i] = c.String() 106 | } 107 | return strings.Join(cs, "\n") 108 | } 109 | 110 | // UnstringifyCookies deserializes a cookie string to http.Cookies 111 | func UnstringifyCookies(s string) []*http.Cookie { 112 | h := http.Header{} 113 | for _, c := range strings.Split(s, "\n") { 114 | h.Add("Set-Cookie", c) 115 | } 116 | r := http.Response{Header: h} 117 | return r.Cookies() 118 | } 119 | 120 | // ContainsCookie checks if a cookie name is represented in cookies 121 | func ContainsCookie(cookies []*http.Cookie, name string) bool { 122 | for _, c := range cookies { 123 | if c.Name == name { 124 | return true 125 | } 126 | } 127 | return false 128 | } 129 | -------------------------------------------------------------------------------- /unmarshal.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "errors" 19 | "reflect" 20 | "strings" 21 | 22 | "github.com/PuerkitoBio/goquery" 23 | ) 24 | 25 | // Unmarshal is a shorthand for colly.UnmarshalHTML 26 | func (h *HTMLElement) Unmarshal(v interface{}) error { 27 | return UnmarshalHTML(v, h.DOM, nil) 28 | } 29 | 30 | // UnmarshalWithMap is a shorthand for colly.UnmarshalHTML, extended to allow maps to be passed in. 31 | func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error { 32 | return UnmarshalHTML(v, h.DOM, structMap) 33 | } 34 | 35 | // UnmarshalHTML declaratively extracts text or attributes to a struct from 36 | // HTML response using struct tags composed of css selectors. 37 | // Allowed struct tags: 38 | // - "selector" (required): CSS (goquery) selector of the desired data 39 | // - "attr" (optional): Selects the matching element's attribute's value. 40 | // Leave it blank or omit to get the text of the element. 41 | // 42 | // Example struct declaration: 43 | // 44 | // type Nested struct { 45 | // String string `selector:"div > p"` 46 | // Classes []string `selector:"li" attr:"class"` 47 | // Struct *Nested `selector:"div > div"` 48 | // } 49 | // 50 | // Supported types: struct, *struct, string, []string 51 | func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error { 52 | rv := reflect.ValueOf(v) 53 | 54 | if rv.Kind() != reflect.Ptr || rv.IsNil() { 55 | return errors.New("Invalid type or nil-pointer") 56 | } 57 | 58 | sv := rv.Elem() 59 | st := reflect.TypeOf(v).Elem() 60 | if structMap != nil { 61 | for k, v := range structMap { 62 | attrV := sv.FieldByName(k) 63 | if !attrV.CanAddr() || !attrV.CanSet() { 64 | continue 65 | } 66 | if err := unmarshalSelector(s, attrV, v); err != nil { 67 | return err 68 | } 69 | } 70 | } else { 71 | for i := 0; i < sv.NumField(); i++ { 72 | attrV := sv.Field(i) 73 | if !attrV.CanAddr() || !attrV.CanSet() { 74 | continue 75 | } 76 | if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil { 77 | return err 78 | } 79 | 80 | } 81 | } 82 | 83 | return nil 84 | } 85 | 86 | func unmarshalSelector(s *goquery.Selection, attrV reflect.Value, selector string) error { 87 | //selector is "-" specify that field should ignore. 88 | if selector == "-" { 89 | return nil 90 | } 91 | htmlAttr := "" 92 | // TODO support more types 93 | switch attrV.Kind() { 94 | case reflect.Slice: 95 | if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil { 96 | return err 97 | } 98 | case reflect.String: 99 | var val string 100 | if selector == "" && htmlAttr != "" { 101 | val = getDOMValue(s, htmlAttr) 102 | } else { 103 | val = getDOMValue(s.Find(selector), htmlAttr) 104 | } 105 | attrV.Set(reflect.Indirect(reflect.ValueOf(val))) 106 | case reflect.Struct: 107 | if err := unmarshalStruct(s, selector, attrV); err != nil { 108 | return err 109 | } 110 | case reflect.Ptr: 111 | if err := unmarshalPtr(s, selector, attrV); err != nil { 112 | return err 113 | } 114 | default: 115 | return errors.New("Invalid type: " + attrV.String()) 116 | } 117 | return nil 118 | } 119 | 120 | func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error { 121 | selector := attrT.Tag.Get("selector") 122 | //selector is "-" specify that field should ignore. 123 | if selector == "-" { 124 | return nil 125 | } 126 | htmlAttr := attrT.Tag.Get("attr") 127 | // TODO support more types 128 | switch attrV.Kind() { 129 | case reflect.Slice: 130 | if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil { 131 | return err 132 | } 133 | case reflect.String: 134 | val := getDOMValue(s.Find(selector), htmlAttr) 135 | attrV.Set(reflect.Indirect(reflect.ValueOf(val))) 136 | case reflect.Struct: 137 | if err := unmarshalStruct(s, selector, attrV); err != nil { 138 | return err 139 | } 140 | case reflect.Ptr: 141 | if err := unmarshalPtr(s, selector, attrV); err != nil { 142 | return err 143 | } 144 | default: 145 | return errors.New("Invalid type: " + attrV.String()) 146 | } 147 | return nil 148 | } 149 | 150 | func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error { 151 | newS := s 152 | if selector != "" { 153 | newS = newS.Find(selector) 154 | } 155 | if newS.Nodes == nil { 156 | return nil 157 | } 158 | v := reflect.New(attrV.Type()) 159 | err := UnmarshalHTML(v.Interface(), newS, nil) 160 | if err != nil { 161 | return err 162 | } 163 | attrV.Set(reflect.Indirect(v)) 164 | return nil 165 | } 166 | 167 | func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error { 168 | newS := s 169 | if selector != "" { 170 | newS = newS.Find(selector) 171 | } 172 | if newS.Nodes == nil { 173 | return nil 174 | } 175 | e := attrV.Type().Elem() 176 | if e.Kind() != reflect.Struct { 177 | return errors.New("Invalid slice type") 178 | } 179 | v := reflect.New(e) 180 | err := UnmarshalHTML(v.Interface(), newS, nil) 181 | if err != nil { 182 | return err 183 | } 184 | attrV.Set(v) 185 | return nil 186 | } 187 | 188 | func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error { 189 | if attrV.Pointer() == 0 { 190 | v := reflect.MakeSlice(attrV.Type(), 0, 0) 191 | attrV.Set(v) 192 | } 193 | switch attrV.Type().Elem().Kind() { 194 | case reflect.String: 195 | s.Find(selector).Each(func(_ int, s *goquery.Selection) { 196 | val := getDOMValue(s, htmlAttr) 197 | attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val)))) 198 | }) 199 | case reflect.Ptr: 200 | s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) { 201 | someVal := reflect.New(attrV.Type().Elem().Elem()) 202 | UnmarshalHTML(someVal.Interface(), innerSel, nil) 203 | attrV.Set(reflect.Append(attrV, someVal)) 204 | }) 205 | case reflect.Struct: 206 | s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) { 207 | someVal := reflect.New(attrV.Type().Elem()) 208 | UnmarshalHTML(someVal.Interface(), innerSel, nil) 209 | attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal))) 210 | }) 211 | default: 212 | return errors.New("Invalid slice type") 213 | } 214 | return nil 215 | } 216 | 217 | func getDOMValue(s *goquery.Selection, attr string) string { 218 | if attr == "" { 219 | return strings.TrimSpace(s.First().Text()) 220 | } 221 | attrV, _ := s.Attr(attr) 222 | return attrV 223 | } 224 | -------------------------------------------------------------------------------- /unmarshal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "bytes" 19 | "testing" 20 | 21 | "github.com/PuerkitoBio/goquery" 22 | ) 23 | 24 | var basicTestData = []byte(``) 25 | var nestedTestData = []byte(`

a

b

c

`) 26 | var pointerSliceTestData = []byte(``) 27 | 28 | func TestBasicUnmarshal(t *testing.T) { 29 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(basicTestData)) 30 | e := &HTMLElement{ 31 | DOM: doc.First(), 32 | } 33 | s := struct { 34 | String string `selector:"li:first-child" attr:"class"` 35 | Items []string `selector:"li"` 36 | Struct struct { 37 | String string `selector:"li:last-child"` 38 | } 39 | }{} 40 | if err := e.Unmarshal(&s); err != nil { 41 | t.Error("Cannot unmarshal struct: " + err.Error()) 42 | } 43 | if s.String != "x" { 44 | t.Errorf(`Invalid data for String: %q, expected "x"`, s.String) 45 | } 46 | if s.Struct.String != "3" { 47 | t.Errorf(`Invalid data for Struct.String: %q, expected "3"`, s.Struct.String) 48 | } 49 | } 50 | 51 | func TestNestedUnmarshalMap(t *testing.T) { 52 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(nestedTestData)) 53 | e := &HTMLElement{ 54 | DOM: doc.First(), 55 | } 56 | doc2, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(basicTestData)) 57 | e2 := &HTMLElement{ 58 | DOM: doc2.First(), 59 | } 60 | type nested struct { 61 | String string 62 | } 63 | mapSelector := make(map[string]string) 64 | mapSelector["String"] = "div > p" 65 | 66 | mapSelector2 := make(map[string]string) 67 | mapSelector2["String"] = "span" 68 | 69 | s := nested{} 70 | s2 := nested{} 71 | if err := e.UnmarshalWithMap(&s, mapSelector); err != nil { 72 | t.Error("Cannot unmarshal struct: " + err.Error()) 73 | } 74 | if err := e2.UnmarshalWithMap(&s2, mapSelector2); err != nil { 75 | t.Error("Cannot unmarshal struct: " + err.Error()) 76 | } 77 | if s.String != "a" { 78 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String) 79 | } 80 | if s2.String != "item" { 81 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String) 82 | } 83 | } 84 | 85 | func TestNestedUnmarshal(t *testing.T) { 86 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(nestedTestData)) 87 | e := &HTMLElement{ 88 | DOM: doc.First(), 89 | } 90 | type nested struct { 91 | String string `selector:"div > p"` 92 | Struct *nested `selector:"div > div"` 93 | } 94 | s := nested{} 95 | if err := e.Unmarshal(&s); err != nil { 96 | t.Error("Cannot unmarshal struct: " + err.Error()) 97 | } 98 | if s.String != "a" { 99 | t.Errorf(`Invalid data for String: %q, expected "a"`, s.String) 100 | } 101 | if s.Struct.String != "b" { 102 | t.Errorf(`Invalid data for Struct.String: %q, expected "b"`, s.Struct.String) 103 | } 104 | if s.Struct.Struct.String != "c" { 105 | t.Errorf(`Invalid data for Struct.Struct.String: %q, expected "c"`, s.Struct.Struct.String) 106 | } 107 | } 108 | 109 | func TestPointerSliceUnmarshall(t *testing.T) { 110 | type info struct { 111 | Text string `selector:"span"` 112 | } 113 | type object struct { 114 | Info []*info `selector:"li.info"` 115 | } 116 | 117 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(pointerSliceTestData)) 118 | e := HTMLElement{DOM: doc.First()} 119 | o := object{} 120 | err := e.Unmarshal(&o) 121 | if err != nil { 122 | t.Fatalf("Failed to unmarshal page: %s\n", err.Error()) 123 | } 124 | 125 | if len(o.Info) != 2 { 126 | t.Errorf("Invalid length for Info: %d, expected 2", len(o.Info)) 127 | } 128 | if o.Info[0].Text != "Info 1" { 129 | t.Errorf("Invalid data for Info.[0].Text: %s, expected Info 1", o.Info[0].Text) 130 | } 131 | if o.Info[1].Text != "Info 2" { 132 | t.Errorf("Invalid data for Info.[1].Text: %s, expected Info 2", o.Info[1].Text) 133 | } 134 | 135 | } 136 | 137 | func TestStructSliceUnmarshall(t *testing.T) { 138 | type info struct { 139 | Text string `selector:"span"` 140 | } 141 | type object struct { 142 | Info []info `selector:"li.info"` 143 | } 144 | 145 | doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(pointerSliceTestData)) 146 | e := HTMLElement{DOM: doc.First()} 147 | o := object{} 148 | err := e.Unmarshal(&o) 149 | if err != nil { 150 | t.Fatalf("Failed to unmarshal page: %s\n", err.Error()) 151 | } 152 | 153 | if len(o.Info) != 2 { 154 | t.Errorf("Invalid length for Info: %d, expected 2", len(o.Info)) 155 | } 156 | if o.Info[0].Text != "Info 1" { 157 | t.Errorf("Invalid data for Info.[0].Text: %s, expected Info 1", o.Info[0].Text) 158 | } 159 | if o.Info[1].Text != "Info 2" { 160 | t.Errorf("Invalid data for Info.[1].Text: %s, expected Info 2", o.Info[1].Text) 161 | } 162 | 163 | } 164 | -------------------------------------------------------------------------------- /xmlelement.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly 16 | 17 | import ( 18 | "strings" 19 | 20 | "github.com/antchfx/htmlquery" 21 | "github.com/antchfx/xmlquery" 22 | "golang.org/x/net/html" 23 | ) 24 | 25 | // XMLElement is the representation of a XML tag. 26 | type XMLElement struct { 27 | // Name is the name of the tag 28 | Name string 29 | Text string 30 | attributes interface{} 31 | // Request is the request object of the element's HTML document 32 | Request *Request 33 | // Response is the Response object of the element's HTML document 34 | Response *Response 35 | // DOM is the DOM object of the page. DOM is relative 36 | // to the current XMLElement and is either a html.Node or xmlquery.Node 37 | // based on how the XMLElement was created. 38 | DOM interface{} 39 | isHTML bool 40 | } 41 | 42 | // NewXMLElementFromHTMLNode creates a XMLElement from a html.Node. 43 | func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement { 44 | return &XMLElement{ 45 | Name: s.Data, 46 | Request: resp.Request, 47 | Response: resp, 48 | Text: htmlquery.InnerText(s), 49 | DOM: s, 50 | attributes: s.Attr, 51 | isHTML: true, 52 | } 53 | } 54 | 55 | // NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node. 56 | func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement { 57 | return &XMLElement{ 58 | Name: s.Data, 59 | Request: resp.Request, 60 | Response: resp, 61 | Text: s.InnerText(), 62 | DOM: s, 63 | attributes: s.Attr, 64 | isHTML: false, 65 | } 66 | } 67 | 68 | // Attr returns the selected attribute of a HTMLElement or empty string 69 | // if no attribute found 70 | func (h *XMLElement) Attr(k string) string { 71 | if h.isHTML { 72 | for _, a := range h.attributes.([]html.Attribute) { 73 | if a.Key == k { 74 | return a.Val 75 | } 76 | } 77 | } else { 78 | for _, a := range h.attributes.([]xmlquery.Attr) { 79 | if a.Name.Local == k { 80 | return a.Value 81 | } 82 | } 83 | } 84 | return "" 85 | } 86 | 87 | // ChildText returns the concatenated and stripped text content of the matching 88 | // elements. 89 | func (h *XMLElement) ChildText(xpathQuery string) string { 90 | if h.isHTML { 91 | child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery) 92 | if child == nil { 93 | return "" 94 | } 95 | return strings.TrimSpace(htmlquery.InnerText(child)) 96 | } 97 | child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery) 98 | if child == nil { 99 | return "" 100 | } 101 | return strings.TrimSpace(child.InnerText()) 102 | 103 | } 104 | 105 | // ChildAttr returns the stripped text content of the first matching 106 | // element's attribute. 107 | func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string { 108 | if h.isHTML { 109 | child := htmlquery.FindOne(h.DOM.(*html.Node), xpathQuery) 110 | if child != nil { 111 | for _, attr := range child.Attr { 112 | if attr.Key == attrName { 113 | return strings.TrimSpace(attr.Val) 114 | } 115 | } 116 | } 117 | } else { 118 | child := xmlquery.FindOne(h.DOM.(*xmlquery.Node), xpathQuery) 119 | if child != nil { 120 | for _, attr := range child.Attr { 121 | if attr.Name.Local == attrName { 122 | return strings.TrimSpace(attr.Value) 123 | } 124 | } 125 | } 126 | } 127 | 128 | return "" 129 | } 130 | 131 | // ChildAttrs returns the stripped text content of all the matching 132 | // element's attributes. 133 | func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string { 134 | var res []string 135 | if h.isHTML { 136 | for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) { 137 | for _, attr := range child.Attr { 138 | if attr.Key == attrName { 139 | res = append(res, strings.TrimSpace(attr.Val)) 140 | } 141 | } 142 | } 143 | } else { 144 | xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) { 145 | for _, attr := range child.Attr { 146 | if attr.Name.Local == attrName { 147 | res = append(res, strings.TrimSpace(attr.Value)) 148 | } 149 | } 150 | }) 151 | } 152 | return res 153 | } 154 | 155 | // ChildTexts returns an array of strings corresponding to child elements that match the xpath query. 156 | // Each item in the array is the stripped text content of the corresponding matching child element. 157 | func (h *XMLElement) ChildTexts(xpathQuery string) []string { 158 | texts := make([]string, 0) 159 | if h.isHTML { 160 | for _, child := range htmlquery.Find(h.DOM.(*html.Node), xpathQuery) { 161 | texts = append(texts, strings.TrimSpace(htmlquery.InnerText(child))) 162 | } 163 | } else { 164 | xmlquery.FindEach(h.DOM.(*xmlquery.Node), xpathQuery, func(i int, child *xmlquery.Node) { 165 | texts = append(texts, strings.TrimSpace(child.InnerText())) 166 | }) 167 | } 168 | return texts 169 | } 170 | -------------------------------------------------------------------------------- /xmlelement_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Adam Tauber 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package colly_test 16 | 17 | import ( 18 | "github.com/antchfx/htmlquery" 19 | "github.com/gocolly/colly/v2" 20 | "reflect" 21 | "strings" 22 | "testing" 23 | ) 24 | 25 | // Borrowed from http://infohost.nmt.edu/tcc/help/pubs/xhtml/example.html 26 | // Added attributes to the `
  • ` tags for testing purposes 27 | const htmlPage = ` 28 | 30 | 31 | 32 | Your page title here 33 | 34 | 35 |

    Your major heading here

    36 |

    37 | This is a regular text paragraph. 38 |

    39 | 47 | 48 | 49 | ` 50 | 51 | func TestAttr(t *testing.T) { 52 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} 53 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) 54 | xmlNode := htmlquery.FindOne(doc, "/html") 55 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) 56 | 57 | if xmlElem.Attr("xmlns") != "http://www.w3.org/1999/xhtml" { 58 | t.Fatalf("failed xmlns attribute test: %v != http://www.w3.org/1999/xhtml", xmlElem.Attr("xmlns")) 59 | } 60 | 61 | if xmlElem.Attr("xml:lang") != "en" { 62 | t.Fatalf("failed lang attribute test: %v != en", xmlElem.Attr("lang")) 63 | } 64 | } 65 | 66 | func TestChildText(t *testing.T) { 67 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} 68 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) 69 | xmlNode := htmlquery.FindOne(doc, "/html") 70 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) 71 | 72 | if text := xmlElem.ChildText("//p"); text != "This is a regular text paragraph." { 73 | t.Fatalf("failed child tag test: %v != This is a regular text paragraph.", text) 74 | } 75 | if text := xmlElem.ChildText("//dl"); text != "" { 76 | t.Fatalf("failed child tag test: %v != \"\"", text) 77 | } 78 | } 79 | 80 | func TestChildTexts(t *testing.T) { 81 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} 82 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) 83 | xmlNode := htmlquery.FindOne(doc, "/html") 84 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) 85 | expected := []string{"First bullet of a bullet list.", "This is the second bullet."} 86 | if texts := xmlElem.ChildTexts("//li"); reflect.DeepEqual(texts, expected) == false { 87 | t.Fatalf("failed child tags test: %v != %v", texts, expected) 88 | } 89 | if texts := xmlElem.ChildTexts("//dl"); reflect.DeepEqual(texts, make([]string, 0)) == false { 90 | t.Fatalf("failed child tag test: %v != \"\"", texts) 91 | } 92 | } 93 | func TestChildAttr(t *testing.T) { 94 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} 95 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) 96 | xmlNode := htmlquery.FindOne(doc, "/html") 97 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) 98 | 99 | if attr := xmlElem.ChildAttr("/body/ul/li[1]", "class"); attr != "list-item-1" { 100 | t.Fatalf("failed child attribute test: %v != list-item-1", attr) 101 | } 102 | if attr := xmlElem.ChildAttr("/body/ul/li[2]", "class"); attr != "list-item-2" { 103 | t.Fatalf("failed child attribute test: %v != list-item-2", attr) 104 | } 105 | } 106 | 107 | func TestChildAttrs(t *testing.T) { 108 | resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} 109 | doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) 110 | xmlNode := htmlquery.FindOne(doc, "/html") 111 | xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) 112 | 113 | attrs := xmlElem.ChildAttrs("/body/ul/li", "class") 114 | if len(attrs) != 2 { 115 | t.Fatalf("failed child attributes length test: %d != 2", len(attrs)) 116 | } 117 | 118 | for _, attr := range attrs { 119 | if !(attr == "list-item-1" || attr == "list-item-2") { 120 | t.Fatalf("failed child attributes values test: %s != list-item-(1 or 2)", attr) 121 | } 122 | } 123 | } 124 | --------------------------------------------------------------------------------