├── .gitignore
├── Disclaimer.md
├── LICENSE
├── Makefile
├── README.md
├── README_zh-cn.md
├── cmd
    └── crawlergo
    │   ├── flag.go
    │   └── main.go
├── dockerfile
├── examples
    ├── host_binding.py
    ├── request_with_cookie.py
    ├── subprocess_call.py
    └── zombie_clean.py
├── get_chrome.sh
├── go.mod
├── go.sum
├── imgs
    ├── bypass.png
    ├── chrome_path.png
    ├── demo.gif
    └── skp.png
└── pkg
    ├── config
        ├── config.go
        └── config_test.go
    ├── domain_collect.go
    ├── engine
        ├── after_dom_tasks.go
        ├── after_loaded_tasks.go
        ├── browser.go
        ├── collect_links.go
        ├── intercept_request.go
        ├── tab.go
        └── tab_test.go
    ├── filter
        ├── simple_filter.go
        ├── smart_filter.go
        └── smart_filter_test.go
    ├── js
        └── javascript.go
    ├── logger
        └── logger.go
    ├── model
        ├── request.go
        ├── url.go
        └── url_test.go
    ├── path_expansion.go
    ├── task_main.go
    ├── taskconfig.go
    ├── taskconfig_test.go
    └── tools
        ├── common.go
        ├── random.go
        └── requests
            ├── requests.go
            ├── response.go
            └── utils.go


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea
3 | bin
4 | upload
5 | result
6 | .DS_Store
7 | internal/.DS_Store
8 | pkg/.DS_Store


--------------------------------------------------------------------------------
/Disclaimer.md:
--------------------------------------------------------------------------------
 1 | ## 免责声明
 2 | 
 3 | 本工具仅面向**合法授权**的企业安全建设行为，**请勿对非授权目标进行爬取行为。**
 4 | 
 5 | 禁止对本软件实施逆向工程、反编译、试图破译源代码等行为。
 6 | 
 7 | **如果发现上述禁止行为，我们将保留追究您法律责任的权利。**
 8 | 
 9 | 如您在使用本工具的过程中存在任何非法行为，您需自行承担相应后果，我们将不承担任何法律及连带责任。
10 | 
11 | 在安装并使用本工具前，请您**务必审慎阅读、充分理解各条款内容**，限制、免责条款或者其他涉及您重大权益的条款可能会以加粗、加下划线等形式提示您重点注意。 除非您已充分阅读、完全理解并接受本协议所有条款，否则，请您不要安装并使用本工具。您的使用行为或者您以其他任何明示或者默示方式表示接受本协议的，即视为您已阅读并同意本协议的约束。
12 | 
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VERSION=$(shell git describe --tags --always)
 2 | 
 3 | .PHONY: build_all
 4 | # build
 5 | build_all:
 6 | 	rm -rf bin && mkdir bin bin/linux-amd64 bin/linux-arm64 bin/darwin-amd64 bin/darwin-arm64 bin/windows-amd64\
 7 | 	&& CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/darwin-arm64/ ./... \
 8 | 	&& CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/darwin-amd64/ ./... \
 9 | 	&& CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/linux-arm64/ ./... \
10 | 	&& CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/linux-amd64/ ./... \
11 | 	&& CGO_ENABLED=0 GOOS=windows GOARCH=amd64 go build -ldflags "-X 'main.Version=$(VERSION)'" -o ./bin/windows-amd64/ ./...
12 | 
13 | .PHONY: build
14 | # build
15 | build:
16 | 	rm -rf bin && mkdir bin && go build -ldflags "-X main.Version=$(VERSION)" -o ./bin/ ./...


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # crawlergo
  2 | 
  3 | ![chromedp](https://img.shields.io/badge/chromedp-v0.5.2-brightgreen.svg) [![BlackHat EU Arsenal](https://img.shields.io/badge/BlackHat%20Europe-2021%20Arsenal-blue.svg)](https://www.blackhat.com/eu-21/arsenal/schedule/index.html#crawlergo-a-powerful-browser-crawler-for-web-vulnerability-scanners-25113)
  4 | 
  5 | > A powerful browser crawler for web vulnerability scanners
  6 | 
  7 | English Document | [中文文档](./README_zh-cn.md)
  8 | 
  9 | crawlergo is a browser crawler that uses `chrome headless` mode for URL collection. It hooks key positions of the whole web page with DOM rendering stage, automatically fills and submits forms, with intelligent JS event triggering, and collects as many entries exposed by the website as possible. The built-in URL de-duplication module filters out a large number of pseudo-static URLs, still maintains a fast parsing and crawling speed for large websites, and finally gets a high-quality collection of request results.
 10 | 
 11 | crawlergo currently supports the following features:
 12 | * chrome browser environment rendering
 13 | * Intelligent form filling, automated submission
 14 | * Full DOM event collection with automated triggering
 15 | * Smart URL de-duplication to remove most duplicate requests
 16 | * Intelligent analysis of web pages and collection of URLs, including javascript file content, page comments, robots.txt files and automatic Fuzz of common paths
 17 | * Support Host binding, automatically fix and add Referer
 18 | * Support browser request proxy
 19 | * Support pushing the results to passive web vulnerability scanners
 20 | 
 21 | ## Screenshot
 22 | 
 23 | ![](./imgs/demo.gif)
 24 | 
 25 | ## Installation
 26 | 
 27 | **Please read and confirm [disclaimer](./Disclaimer.md) carefully before installing and using。**
 28 | 
 29 | **Build**
 30 | 
 31 | - compilation for current platform
 32 | 
 33 | ```shell
 34 | make build
 35 | ```
 36 | 
 37 | - compile for all platforms
 38 | ```shell
 39 | make build_all
 40 | ```
 41 | 
 42 | 1. crawlergo relies only on the chrome environment to run, go to [download](https://www.chromium.org/getting-involved/download-chromium) for the new version of chromium.
 43 | 2. Go to [download page](https://github.com/0Kee-Team/crawlergo/releases) for the latest version of crawlergo and extract it to any directory. If you are on linux or macOS, please give crawlergo **executable permissions (+x)**.
 44 | 3. Or you can modify the code and build it yourself.
 45 | 
 46 | > If you are using a linux system and chrome prompts you with missing dependencies, please see TroubleShooting below
 47 | 
 48 | ## Quick Start
 49 | ### Go！
 50 | 
 51 | Assuming your chromium installation directory is `/tmp/chromium/`, set up 10 tabs open at the same time and crawl the `testphp.vulnweb.com`:
 52 | 
 53 | ```shell
 54 | bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
 55 | ```
 56 | 
 57 | 
 58 | ### Docker usage
 59 | 
 60 | You can also use this with docker without headache: 
 61 | 
 62 | ```shell
 63 | git clone https://github.com/Qianlitp/crawlergo
 64 | docker build . -t crawlergo
 65 | docker run crawlergo http://testphp.vulnweb.com/
 66 | ```
 67 | 
 68 | 
 69 | ### Using Proxy
 70 | 
 71 | ```shell
 72 | bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
 73 | ```
 74 | 
 75 | 
 76 | ### Calling crawlergo with python
 77 | 
 78 | By default, crawlergo prints the results directly on the screen. We next set the output mode to `json`, and the sample code for calling it using python is as follows:
 79 | 
 80 | ```python
 81 | #!/usr/bin/python3
 82 | # coding: utf-8
 83 | 
 84 | import simplejson
 85 | import subprocess
 86 | 
 87 | 
 88 | def main():
 89 |     target = "http://testphp.vulnweb.com/"
 90 |     cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
 91 |     rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 92 |     output, error = rsp.communicate()
 93 | 	#  "--[Mission Complete]--"  is the end-of-task separator string
 94 |     result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
 95 |     req_list = result["req_list"]
 96 |     print(req_list[0])
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 | ```
102 | 
103 | ### Crawl Results
104 | 
105 | When the output mode is set to `json`, the returned result, after JSON deserialization, contains four parts:
106 | 
107 | * `all_req_list`： All requests found during this crawl task, containing any resource type from other domains.
108 | * `req_list`：Returns the **current domain results** of this crawl task, pseudo-statically de-duplicated, without static resource links. It is a subset of `all_req_list `.
109 | * `all_domain_list`：List of all domains found.
110 | * `sub_domain_list`：List of subdomains found.
111 | 
112 | 
113 | ## Examples
114 | 
115 | crawlergo returns the full request and URL, which can be used in a variety of ways:
116 | 
117 | * Used in conjunction with other passive web vulnerability scanners
118 | 
119 |   First, start a passive scanner and set the listening address to: `http://127.0.0.1:1234/`
120 | 
121 |   Next, assuming crawlergo is on the same machine as the scanner, start crawlergo and set the parameters:
122 | 
123 |   `--push-to-proxy http://127.0.0.1:1234/`
124 | 
125 | * Host binding (not available for high version chrome)  [(example)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/host_binding.py)
126 | 
127 | * Custom Cookies  [(example)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/request_with_cookie.py)
128 | 
129 | * Regularly clean up zombie processes generated by crawlergo [(example)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/zombie_clean.py) , contributed by @ring04h
130 | 
131 | 
132 | ## Bypass headless detect
133 | crawlergo can bypass headless mode detection by default.
134 | 
135 | https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html
136 | 
137 | ![](./imgs/bypass.png)
138 | 
139 | 
140 | ## TroubleShooting
141 | 
142 | * 'Fetch.enable' wasn't found
143 | 
144 |   Fetch is a feature supported by the new version of chrome, if this error occurs, it means your version is too low, please upgrade the chrome version.
145 |   
146 | * chrome runs with missing dependencies such as xxx.so
147 | 
148 |   ```shell
149 |   // Ubuntu
150 |   apt-get install -yq --no-install-recommends \
151 |        libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 \
152 |        libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 \
153 |        libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libgbm1 \
154 |        libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3
155 |        
156 |   // CentOS 7
157 |   sudo yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 libXdamage.x86_64 libXext.x86_64 libXi.x86_64 \
158 |        libXtst.x86_64 cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 \
159 |        ipa-gothic-fonts xorg-x11-fonts-100dpi xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y
160 |   
161 |   sudo yum update nss -y
162 |   ```
163 | 
164 | 
165 | * Run prompt **Navigation timeout** / browser not found / don't know correct **browser executable path**
166 | 
167 |    Make sure the browser executable path is configured correctly, type: `chrome://version` in the address bar, and find the executable file path:
168 | 
169 |   ![](./imgs/chrome_path.png)
170 | 
171 | 
172 | 
173 | ## Parameters
174 | ### Required parameters
175 | * `--chromium-path Path, -c Path`    The path to the chrome executable. (**Required**)
176 | ### Basic parameters
177 | * `--custom-headers Headers`   Customize the HTTP header. Please pass in the data after JSON serialization, this is globally defined and will be used for all requests. (Default: null)
178 | * `--post-data PostData, -d PostData`   POST data. (Default: null)
179 | * `--max-crawled-count Number, -m Number`    The maximum number of tasks for crawlers to avoid long crawling time due to pseudo-static. (Default: 200)
180 | * `--filter-mode Mode, -f Mode`   Filtering mode, `simple`: only static resources and duplicate requests are filtered.  `smart`: with the ability to filter pseudo-static. `strict`: stricter pseudo-static filtering rules. (Default: smart)
181 | * `--output-mode value, -o value`   Result output mode, `console`: print the glorified results directly to the screen. `json`: print the json serialized string of all results.  `none`: don't print the output. (Default: console)
182 | * `--output-json filepath` Write the result to the specified file after JSON serializing it. (Default: null)
183 | * `--request-proxy proxyAddress` socks5 proxy address, all network requests from crawlergo and chrome browser are sent through the proxy. (Default: null)
184 | 
185 | ### Expand input URL
186 | * `--fuzz-path`  Use the built-in dictionary for path fuzzing. (Default: false)
187 | * `--fuzz-path-dict`  Customize the Fuzz path by passing in a dictionary file path, e.g. /home/user/fuzz_dir.txt, each line of the file represents a path to be fuzzed. (Default: null)
188 | * `--robots-path` Resolve the path from the /robots.txt file. (Default: false)
189 | 
190 | ### Form auto-fill
191 | * `--ignore-url-keywords, -iuk` URL keyword that you don't want to visit, generally used to exclude logout links when customizing cookies. Usage: `-iuk logout -iuk exit`. (default: "logout", "quit", "exit")
192 | * `--form-values, -fv` Customize the value of the form fill, set by text type. Support definition types: default, mail, code, phone, username, password, qq, id_card, url, date and number. Text types are identified by the four attribute value keywords `id`, `name`, `class`, `type` of the input box label. For example, define the mailbox input box to be automatically filled with A and the password input box to be automatically filled with B, `-fv mail=A -fv password=B`.Where default represents the fill value when the text type is not recognized, as "Cralwergo". (Default: Cralwergo)
193 | * `--form-keyword-values, -fkv` Customize the value of the form fill, set by keyword fuzzy match. The keyword matches the four attribute values of `id`, `name`, `class`, `type` of the input box label. For example, fuzzy match the pass keyword to fill 123456 and the user keyword to fill admin, `-fkv user=admin -fkv pass=123456`. (Default: Cralwergo)
194 | 
195 | ### Advanced settings for the crawling process
196 | * `--max-tab-count Number, -t Number`   The maximum number of tabs the crawler can open at the same time. (Default: 8)
197 | * `--tab-run-timeout Timeout`   Maximum runtime for a single tab page. (Default: 20s)
198 | * `--wait-dom-content-loaded-timeout Timeout`  The maximum timeout to wait for the page to finish loading. (Default: 5s)
199 | * `--event-trigger-interval Interval` The interval when the event is triggered automatically, generally used in the case of slow target network and DOM update conflicts that lead to URL miss capture. (Default: 100ms)
200 | * `--event-trigger-mode Value` DOM event auto-triggered mode, with `async` and `sync`, for URL miss-catching caused by DOM update conflicts. (Default: async)
201 | * `--before-exit-delay` Delay exit to close chrome at the end of a single tab task. Used to wait for partial DOM updates and XHR requests to be captured. (Default: 1s)
202 | 
203 | ### Other
204 | * `--push-to-proxy` The listener address of the crawler result to be received, usually the listener address of the passive scanner. (Default: null)
205 | * `--push-pool-max` The maximum number of concurrency when sending crawler results to the listening address. (Default: 10)
206 | * `--log-level` Logging levels, debug, info, warn, error and fatal. (Default: info)
207 | * `--no-headless`  Turn off chrome headless mode to visualize the crawling process. (Default: false)
208 | 
209 | 
210 | ## Follow me
211 | 
212 | Weibo：[@9ian1i](https://weibo.com/u/5242748339) 
213 | Twitter: [@9ian1i](https://twitter.com/9ian1i)
214 | 
215 | Related articles：[A browser crawler practice for web vulnerability scanning](https://www.anquanke.com/post/id/178339)
216 | 


--------------------------------------------------------------------------------
/README_zh-cn.md:
--------------------------------------------------------------------------------
  1 | # crawlergo
  2 | 
  3 | ![chromedp](https://img.shields.io/badge/chromedp-v0.5.2-brightgreen.svg)  [![BlackHat EU Arsenal](https://img.shields.io/badge/BlackHat%20Europe-2021%20Arsenal-blue.svg)](https://www.blackhat.com/eu-21/arsenal/schedule/index.html#crawlergo-a-powerful-browser-crawler-for-web-vulnerability-scanners-25113)
  4 | 
  5 | > A powerful browser crawler for web vulnerability scanners
  6 | 
  7 | [English Document](./README.md) | 中文文档
  8 | 
  9 | crawlergo是一个使用`chrome headless`模式进行URL收集的浏览器爬虫。它对整个网页的关键位置与DOM渲染阶段进行HOOK，自动进行表单填充并提交，配合智能的JS事件触发，尽可能的收集网站暴露出的入口。内置URL去重模块，过滤掉了大量伪静态URL，对于大型网站仍保持较快的解析与抓取速度，最后得到高质量的请求结果集合。
 10 | 
 11 | crawlergo 目前支持以下特性：
 12 | 
 13 | * 原生浏览器环境，协程池调度任务
 14 | * 表单智能填充、自动化提交
 15 | * 完整DOM事件收集，自动化触发
 16 | * 智能URL去重，去掉大部分的重复请求
 17 | * 全面分析收集，包括javascript文件内容、页面注释、robots.txt文件和常见路径Fuzz
 18 | * 支持Host绑定，自动添加Referer
 19 | * 支持请求代理，支持爬虫结果主动推送
 20 | 
 21 | ## 运行截图
 22 | 
 23 | ![](./imgs/demo.gif)
 24 | 
 25 | ## 安装
 26 | 
 27 | **安装使用之前，请仔细阅读并确认[免责声明](./Disclaimer.md)。**
 28 | 
 29 | **Build**
 30 | 
 31 | - 编译适用于当前机器的文件
 32 | 
 33 | ```shell
 34 | make build
 35 | ```
 36 | 
 37 | - 交叉编译所有平台的文件
 38 | ```shell
 39 | make build_all
 40 | ```
 41 | 
 42 | 1. crawlergo 只依赖chrome运行即可，前往[下载](https://www.chromium.org/getting-involved/download-chromium)新版本的chromium。
 43 | 2. 前往[页面下载](https://github.com/0Kee-Team/crawlergo/releases)最新版本的crawlergo解压到任意目录，如果是linux或者macOS系统，请赋予crawlergo**可执行权限(+x)**。
 44 | 3. 或者直接根据源码自行编译。
 45 | 
 46 | > 如果你使用linux系统，运行时chrome提示缺少一些依赖组件，请看下方 Trouble Shooting
 47 | 
 48 | ## Quick Start
 49 | 
 50 | ### Go！
 51 | 
 52 | 假设你的chromium安装在 `/tmp/chromium/` ，开启最大10标签页，爬取AWVS靶场：
 53 | 
 54 | ```shell
 55 | bin/crawlergo -c /tmp/chromium/chrome -t 10 http://testphp.vulnweb.com/
 56 | ```
 57 | 
 58 | 
 59 | 
 60 | ### 使用代理
 61 | 
 62 | ```shell
 63 | bin/crawlergo -c /tmp/chromium/chrome -t 10 --request-proxy socks5://127.0.0.1:7891 http://testphp.vulnweb.com/
 64 | ```
 65 | 
 66 | 
 67 | 
 68 | ### 系统调用
 69 | 
 70 | 默认打印当前域名请求，但多数情况我们希望调用crawlergo返回的结果，所以设置输出模式为 `json`，使用python调用并收集结果的示例如下：
 71 | 
 72 | ```python
 73 | #!/usr/bin/python3
 74 | # coding: utf-8
 75 | 
 76 | import simplejson
 77 | import subprocess
 78 | 
 79 | 
 80 | def main():
 81 |     target = "http://testphp.vulnweb.com/"
 82 |     cmd = ["bin/crawlergo", "-c", "/tmp/chromium/chrome", "-o", "json", target]
 83 |     rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 84 |     output, error = rsp.communicate()
 85 | 	#  "--[Mission Complete]--"  是任务结束的分隔字符串
 86 |     result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
 87 |     req_list = result["req_list"]
 88 |     print(req_list[0])
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     main()
 93 | ```
 94 | 
 95 | ### 返回结果
 96 | 
 97 | 当设置输出模式为 `json`时，返回的结果反序列化之后包含四个部分：
 98 | 
 99 | * `all_req_list`： 本次爬取任务过程中发现的所有请求，包含其他域名的任何资源类型。
100 | * `req_list`：本次爬取任务的**同域名结果**，经过伪静态去重，不包含静态资源链接。理论上是 `all_req_list `的子集
101 | * `all_domain_list`：发现的所有域名列表。
102 | * `sub_domain_list`：发现的任务目标的子域名列表。
103 | 
104 | 
105 | 
106 | ## 完整参数说明
107 | 
108 | crawlergo 拥有灵活的参数配置，以下是详细的选项说明：
109 | 
110 | * `--chromium-path Path, -c Path`    chrome的可执行程序路径
111 | * `--custom-headers Headers`   自定义HTTP头，使用传入json序列化之后的数据，这个是全局定义，将被用于所有请求
112 | * `--post-data PostData, -d PostData`   提供POST数据，目标使用POST请求方法
113 | * `--max-crawled-count Number, -m Number`   爬虫最大任务数量，避免因伪静态造成长时间无意义抓取。
114 | * `--filter-mode Mode, -f Mode`   过滤模式，简单：只过滤静态资源和完全重复的请求。智能：拥有过滤伪静态的能力。严格：更加严格的伪静态过滤规则。
115 | * `--output-mode value, -o value`   结果输出模式，`console`：打印当前域名结果。`json`：打印所有结果的json序列化字符串，可直接被反序列化解析。`none`：不打印输出。
116 | * `--output-json filepath` 将爬虫结果JSON序列化之后写入到json文件。
117 | * `--max-tab-count Number, -t Number`   爬虫同时开启最大标签页，即同时爬取的页面数量。
118 | * `--fuzz-path`  使用常见路径Fuzz目标，获取更多入口。
119 | * `--fuzz-path-dict`  通过字典文件自定义Fuzz目录，传入字典文件路径，如：`/home/user/fuzz_dir.txt`，文件每行代表一个要fuzz的目录。
120 | * `--robots-path` 从 /robots.txt 文件中解析路径，获取更多入口。
121 | * `--request-proxy proxyAddress` 支持**socks5**代理，crawlergo和chrome浏览器的所有网络请求均经过代理发送。
122 | * `--tab-run-timeout Timeout`   单个Tab标签页的最大运行超时。
123 | * `--wait-dom-content-loaded-timeout Timeout`  爬虫等待页面加载完毕的最大超时。
124 | * `--event-trigger-interval Interval` 事件自动触发时的间隔时间，一般用于目标网络缓慢，DOM更新冲突时导致的URL漏抓。
125 | * `--event-trigger-mode Value` 事件自动触发的模式，分为异步和同步，用于DOM更新冲突时导致的URL漏抓。
126 | * `--before-exit-delay` 单个tab标签页任务结束时，延迟退出关闭chrome的时间，用于等待部分DOM更新和XHR请求的发起捕获。
127 | * `--ignore-url-keywords` 不想访问的URL关键字，一般用于在携带Cookie访问时排除注销链接。用法：`-iuk logout -iuk exit`。
128 | * `--form-values` 自定义表单填充的值，按照文本类型设置。支持定义类型：default, mail, code, phone, username, password, qq, id_card, url, date, number，文本类型通过输入框标签的`id`、`name`、`class`、`type`四个属性值关键字进行识别。如，定义邮箱输入框自动填充A，密码输入框自动填充B，`-fv mail=A -fv password=B`。其中default代表无法识别文本类型时的默认填充值，目前为Cralwergo。
129 | * `--form-keyword-values` 自定义表单填充的值，按照关键字模糊匹配设置。关键字匹配输入框标签的`id`、`name`、`class`、`type`四个属性值。如，模糊匹配pass关键词填充123456，user关键词填充admin，`-fkv user=admin -fkv pass=123456`。
130 | * `--push-to-proxy` 拟接收爬虫结果的监听地址，一般为被动扫描器的监听地址。
131 | * `--push-pool-max` 发送爬虫结果到监听地址时的最大并发数。
132 | * `--log-level` 打印日志等级，可选 debug, info, warn, error 和 fatal。
133 | * `--no-headless`  关闭chrome headless模式，可直观的看到爬虫过程。
134 | 
135 | 
136 | 
137 | ## 使用举例
138 | 
139 | crawlergo 返回了全量的请求和URL信息，可以有多种使用方法：
140 | 
141 | * 联动其它的开源被动扫描器
142 | 
143 |   首先，启动某被动扫描器，设置监听地址为：`http://127.0.0.1:1234/`。
144 | 
145 |   接下来，假设crawlergo与扫描器在同一台机器，启动 crawlergo，设置参数：
146 | 
147 |   `--push-to-proxy http://127.0.0.1:1234/`
148 | 
149 | * 子域名收集  example
150 | 
151 | * 旁站入口收集  example
152 | 
153 | * 结合celery实现分布式扫描
154 | 
155 | * Host绑定设置（高版本chrome无法使用）  [(查看例子)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/host_binding.py)
156 | 
157 | * 带Cookie扫描  [(查看例子)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/request_with_cookie.py)
158 | 
159 | * 调用crawlergo调用产生僵尸进程，定时清理 [(查看例子)](https://github.com/0Kee-Team/crawlergo/blob/master/examples/zombie_clean.py) , contributed by @ring04h
160 | 
161 | ## Trouble Shooting
162 | 
163 | * 'Fetch.enable' wasn't found
164 | 
165 |   Fetch是新版chrome支持的功能，如果出现此错误，说明你的版本较低，请升级chrome到最新版即可。
166 |   
167 | * chrome运行提示缺少 xxx.so 等依赖
168 | 
169 |   ```shell
170 |   // Ubuntu
171 |   apt-get install -yq --no-install-recommends \
172 |        libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 \
173 |        libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 \
174 |        libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libgbm1 \
175 |        libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3
176 |        
177 |   // CentOS 7
178 |   sudo yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 libXdamage.x86_64 libXext.x86_64 libXi.x86_64 \
179 |        libXtst.x86_64 cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 \
180 |        ipa-gothic-fonts xorg-x11-fonts-100dpi xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y
181 |   
182 |   sudo yum update nss -y
183 |   ```
184 | 
185 | 
186 | * 运行提示**导航超时** / 浏览器无法找到 / 不知道正确的**浏览器可执行文件路径**
187 | 
188 |   确认配置的浏览器可执行路径正确，在地址栏中输入：`chrome://version`，找到可执行程序文件路径：
189 | 
190 |   ![](./imgs/chrome_path.png)
191 | 
192 | ## Bypass headless detect
193 | 
194 | https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html
195 | 
196 | ![](./imgs/bypass.png)
197 | 
198 | 
199 | ## Follow me
200 | 
201 | 如果你有关于浏览器爬虫的想法，欢迎和我交流。
202 | 
203 | 微博：[@9ian1i](https://weibo.com/u/5242748339) 
204 | Github: [@9ian1i](https://github.com/Qianlitp) 
205 | 
206 | 相关文章：[漏扫动态爬虫实践](https://www.anquanke.com/post/id/178339)
207 | 


--------------------------------------------------------------------------------
/cmd/crawlergo/flag.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/Qianlitp/crawlergo/pkg/config"
  6 | 	"github.com/urfave/cli/v2"
  7 | )
  8 | 
  9 | var cliFlags = []cli.Flag{
 10 | 	SetChromePath(),
 11 | 	SetCustomHeaders(),
 12 | 	SetPostData(),
 13 | 	SetMaxCrawledCount(),
 14 | 	SetFilterMod(),
 15 | 	SetOutputMode(),
 16 | 	SetOutputJSON(),
 17 | 	//SetIgcognitoContext(),
 18 | 	SetMaxTabCount(),
 19 | 	SetFuzzPath(),
 20 | 	SetFuzzPathDict(),
 21 | 	SetRobotsPath(),
 22 | 	SetRequestProxy(),
 23 | 	SetEncodeURL(),
 24 | 	SetTabRunTTL(),
 25 | 	SetWaitDomContentLoadedTTL(),
 26 | 	SetEventTriggerMode(),
 27 | 	SetEventTriggerInterval(),
 28 | 	SetBeforeExitDelay(),
 29 | 	SetIgnoreUrlKeywords(),
 30 | 	SetFormValues(),
 31 | 	SetFormKeywordValue(),
 32 | 	SetPushToProxy(),
 33 | 	SetPushPoolMax(),
 34 | 	SetLogLevel(),
 35 | 	SetNoHeadless(),
 36 | }
 37 | 
 38 | func SetChromePath() *cli.PathFlag {
 39 | 	return &cli.PathFlag{
 40 | 		Name:        "chromium-path",
 41 | 		Aliases:     []string{"c"},
 42 | 		Usage:       "`Path` of chromium executable. Such as \"/home/test/chrome-linux/chrome\"",
 43 | 		Destination: &taskConfig.ChromiumPath,
 44 | 		EnvVars:     []string{"CRAWLERGO_CHROMIUM_PATH"},
 45 | 	}
 46 | }
 47 | 
 48 | func SetCustomHeaders() *cli.StringFlag {
 49 | 	return &cli.StringFlag{
 50 | 		Name:        "custom-headers",
 51 | 		Usage:       "add additional `Headers` to each request. The input string will be called json.Unmarshal",
 52 | 		Value:       fmt.Sprintf(`{"Spider-Name": "crawlergo", "User-Agent": "%s"}`, config.DefaultUA),
 53 | 		Destination: &taskConfig.ExtraHeadersString,
 54 | 	}
 55 | }
 56 | 
 57 | func SetPostData() *cli.StringFlag {
 58 | 	return &cli.StringFlag{
 59 | 		Name:        "post-data",
 60 | 		Aliases:     []string{"d"},
 61 | 		Usage:       "set `PostData` to target and use POST method.",
 62 | 		Destination: &postData,
 63 | 	}
 64 | }
 65 | 
 66 | func SetMaxCrawledCount() *cli.IntFlag {
 67 | 	return &cli.IntFlag{
 68 | 		Name:        "max-crawled-count",
 69 | 		Aliases:     []string{"m"},
 70 | 		Value:       config.MaxCrawlCount,
 71 | 		Usage:       "the maximum `Number` of URLs visited by the crawler in this task.",
 72 | 		Destination: &taskConfig.MaxCrawlCount,
 73 | 	}
 74 | }
 75 | 
 76 | func SetFilterMod() *cli.StringFlag {
 77 | 	return &cli.StringFlag{
 78 | 		Name:        "filter-mode",
 79 | 		Aliases:     []string{"f"},
 80 | 		Value:       "smart",
 81 | 		Usage:       "filtering `Mode` used for collected requests. Allowed mode:\"simple\", \"smart\" or \"strict\".",
 82 | 		Destination: &taskConfig.FilterMode,
 83 | 	}
 84 | }
 85 | 
 86 | func SetOutputMode() *cli.StringFlag {
 87 | 	return &cli.StringFlag{
 88 | 		Name:        "output-mode",
 89 | 		Aliases:     []string{"o"},
 90 | 		Value:       "console",
 91 | 		Usage:       "console print or serialize output. Allowed mode:\"console\" ,\"json\" or \"none\".",
 92 | 		Destination: &outputMode,
 93 | 	}
 94 | }
 95 | 
 96 | func SetOutputJSON() *cli.StringFlag {
 97 | 	return &cli.StringFlag{
 98 | 		Name:        "output-json",
 99 | 		Usage:       "write output to a json file.Such as result_www_test_com.json",
100 | 		Destination: &outputJsonPath,
101 | 	}
102 | }
103 | 
104 | func SetMaxTabCount() *cli.IntFlag {
105 | 	return &cli.IntFlag{
106 | 		Name:        "max-tab-count",
107 | 		Aliases:     []string{"t"},
108 | 		Value:       8,
109 | 		Usage:       "maximum `Number` of tabs allowed.",
110 | 		Destination: &taskConfig.MaxTabsCount,
111 | 	}
112 | }
113 | 
114 | func SetFuzzPath() *cli.BoolFlag {
115 | 	return &cli.BoolFlag{
116 | 		Name:        "fuzz-path",
117 | 		Value:       false,
118 | 		Usage:       "whether to fuzz the target with common paths.",
119 | 		Destination: &taskConfig.PathByFuzz,
120 | 	}
121 | }
122 | 
123 | func SetFuzzPathDict() *cli.PathFlag {
124 | 	return &cli.PathFlag{
125 | 		Name:        "fuzz-path-dict",
126 | 		Usage:       "`Path` of fuzz dict. Such as \"/home/test/fuzz_path.txt\"",
127 | 		Destination: &taskConfig.FuzzDictPath,
128 | 	}
129 | }
130 | 
131 | func SetRobotsPath() *cli.BoolFlag {
132 | 	return &cli.BoolFlag{
133 | 		Name:        "robots-path",
134 | 		Value:       false,
135 | 		Usage:       "whether to resolve paths from /robots.txt.",
136 | 		Destination: &taskConfig.PathFromRobots,
137 | 	}
138 | }
139 | 
140 | func SetRequestProxy() *cli.StringFlag {
141 | 	return &cli.StringFlag{
142 | 		Name:        "request-proxy",
143 | 		Usage:       "all requests connect through defined proxy server.",
144 | 		Destination: &taskConfig.Proxy,
145 | 	}
146 | }
147 | 
148 | // return &cli.BoolFlag{
149 | //	Name:        "bypass",
150 | //	Value:       false,
151 | //	Usage:       "whether to encode url with detected charset.",
152 | //	Destination: &taskConfig.EncodeURLWithCharset,
153 | //},
154 | func SetEncodeURL() *cli.BoolFlag {
155 | 	return &cli.BoolFlag{
156 | 		Name:        "encode-url",
157 | 		Value:       false,
158 | 		Usage:       "whether to encode url with detected charset.",
159 | 		Destination: &taskConfig.EncodeURLWithCharset,
160 | 	}
161 | }
162 | 
163 | func SetTabRunTTL() *cli.DurationFlag {
164 | 
165 | 	return &cli.DurationFlag{
166 | 		Name:        "tab-run-timeout",
167 | 		Value:       config.TabRunTimeout,
168 | 		Usage:       "the `Timeout` of a single tab task.",
169 | 		Destination: &taskConfig.TabRunTimeout,
170 | 	}
171 | }
172 | 
173 | func SetWaitDomContentLoadedTTL() *cli.DurationFlag {
174 | 	return &cli.DurationFlag{
175 | 		Name:        "wait-dom-content-loaded-timeout",
176 | 		Value:       config.DomContentLoadedTimeout,
177 | 		Usage:       "the `Timeout` of waiting for a page dom ready.",
178 | 		Destination: &taskConfig.DomContentLoadedTimeout,
179 | 	}
180 | }
181 | 
182 | func SetEventTriggerMode() *cli.StringFlag {
183 | 	return &cli.StringFlag{
184 | 		Name:        "event-trigger-mode",
185 | 		Value:       config.EventTriggerAsync,
186 | 		Usage:       "this `Value` determines how the crawler automatically triggers events.Allowed mode:\"async\" or \"sync\".",
187 | 		Destination: &taskConfig.EventTriggerMode,
188 | 	}
189 | }
190 | 
191 | func SetEventTriggerInterval() *cli.DurationFlag {
192 | 	return &cli.DurationFlag{
193 | 		Name:        "event-trigger-interval",
194 | 		Value:       config.EventTriggerInterval,
195 | 		Usage:       "the `Interval` of triggering each event.",
196 | 		Destination: &taskConfig.EventTriggerInterval,
197 | 	}
198 | }
199 | 
200 | func SetBeforeExitDelay() *cli.DurationFlag {
201 | 	return &cli.DurationFlag{
202 | 		Name:        "before-exit-delay",
203 | 		Value:       config.BeforeExitDelay,
204 | 		Usage:       "the `Time` of waiting before crawler exit.",
205 | 		Destination: &taskConfig.BeforeExitDelay,
206 | 	}
207 | }
208 | 
209 | func SetIgnoreUrlKeywords() *cli.StringSliceFlag {
210 | 	return &cli.StringSliceFlag{
211 | 		Name:        "ignore-url-keywords",
212 | 		Aliases:     []string{"iuk"},
213 | 		Value:       cli.NewStringSlice(config.DefaultIgnoreKeywords...),
214 | 		Usage:       "crawlergo will not crawl these URLs matched by `Keywords`. e.g.: -iuk logout -iuk quit -iuk exit",
215 | 		DefaultText: "Default [logout quit exit]",
216 | 		Destination: ignoreKeywords,
217 | 	}
218 | }
219 | 
220 | func SetFormValues() *cli.StringSliceFlag {
221 | 	return &cli.StringSliceFlag{
222 | 		Name:        "form-values",
223 | 		Aliases:     []string{"fv"},
224 | 		Usage:       "custom filling text for each form type. e.g.: -fv username=crawlergo_nice -fv password=admin123",
225 | 		Destination: customFormTypeValues,
226 | 	}
227 | }
228 | 
229 | // 根据关键词自行选择填充文本
230 | func SetFormKeywordValue() *cli.StringSliceFlag {
231 | 	return &cli.StringSliceFlag{
232 | 		Name:        "form-keyword-values",
233 | 		Aliases:     []string{"fkv"},
234 | 		Usage:       "custom filling text, fuzzy matched by keyword. e.g.: -fkv user=crawlergo_nice -fkv pass=admin123",
235 | 		Destination: customFormKeywordValues,
236 | 	}
237 | }
238 | 
239 | func SetPushToProxy() *cli.StringFlag {
240 | 	return &cli.StringFlag{
241 | 		Name:        "push-to-proxy",
242 | 		Usage:       "every request in 'req_list' will be pushed to the proxy `Address`. Such as \"http://127.0.0.1:8080/\"",
243 | 		Destination: &pushAddress,
244 | 	}
245 | }
246 | 
247 | func SetPushPoolMax() *cli.IntFlag {
248 | 	return &cli.IntFlag{
249 | 		Name:        "push-pool-max",
250 | 		Usage:       "maximum `Number` of concurrency when pushing results to proxy.",
251 | 		Value:       DefaultMaxPushProxyPoolMax,
252 | 		Destination: &pushProxyPoolMax,
253 | 	}
254 | }
255 | 
256 | func SetLogLevel() *cli.StringFlag {
257 | 	return &cli.StringFlag{
258 | 		Name:        "log-level",
259 | 		Usage:       "log print `Level`, options include debug, info, warn, error and fatal.",
260 | 		Value:       DefaultLogLevel,
261 | 		Destination: &logLevel,
262 | 	}
263 | }
264 | 
265 | func SetNoHeadless() *cli.BoolFlag {
266 | 	return &cli.BoolFlag{
267 | 		Name:        "no-headless",
268 | 		Value:       false,
269 | 		Usage:       "no headless mode",
270 | 		Destination: &taskConfig.NoHeadless,
271 | 	}
272 | }
273 | 


--------------------------------------------------------------------------------
/cmd/crawlergo/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"os"
  9 | 	"os/signal"
 10 | 	"strings"
 11 | 	"sync"
 12 | 	"syscall"
 13 | 
 14 | 	"github.com/Qianlitp/crawlergo/pkg"
 15 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 16 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 17 | 	model2 "github.com/Qianlitp/crawlergo/pkg/model"
 18 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 19 | 	"github.com/Qianlitp/crawlergo/pkg/tools/requests"
 20 | 	"github.com/panjf2000/ants/v2"
 21 | 	"github.com/sirupsen/logrus"
 22 | 	"github.com/urfave/cli/v2"
 23 | )
 24 | 
 25 | /**
 26 | 命令行调用适配器
 27 | 
 28 | 用于生成开源的二进制程序
 29 | */
 30 | 
 31 | type Result struct {
 32 | 	ReqList       []Request `json:"req_list"`
 33 | 	AllReqList    []Request `json:"all_req_list"`
 34 | 	AllDomainList []string  `json:"all_domain_list"`
 35 | 	SubDomainList []string  `json:"sub_domain_list"`
 36 | }
 37 | 
 38 | type Request struct {
 39 | 	Url     string                 `json:"url"`
 40 | 	Method  string                 `json:"method"`
 41 | 	Headers map[string]interface{} `json:"headers"`
 42 | 	Data    string                 `json:"data"`
 43 | 	Source  string                 `json:"source"`
 44 | }
 45 | 
 46 | type ProxyTask struct {
 47 | 	req       *model2.Request
 48 | 	pushProxy string
 49 | }
 50 | 
 51 | const (
 52 | 	DefaultMaxPushProxyPoolMax = 10
 53 | 	DefaultLogLevel            = "Info"
 54 | )
 55 | 
 56 | var (
 57 | 	taskConfig              pkg.TaskConfig
 58 | 	outputMode              string
 59 | 	postData                string
 60 | 	signalChan              chan os.Signal
 61 | 	ignoreKeywords          = cli.NewStringSlice(config.DefaultIgnoreKeywords...)
 62 | 	customFormTypeValues    = cli.NewStringSlice()
 63 | 	customFormKeywordValues = cli.NewStringSlice()
 64 | 	pushAddress             string
 65 | 	pushProxyPoolMax        int
 66 | 	pushProxyWG             sync.WaitGroup
 67 | 	outputJsonPath          string
 68 | 	logLevel                string
 69 | 	Version                 string
 70 | )
 71 | 
 72 | func main() {
 73 | 	author := cli.Author{
 74 | 		Name:  "9ian1i",
 75 | 		Email: "9ian1itp@gmail.com",
 76 | 	}
 77 | 
 78 | 	//ignoreKeywords = cli.NewStringSlice(config.DefaultIgnoreKeywords...)
 79 | 	//customFormTypeValues = cli.NewStringSlice()
 80 | 	//customFormKeywordValues = cli.NewStringSlice()
 81 | 
 82 | 	app := &cli.App{
 83 | 		Name:      "crawlergo",
 84 | 		Usage:     "A powerful browser crawler for web vulnerability scanners",
 85 | 		UsageText: "crawlergo [global options] url1 url2 url3 ... (must be same host)",
 86 | 		Version:   Version,
 87 | 		Authors:   []*cli.Author{&author},
 88 | 		Flags:     cliFlags,
 89 | 		Action:    run,
 90 | 	}
 91 | 
 92 | 	err := app.Run(os.Args)
 93 | 	if err != nil {
 94 | 		logger.Logger.Fatal(err)
 95 | 	}
 96 | }
 97 | 
 98 | func run(c *cli.Context) error {
 99 | 	signalChan = make(chan os.Signal, 1)
100 | 	signal.Notify(signalChan, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGINT)
101 | 	if c.Args().Len() == 0 {
102 | 		logger.Logger.Error("url must be set")
103 | 		return errors.New("url must be set")
104 | 	}
105 | 
106 | 	// 设置日志输出级别
107 | 	level, err := logrus.ParseLevel(logLevel)
108 | 	if err != nil {
109 | 		logger.Logger.Fatal(err)
110 | 	}
111 | 	logger.Logger.SetLevel(level)
112 | 
113 | 	var targets []*model2.Request
114 | 	for _, _url := range c.Args().Slice() {
115 | 		var req model2.Request
116 | 		url, err := model2.GetUrl(_url)
117 | 		if err != nil {
118 | 			logger.Logger.Error("parse url failed, ", err)
119 | 			continue
120 | 		}
121 | 		if postData != "" {
122 | 			req = model2.GetRequest(config.POST, url, getOption())
123 | 		} else {
124 | 			req = model2.GetRequest(config.GET, url, getOption())
125 | 		}
126 | 		req.Proxy = taskConfig.Proxy
127 | 		targets = append(targets, &req)
128 | 	}
129 | 	taskConfig.IgnoreKeywords = ignoreKeywords.Value()
130 | 	if taskConfig.Proxy != "" {
131 | 		logger.Logger.Info("request with proxy: ", taskConfig.Proxy)
132 | 	}
133 | 
134 | 	if len(targets) == 0 {
135 | 		logger.Logger.Fatal("no validate target.")
136 | 	}
137 | 
138 | 	// 检查自定义的表单参数配置
139 | 	taskConfig.CustomFormValues, err = parseCustomFormValues(customFormTypeValues.Value())
140 | 	if err != nil {
141 | 		logger.Logger.Fatal(err)
142 | 	}
143 | 	taskConfig.CustomFormKeywordValues, err = keywordStringToMap(customFormKeywordValues.Value())
144 | 	if err != nil {
145 | 		logger.Logger.Fatal(err)
146 | 	}
147 | 
148 | 	// 开始爬虫任务
149 | 	task, err := pkg.NewCrawlerTask(targets, taskConfig)
150 | 	if err != nil {
151 | 		logger.Logger.Error("create crawler task failed.")
152 | 		os.Exit(-1)
153 | 	}
154 | 	if len(targets) != 0 {
155 | 		logger.Logger.Info(fmt.Sprintf("Init crawler task, host: %s, max tab count: %d, max crawl count: %d.",
156 | 			targets[0].URL.Host, taskConfig.MaxTabsCount, taskConfig.MaxCrawlCount))
157 | 		logger.Logger.Info("filter mode: ", taskConfig.FilterMode)
158 | 	}
159 | 
160 | 	// 提示自定义表单填充参数
161 | 	if len(taskConfig.CustomFormValues) > 0 {
162 | 		logger.Logger.Info("Custom form values, " + tools.MapStringFormat(taskConfig.CustomFormValues))
163 | 	}
164 | 	// 提示自定义表单填充参数
165 | 	if len(taskConfig.CustomFormKeywordValues) > 0 {
166 | 		logger.Logger.Info("Custom form keyword values, " + tools.MapStringFormat(taskConfig.CustomFormKeywordValues))
167 | 	}
168 | 	if _, ok := taskConfig.CustomFormValues["default"]; !ok {
169 | 		logger.Logger.Info("If no matches, default form input text: " + config.DefaultInputText)
170 | 		taskConfig.CustomFormValues["default"] = config.DefaultInputText
171 | 	}
172 | 
173 | 	go handleExit(task)
174 | 	logger.Logger.Info("Start crawling.")
175 | 	task.Run()
176 | 	result := task.Result
177 | 
178 | 	logger.Logger.Info(fmt.Sprintf("Task finished, %d results, %d requests, %d subdomains, %d domains found.",
179 | 		len(result.ReqList), len(result.AllReqList), len(result.SubDomainList), len(result.AllDomainList)))
180 | 
181 | 	// 内置请求代理
182 | 	if pushAddress != "" {
183 | 		logger.Logger.Info("pushing results to ", pushAddress, ", max pool number:", pushProxyPoolMax)
184 | 		Push2Proxy(result.ReqList)
185 | 	}
186 | 
187 | 	// 输出结果
188 | 	outputResult(result)
189 | 
190 | 	return nil
191 | }
192 | 
193 | func getOption() model2.Options {
194 | 	var option model2.Options
195 | 	if postData != "" {
196 | 		option.PostData = postData
197 | 	}
198 | 	if taskConfig.ExtraHeadersString != "" {
199 | 		err := json.Unmarshal([]byte(taskConfig.ExtraHeadersString), &taskConfig.ExtraHeaders)
200 | 		if err != nil {
201 | 			logger.Logger.Fatal("custom headers can't be Unmarshal.")
202 | 			panic(err)
203 | 		}
204 | 		option.Headers = taskConfig.ExtraHeaders
205 | 	}
206 | 	return option
207 | }
208 | 
209 | func parseCustomFormValues(customData []string) (map[string]string, error) {
210 | 	parsedData := map[string]string{}
211 | 	for _, item := range customData {
212 | 		keyValue := strings.Split(item, "=")
213 | 		if len(keyValue) < 2 {
214 | 			return nil, errors.New("invalid form item: " + item)
215 | 		}
216 | 		key := keyValue[0]
217 | 		if !tools.StringSliceContain(config.AllowedFormName, key) {
218 | 			return nil, errors.New("not allowed form key: " + key)
219 | 		}
220 | 		value := keyValue[1]
221 | 		parsedData[key] = value
222 | 	}
223 | 	return parsedData, nil
224 | }
225 | 
226 | func keywordStringToMap(data []string) (map[string]string, error) {
227 | 	parsedData := map[string]string{}
228 | 	for _, item := range data {
229 | 		keyValue := strings.Split(item, "=")
230 | 		if len(keyValue) < 2 {
231 | 			return nil, errors.New("invalid keyword format: " + item)
232 | 		}
233 | 		key := keyValue[0]
234 | 		value := keyValue[1]
235 | 		parsedData[key] = value
236 | 	}
237 | 	return parsedData, nil
238 | }
239 | 
240 | func outputResult(result *pkg.Result) {
241 | 	// 输出结果
242 | 	if outputMode == "json" {
243 | 		fmt.Println("--[Mission Complete]--")
244 | 		resBytes := getJsonSerialize(result)
245 | 		fmt.Println(string(resBytes))
246 | 	} else if outputMode == "console" {
247 | 		for _, req := range result.ReqList {
248 | 			req.FormatPrint()
249 | 		}
250 | 	}
251 | 	if len(outputJsonPath) != 0 {
252 | 		resBytes := getJsonSerialize(result)
253 | 		tools.WriteFile(outputJsonPath, resBytes)
254 | 	}
255 | }
256 | 
257 | /**
258 | 原生被动代理推送支持
259 | */
260 | func Push2Proxy(reqList []*model2.Request) {
261 | 	pool, _ := ants.NewPool(pushProxyPoolMax)
262 | 	defer pool.Release()
263 | 	for _, req := range reqList {
264 | 		task := ProxyTask{
265 | 			req:       req,
266 | 			pushProxy: pushAddress,
267 | 		}
268 | 		pushProxyWG.Add(1)
269 | 		go func() {
270 | 			err := pool.Submit(task.doRequest)
271 | 			if err != nil {
272 | 				logger.Logger.Error("add Push2Proxy task failed: ", err)
273 | 				pushProxyWG.Done()
274 | 			}
275 | 		}()
276 | 	}
277 | 	pushProxyWG.Wait()
278 | }
279 | 
280 | /**
281 | 协程池请求的任务
282 | */
283 | func (p *ProxyTask) doRequest() {
284 | 	defer pushProxyWG.Done()
285 | 	_, _ = requests.Request(p.req.Method, p.req.URL.String(), tools.ConvertHeaders(p.req.Headers), []byte(p.req.PostData),
286 | 		&requests.ReqOptions{Timeout: 1, AllowRedirect: false, Proxy: p.pushProxy})
287 | }
288 | 
289 | func handleExit(t *pkg.CrawlerTask) {
290 | 	<-signalChan
291 | 	fmt.Println("exit ...")
292 | 	t.Pool.Tune(1)
293 | 	t.Pool.Release()
294 | 	t.Browser.Close()
295 | 	os.Exit(-1)
296 | }
297 | 
298 | func getJsonSerialize(result *pkg.Result) []byte {
299 | 	var res Result
300 | 	var reqList []Request
301 | 	var allReqList []Request
302 | 	for _, _req := range result.ReqList {
303 | 		var req Request
304 | 		req.Method = _req.Method
305 | 		req.Url = _req.URL.String()
306 | 		req.Source = _req.Source
307 | 		req.Data = _req.PostData
308 | 		req.Headers = _req.Headers
309 | 		reqList = append(reqList, req)
310 | 	}
311 | 	for _, _req := range result.AllReqList {
312 | 		var req Request
313 | 		req.Method = _req.Method
314 | 		req.Url = _req.URL.String()
315 | 		req.Source = _req.Source
316 | 		req.Data = _req.PostData
317 | 		req.Headers = _req.Headers
318 | 		allReqList = append(allReqList, req)
319 | 	}
320 | 	res.AllReqList = allReqList
321 | 	res.ReqList = reqList
322 | 	res.AllDomainList = result.AllDomainList
323 | 	res.SubDomainList = result.SubDomainList
324 | 
325 | 	resBytes, err := json.Marshal(res)
326 | 	if err != nil {
327 | 		log.Fatal("Marshal result error")
328 | 	}
329 | 	return resBytes
330 | }
331 | 


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | ## Build
 2 | FROM golang:1.16-buster AS build
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | RUN apt-get update && apt-get install unzip && rm -rf /var/lib/apt/lists/*
 7 | COPY ./ ./
 8 | RUN make build
 9 | RUN chmod +x ./get_chrome.sh && ./get_chrome.sh
10 | 
11 | 
12 | ## Deploy
13 | FROM ubuntu:18.04
14 | 
15 | WORKDIR /
16 | 
17 | COPY --from=build /app/bin/crawlergo /crawlergo
18 | COPY --from=build /app/latest/ /chrome/
19 | RUN apt-get update && apt-get install -yq --no-install-recommends \
20 |      libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 \
21 |      libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 \
22 |      libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libgbm1 \
23 |      libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3 \
24 |      && rm -rf /var/lib/apt/lists/*
25 | 
26 | ENTRYPOINT ["/crawlergo", "-c", "/chrome/chrome"]


--------------------------------------------------------------------------------
/examples/host_binding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # coding: utf-8
 3 | 
 4 | import simplejson
 5 | import subprocess
 6 | """
 7 |     628235 版本的chrome可用
 8 | 
 9 |     为什么高版本无法Host绑定？
10 |     https://github.com/chromium/chromium/commit/d31383577e0517843c8059dec9b87469bf30900f#diff-d717572478f6a97f889b33917c9d3a5f
11 | 
12 |     查找历史版本
13 |     https://github.com/macchrome/winchrome/releases?after=v77.0.3865.90-r681094-Win64
14 | 
15 |     下载地址
16 |     https://storage.googleapis.com/chromium-browser-snapshots/Linux_x64/628235/chrome-linux.zip
17 | """
18 | 
19 | 
20 | def main():
21 |     target = "http://176.28.50.165/"
22 |     headers = {
23 |         "Host": "testphp.vulnweb.com",
24 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
25 |                       "Chrome/74.0.3945.0 Safari/537.36",
26 |     }
27 |     cmd = ["./crawlergo_cmd", "-c", "/tmp/chrome-linux-628235/chrome",
28 |            "-o", "json", "--custom-headers", simplejson.dumps(headers), target]
29 |     rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
30 |     output, error = rsp.communicate()
31 | 
32 |     result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
33 |     req_list = result["req_list"]
34 |     for each in req_list:
35 |         print(each)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/examples/request_with_cookie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # coding: utf-8
 3 | 
 4 | import simplejson
 5 | import subprocess
 6 | """
 7 |     添加Cookie扫描示例
 8 |     
 9 |     命令行调用时：
10 |     ./crawlergo -c /home/test/chrome-linux/chrome -o json --ignore-url-keywords quit,exit,zhuxiao --custom-headers "{\"Cookie\": \"crawlergo=Cool\"}"
11 | 
12 |     使用 --ignore-url-keywords 添加你想要的排除的关键字，避免访问注销请求
13 | """
14 | 
15 | 
16 | def main():
17 |     target = "http://testphp.vulnweb.com/"
18 |     headers = {
19 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
20 |                       "Chrome/74.0.3945.0 Safari/537.36",
21 |         "Cookie": "crawlergo=Cool"
22 |     }
23 |     cmd = ["./crawlergo", "-c", "/home/test/chrome-linux/chrome",
24 |            "-o", "json", "--ignore-url-keywords", "quit,exit,zhuxiao", "--custom-headers", simplejson.dumps(headers),
25 |            target]
26 | 
27 |     rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
28 |     output, error = rsp.communicate()
29 | 
30 |     result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
31 |     req_list = result["req_list"]
32 |     for each in req_list:
33 |         print(each)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/subprocess_call.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # coding: utf-8
 3 | 
 4 | import simplejson
 5 | import subprocess
 6 | 
 7 | 
 8 | def main():
 9 |     target = "http://testphp.vulnweb.com/"
10 |     cmd = ["./crawlergo_cmd", "-c", "/tmp/chrome-linux/chrome", "-o", "json", target]
11 |     rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
12 |     output, error = rsp.communicate()
13 | 
14 |     result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
15 |     req_list = result["req_list"]
16 |     print(req_list[0])
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 


--------------------------------------------------------------------------------
/examples/zombie_clean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # coding: utf-8
 3 | 
 4 | """
 5 |     author: 猪猪侠 https://github.com/ring04h
 6 | 
 7 | """
 8 | 
 9 | import logging
10 | import subprocess
11 | 
12 | logging.basicConfig(level=logging.DEBUG)
13 | 
14 | # 
15 | # (crontab -l;echo '0 2 * * * /usr/local/bin/python3 /data/script/zombie_clean.py') | crontab -
16 | # 
17 | 
18 | def is_timeout(etime):
19 |     if '-' in etime:
20 |         day, hour = etime.split('-')
21 |         return True if int(day) >= 1 else False
22 |     else:
23 |         return False
24 | 
25 | 
26 | def cmdprocess(cmdline):
27 | 
28 |     pipe = subprocess.Popen(cmdline, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
29 |     output, stderr = pipe.communicate()
30 |     return_code = pipe.returncode
31 |     stderr = stderr.decode(errors='replace')
32 |     output = output.decode(errors='replace')
33 |     return output, stderr, return_code
34 | 
35 | 
36 | 
37 | def main():
38 | 
39 |     cmdline = "ps -ef | grep crawlergo | grep -v grep | awk '{print $2}'"
40 |     output, stderr, return_code = cmdprocess(cmdline)
41 |     
42 |     if return_code != 0:
43 |         return
44 | 
45 |     zombie_pids = output.splitlines()
46 | 
47 |     for zombie_pid in zombie_pids:
48 | 
49 |         cmdline = f'''ps -eo pid,etime | grep {zombie_pid}'''
50 |         ps_output, ps_stderr, ps_return_code = cmdprocess(cmdline)
51 | 
52 |         if ps_return_code != 0:
53 |             continue
54 | 
55 |         for line in ps_output.splitlines():
56 |             
57 |             pid, etime = line.split()
58 | 
59 |             status = is_timeout(etime)
60 |             logging.debug(f"PID: {pid:<8} ETIME: {etime:<15} TIMEOUT: {status}")
61 | 
62 |             if not status: 
63 |                 continue
64 | 
65 |             kill_cmdline = f"kill -9 {pid}"
66 |             logging.debug(f"call kill : [{kill_cmdline}]")
67 | 
68 |             cmdprocess(kill_cmdline)
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 
73 | 


--------------------------------------------------------------------------------
/get_chrome.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd $(dirname $0)
 3 | 
 4 | LASTCHANGE_URL="https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2FLAST_CHANGE?alt=media"
 5 | 
 6 | REVISION=$(curl -s -S $LASTCHANGE_URL)
 7 | 
 8 | echo "latest revision is $REVISION"
 9 | 
10 | if [ -d $REVISION ] ; then
11 |   echo "already have latest version"
12 |   exit
13 | fi
14 | 
15 | ZIP_URL="https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F$REVISION%2Fchrome-linux.zip?alt=media"
16 | 
17 | ZIP_FILE="${REVISION}-chrome-linux.zip"
18 | 
19 | echo "fetching $ZIP_URL"
20 | 
21 | rm -rf $REVISION
22 | mkdir $REVISION
23 | pushd $REVISION
24 | curl -# $ZIP_URL > $ZIP_FILE
25 | echo "unzipping.."
26 | unzip $ZIP_FILE
27 | popd
28 | rm -f ./latest
29 | ln -s $REVISION/chrome-linux/ ./latest


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/Qianlitp/crawlergo
 2 | 
 3 | go 1.16
 4 | 
 5 | replace git.apache.org/thrift.git => github.com/apache/thrift v0.13.0
 6 | 
 7 | require (
 8 | 	github.com/chromedp/cdproto v0.0.0-20220629234738-4cfc9cdeeb92
 9 | 	github.com/chromedp/chromedp v0.8.2
10 | 	github.com/deckarep/golang-set v1.7.1
11 | 	github.com/gogf/gf v1.16.6
12 | 	github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 // indirect
13 | 	github.com/panjf2000/ants/v2 v2.2.2
14 | 	github.com/pkg/errors v0.8.1
15 | 	github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
16 | 	github.com/sirupsen/logrus v1.4.2
17 | 	github.com/urfave/cli/v2 v2.23.6
18 | 	golang.org/x/net v0.0.0-20210520170846-37e1c6afe023
19 | 	golang.org/x/sys v0.0.0-20220627191245-f75cf1eec38b // indirect
20 | )
21 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
  2 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
  3 | github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
  4 | github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
  5 | github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4 h1:QD3KxSJ59L2lxG6MXBjNHxiQO2RmxTQ3XcK+wO44WOg=
  6 | github.com/chromedp/cdproto v0.0.0-20191114225735-6626966fbae4/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
  7 | github.com/chromedp/cdproto v0.0.0-20220515234810-83d799542a04/go.mod h1:5Y4sD/eXpwrChIuxhSr/G20n9CdbCmoerOHnuAf0Zr0=
  8 | github.com/chromedp/cdproto v0.0.0-20220629234738-4cfc9cdeeb92 h1:0kiAQSLWZDt4wsmcICou3C6in/OJ58FCqvXcB8Ax1Dk=
  9 | github.com/chromedp/cdproto v0.0.0-20220629234738-4cfc9cdeeb92/go.mod h1:5Y4sD/eXpwrChIuxhSr/G20n9CdbCmoerOHnuAf0Zr0=
 10 | github.com/chromedp/chromedp v0.5.2 h1:W8xBXQuUnd2dZK0SN/lyVwsQM7KgW+kY5HGnntms194=
 11 | github.com/chromedp/chromedp v0.5.2/go.mod h1:rsTo/xRo23KZZwFmWk2Ui79rBaVRRATCjLzNQlOFSiA=
 12 | github.com/chromedp/chromedp v0.8.2 h1:EYSsSqWuKYwyHZEJpU00kOGOMz5DE0qDVckelzauMFA=
 13 | github.com/chromedp/chromedp v0.8.2/go.mod h1:vpbCNtfYeOUo2q5reuwX6ZmPpbHRf5PZfAqNR2ObB+g=
 14 | github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
 15 | github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
 16 | github.com/clbanning/mxj v1.8.5-0.20200714211355-ff02cfb8ea28 h1:LdXxtjzvZYhhUaonAaAKArG3pyC67kGL3YY+6hGG8G4=
 17 | github.com/clbanning/mxj v1.8.5-0.20200714211355-ff02cfb8ea28/go.mod h1:BVjHeAH+rl9rs6f+QIpeRl0tfu10SXn1pUSa5PVGJng=
 18 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 19 | github.com/cpuguy83/go-md2man/v2 v2.0.0 h1:EoUDS0afbrsXAZ9YQ9jdu/mZ2sXgT1/2yyNng4PGlyM=
 20 | github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
 21 | github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
 22 | github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 23 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 24 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 25 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 26 | github.com/deckarep/golang-set v1.7.1 h1:SCQV0S6gTtp6itiFrTqI+pfmJ4LN85S1YzhDf9rTHJQ=
 27 | github.com/deckarep/golang-set v1.7.1/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ=
 28 | github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc=
 29 | github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
 30 | github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
 31 | github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
 32 | github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE=
 33 | github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
 34 | github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
 35 | github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
 36 | github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
 37 | github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
 38 | github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
 39 | github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
 40 | github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
 41 | github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
 42 | github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
 43 | github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
 44 | github.com/gobwas/ws v1.1.0 h1:7RFti/xnNkMJnrK7D1yQ/iCIB5OrrY/54/H930kIbHA=
 45 | github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0=
 46 | github.com/gogf/gf v1.16.6 h1:Yp5YfwnGz41d1tiVqxcWXiPXyuzjTb7ax4SnPSXxDE8=
 47 | github.com/gogf/gf v1.16.6/go.mod h1:4LoHfEBl2jbVmZpVx+qk2La3zWr1V315FtF2PVZuyQ8=
 48 | github.com/gomodule/redigo v1.8.5 h1:nRAxCa+SVsyjSBrtZmG/cqb6VbTmuRzpg/PoTFlpumc=
 49 | github.com/gomodule/redigo v1.8.5/go.mod h1:P9dn9mFrCBvWhGE1wpxx6fgq7BAeLBk+UUUzlpkBYO0=
 50 | github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ=
 51 | github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 52 | github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc=
 53 | github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 54 | github.com/grokify/html-strip-tags-go v0.0.0-20190921062105-daaa06bf1aaf h1:wIOAyJMMen0ELGiFzlmqxdcV1yGbkyHBAB6PolcNbLA=
 55 | github.com/grokify/html-strip-tags-go v0.0.0-20190921062105-daaa06bf1aaf/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
 56 | github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 57 | github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 58 | github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
 59 | github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
 60 | github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
 61 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 62 | github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM=
 63 | github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
 64 | github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 65 | github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 66 | github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8=
 67 | github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
 68 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY=
 69 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
 70 | github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0=
 71 | github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
 72 | github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
 73 | github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
 74 | github.com/orisano/pixelmatch v0.0.0-20210112091706-4fa4c7ba91d5/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
 75 | github.com/panjf2000/ants/v2 v2.2.2 h1:TWzusBjq/IflXhy+/S6u5wmMLCBdJnB9tPIx9Zmhvok=
 76 | github.com/panjf2000/ants/v2 v2.2.2/go.mod h1:1GFm8bV8nyCQvU5K4WvBCTG1/YBFOD2VzjffD8fV55A=
 77 | github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
 78 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 79 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 80 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 81 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
 82 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 83 | github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 84 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 85 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
 86 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 87 | github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
 88 | github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
 89 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 90 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 91 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 92 | github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 93 | github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 94 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 95 | github.com/urfave/cli/v2 v2.0.0 h1:+HU9SCbu8GnEUFtIBfuUNXN39ofWViIEJIp6SURMpCg=
 96 | github.com/urfave/cli/v2 v2.0.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
 97 | github.com/urfave/cli/v2 v2.23.6 h1:iWmtKD+prGo1nKUtLO0Wg4z9esfBM4rAV4QRLQiEmJ4=
 98 | github.com/urfave/cli/v2 v2.23.6/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
 99 | github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
100 | github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
101 | go.opentelemetry.io/otel v1.0.0-RC2 h1:SHhxSjB+omnGZPgGlKe+QMp3MyazcOHdQ8qwo89oKbg=
102 | go.opentelemetry.io/otel v1.0.0-RC2/go.mod h1:w1thVQ7qbAy8MHb0IFj8a5Q2QU0l2ksf8u/CN8m3NOM=
103 | go.opentelemetry.io/otel/oteltest v1.0.0-RC2 h1:xNKqMhlZYkASSyvF4JwObZFMq0jhFN3c3SP+2rCzVPk=
104 | go.opentelemetry.io/otel/oteltest v1.0.0-RC2/go.mod h1:kiQ4tw5tAL4JLTbcOYwK1CWI1HkT5aiLzHovgOVnz/A=
105 | go.opentelemetry.io/otel/trace v1.0.0-RC2 h1:dunAP0qDULMIT82atj34m5RgvsIK6LcsXf1c/MsYg1w=
106 | go.opentelemetry.io/otel/trace v1.0.0-RC2/go.mod h1:JPQ+z6nNw9mqEGT8o3eoPTdnNI+Aj5JcxEsVGREIAy4=
107 | golang.org/x/net v0.0.0-20210520170846-37e1c6afe023 h1:ADo5wSpq2gqaCGQWzk7S5vd//0iyyLeAratkEoG5dLE=
108 | golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
109 | golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
110 | golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
111 | golang.org/x/sys v0.0.0-20191113165036-4c7a9d0fe056/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
112 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
113 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
114 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
115 | golang.org/x/sys v0.0.0-20201207223542-d4d67f95c62d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
116 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da h1:b3NXsE2LusjYGGjL5bxEVZZORm/YEFFrWFjR8eFrw/c=
117 | golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
118 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
119 | golang.org/x/sys v0.0.0-20220627191245-f75cf1eec38b h1:2n253B2r0pYSmEV+UNCQoPfU/FiaizQEK5Gu4Bq4JE8=
120 | golang.org/x/sys v0.0.0-20220627191245-f75cf1eec38b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
121 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
122 | golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
123 | golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
124 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
125 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
126 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
127 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
128 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
129 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
130 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
131 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
132 | gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
133 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
134 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
135 | 


--------------------------------------------------------------------------------
/imgs/bypass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zer0yu/crawlergo/1ee900d6714b596f83efa5d1b54748eacabdf293/imgs/bypass.png


--------------------------------------------------------------------------------
/imgs/chrome_path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zer0yu/crawlergo/1ee900d6714b596f83efa5d1b54748eacabdf293/imgs/chrome_path.png


--------------------------------------------------------------------------------
/imgs/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zer0yu/crawlergo/1ee900d6714b596f83efa5d1b54748eacabdf293/imgs/demo.gif


--------------------------------------------------------------------------------
/imgs/skp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zer0yu/crawlergo/1ee900d6714b596f83efa5d1b54748eacabdf293/imgs/skp.png


--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"time"
  5 | 
  6 | 	mapset "github.com/deckarep/golang-set"
  7 | )
  8 | 
  9 | const (
 10 | 	DefaultUA               = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36"
 11 | 	MaxTabsCount            = 10
 12 | 	TabRunTimeout           = 20 * time.Second
 13 | 	DefaultInputText        = "Crawlergo"
 14 | 	FormInputKeyword        = "Crawlergo"
 15 | 	SuspectURLRegex         = `(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`
 16 | 	URLRegex                = `((https?|ftp|file):)?//[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]`
 17 | 	AttrURLRegex            = ``
 18 | 	DomContentLoadedTimeout = 5 * time.Second
 19 | 	EventTriggerInterval    = 100 * time.Millisecond // 单位毫秒
 20 | 	BeforeExitDelay         = 1 * time.Second
 21 | 	DefaultEventTriggerMode = EventTriggerAsync
 22 | 	MaxCrawlCount           = 200
 23 | )
 24 | 
 25 | // 请求方法
 26 | const (
 27 | 	GET     = "GET"
 28 | 	POST    = "POST"
 29 | 	PUT     = "PUT"
 30 | 	DELETE  = "DELETE"
 31 | 	HEAD    = "HEAD"
 32 | 	OPTIONS = "OPTIONS"
 33 | )
 34 | 
 35 | // 过滤模式
 36 | const (
 37 | 	SimpleFilterMode = "simple"
 38 | 	SmartFilterMode  = "smart"
 39 | 	StrictFilterMode = "strict"
 40 | )
 41 | 
 42 | // 事件触发模式
 43 | const (
 44 | 	EventTriggerAsync = "async"
 45 | 	EventTriggerSync  = "sync"
 46 | )
 47 | 
 48 | // 请求的来源
 49 | const (
 50 | 	FromTarget      = "Target"     //初始输入的目标
 51 | 	FromNavigation  = "Navigation" //页面导航请求
 52 | 	FromXHR         = "XHR"        //ajax异步请求
 53 | 	FromDOM         = "DOM"        //dom解析出来的请求
 54 | 	FromJSFile      = "JavaScript" //JS脚本中解析
 55 | 	FromFuzz        = "PathFuzz"   //初始path fuzz
 56 | 	FromRobots      = "robots.txt" //robots.txt
 57 | 	FromComment     = "Comment"    //页面中的注释
 58 | 	FromWebSocket   = "WebSocket"
 59 | 	FromEventSource = "EventSource"
 60 | 	FromFetch       = "Fetch"
 61 | 	FromHistoryAPI  = "HistoryAPI"
 62 | 	FromOpenWindow  = "OpenWindow"
 63 | 	FromHashChange  = "HashChange"
 64 | 	FromStaticRes   = "StaticResource"
 65 | 	FromStaticRegex = "StaticRegex"
 66 | )
 67 | 
 68 | // content-type
 69 | const (
 70 | 	JSON       = "application/json"
 71 | 	URLENCODED = "application/x-www-form-urlencoded"
 72 | 	MULTIPART  = "multipart/form-data"
 73 | )
 74 | 
 75 | var (
 76 | 	StaticSuffix = []string{
 77 | 		"png", "gif", "jpg", "mp4", "mp3", "mng", "pct", "bmp", "jpeg", "pst", "psp", "ttf",
 78 | 		"tif", "tiff", "ai", "drw", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff",
 79 | 		"dxf", "eps", "ps", "svg", "3gp", "asf", "asx", "avi", "mov", "mpg", "qt", "rm",
 80 | 		"wmv", "m4a", "bin", "xls", "xlsx", "ppt", "pptx", "doc", "docx", "odt", "ods", "odg",
 81 | 		"odp", "exe", "zip", "rar", "tar", "gz", "iso", "rss", "pdf", "txt", "dll", "ico",
 82 | 		"gz2", "apk", "crt", "woff", "map", "woff2", "webp", "less", "dmg", "bz2", "otf", "swf",
 83 | 		"flv", "mpeg", "dat", "xsl", "csv", "cab", "exif", "wps", "m4v", "rmvb",
 84 | 	}
 85 | 	StaticSuffixSet mapset.Set
 86 | )
 87 | 
 88 | var (
 89 | 	ScriptSuffix = []string{
 90 | 		"php", "asp", "jsp", "asa",
 91 | 	}
 92 | 	ScriptSuffixSet mapset.Set
 93 | )
 94 | 
 95 | var DefaultIgnoreKeywords = []string{"logout", "quit", "exit"}
 96 | var AllowedFormName = []string{"default", "mail", "code", "phone", "username", "password", "qq", "id_card", "url", "date", "number"}
 97 | 
 98 | type ContinueResourceList []string
 99 | 
100 | var InputTextMap = map[string]map[string]interface{}{
101 | 	"mail": {
102 | 		"keyword": []string{"mail"},
103 | 		"value":   "crawlergo@gmail.com",
104 | 	},
105 | 	"code": {
106 | 		"keyword": []string{"yanzhengma", "code", "ver", "captcha"},
107 | 		"value":   "123a",
108 | 	},
109 | 	"phone": {
110 | 		"keyword": []string{"phone", "number", "tel", "shouji"},
111 | 		"value":   "18812345678",
112 | 	},
113 | 	"username": {
114 | 		"keyword": []string{"name", "user", "id", "login", "account"},
115 | 		"value":   "crawlergo@gmail.com",
116 | 	},
117 | 	"password": {
118 | 		"keyword": []string{"pass", "pwd"},
119 | 		"value":   "Crawlergo6.",
120 | 	},
121 | 	"qq": {
122 | 		"keyword": []string{"qq", "wechat", "tencent", "weixin"},
123 | 		"value":   "123456789",
124 | 	},
125 | 	"IDCard": {
126 | 		"keyword": []string{"card", "shenfen"},
127 | 		"value":   "511702197409284963",
128 | 	},
129 | 	"url": {
130 | 		"keyword": []string{"url", "site", "web", "blog", "link"},
131 | 		"value":   "http://crawlergo.nice.cn/",
132 | 	},
133 | 	"date": {
134 | 		"keyword": []string{"date", "time", "year", "now"},
135 | 		"value":   "2018-01-01",
136 | 	},
137 | 	"number": {
138 | 		"keyword": []string{"day", "age", "num", "count"},
139 | 		"value":   "10",
140 | 	},
141 | }
142 | 
143 | func init() {
144 | 	StaticSuffixSet = initSet(StaticSuffix)
145 | 	ScriptSuffixSet = initSet(ScriptSuffix)
146 | }
147 | 
148 | func initSet(suffixs []string) mapset.Set {
149 | 	set := mapset.NewSet()
150 | 	for _, s := range suffixs {
151 | 		set.Add(s)
152 | 	}
153 | 	return set
154 | }
155 | 


--------------------------------------------------------------------------------
/pkg/config/config_test.go:
--------------------------------------------------------------------------------
 1 | package config_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 7 | 	"github.com/stretchr/testify/assert"
 8 | )
 9 | 
10 | func TestStaticSuffix(t *testing.T) {
11 | 	assert.Equal(t, true, config.StaticSuffixSet.Contains("png"))
12 | 	assert.Equal(t, false, config.StaticSuffixSet.Contains("demo"))
13 | 
14 | 	assert.Equal(t, true, config.ScriptSuffixSet.Contains("asp"))
15 | 	assert.Equal(t, false, config.ScriptSuffixSet.Contains("demo"))
16 | }
17 | 


--------------------------------------------------------------------------------
/pkg/domain_collect.go:
--------------------------------------------------------------------------------
 1 | package pkg
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/Qianlitp/crawlergo/pkg/model"
 7 | 	mapset "github.com/deckarep/golang-set"
 8 | )
 9 | 
10 | func SubDomainCollect(reqList []*model.Request, HostLimit string) []string {
11 | 	var subDomainList []string
12 | 	uniqueSet := mapset.NewSet()
13 | 	for _, req := range reqList {
14 | 		domain := req.URL.Hostname()
15 | 		if uniqueSet.Contains(domain) {
16 | 			continue
17 | 		}
18 | 		uniqueSet.Add(domain)
19 | 		if strings.HasSuffix(domain, "."+HostLimit) {
20 | 			subDomainList = append(subDomainList, domain)
21 | 		}
22 | 	}
23 | 	return subDomainList
24 | }
25 | 
26 | func AllDomainCollect(reqList []*model.Request) []string {
27 | 	uniqueSet := mapset.NewSet()
28 | 	var allDomainList []string
29 | 	for _, req := range reqList {
30 | 		domain := req.URL.Hostname()
31 | 		if uniqueSet.Contains(domain) {
32 | 			continue
33 | 		}
34 | 		uniqueSet.Add(domain)
35 | 		allDomainList = append(allDomainList, req.URL.Hostname())
36 | 	}
37 | 	return allDomainList
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/engine/after_dom_tasks.go:
--------------------------------------------------------------------------------
  1 | package engine
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"os"
  6 | 	"strings"
  7 | 	"time"
  8 | 
  9 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 10 | 	"github.com/Qianlitp/crawlergo/pkg/js"
 11 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 12 | 	"github.com/chromedp/cdproto/cdp"
 13 | 	"github.com/chromedp/chromedp"
 14 | )
 15 | 
 16 | /**
 17 | 在DOMContentLoaded完成后执行
 18 | */
 19 | func (tab *Tab) AfterDOMRun() {
 20 | 	defer tab.WG.Done()
 21 | 
 22 | 	logger.Logger.Debug("afterDOMRun start")
 23 | 
 24 | 	// 获取当前body节点的nodeId 用于之后查找子节点
 25 | 	if !tab.getBodyNodeId() {
 26 | 		logger.Logger.Debug("no body document NodeID, exit.")
 27 | 		return
 28 | 	}
 29 | 
 30 | 	tab.domWG.Add(2)
 31 | 	go tab.fillForm()
 32 | 	go tab.setObserverJS()
 33 | 	tab.domWG.Wait()
 34 | 	logger.Logger.Debug("afterDOMRun end")
 35 | 	tab.WG.Add(1)
 36 | 	go tab.AfterLoadedRun()
 37 | }
 38 | 
 39 | /**
 40 | 获取的Body的NodeId 用于之后子节点无等待查询
 41 | 最多等待3秒 如果DOM依旧没有渲染完成，则退出
 42 | */
 43 | func (tab *Tab) getBodyNodeId() bool {
 44 | 	var docNodeIDs []cdp.NodeID
 45 | 	ctx := tab.GetExecutor()
 46 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*3)
 47 | 	defer cancel()
 48 | 	// 获取 Frame document root
 49 | 	err := chromedp.NodeIDs(`body`, &docNodeIDs, chromedp.ByQuery).Do(tCtx)
 50 | 	if len(docNodeIDs) == 0 || err != nil {
 51 | 		// not root node yet?
 52 | 		logger.Logger.Debug("getBodyNodeId failed, maybe DOM not ready?")
 53 | 		if err != nil {
 54 | 			logger.Logger.Debug(err)
 55 | 		}
 56 | 		return false
 57 | 	}
 58 | 	tab.DocBodyNodeId = docNodeIDs[0]
 59 | 	return true
 60 | }
 61 | 
 62 | /**
 63 | 自动化填充表单
 64 | */
 65 | func (tab *Tab) fillForm() {
 66 | 	defer tab.domWG.Done()
 67 | 	logger.Logger.Debug("fillForm start")
 68 | 	tab.fillFormWG.Add(3)
 69 | 	f := FillForm{
 70 | 		tab: tab,
 71 | 	}
 72 | 
 73 | 	go f.fillInput()
 74 | 	go f.fillMultiSelect()
 75 | 	go f.fillTextarea()
 76 | 
 77 | 	tab.fillFormWG.Wait()
 78 | 	logger.Logger.Debug("fillForm end")
 79 | }
 80 | 
 81 | /**
 82 | 设置Dom节点变化的观察函数
 83 | */
 84 | func (tab *Tab) setObserverJS() {
 85 | 	defer tab.domWG.Done()
 86 | 	logger.Logger.Debug("setObserverJS start")
 87 | 	// 设置Dom节点变化的观察函数
 88 | 	go tab.Evaluate(js.ObserverJS)
 89 | 	logger.Logger.Debug("setObserverJS end")
 90 | }
 91 | 
 92 | type FillForm struct {
 93 | 	tab *Tab
 94 | }
 95 | 
 96 | /**
 97 | 填充所有 input 标签
 98 | */
 99 | func (f *FillForm) fillInput() {
100 | 	defer f.tab.fillFormWG.Done()
101 | 	var nodes []*cdp.Node
102 | 	ctx := f.tab.GetExecutor()
103 | 
104 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
105 | 	defer cancel()
106 | 	// 首先判断input标签是否存在，减少等待时间 提前退出
107 | 	inputNodes, inputErr := f.tab.GetNodeIDs(`input`)
108 | 	if inputErr != nil || len(inputNodes) == 0 {
109 | 		logger.Logger.Debug("fillInput: get form input element err")
110 | 		if inputErr != nil {
111 | 			logger.Logger.Debug(inputErr)
112 | 		}
113 | 		return
114 | 	}
115 | 	// 获取所有的input标签
116 | 	err := chromedp.Nodes(`input`, &nodes, chromedp.ByQueryAll).Do(tCtx)
117 | 
118 | 	if err != nil {
119 | 		logger.Logger.Debug("get all input element err")
120 | 		logger.Logger.Debug(err)
121 | 		return
122 | 	}
123 | 
124 | 	// 找出 type 为空 或者 type=text
125 | 	for _, node := range nodes {
126 | 		// 兜底超时
127 | 		tCtxN, cancelN := context.WithTimeout(ctx, time.Second*5)
128 | 		attrType := node.AttributeValue("type")
129 | 		if attrType == "text" || attrType == "" {
130 | 			inputName := node.AttributeValue("id") + node.AttributeValue("class") + node.AttributeValue("name")
131 | 			value := f.GetMatchInputText(inputName)
132 | 			var nodeIds = []cdp.NodeID{node.NodeID}
133 | 			// 先使用模拟输入
134 | 			_ = chromedp.SendKeys(nodeIds, value, chromedp.ByNodeID).Do(tCtxN)
135 | 			// 再直接赋值JS属性
136 | 			_ = chromedp.SetAttributeValue(nodeIds, "value", value, chromedp.ByNodeID).Do(tCtxN)
137 | 		} else if attrType == "email" || attrType == "password" || attrType == "tel" {
138 | 			value := f.GetMatchInputText(attrType)
139 | 			var nodeIds = []cdp.NodeID{node.NodeID}
140 | 			// 先使用模拟输入
141 | 			_ = chromedp.SendKeys(nodeIds, value, chromedp.ByNodeID).Do(tCtxN)
142 | 			// 再直接赋值JS属性
143 | 			_ = chromedp.SetAttributeValue(nodeIds, "value", value, chromedp.ByNodeID).Do(tCtxN)
144 | 		} else if attrType == "radio" || attrType == "checkbox" {
145 | 			var nodeIds = []cdp.NodeID{node.NodeID}
146 | 			_ = chromedp.SetAttributeValue(nodeIds, "checked", "true", chromedp.ByNodeID).Do(tCtxN)
147 | 		} else if attrType == "file" || attrType == "image" {
148 | 			var nodeIds = []cdp.NodeID{node.NodeID}
149 | 			wd, _ := os.Getwd()
150 | 			filePath := wd + "/upload/image.png"
151 | 			_ = chromedp.RemoveAttribute(nodeIds, "accept", chromedp.ByNodeID).Do(tCtxN)
152 | 			_ = chromedp.RemoveAttribute(nodeIds, "required", chromedp.ByNodeID).Do(tCtxN)
153 | 			_ = chromedp.SendKeys(nodeIds, filePath, chromedp.ByNodeID).Do(tCtxN)
154 | 		}
155 | 		cancelN()
156 | 	}
157 | }
158 | 
159 | func (f *FillForm) fillTextarea() {
160 | 	defer f.tab.fillFormWG.Done()
161 | 	ctx := f.tab.GetExecutor()
162 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
163 | 	defer cancel()
164 | 	value := f.GetMatchInputText("other")
165 | 
166 | 	textareaNodes, textareaErr := f.tab.GetNodeIDs(`textarea`)
167 | 	if textareaErr != nil || len(textareaNodes) == 0 {
168 | 		logger.Logger.Debug("fillTextarea: get textarea element err")
169 | 		if textareaErr != nil {
170 | 			logger.Logger.Debug(textareaErr)
171 | 		}
172 | 		return
173 | 	}
174 | 
175 | 	_ = chromedp.SendKeys(textareaNodes, value, chromedp.ByNodeID).Do(tCtx)
176 | }
177 | 
178 | func (f *FillForm) fillMultiSelect() {
179 | 	defer f.tab.fillFormWG.Done()
180 | 	ctx := f.tab.GetExecutor()
181 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
182 | 	defer cancel()
183 | 	optionNodes, optionErr := f.tab.GetNodeIDs(`select option:first-child`)
184 | 	if optionErr != nil || len(optionNodes) == 0 {
185 | 		logger.Logger.Debug("fillMultiSelect: get select option element err")
186 | 		if optionErr != nil {
187 | 			logger.Logger.Debug(optionErr)
188 | 		}
189 | 		return
190 | 	}
191 | 	_ = chromedp.SetAttributeValue(optionNodes, "selected", "true", chromedp.ByNodeID).Do(tCtx)
192 | 	_ = chromedp.SetJavascriptAttribute(optionNodes, "selected", "true", chromedp.ByNodeID).Do(tCtx)
193 | }
194 | 
195 | func (f *FillForm) GetMatchInputText(name string) string {
196 | 	// 如果自定义了关键词，模糊匹配
197 | 	for key, value := range f.tab.config.CustomFormKeywordValues {
198 | 		if strings.Contains(name, key) {
199 | 			return value
200 | 		}
201 | 	}
202 | 
203 | 	name = strings.ToLower(name)
204 | 	for key, item := range config.InputTextMap {
205 | 		for _, keyword := range item["keyword"].([]string) {
206 | 			if strings.Contains(name, keyword) {
207 | 				if customValue, ok := f.tab.config.CustomFormValues[key]; ok {
208 | 					return customValue
209 | 				} else {
210 | 					return item["value"].(string)
211 | 				}
212 | 			}
213 | 		}
214 | 	}
215 | 	return f.tab.config.CustomFormValues["default"]
216 | }
217 | 


--------------------------------------------------------------------------------
/pkg/engine/after_loaded_tasks.go:
--------------------------------------------------------------------------------
  1 | package engine
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"time"
  7 | 
  8 | 	"github.com/Qianlitp/crawlergo/pkg/config"
  9 | 	"github.com/Qianlitp/crawlergo/pkg/js"
 10 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 11 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 12 | 	"github.com/chromedp/cdproto/cdp"
 13 | 	"github.com/chromedp/chromedp"
 14 | )
 15 | 
 16 | /**
 17 | 根据NODE节点执行JS的代码
 18 | err := EvaluateAsDevTools(snippet(submitJS, cashX(true), sel, nodes[0]), &res).Do(ctx)
 19 | 
 20 | 具体环境实现在 chromedp.submit 函数中 参考即可写出
 21 | */
 22 | 
 23 | /**
 24 | 在页面Loaded之后执行
 25 | 同时等待 afterDOMRun 之后执行
 26 | */
 27 | func (tab *Tab) AfterLoadedRun() {
 28 | 	defer tab.WG.Done()
 29 | 	logger.Logger.Debug("afterLoadedRun start")
 30 | 	tab.formSubmitWG.Add(2)
 31 | 	tab.loadedWG.Add(3)
 32 | 	tab.removeLis.Add(1)
 33 | 
 34 | 	go tab.formSubmit()
 35 | 	tab.formSubmitWG.Wait()
 36 | 	logger.Logger.Debug("formSubmit end")
 37 | 
 38 | 	if tab.config.EventTriggerMode == config.EventTriggerAsync {
 39 | 		go tab.triggerJavascriptProtocol()
 40 | 		go tab.triggerInlineEvents()
 41 | 		go tab.triggerDom2Events()
 42 | 		tab.loadedWG.Wait()
 43 | 	} else if tab.config.EventTriggerMode == config.EventTriggerSync {
 44 | 		tab.triggerInlineEvents()
 45 | 		time.Sleep(tab.config.EventTriggerInterval)
 46 | 		tab.triggerDom2Events()
 47 | 		time.Sleep(tab.config.EventTriggerInterval)
 48 | 		tab.triggerJavascriptProtocol()
 49 | 	}
 50 | 
 51 | 	// 事件触发之后 需要等待一点时间让浏览器成功发出ajax请求 更新DOM
 52 | 	time.Sleep(tab.config.BeforeExitDelay)
 53 | 
 54 | 	go tab.RemoveDOMListener()
 55 | 	tab.removeLis.Wait()
 56 | 	logger.Logger.Debug("afterLoadedRun end")
 57 | }
 58 | 
 59 | /**
 60 | 自动化点击提交表单
 61 | */
 62 | func (tab *Tab) formSubmit() {
 63 | 
 64 | 	logger.Logger.Debug("formSubmit start")
 65 | 
 66 | 	// 首先对form表单设置target
 67 | 	tab.setFormToFrame()
 68 | 
 69 | 	// 接下来尝试三种方式提交表单
 70 | 	go tab.clickSubmit()
 71 | 	go tab.clickAllButton()
 72 | }
 73 | 
 74 | /**
 75 | 设置form的target指向一个frame
 76 | */
 77 | func (tab *Tab) setFormToFrame() {
 78 | 	// 首先新建 frame
 79 | 	nameStr := tools.RandSeq(8)
 80 | 	tab.Evaluate(fmt.Sprintf(js.NewFrameTemplate, nameStr, nameStr))
 81 | 
 82 | 	// 接下来将所有的 form 节点target都指向它
 83 | 	ctx := tab.GetExecutor()
 84 | 	formNodes, formErr := tab.GetNodeIDs(`form`)
 85 | 	if formErr != nil || len(formNodes) == 0 {
 86 | 		logger.Logger.Debug("setFormToFrame: get form element err")
 87 | 		if formErr != nil {
 88 | 			logger.Logger.Debug(formErr)
 89 | 		}
 90 | 		return
 91 | 	}
 92 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*2)
 93 | 	defer cancel()
 94 | 	_ = chromedp.SetAttributeValue(formNodes, "target", nameStr, chromedp.ByNodeID).Do(tCtx)
 95 | }
 96 | 
 97 | /**
 98 | 点击按钮 type=submit
 99 | */
100 | func (tab *Tab) clickSubmit() {
101 | 	defer tab.formSubmitWG.Done()
102 | 
103 | 	// 首先点击按钮 type=submit
104 | 	ctx := tab.GetExecutor()
105 | 
106 | 	// 获取所有的form节点 直接执行submit
107 | 	formNodes, formErr := tab.GetNodeIDs(`form`)
108 | 	if formErr != nil || len(formNodes) == 0 {
109 | 		logger.Logger.Debug("clickSubmit: get form element err")
110 | 		if formErr != nil {
111 | 			logger.Logger.Debug(formErr)
112 | 		}
113 | 		return
114 | 	}
115 | 	tCtx1, cancel1 := context.WithTimeout(ctx, time.Second*2)
116 | 	defer cancel1()
117 | 	_ = chromedp.Submit(formNodes, chromedp.ByNodeID).Do(tCtx1)
118 | 
119 | 	// 获取所有的input标签
120 | 	inputNodes, inputErr := tab.GetNodeIDs(`form input[type=submit]`)
121 | 	if inputErr != nil || len(inputNodes) == 0 {
122 | 		logger.Logger.Debug("clickSubmit: get form input element err")
123 | 		if inputErr != nil {
124 | 			logger.Logger.Debug(inputErr)
125 | 		}
126 | 		return
127 | 	}
128 | 	tCtx2, cancel2 := context.WithTimeout(ctx, time.Second*2)
129 | 	defer cancel2()
130 | 	_ = chromedp.Click(inputNodes, chromedp.ByNodeID).Do(tCtx2)
131 | }
132 | 
133 | /**
134 | click all button
135 | */
136 | func (tab *Tab) clickAllButton() {
137 | 	defer tab.formSubmitWG.Done()
138 | 
139 | 	// 获取所有的form中的button节点
140 | 	ctx := tab.GetExecutor()
141 | 	// 获取所有的button标签
142 | 	btnNodeIDs, bErr := tab.GetNodeIDs(`form button`)
143 | 	if bErr != nil || len(btnNodeIDs) == 0 {
144 | 		logger.Logger.Debug("clickAllButton: get form button element err")
145 | 		if bErr != nil {
146 | 			logger.Logger.Debug(bErr)
147 | 		}
148 | 		return
149 | 	}
150 | 	tCtx, cancel1 := context.WithTimeout(ctx, time.Second*2)
151 | 	defer cancel1()
152 | 	_ = chromedp.Click(btnNodeIDs, chromedp.ByNodeID).Do(tCtx)
153 | 
154 | 	// 使用JS的click方法进行点击
155 | 	var btnNodes []*cdp.Node
156 | 	tCtx2, cancel2 := context.WithTimeout(ctx, time.Second*2)
157 | 	defer cancel2()
158 | 	err := chromedp.Nodes(btnNodeIDs, &btnNodes, chromedp.ByNodeID).Do(tCtx2)
159 | 	if err != nil {
160 | 		return
161 | 	}
162 | 	for _, node := range btnNodes {
163 | 		_ = tab.EvaluateWithNode(js.FormNodeClickJS, node)
164 | 	}
165 | }
166 | 
167 | /**
168 | 触发内联事件
169 | */
170 | func (tab *Tab) triggerInlineEvents() {
171 | 	defer tab.loadedWG.Done()
172 | 	logger.Logger.Debug("triggerInlineEvents start")
173 | 	tab.Evaluate(fmt.Sprintf(js.TriggerInlineEventJS, tab.config.EventTriggerInterval.Seconds()*1000))
174 | 	logger.Logger.Debug("triggerInlineEvents end")
175 | }
176 | 
177 | /**
178 | 触发DOM2级事件
179 | */
180 | func (tab *Tab) triggerDom2Events() {
181 | 	defer tab.loadedWG.Done()
182 | 	logger.Logger.Debug("triggerDom2Events start")
183 | 	tab.Evaluate(fmt.Sprintf(js.TriggerDom2EventJS, tab.config.EventTriggerInterval.Seconds()*1000))
184 | 	logger.Logger.Debug("triggerDom2Events end")
185 | }
186 | 
187 | /**
188 | a标签的href值为伪协议，
189 | */
190 | func (tab *Tab) triggerJavascriptProtocol() {
191 | 	defer tab.loadedWG.Done()
192 | 	logger.Logger.Debug("clickATagJavascriptProtocol start")
193 | 	tab.Evaluate(fmt.Sprintf(js.TriggerJavascriptProtocol, tab.config.EventTriggerInterval.Seconds()*1000,
194 | 		tab.config.EventTriggerInterval.Seconds()*1000))
195 | 	logger.Logger.Debug("clickATagJavascriptProtocol end")
196 | }
197 | 
198 | /**
199 | 移除DOM节点变化监听
200 | */
201 | func (tab *Tab) RemoveDOMListener() {
202 | 	defer tab.removeLis.Done()
203 | 	logger.Logger.Debug("RemoveDOMListener start")
204 | 	// 移除DOM节点变化监听
205 | 	tab.Evaluate(js.RemoveDOMListenerJS)
206 | 	logger.Logger.Debug("RemoveDOMListener end")
207 | }
208 | 


--------------------------------------------------------------------------------
/pkg/engine/browser.go:
--------------------------------------------------------------------------------
  1 | package engine
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"log"
  6 | 	"sync"
  7 | 	"time"
  8 | 
  9 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 10 | 
 11 | 	"github.com/chromedp/cdproto/browser"
 12 | 	"github.com/chromedp/chromedp"
 13 | )
 14 | 
 15 | type Browser struct {
 16 | 	Ctx          *context.Context
 17 | 	Cancel       *context.CancelFunc
 18 | 	tabs         []*context.Context
 19 | 	tabCancels   []context.CancelFunc
 20 | 	ExtraHeaders map[string]interface{}
 21 | 	lock         sync.Mutex
 22 | }
 23 | 
 24 | func InitBrowser(chromiumPath string, extraHeaders map[string]interface{}, proxy string, noHeadless bool) *Browser {
 25 | 	var bro Browser
 26 | 	opts := append(chromedp.DefaultExecAllocatorOptions[:],
 27 | 
 28 | 		// 无头模式
 29 | 		chromedp.Flag("headless", !noHeadless),
 30 | 		// https://github.com/chromedp/chromedp/issues/997#issuecomment-1030596050
 31 | 		// incognito mode not used
 32 | 		// 禁用GPU，不显示GUI
 33 | 		chromedp.Flag("disable-gpu", true),
 34 | 		// 取消沙盒模式
 35 | 		chromedp.Flag("no-sandbox", true),
 36 | 		// 忽略证书错误
 37 | 		chromedp.Flag("ignore-certificate-errors", true),
 38 | 
 39 | 		chromedp.Flag("disable-images", true),
 40 | 		//
 41 | 		chromedp.Flag("disable-web-security", true),
 42 | 		//
 43 | 		chromedp.Flag("disable-xss-auditor", true),
 44 | 		//
 45 | 		chromedp.Flag("disable-setuid-sandbox", true),
 46 | 
 47 | 		chromedp.Flag("allow-running-insecure-content", true),
 48 | 
 49 | 		chromedp.Flag("disable-webgl", true),
 50 | 
 51 | 		chromedp.Flag("disable-popup-blocking", true),
 52 | 
 53 | 		chromedp.WindowSize(1920, 1080),
 54 | 	)
 55 | 	// 设置浏览器代理
 56 | 	if proxy != "" {
 57 | 		opts = append(opts, chromedp.ProxyServer(proxy))
 58 | 	}
 59 | 
 60 | 	if len(chromiumPath) > 0 {
 61 | 
 62 | 		// 指定执行路径
 63 | 		opts = append(opts, chromedp.ExecPath(chromiumPath))
 64 | 	}
 65 | 
 66 | 	allocCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
 67 | 	bctx, _ := chromedp.NewContext(allocCtx,
 68 | 		chromedp.WithLogf(log.Printf),
 69 | 	)
 70 | 	// https://github.com/chromedp/chromedp/issues/824#issuecomment-845664441
 71 | 	// 如果需要在一个浏览器上创建多个tab，则需要先创建浏览器的上下文，即运行下面的语句
 72 | 	err := chromedp.Run(bctx)
 73 | 	if err != nil {
 74 | 		// not found chrome process need exit
 75 | 		logger.Logger.Fatal("chromedp run error: ", err.Error())
 76 | 	}
 77 | 	bro.Cancel = &cancel
 78 | 	bro.Ctx = &bctx
 79 | 	bro.ExtraHeaders = extraHeaders
 80 | 	return &bro
 81 | }
 82 | 
 83 | func (bro *Browser) NewTab(timeout time.Duration) (*context.Context, context.CancelFunc) {
 84 | 	bro.lock.Lock()
 85 | 	ctx, cancel := chromedp.NewContext(*bro.Ctx)
 86 | 	//defer cancel()
 87 | 	tCtx, _ := context.WithTimeout(ctx, timeout)
 88 | 	bro.tabs = append(bro.tabs, &tCtx)
 89 | 	bro.tabCancels = append(bro.tabCancels, cancel)
 90 | 	//defer cancel2()
 91 | 	bro.lock.Unlock()
 92 | 
 93 | 	//return bro.Ctx, &cancel
 94 | 	return &tCtx, cancel
 95 | }
 96 | 
 97 | func (bro *Browser) Close() {
 98 | 	logger.Logger.Info("closing browser.")
 99 | 	for _, cancel := range bro.tabCancels {
100 | 		cancel()
101 | 	}
102 | 
103 | 	for _, ctx := range bro.tabs {
104 | 		err := browser.Close().Do(*ctx)
105 | 		if err != nil {
106 | 			logger.Logger.Debug(err)
107 | 		}
108 | 	}
109 | 
110 | 	err := browser.Close().Do(*bro.Ctx)
111 | 	if err != nil {
112 | 		logger.Logger.Debug(err)
113 | 	}
114 | 	(*bro.Cancel)()
115 | }
116 | 


--------------------------------------------------------------------------------
/pkg/engine/collect_links.go:
--------------------------------------------------------------------------------
 1 | package engine
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"regexp"
 7 | 	"time"
 8 | 
 9 | 	"github.com/Qianlitp/crawlergo/pkg/config"
10 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
11 | 	"github.com/chromedp/cdproto/cdp"
12 | 	"github.com/chromedp/chromedp"
13 | )
14 | 
15 | /**
16 | 最后收集所有的链接
17 | */
18 | func (tab *Tab) collectLinks() {
19 | 	go tab.collectHrefLinks()
20 | 	go tab.collectObjectLinks()
21 | 	go tab.collectCommentLinks()
22 | }
23 | 
24 | func (tab *Tab) collectHrefLinks() {
25 | 	defer tab.collectLinkWG.Done()
26 | 	ctx := tab.GetExecutor()
27 | 	// 收集 src href data-url 属性值
28 | 	attrNameList := []string{"src", "href", "data-url", "data-href"}
29 | 	for _, attrName := range attrNameList {
30 | 		tCtx, cancel := context.WithTimeout(ctx, time.Second*1)
31 | 		var attrs []map[string]string
32 | 		_ = chromedp.AttributesAll(fmt.Sprintf(`[%s]`, attrName), &attrs, chromedp.ByQueryAll).Do(tCtx)
33 | 		cancel()
34 | 		for _, attrMap := range attrs {
35 | 			tab.AddResultUrl(config.GET, attrMap[attrName], config.FromDOM)
36 | 		}
37 | 	}
38 | }
39 | 
40 | func (tab *Tab) collectObjectLinks() {
41 | 	defer tab.collectLinkWG.Done()
42 | 	ctx := tab.GetExecutor()
43 | 	// 收集 object[data] links
44 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*1)
45 | 	defer cancel()
46 | 	var attrs []map[string]string
47 | 	_ = chromedp.AttributesAll(`object[data]`, &attrs, chromedp.ByQueryAll).Do(tCtx)
48 | 	for _, attrMap := range attrs {
49 | 		tab.AddResultUrl(config.GET, attrMap["data"], config.FromDOM)
50 | 	}
51 | }
52 | 
53 | func (tab *Tab) collectCommentLinks() {
54 | 	defer tab.collectLinkWG.Done()
55 | 	ctx := tab.GetExecutor()
56 | 	// 收集注释中的链接
57 | 	var nodes []*cdp.Node
58 | 	tCtxComment, cancel := context.WithTimeout(ctx, time.Second*1)
59 | 	defer cancel()
60 | 	commentErr := chromedp.Nodes(`//comment()`, &nodes, chromedp.BySearch).Do(tCtxComment)
61 | 	if commentErr != nil {
62 | 		logger.Logger.Debug("get comment nodes err")
63 | 		logger.Logger.Debug(commentErr)
64 | 		return
65 | 	}
66 | 	urlRegex := regexp.MustCompile(config.URLRegex)
67 | 	for _, node := range nodes {
68 | 		content := node.NodeValue
69 | 		urlList := urlRegex.FindAllString(content, -1)
70 | 		for _, url := range urlList {
71 | 			tab.AddResultUrl(config.GET, url, config.FromComment)
72 | 		}
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/pkg/engine/intercept_request.go:
--------------------------------------------------------------------------------
  1 | package engine
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"context"
  6 | 	"encoding/base64"
  7 | 	"io"
  8 | 	"net/textproto"
  9 | 	"regexp"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 	"time"
 13 | 
 14 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 15 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 16 | 	model2 "github.com/Qianlitp/crawlergo/pkg/model"
 17 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 18 | 	"github.com/Qianlitp/crawlergo/pkg/tools/requests"
 19 | 	"github.com/chromedp/cdproto/fetch"
 20 | 	"github.com/chromedp/cdproto/network"
 21 | )
 22 | 
 23 | /**
 24 | 处理每一个HTTP请求
 25 | */
 26 | func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
 27 | 	defer tab.WG.Done()
 28 | 	ctx := tab.GetExecutor()
 29 | 	_req := v.Request
 30 | 	// 拦截到的URL格式一定正常 不处理错误
 31 | 	url, err := model2.GetUrl(_req.URL, *tab.NavigateReq.URL)
 32 | 	if err != nil {
 33 | 		logger.Logger.Debug("InterceptRequest parse url failed: ", err)
 34 | 		_ = fetch.ContinueRequest(v.RequestID).Do(ctx)
 35 | 		return
 36 | 	}
 37 | 	_option := model2.Options{
 38 | 		Headers:  _req.Headers,
 39 | 		PostData: _req.PostData,
 40 | 	}
 41 | 	req := model2.GetRequest(_req.Method, url, _option)
 42 | 
 43 | 	if IsIgnoredByKeywordMatch(req, tab.config.IgnoreKeywords) {
 44 | 		_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
 45 | 		req.Source = config.FromXHR
 46 | 		tab.AddResultRequest(req)
 47 | 		return
 48 | 	}
 49 | 
 50 | 	tab.HandleHostBinding(&req)
 51 | 
 52 | 	// 静态资源 全部阻断
 53 | 	// https://github.com/Qianlitp/crawlergo/issues/106
 54 | 	if config.StaticSuffixSet.Contains(url.FileExt()) {
 55 | 		_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
 56 | 		req.Source = config.FromStaticRes
 57 | 		tab.AddResultRequest(req)
 58 | 		return
 59 | 	}
 60 | 
 61 | 	// 处理导航请求
 62 | 	if tab.IsNavigatorRequest(v.NetworkID.String()) {
 63 | 		tab.NavNetworkID = v.NetworkID.String()
 64 | 		tab.HandleNavigationReq(&req, v)
 65 | 		req.Source = config.FromNavigation
 66 | 		tab.AddResultRequest(req)
 67 | 		return
 68 | 	}
 69 | 
 70 | 	req.Source = config.FromXHR
 71 | 	tab.AddResultRequest(req)
 72 | 	_ = fetch.ContinueRequest(v.RequestID).Do(ctx)
 73 | }
 74 | 
 75 | /**
 76 | 判断是否为导航请求
 77 | */
 78 | func (tab *Tab) IsNavigatorRequest(networkID string) bool {
 79 | 	return networkID == tab.LoaderID
 80 | }
 81 | 
 82 | /**
 83 | 处理 401 407 认证弹窗
 84 | */
 85 | func (tab *Tab) HandleAuthRequired(req *fetch.EventAuthRequired) {
 86 | 	defer tab.WG.Done()
 87 | 	logger.Logger.Debug("auth required found, auto auth.")
 88 | 	ctx := tab.GetExecutor()
 89 | 	authRes := fetch.AuthChallengeResponse{
 90 | 		Response: fetch.AuthChallengeResponseResponseProvideCredentials,
 91 | 		Username: "Crawlergo",
 92 | 		Password: "Crawlergo",
 93 | 	}
 94 | 	// 取消认证
 95 | 	_ = fetch.ContinueWithAuth(req.RequestID, &authRes).Do(ctx)
 96 | }
 97 | 
 98 | /**
 99 | 处理导航请求
100 | */
101 | func (tab *Tab) HandleNavigationReq(req *model2.Request, v *fetch.EventRequestPaused) {
102 | 	navReq := tab.NavigateReq
103 | 	ctx := tab.GetExecutor()
104 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*5)
105 | 	defer cancel()
106 | 	overrideReq := fetch.ContinueRequest(v.RequestID).WithURL(req.URL.String())
107 | 
108 | 	// 处理后端重定向请求
109 | 	if tab.FoundRedirection && tab.IsTopFrame(v.FrameID.String()) {
110 | 		logger.Logger.Debug("redirect navigation req: " + req.URL.String())
111 | 		//_ = fetch.FailRequest(v.RequestID, network.ErrorReasonConnectionAborted).Do(ctx)
112 | 		body := base64.StdEncoding.EncodeToString([]byte(`<html><body>Crawlergo</body></html>`))
113 | 		param := fetch.FulfillRequest(v.RequestID, 200).WithBody(body)
114 | 		err := param.Do(ctx)
115 | 		if err != nil {
116 | 			logger.Logger.Debug(err)
117 | 		}
118 | 		navReq.RedirectionFlag = true
119 | 		navReq.Source = config.FromNavigation
120 | 		tab.AddResultRequest(navReq)
121 | 		// 处理重定向标记
122 | 	} else if navReq.RedirectionFlag && tab.IsTopFrame(v.FrameID.String()) {
123 | 		navReq.RedirectionFlag = false
124 | 		logger.Logger.Debug("has redirection_flag: " + req.URL.String())
125 | 		headers := tools.ConvertHeaders(req.Headers)
126 | 		headers["Range"] = "bytes=0-1048576"
127 | 		res, err := requests.Request(req.Method, req.URL.String(), headers, []byte(req.PostData), &requests.ReqOptions{
128 | 			AllowRedirect: false, Proxy: tab.config.Proxy})
129 | 		if err != nil {
130 | 			logger.Logger.Debug(err)
131 | 			_ = fetch.FailRequest(v.RequestID, network.ErrorReasonConnectionAborted).Do(ctx)
132 | 			return
133 | 		}
134 | 		body := base64.StdEncoding.EncodeToString([]byte(res.Text))
135 | 		param := fetch.FulfillRequest(v.RequestID, 200).WithResponseHeaders(ConvertHeadersNoLocation(res.Header)).WithBody(body)
136 | 		errR := param.Do(ctx)
137 | 		if errR != nil {
138 | 			logger.Logger.Debug(errR)
139 | 		}
140 | 		// 主导航请求
141 | 	} else if tab.IsTopFrame(v.FrameID.String()) && req.URL.NavigationUrl() == navReq.URL.NavigationUrl() {
142 | 		logger.Logger.Debug("main navigation req: " + navReq.URL.String())
143 | 		// 手动设置POST信息
144 | 		if navReq.Method == config.POST || navReq.Method == config.PUT {
145 | 			overrideReq = overrideReq.WithPostData(navReq.PostData)
146 | 		}
147 | 		overrideReq = overrideReq.WithMethod(navReq.Method)
148 | 		overrideReq = overrideReq.WithHeaders(MergeHeaders(navReq.Headers, req.Headers))
149 | 		_ = overrideReq.Do(tCtx)
150 | 		// 子frame的导航
151 | 	} else if !tab.IsTopFrame(v.FrameID.String()) {
152 | 		_ = overrideReq.Do(tCtx)
153 | 		// 前端跳转 返回204
154 | 	} else {
155 | 		_ = fetch.FulfillRequest(v.RequestID, 204).Do(ctx)
156 | 	}
157 | }
158 | 
159 | /**
160 | 处理Host绑定
161 | */
162 | func (tab *Tab) HandleHostBinding(req *model2.Request) {
163 | 	url := req.URL
164 | 	navUrl := tab.NavigateReq.URL
165 | 	// 导航请求的域名和HOST绑定中的域名不同，且当前请求的domain和导航请求header中的Host相同，则替换当前请求的domain并绑定Host
166 | 	if host, ok := tab.NavigateReq.Headers["Host"]; ok {
167 | 		if navUrl.Hostname() != host && url.Host == host {
168 | 			urlObj, _ := model2.GetUrl(strings.Replace(req.URL.String(), "://"+url.Hostname(), "://"+navUrl.Hostname(), -1), *navUrl)
169 | 			req.URL = urlObj
170 | 			req.Headers["Host"] = host
171 | 
172 | 		} else if navUrl.Hostname() != host && url.Host == navUrl.Host {
173 | 			req.Headers["Host"] = host
174 | 		}
175 | 		// 修正Origin
176 | 		if _, ok := req.Headers["Origin"]; ok {
177 | 			req.Headers["Origin"] = strings.Replace(req.Headers["Origin"].(string), navUrl.Host, host.(string), 1)
178 | 		}
179 | 		// 修正Referer
180 | 		if _, ok := req.Headers["Referer"]; ok {
181 | 			req.Headers["Referer"] = strings.Replace(req.Headers["Referer"].(string), navUrl.Host, host.(string), 1)
182 | 		} else {
183 | 			req.Headers["Referer"] = strings.Replace(navUrl.String(), navUrl.Host, host.(string), 1)
184 | 		}
185 | 	}
186 | }
187 | 
188 | func (tab *Tab) IsTopFrame(FrameID string) bool {
189 | 	return FrameID == tab.TopFrameId
190 | }
191 | 
192 | /**
193 | 解析响应内容中的URL 使用正则匹配
194 | */
195 | func (tab *Tab) ParseResponseURL(v *network.EventResponseReceived) {
196 | 	defer tab.WG.Done()
197 | 	ctx := tab.GetExecutor()
198 | 	res, err := network.GetResponseBody(v.RequestID).Do(ctx)
199 | 	if err != nil {
200 | 		logger.Logger.Debug("ParseResponseURL ", err)
201 | 		return
202 | 	}
203 | 	resStr := string(res)
204 | 
205 | 	urlRegex := regexp.MustCompile(config.SuspectURLRegex)
206 | 	urlList := urlRegex.FindAllString(resStr, -1)
207 | 	for _, url := range urlList {
208 | 
209 | 		url = url[1 : len(url)-1]
210 | 		url_lower := strings.ToLower(url)
211 | 		if strings.HasPrefix(url_lower, "image/x-icon") || strings.HasPrefix(url_lower, "text/css") || strings.HasPrefix(url_lower, "text/javascript") {
212 | 			continue
213 | 		}
214 | 
215 | 		tab.AddResultUrl(config.GET, url, config.FromJSFile)
216 | 	}
217 | }
218 | 
219 | func (tab *Tab) HandleRedirectionResp(v *network.EventResponseReceivedExtraInfo) {
220 | 	defer tab.WG.Done()
221 | 	statusCode := tab.GetStatusCode(v.HeadersText)
222 | 	// 导航请求，且返回重定向
223 | 	if 300 <= statusCode && statusCode < 400 {
224 | 		logger.Logger.Debug("set redirect flag.")
225 | 		tab.FoundRedirection = true
226 | 	}
227 | }
228 | 
229 | func (tab *Tab) GetContentCharset(v *network.EventResponseReceived) {
230 | 	defer tab.WG.Done()
231 | 	var getCharsetRegex = regexp.MustCompile("charset=(.+)$")
232 | 	for key, value := range v.Response.Headers {
233 | 		if key == "Content-Type" {
234 | 			value := value.(string)
235 | 			if strings.Contains(value, "charset") {
236 | 				value = getCharsetRegex.FindString(value)
237 | 				value = strings.ToUpper(strings.Replace(value, "charset=", "", -1))
238 | 				tab.PageCharset = value
239 | 				tab.PageCharset = strings.TrimSpace(tab.PageCharset)
240 | 			}
241 | 		}
242 | 	}
243 | }
244 | 
245 | func (tab *Tab) GetStatusCode(headerText string) int {
246 | 	rspInput := strings.NewReader(headerText)
247 | 	rspBuf := bufio.NewReader(rspInput)
248 | 	tp := textproto.NewReader(rspBuf)
249 | 	line, err := tp.ReadLine()
250 | 	if err != nil {
251 | 		if err == io.EOF {
252 | 			err = io.ErrUnexpectedEOF
253 | 		}
254 | 		return 0
255 | 	}
256 | 	parts := strings.Split(line, " ")
257 | 	if len(parts) < 3 {
258 | 		return 0
259 | 	}
260 | 	code, _ := strconv.Atoi(parts[1])
261 | 	return code
262 | }
263 | 
264 | func MergeHeaders(navHeaders map[string]interface{}, headers map[string]interface{}) []*fetch.HeaderEntry {
265 | 	var mergedHeaders []*fetch.HeaderEntry
266 | 	for key, value := range navHeaders {
267 | 		if _, ok := headers[key]; !ok {
268 | 			var header fetch.HeaderEntry
269 | 			header.Name = key
270 | 			header.Value = value.(string)
271 | 			mergedHeaders = append(mergedHeaders, &header)
272 | 		}
273 | 	}
274 | 
275 | 	for key, value := range headers {
276 | 		var header fetch.HeaderEntry
277 | 		header.Name = key
278 | 		header.Value = value.(string)
279 | 		mergedHeaders = append(mergedHeaders, &header)
280 | 	}
281 | 	return mergedHeaders
282 | }
283 | 
284 | func ConvertHeadersNoLocation(h map[string][]string) []*fetch.HeaderEntry {
285 | 	var headers []*fetch.HeaderEntry
286 | 	for key, value := range h {
287 | 		if key == "Location" {
288 | 			continue
289 | 		}
290 | 		var header fetch.HeaderEntry
291 | 		header.Name = key
292 | 		header.Value = value[0]
293 | 		headers = append(headers, &header)
294 | 	}
295 | 	return headers
296 | }
297 | 


--------------------------------------------------------------------------------
/pkg/engine/tab.go:
--------------------------------------------------------------------------------
  1 | package engine
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"regexp"
  9 | 	"strings"
 10 | 	"sync"
 11 | 	"time"
 12 | 
 13 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 14 | 	"github.com/Qianlitp/crawlergo/pkg/js"
 15 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 16 | 	model2 "github.com/Qianlitp/crawlergo/pkg/model"
 17 | 	"github.com/chromedp/cdproto/cdp"
 18 | 	"github.com/chromedp/cdproto/dom"
 19 | 	"github.com/chromedp/cdproto/fetch"
 20 | 	"github.com/chromedp/cdproto/network"
 21 | 	"github.com/chromedp/cdproto/page"
 22 | 	"github.com/chromedp/cdproto/runtime"
 23 | 	"github.com/chromedp/chromedp"
 24 | 	"github.com/gogf/gf/encoding/gcharset"
 25 | )
 26 | 
 27 | type Tab struct {
 28 | 	Ctx              *context.Context
 29 | 	Cancel           context.CancelFunc
 30 | 	NavigateReq      model2.Request
 31 | 	ExtraHeaders     map[string]interface{}
 32 | 	ResultList       []*model2.Request
 33 | 	TopFrameId       string
 34 | 	LoaderID         string
 35 | 	NavNetworkID     string
 36 | 	PageCharset      string
 37 | 	PageBindings     map[string]interface{}
 38 | 	FoundRedirection bool
 39 | 	DocBodyNodeId    cdp.NodeID
 40 | 	config           TabConfig
 41 | 
 42 | 	lock sync.Mutex
 43 | 
 44 | 	WG            sync.WaitGroup //当前Tab页的等待同步计数
 45 | 	collectLinkWG sync.WaitGroup
 46 | 	loadedWG      sync.WaitGroup //Loaded之后的等待计数
 47 | 	formSubmitWG  sync.WaitGroup //表单提交完毕的等待计数
 48 | 	removeLis     sync.WaitGroup //移除事件监听
 49 | 	domWG         sync.WaitGroup //DOMContentLoaded 的等待计数
 50 | 	fillFormWG    sync.WaitGroup //填充表单任务
 51 | }
 52 | 
 53 | type TabConfig struct {
 54 | 	TabRunTimeout           time.Duration
 55 | 	DomContentLoadedTimeout time.Duration
 56 | 	EventTriggerMode        string        // 事件触发的调用方式： 异步 或 顺序
 57 | 	EventTriggerInterval    time.Duration // 事件触发的间隔 单位毫秒
 58 | 	BeforeExitDelay         time.Duration // 退出前的等待时间，等待DOM渲染，等待XHR发出捕获
 59 | 	EncodeURLWithCharset    bool
 60 | 	IgnoreKeywords          []string //
 61 | 	Proxy                   string
 62 | 	CustomFormValues        map[string]string
 63 | 	CustomFormKeywordValues map[string]string
 64 | }
 65 | 
 66 | type bindingCallPayload struct {
 67 | 	Name string   `json:"name"`
 68 | 	Seq  int      `json:"seq"`
 69 | 	Args []string `json:"args"`
 70 | }
 71 | 
 72 | func NewTab(browser *Browser, navigateReq model2.Request, config TabConfig) *Tab {
 73 | 	var tab Tab
 74 | 	tab.ExtraHeaders = map[string]interface{}{}
 75 | 	var DOMContentLoadedRun = false
 76 | 	tab.Ctx, tab.Cancel = browser.NewTab(config.TabRunTimeout)
 77 | 	for key, value := range browser.ExtraHeaders {
 78 | 		navigateReq.Headers[key] = value
 79 | 		if key != "Host" {
 80 | 			tab.ExtraHeaders[key] = value
 81 | 		}
 82 | 	}
 83 | 	tab.NavigateReq = navigateReq
 84 | 	tab.config = config
 85 | 	tab.DocBodyNodeId = 0
 86 | 
 87 | 	// 设置请求拦截监听
 88 | 	chromedp.ListenTarget(*tab.Ctx, func(v interface{}) {
 89 | 		switch v := v.(type) {
 90 | 		// 根据不同的事件 选择执行对应的动作
 91 | 		case *network.EventRequestWillBeSent:
 92 | 			if string(v.RequestID) == string(v.LoaderID) && v.Type == "Document" && tab.TopFrameId == "" {
 93 | 				tab.LoaderID = string(v.LoaderID)
 94 | 				tab.TopFrameId = string(v.FrameID)
 95 | 			}
 96 | 
 97 | 		// 请求发出时暂停 即 请求拦截
 98 | 		case *fetch.EventRequestPaused:
 99 | 			tab.WG.Add(1)
100 | 			go tab.InterceptRequest(v)
101 | 
102 | 		// 解析所有JS文件中的URL并添加到结果中
103 | 		// 解析HTML文档中的URL
104 | 		// 查找当前页面的编码
105 | 		case *network.EventResponseReceived:
106 | 			if v.Response.MimeType == "application/javascript" || v.Response.MimeType == "text/html" || v.Response.MimeType == "application/json" {
107 | 				tab.WG.Add(1)
108 | 				go tab.ParseResponseURL(v)
109 | 			}
110 | 			if v.RequestID.String() == tab.NavNetworkID {
111 | 				tab.WG.Add(1)
112 | 				go tab.GetContentCharset(v)
113 | 			}
114 | 		// 处理后端重定向 3XX
115 | 		case *network.EventResponseReceivedExtraInfo:
116 | 			if v.RequestID.String() == tab.NavNetworkID {
117 | 				tab.WG.Add(1)
118 | 				go tab.HandleRedirectionResp(v)
119 | 			}
120 | 		//case *network.EventLoadingFailed:
121 | 		//	logger.Logger.Error("EventLoadingFailed ", v.ErrorText)
122 | 		// 401 407 要求认证 此时会阻塞当前页面 需要处理解决
123 | 		case *fetch.EventAuthRequired:
124 | 			tab.WG.Add(1)
125 | 			go tab.HandleAuthRequired(v)
126 | 
127 | 		// DOMContentLoaded
128 | 		// 开始执行表单填充 和 执行DOM节点观察函数
129 | 		// 只执行一次
130 | 		case *page.EventDomContentEventFired:
131 | 			if DOMContentLoadedRun {
132 | 				return
133 | 			}
134 | 			DOMContentLoadedRun = true
135 | 			tab.WG.Add(1)
136 | 			go tab.AfterDOMRun()
137 | 		// Loaded
138 | 		case *page.EventLoadEventFired:
139 | 			if DOMContentLoadedRun {
140 | 				return
141 | 			}
142 | 			DOMContentLoadedRun = true
143 | 			tab.WG.Add(1)
144 | 			go tab.AfterDOMRun()
145 | 
146 | 		// close Dialog
147 | 		case *page.EventJavascriptDialogOpening:
148 | 			tab.WG.Add(1)
149 | 			go tab.dismissDialog()
150 | 
151 | 		// handle expose function
152 | 		case *runtime.EventBindingCalled:
153 | 			tab.WG.Add(1)
154 | 			go tab.HandleBindingCalled(v)
155 | 		}
156 | 	})
157 | 
158 | 	return &tab
159 | }
160 | 
161 | func (tab *Tab) Start() {
162 | 	logger.Logger.Info("Crawling " + tab.NavigateReq.Method + " " + tab.NavigateReq.URL.String())
163 | 	defer tab.Cancel()
164 | 	if err := chromedp.Run(*tab.Ctx,
165 | 		RunWithTimeOut(tab.Ctx, tab.config.DomContentLoadedTimeout, chromedp.Tasks{
166 | 			//
167 | 			runtime.Enable(),
168 | 			// 开启网络层API
169 | 			network.Enable(),
170 | 			// 开启请求拦截API
171 | 			fetch.Enable().WithHandleAuthRequests(true),
172 | 			// 添加回调函数绑定
173 | 			// XSS-Scan 使用的回调
174 | 			runtime.AddBinding("addLink"),
175 | 			runtime.AddBinding("Test"),
176 | 			// 初始化执行JS
177 | 			chromedp.ActionFunc(func(ctx context.Context) error {
178 | 				var err error
179 | 				_, err = page.AddScriptToEvaluateOnNewDocument(js.TabInitJS).Do(ctx)
180 | 				if err != nil {
181 | 					return err
182 | 				}
183 | 				return nil
184 | 			}),
185 | 			network.SetExtraHTTPHeaders(tab.ExtraHeaders),
186 | 			// 执行导航
187 | 			chromedp.Navigate(tab.NavigateReq.URL.String()),
188 | 		}),
189 | 	); err != nil {
190 | 		if errors.Is(err, context.Canceled) {
191 | 			logger.Logger.Debug("Crawling Canceled")
192 | 			return
193 | 		}
194 | 		logger.Logger.Warn("navigate timeout ", tab.NavigateReq.URL.String())
195 | 	}
196 | 
197 | 	waitDone := func() <-chan struct{} {
198 | 		tab.WG.Wait()
199 | 		ch := make(chan struct{})
200 | 		defer close(ch)
201 | 		return ch
202 | 	}
203 | 
204 | 	select {
205 | 	case <-waitDone():
206 | 		logger.Logger.Debug("all navigation tasks done.")
207 | 	case <-time.After(tab.config.DomContentLoadedTimeout + time.Second*10):
208 | 		logger.Logger.Warn("navigation tasks TIMEOUT.")
209 | 	}
210 | 
211 | 	// 等待收集所有链接
212 | 	logger.Logger.Debug("collectLinks start.")
213 | 	tab.collectLinkWG.Add(3)
214 | 	go tab.collectLinks()
215 | 	tab.collectLinkWG.Wait()
216 | 	logger.Logger.Debug("collectLinks end.")
217 | 
218 | 	// 识别页面编码 并编码所有URL
219 | 	if tab.config.EncodeURLWithCharset {
220 | 		tab.DetectCharset()
221 | 		tab.EncodeAllURLWithCharset()
222 | 	}
223 | 
224 | 	//fmt.Println(tab.NavigateReq.URL.String(), len(tab.ResultList))
225 | 	//for _, v := range tab.ResultList {
226 | 	//	v.SimplePrint()
227 | 	//}
228 | 	// fmt.Println("Finished " + tab.NavigateReq.Method + " " + tab.NavigateReq.URL.String())
229 | }
230 | 
231 | func RunWithTimeOut(ctx *context.Context, timeout time.Duration, tasks chromedp.Tasks) chromedp.ActionFunc {
232 | 	return func(ctx context.Context) error {
233 | 		timeoutContext, _ := context.WithTimeout(ctx, timeout)
234 | 		//defer cancel()
235 | 		return tasks.Do(timeoutContext)
236 | 	}
237 | }
238 | 
239 | /**
240 | 添加收集到的URL到结果列表，需要处理Host绑定
241 | */
242 | func (tab *Tab) AddResultUrl(method string, _url string, source string) {
243 | 	navUrl := tab.NavigateReq.URL
244 | 	url, err := model2.GetUrl(_url, *navUrl)
245 | 	if err != nil {
246 | 		return
247 | 	}
248 | 	option := model2.Options{
249 | 		Headers:  map[string]interface{}{},
250 | 		PostData: "",
251 | 	}
252 | 	referer := navUrl.String()
253 | 
254 | 	// 处理Host绑定
255 | 	if host, ok := tab.NavigateReq.Headers["Host"]; ok {
256 | 		if host != navUrl.Hostname() && url.Hostname() == host {
257 | 			url, _ = model2.GetUrl(strings.Replace(url.String(), "://"+url.Hostname(), "://"+navUrl.Hostname(), -1), *navUrl)
258 | 			option.Headers["Host"] = host
259 | 			referer = strings.Replace(navUrl.String(), navUrl.Host, host.(string), -1)
260 | 		}
261 | 	}
262 | 	// 添加Cookie
263 | 	if cookie, ok := tab.NavigateReq.Headers["Cookie"]; ok {
264 | 		option.Headers["Cookie"] = cookie
265 | 	}
266 | 
267 | 	// 修正Referer
268 | 	option.Headers["Referer"] = referer
269 | 	for key, value := range tab.ExtraHeaders {
270 | 		option.Headers[key] = value
271 | 	}
272 | 	req := model2.GetRequest(method, url, option)
273 | 	req.Source = source
274 | 
275 | 	tab.lock.Lock()
276 | 	tab.ResultList = append(tab.ResultList, &req)
277 | 	tab.lock.Unlock()
278 | }
279 | 
280 | /**
281 | 添加请求到结果列表，拦截请求时处理了Host绑定，此处无需处理
282 | */
283 | func (tab *Tab) AddResultRequest(req model2.Request) {
284 | 	for key, value := range tab.ExtraHeaders {
285 | 		req.Headers[key] = value
286 | 	}
287 | 	tab.lock.Lock()
288 | 	tab.ResultList = append(tab.ResultList, &req)
289 | 	tab.lock.Unlock()
290 | }
291 | 
292 | /**
293 | 获取当前标签页CDP的执行上下文
294 | */
295 | func (tab *Tab) GetExecutor() context.Context {
296 | 	c := chromedp.FromContext(*tab.Ctx)
297 | 	ctx := cdp.WithExecutor(*tab.Ctx, c.Target)
298 | 	return ctx
299 | }
300 | 
301 | /**
302 | 关闭弹窗
303 | */
304 | func (tab *Tab) dismissDialog() {
305 | 	defer tab.WG.Done()
306 | 	ctx := tab.GetExecutor()
307 | 	_ = page.HandleJavaScriptDialog(false).Do(ctx)
308 | }
309 | 
310 | /**
311 | 处理回调
312 | */
313 | func (tab *Tab) HandleBindingCalled(event *runtime.EventBindingCalled) {
314 | 	defer tab.WG.Done()
315 | 	payload := []byte(event.Payload)
316 | 	var bcPayload bindingCallPayload
317 | 	_ = json.Unmarshal(payload, &bcPayload)
318 | 	if bcPayload.Name == "addLink" && len(bcPayload.Args) > 1 {
319 | 		tab.AddResultUrl(config.GET, bcPayload.Args[0], bcPayload.Args[1])
320 | 	}
321 | 	if bcPayload.Name == "Test" {
322 | 		fmt.Println(bcPayload.Args)
323 | 	}
324 | 	tab.Evaluate(fmt.Sprintf(js.DeliverResultJS, bcPayload.Name, bcPayload.Seq, "s"))
325 | }
326 | 
327 | /**
328 | 执行JS
329 | */
330 | func (tab *Tab) Evaluate(expression string) {
331 | 	ctx := tab.GetExecutor()
332 | 	tCtx, cancel := context.WithTimeout(ctx, time.Second*5)
333 | 	defer cancel()
334 | 	_, exception, err := runtime.Evaluate(expression).Do(tCtx)
335 | 	if exception != nil {
336 | 		logger.Logger.Debug("tab Evaluate: ", exception.Text)
337 | 	}
338 | 	if err != nil {
339 | 		logger.Logger.Debug("tab Evaluate: ", err)
340 | 	}
341 | }
342 | 
343 | /**
344 | 立即根据条件获取Nodes的ID，不等待
345 | */
346 | func (tab *Tab) GetNodeIDs(sel string) ([]cdp.NodeID, error) {
347 | 	ctx := tab.GetExecutor()
348 | 	return dom.QuerySelectorAll(tab.DocBodyNodeId, sel).Do(ctx)
349 | }
350 | 
351 | /**
352 | 根据给的Node执行JS
353 | */
354 | func (tab *Tab) EvaluateWithNode(expression string, node *cdp.Node) error {
355 | 	ctx := tab.GetExecutor()
356 | 	var res bool
357 | 	err := chromedp.EvaluateAsDevTools(js.Snippet(expression, js.CashX(true), "", node), &res).Do(ctx)
358 | 	if err != nil {
359 | 		return err
360 | 	}
361 | 	return nil
362 | }
363 | 
364 | /**
365 | 识别页面的编码
366 | */
367 | func (tab *Tab) DetectCharset() {
368 | 	ctx := tab.GetExecutor()
369 | 	tCtx, cancel := context.WithTimeout(ctx, time.Millisecond*500)
370 | 	defer cancel()
371 | 	var content string
372 | 	var ok bool
373 | 	var getCharsetRegex = regexp.MustCompile("charset=(.+)$")
374 | 	err := chromedp.AttributeValue(`meta[http-equiv=Content-Type]`, "content", &content, &ok, chromedp.ByQuery).Do(tCtx)
375 | 	if err != nil || !ok {
376 | 		return
377 | 	}
378 | 	if strings.Contains(content, "charset=") {
379 | 		charset := getCharsetRegex.FindString(content)
380 | 		if charset != "" {
381 | 			tab.PageCharset = strings.ToUpper(strings.Replace(charset, "charset=", "", -1))
382 | 			tab.PageCharset = strings.TrimSpace(tab.PageCharset)
383 | 		}
384 | 	}
385 | }
386 | 
387 | func (tab *Tab) EncodeAllURLWithCharset() {
388 | 	if tab.PageCharset == "" || tab.PageCharset == "UTF-8" {
389 | 		return
390 | 	}
391 | 	for _, req := range tab.ResultList {
392 | 		newRawQuery, err := gcharset.UTF8To(tab.PageCharset, req.URL.RawQuery)
393 | 		if err == nil {
394 | 			req.URL.RawQuery = newRawQuery
395 | 		}
396 | 		newRawPath, err := gcharset.UTF8To(tab.PageCharset, req.URL.RawPath)
397 | 		if err == nil {
398 | 			req.URL.RawPath = newRawPath
399 | 		}
400 | 	}
401 | }
402 | 
403 | func IsIgnoredByKeywordMatch(req model2.Request, IgnoreKeywords []string) bool {
404 | 	for _, _str := range IgnoreKeywords {
405 | 		if strings.Contains(req.URL.String(), _str) {
406 | 			logger.Logger.Info("ignore request: ", req.SimpleFormat())
407 | 			return true
408 | 		}
409 | 	}
410 | 	return false
411 | }
412 | 


--------------------------------------------------------------------------------
/pkg/engine/tab_test.go:
--------------------------------------------------------------------------------
 1 | package engine_test
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 	"testing"
 6 | 	"time"
 7 | )
 8 | 
 9 | func TestChannel(t *testing.T) {
10 | 	wg := sync.WaitGroup{}
11 | 
12 | 	for range "..." {
13 | 		wg.Add(1)
14 | 		go func() {
15 | 			defer wg.Done()
16 | 			t.Log("=====> go func")
17 | 			time.Sleep(time.Second * 5)
18 | 			t.Log("go func done! <<<<========")
19 | 		}()
20 | 	}
21 | 
22 | 	waitDone := func() <-chan struct{} {
23 | 		wg.Wait()
24 | 		ch := make(chan struct{})
25 | 		defer close(ch)
26 | 		return ch
27 | 	}
28 | 
29 | 	select {
30 | 	case <-waitDone():
31 | 		t.Log("all goroutine done")
32 | 	case <-time.After(time.Second * 10):
33 | 		t.Error("timeout")
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/filter/simple_filter.go:
--------------------------------------------------------------------------------
  1 | package filter
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 
  6 | 	"github.com/Qianlitp/crawlergo/pkg/config"
  7 | 	"github.com/Qianlitp/crawlergo/pkg/model"
  8 | 	mapset "github.com/deckarep/golang-set"
  9 | )
 10 | 
 11 | type SimpleFilter struct {
 12 | 	UniqueSet mapset.Set
 13 | 	HostLimit string
 14 | }
 15 | 
 16 | var (
 17 | 	staticSuffixSet = config.StaticSuffixSet.Clone()
 18 | )
 19 | 
 20 | func init() {
 21 | 	for _, suffix := range []string{"js", "css", "json"} {
 22 | 		staticSuffixSet.Add(suffix)
 23 | 	}
 24 | }
 25 | 
 26 | /**
 27 | 需要过滤则返回 true
 28 | */
 29 | func (s *SimpleFilter) DoFilter(req *model.Request) bool {
 30 | 	if s.UniqueSet == nil {
 31 | 		s.UniqueSet = mapset.NewSet()
 32 | 	}
 33 | 	// 首先判断是否需要过滤域名
 34 | 	if s.HostLimit != "" && s.DomainFilter(req) {
 35 | 		return true
 36 | 	}
 37 | 	// 去重
 38 | 	if s.UniqueFilter(req) {
 39 | 		return true
 40 | 	}
 41 | 	// 过滤静态资源
 42 | 	if s.StaticFilter(req) {
 43 | 		return true
 44 | 	}
 45 | 	return false
 46 | }
 47 | 
 48 | /**
 49 | 请求去重
 50 | */
 51 | func (s *SimpleFilter) UniqueFilter(req *model.Request) bool {
 52 | 	if s.UniqueSet == nil {
 53 | 		s.UniqueSet = mapset.NewSet()
 54 | 	}
 55 | 	if s.UniqueSet.Contains(req.UniqueId()) {
 56 | 		return true
 57 | 	} else {
 58 | 		s.UniqueSet.Add(req.UniqueId())
 59 | 		return false
 60 | 	}
 61 | }
 62 | 
 63 | /**
 64 | 静态资源过滤
 65 | */
 66 | func (s *SimpleFilter) StaticFilter(req *model.Request) bool {
 67 | 	if s.UniqueSet == nil {
 68 | 		s.UniqueSet = mapset.NewSet()
 69 | 	}
 70 | 	// 首先将slice转换成map
 71 | 
 72 | 	if req.URL.FileExt() == "" {
 73 | 		return false
 74 | 	}
 75 | 	if staticSuffixSet.Contains(req.URL.FileExt()) {
 76 | 		return true
 77 | 	}
 78 | 	return false
 79 | }
 80 | 
 81 | /**
 82 | 只保留指定域名的链接
 83 | */
 84 | func (s *SimpleFilter) DomainFilter(req *model.Request) bool {
 85 | 	if s.UniqueSet == nil {
 86 | 		s.UniqueSet = mapset.NewSet()
 87 | 	}
 88 | 	if req.URL.Host == s.HostLimit || req.URL.Hostname() == s.HostLimit {
 89 | 		return false
 90 | 	}
 91 | 	if strings.HasSuffix(s.HostLimit, ":80") && req.URL.Port() == "" && req.URL.Scheme == "http" {
 92 | 		if req.URL.Hostname()+":80" == s.HostLimit {
 93 | 			return false
 94 | 		}
 95 | 	}
 96 | 	if strings.HasSuffix(s.HostLimit, ":443") && req.URL.Port() == "" && req.URL.Scheme == "https" {
 97 | 		if req.URL.Hostname()+":443" == s.HostLimit {
 98 | 			return false
 99 | 		}
100 | 	}
101 | 	return true
102 | }
103 | 


--------------------------------------------------------------------------------
/pkg/filter/smart_filter.go:
--------------------------------------------------------------------------------
  1 | package filter
  2 | 
  3 | import (
  4 | 	"go/types"
  5 | 	"regexp"
  6 | 	"sort"
  7 | 	"strings"
  8 | 	"sync"
  9 | 
 10 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 11 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 12 | 	"github.com/Qianlitp/crawlergo/pkg/model"
 13 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 14 | 
 15 | 	mapset "github.com/deckarep/golang-set"
 16 | )
 17 | 
 18 | type SmartFilter struct {
 19 | 	StrictMode                 bool
 20 | 	SimpleFilter               SimpleFilter
 21 | 	filterLocationSet          mapset.Set // 非逻辑型参数的位置记录 全局统一标记过滤
 22 | 	filterParamKeyRepeatCount  sync.Map
 23 | 	filterParamKeySingleValues sync.Map // 所有参数名重复数量统计
 24 | 	filterPathParamKeySymbol   sync.Map // 某个path下的某个参数的值出现标记次数统计
 25 | 	filterParamKeyAllValues    sync.Map
 26 | 	filterPathParamEmptyValues sync.Map
 27 | 	filterParentPathValues     sync.Map
 28 | 	uniqueMarkedIds            mapset.Set // 标记后的唯一ID，用于去重
 29 | }
 30 | 
 31 | const (
 32 | 	MaxParentPathCount         = 32 // 相对于上一级目录，本级path目录的数量修正最大值
 33 | 	MaxParamKeySingleCount     = 8  // 某个URL参数名重复修正最大值
 34 | 	MaxParamKeyAllCount        = 10 // 本轮所有URL中某个参数名的重复修正最大值
 35 | 	MaxPathParamEmptyCount     = 10 // 某个path下的参数值为空，参数名个数修正最大值
 36 | 	MaxPathParamKeySymbolCount = 5  // 某个Path下的某个参数的标记数量超过此值，则该参数被全局标记
 37 | )
 38 | 
 39 | const (
 40 | 	CustomValueMark    = "{{Crawlergo}}"
 41 | 	FixParamRepeatMark = "{{fix_param}}"
 42 | 	FixPathMark        = "{{fix_path}}"
 43 | 	TooLongMark        = "{{long}}"
 44 | 	NumberMark         = "{{number}}"
 45 | 	ChineseMark        = "{{chinese}}"
 46 | 	UpperMark          = "{{upper}}"
 47 | 	LowerMark          = "{{lower}}"
 48 | 	UrlEncodeMark      = "{{urlencode}}"
 49 | 	UnicodeMark        = "{{unicode}}"
 50 | 	BoolMark           = "{{bool}}"
 51 | 	ListMark           = "{{list}}"
 52 | 	TimeMark           = "{{time}}"
 53 | 	MixAlphaNumMark    = "{{mix_alpha_num}}"
 54 | 	MixSymbolMark      = "{{mix_symbol}}"
 55 | 	MixNumMark         = "{{mix_num}}"
 56 | 	NoLowerAlphaMark   = "{{no_lower}}"
 57 | 	MixStringMark      = "{{mix_str}}"
 58 | )
 59 | 
 60 | var chineseRegex = regexp.MustCompile("[\u4e00-\u9fa5]+")
 61 | var urlencodeRegex = regexp.MustCompile("(?:%[A-Fa-f0-9]{2,6})+")
 62 | var unicodeRegex = regexp.MustCompile(`(?:\\u\w{4})+`)
 63 | var onlyAlphaRegex = regexp.MustCompile("^[a-zA-Z]+$")
 64 | var onlyAlphaUpperRegex = regexp.MustCompile("^[A-Z]+$")
 65 | var alphaUpperRegex = regexp.MustCompile("[A-Z]+")
 66 | var alphaLowerRegex = regexp.MustCompile("[a-z]+")
 67 | var replaceNumRegex = regexp.MustCompile(`[0-9]+\.[0-9]+|\d+`)
 68 | var onlyNumberRegex = regexp.MustCompile(`^[0-9]+$`)
 69 | var numberRegex = regexp.MustCompile(`[0-9]+`)
 70 | var OneNumberRegex = regexp.MustCompile(`[0-9]`)
 71 | var numSymbolRegex = regexp.MustCompile(`\.|_|-`)
 72 | var timeSymbolRegex = regexp.MustCompile(`-|:|\s`)
 73 | var onlyAlphaNumRegex = regexp.MustCompile(`^[0-9a-zA-Z]+$`)
 74 | var markedStringRegex = regexp.MustCompile(`^{{.+}}$`)
 75 | var htmlReplaceRegex = regexp.MustCompile(`\.shtml|\.html|\.htm`)
 76 | 
 77 | func (s *SmartFilter) Init() {
 78 | 	s.filterLocationSet = mapset.NewSet()
 79 | 	s.filterParamKeyRepeatCount = sync.Map{}
 80 | 	s.filterParamKeySingleValues = sync.Map{}
 81 | 	s.filterPathParamKeySymbol = sync.Map{}
 82 | 	s.filterParamKeyAllValues = sync.Map{}
 83 | 	s.filterPathParamEmptyValues = sync.Map{}
 84 | 	s.filterParentPathValues = sync.Map{}
 85 | 	s.uniqueMarkedIds = mapset.NewSet()
 86 | }
 87 | 
 88 | /**
 89 | 智能去重
 90 | 可选严格模式
 91 | 
 92 | 需要过滤则返回 true
 93 | */
 94 | func (s *SmartFilter) DoFilter(req *model.Request) bool {
 95 | 	// 首先过滤掉静态资源、基础的去重、过滤其它的域名
 96 | 	if s.SimpleFilter.DoFilter(req) {
 97 | 		logger.Logger.Debugf("filter req by simplefilter: " + req.URL.RequestURI())
 98 | 		return true
 99 | 	}
100 | 
101 | 	req.Filter.FragmentID = s.calcFragmentID(req.URL.Fragment)
102 | 
103 | 	// 标记
104 | 	if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
105 | 		s.getMark(req)
106 | 		s.repeatCountStatistic(req)
107 | 	} else if req.Method == config.POST || req.Method == config.PUT {
108 | 		s.postMark(req)
109 | 	} else {
110 | 		logger.Logger.Debug("dont support such method: " + req.Method)
111 | 	}
112 | 
113 | 	// 对标记后的请求进行去重
114 | 	uniqueId := req.Filter.UniqueId
115 | 	if s.uniqueMarkedIds.Contains(uniqueId) {
116 | 		logger.Logger.Debugf("filter req by uniqueMarkedIds 1: " + req.URL.RequestURI())
117 | 		return true
118 | 	}
119 | 
120 | 	// 全局数值型参数标记
121 | 	s.globalFilterLocationMark(req)
122 | 
123 | 	// 接下来对标记的GET请求进行去重
124 | 	if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
125 | 		// 对超过阈值的GET请求进行标记
126 | 		s.overCountMark(req)
127 | 
128 | 		// 重新计算 QueryMapId
129 | 		req.Filter.QueryMapId = getParamMapID(req.Filter.MarkedQueryMap)
130 | 		// 重新计算 PathId
131 | 		req.Filter.PathId = getPathID(req.Filter.MarkedPath)
132 | 	} else {
133 | 		// 重新计算 PostDataId
134 | 		req.Filter.PostDataId = getParamMapID(req.Filter.MarkedPostDataMap)
135 | 	}
136 | 
137 | 	// 重新计算请求唯一ID
138 | 	req.Filter.UniqueId = getMarkedUniqueID(req)
139 | 
140 | 	// 新的ID再次去重
141 | 	newUniqueId := req.Filter.UniqueId
142 | 	if s.uniqueMarkedIds.Contains(newUniqueId) {
143 | 		logger.Logger.Debugf("filter req by uniqueMarkedIds 2: " + req.URL.RequestURI())
144 | 		return true
145 | 	}
146 | 
147 | 	// 添加到结果集中
148 | 	s.uniqueMarkedIds.Add(newUniqueId)
149 | 	return false
150 | }
151 | 
152 | /**
153 | Query的Map对象会自动解码，所以对RawQuery进行预先的标记
154 | */
155 | func (s *SmartFilter) preQueryMark(rawQuery string) string {
156 | 	if chineseRegex.MatchString(rawQuery) {
157 | 		return chineseRegex.ReplaceAllString(rawQuery, ChineseMark)
158 | 	} else if urlencodeRegex.MatchString(rawQuery) {
159 | 		return urlencodeRegex.ReplaceAllString(rawQuery, UrlEncodeMark)
160 | 	} else if unicodeRegex.MatchString(rawQuery) {
161 | 		return unicodeRegex.ReplaceAllString(rawQuery, UnicodeMark)
162 | 	}
163 | 	return rawQuery
164 | }
165 | 
166 | /**
167 | 对GET请求的参数和路径进行标记
168 | */
169 | func (s *SmartFilter) getMark(req *model.Request) {
170 | 	// 首先是解码前的预先替换
171 | 	todoURL := *(req.URL)
172 | 	todoURL.RawQuery = s.preQueryMark(todoURL.RawQuery)
173 | 
174 | 	// 依次打标记
175 | 	queryMap := todoURL.QueryMap()
176 | 	queryMap = markParamName(queryMap)
177 | 	queryMap = s.markParamValue(queryMap, *req)
178 | 	markedPath := MarkPath(todoURL.Path)
179 | 
180 | 	// 计算唯一的ID
181 | 	var queryKeyID string
182 | 	var queryMapID string
183 | 	if len(queryMap) != 0 {
184 | 		queryKeyID = getKeysID(queryMap)
185 | 		queryMapID = getParamMapID(queryMap)
186 | 	} else {
187 | 		queryKeyID = ""
188 | 		queryMapID = ""
189 | 	}
190 | 	pathID := getPathID(markedPath)
191 | 
192 | 	req.Filter.MarkedQueryMap = queryMap
193 | 	req.Filter.QueryKeysId = queryKeyID
194 | 	req.Filter.QueryMapId = queryMapID
195 | 	req.Filter.MarkedPath = markedPath
196 | 	req.Filter.PathId = pathID
197 | 
198 | 	// 最后计算标记后的唯一请求ID
199 | 	req.Filter.UniqueId = getMarkedUniqueID(req)
200 | }
201 | 
202 | /**
203 | 对POST请求的参数和路径进行标记
204 | */
205 | func (s *SmartFilter) postMark(req *model.Request) {
206 | 	postDataMap := req.PostDataMap()
207 | 
208 | 	postDataMap = markParamName(postDataMap)
209 | 	postDataMap = s.markParamValue(postDataMap, *req)
210 | 	markedPath := MarkPath(req.URL.Path)
211 | 
212 | 	// 计算唯一的ID
213 | 	var postDataMapID string
214 | 	if len(postDataMap) != 0 {
215 | 		postDataMapID = getParamMapID(postDataMap)
216 | 	} else {
217 | 		postDataMapID = ""
218 | 	}
219 | 	pathID := getPathID(markedPath)
220 | 
221 | 	req.Filter.MarkedPostDataMap = postDataMap
222 | 	req.Filter.PostDataId = postDataMapID
223 | 	req.Filter.MarkedPath = markedPath
224 | 	req.Filter.PathId = pathID
225 | 
226 | 	// 最后计算标记后的唯一请求ID
227 | 	req.Filter.UniqueId = getMarkedUniqueID(req)
228 | }
229 | 
230 | /**
231 | 标记参数名
232 | */
233 | func markParamName(paramMap map[string]interface{}) map[string]interface{} {
234 | 	markedParamMap := map[string]interface{}{}
235 | 	for key, value := range paramMap {
236 | 		// 纯字母不处理
237 | 		if onlyAlphaRegex.MatchString(key) {
238 | 			markedParamMap[key] = value
239 | 			// 参数名过长
240 | 		} else if len(key) >= 32 {
241 | 			markedParamMap[TooLongMark] = value
242 | 			// 替换掉数字
243 | 		} else {
244 | 			key = replaceNumRegex.ReplaceAllString(key, NumberMark)
245 | 			markedParamMap[key] = value
246 | 		}
247 | 	}
248 | 	return markedParamMap
249 | }
250 | 
251 | /**
252 | 标记参数值
253 | */
254 | func (s *SmartFilter) markParamValue(paramMap map[string]interface{}, req model.Request) map[string]interface{} {
255 | 	markedParamMap := map[string]interface{}{}
256 | 	for key, value := range paramMap {
257 | 		switch value.(type) {
258 | 		case bool:
259 | 			markedParamMap[key] = BoolMark
260 | 			continue
261 | 		case types.Slice:
262 | 			markedParamMap[key] = ListMark
263 | 			continue
264 | 		case float64:
265 | 			markedParamMap[key] = NumberMark
266 | 			continue
267 | 		}
268 | 		// 只处理string类型
269 | 		valueStr, ok := value.(string)
270 | 		if !ok {
271 | 			continue
272 | 		}
273 | 		// Crawlergo 为特定字符，说明此参数位置为数值型，非逻辑型，记录下此参数，全局过滤
274 | 		if strings.Contains(valueStr, "Crawlergo") {
275 | 			name := req.URL.Hostname() + req.URL.Path + req.Method + key
276 | 			s.filterLocationSet.Add(name)
277 | 			markedParamMap[key] = CustomValueMark
278 | 			// 全大写字母
279 | 		} else if onlyAlphaUpperRegex.MatchString(valueStr) {
280 | 			markedParamMap[key] = UpperMark
281 | 			// 参数值长度大于等于16
282 | 		} else if len(valueStr) >= 16 {
283 | 			markedParamMap[key] = TooLongMark
284 | 			// 均为数字和一些符号组成
285 | 		} else if onlyNumberRegex.MatchString(valueStr) || onlyNumberRegex.MatchString(numSymbolRegex.ReplaceAllString(valueStr, "")) {
286 | 			markedParamMap[key] = NumberMark
287 | 			// 存在中文
288 | 		} else if chineseRegex.MatchString(valueStr) {
289 | 			markedParamMap[key] = ChineseMark
290 | 			// urlencode
291 | 		} else if urlencodeRegex.MatchString(valueStr) {
292 | 			markedParamMap[key] = UrlEncodeMark
293 | 			// unicode
294 | 		} else if unicodeRegex.MatchString(valueStr) {
295 | 			markedParamMap[key] = UnicodeMark
296 | 			// 时间
297 | 		} else if onlyNumberRegex.MatchString(timeSymbolRegex.ReplaceAllString(valueStr, "")) {
298 | 			markedParamMap[key] = TimeMark
299 | 			// 字母加数字
300 | 		} else if onlyAlphaNumRegex.MatchString(valueStr) && numberRegex.MatchString(valueStr) {
301 | 			markedParamMap[key] = MixAlphaNumMark
302 | 			// 含有一些特殊符号
303 | 		} else if hasSpecialSymbol(valueStr) {
304 | 			markedParamMap[key] = MixSymbolMark
305 | 			// 数字出现的次数超过3，视为数值型参数
306 | 		} else if b := OneNumberRegex.ReplaceAllString(valueStr, "0"); strings.Count(b, "0") >= 3 {
307 | 			markedParamMap[key] = MixNumMark
308 | 			// 严格模式
309 | 		} else if s.StrictMode {
310 | 			// 无小写字母
311 | 			if !alphaLowerRegex.MatchString(valueStr) {
312 | 				markedParamMap[key] = NoLowerAlphaMark
313 | 				// 常见的值一般为 大写字母、小写字母、数字、下划线的任意组合，组合类型超过三种则视为伪静态
314 | 			} else {
315 | 				count := 0
316 | 				if alphaLowerRegex.MatchString(valueStr) {
317 | 					count += 1
318 | 				}
319 | 				if alphaUpperRegex.MatchString(valueStr) {
320 | 					count += 1
321 | 				}
322 | 				if numberRegex.MatchString(valueStr) {
323 | 					count += 1
324 | 				}
325 | 				if strings.Contains(valueStr, "_") || strings.Contains(valueStr, "-") {
326 | 					count += 1
327 | 				}
328 | 				if count >= 3 {
329 | 					markedParamMap[key] = MixStringMark
330 | 				}
331 | 			}
332 | 		} else {
333 | 			markedParamMap[key] = value
334 | 		}
335 | 	}
336 | 	return markedParamMap
337 | }
338 | 
339 | /**
340 | 标记路径
341 | */
342 | func MarkPath(path string) string {
343 | 	pathParts := strings.Split(path, "/")
344 | 	for index, part := range pathParts {
345 | 		if len(part) >= 32 {
346 | 			pathParts[index] = TooLongMark
347 | 		} else if onlyNumberRegex.MatchString(numSymbolRegex.ReplaceAllString(part, "")) {
348 | 			pathParts[index] = NumberMark
349 | 		} else if strings.HasSuffix(part, ".html") || strings.HasSuffix(part, ".htm") || strings.HasSuffix(part, ".shtml") {
350 | 			part = htmlReplaceRegex.ReplaceAllString(part, "")
351 | 			// 大写、小写、数字混合
352 | 			if numberRegex.MatchString(part) && alphaUpperRegex.MatchString(part) && alphaLowerRegex.MatchString(part) {
353 | 				pathParts[index] = MixAlphaNumMark
354 | 				// 纯数字
355 | 			} else if b := numSymbolRegex.ReplaceAllString(part, ""); onlyNumberRegex.MatchString(b) {
356 | 				pathParts[index] = NumberMark
357 | 			}
358 | 			// 含有特殊符号
359 | 		} else if hasSpecialSymbol(part) {
360 | 			pathParts[index] = MixSymbolMark
361 | 		} else if chineseRegex.MatchString(part) {
362 | 			pathParts[index] = ChineseMark
363 | 		} else if unicodeRegex.MatchString(part) {
364 | 			pathParts[index] = UnicodeMark
365 | 		} else if onlyAlphaUpperRegex.MatchString(part) {
366 | 			pathParts[index] = UpperMark
367 | 			// 均为数字和一些符号组成
368 | 		} else if b := numSymbolRegex.ReplaceAllString(part, ""); onlyNumberRegex.MatchString(b) {
369 | 			pathParts[index] = NumberMark
370 | 			// 数字出现的次数超过3，视为伪静态path
371 | 		} else if b := OneNumberRegex.ReplaceAllString(part, "0"); strings.Count(b, "0") > 3 {
372 | 			pathParts[index] = MixNumMark
373 | 		}
374 | 	}
375 | 	newPath := strings.Join(pathParts, "/")
376 | 	return newPath
377 | }
378 | 
379 | /**
380 | 全局数值型参数过滤
381 | */
382 | func (s *SmartFilter) globalFilterLocationMark(req *model.Request) {
383 | 	name := req.URL.Hostname() + req.URL.Path + req.Method
384 | 	if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
385 | 		for key := range req.Filter.MarkedQueryMap {
386 | 			name += key
387 | 			if s.filterLocationSet.Contains(name) {
388 | 				req.Filter.MarkedQueryMap[key] = CustomValueMark
389 | 			}
390 | 		}
391 | 	} else if req.Method == config.POST || req.Method == config.PUT {
392 | 		for key := range req.Filter.MarkedPostDataMap {
393 | 			name += key
394 | 			if s.filterLocationSet.Contains(name) {
395 | 				req.Filter.MarkedPostDataMap[key] = CustomValueMark
396 | 			}
397 | 		}
398 | 	}
399 | }
400 | 
401 | /**
402 | 进行全局重复参数名、参数值、路径的统计标记
403 | 之后对超过阈值的部分再次打标记
404 | */
405 | func (s *SmartFilter) repeatCountStatistic(req *model.Request) {
406 | 	queryKeyId := req.Filter.QueryKeysId
407 | 	pathId := req.Filter.PathId
408 | 	if queryKeyId != "" {
409 | 		// 所有参数名重复数量统计
410 | 		if v, ok := s.filterParamKeyRepeatCount.Load(queryKeyId); ok {
411 | 			s.filterParamKeyRepeatCount.Store(queryKeyId, v.(int)+1)
412 | 		} else {
413 | 			s.filterParamKeyRepeatCount.Store(queryKeyId, 1)
414 | 		}
415 | 
416 | 		for key, value := range req.Filter.MarkedQueryMap {
417 | 			// 某个URL的所有参数名重复数量统计
418 | 			paramQueryKey := queryKeyId + key
419 | 
420 | 			if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok {
421 | 				set := set.(mapset.Set)
422 | 				set.Add(value)
423 | 			} else {
424 | 				s.filterParamKeySingleValues.Store(paramQueryKey, mapset.NewSet(value))
425 | 			}
426 | 
427 | 			//本轮所有URL中某个参数重复数量统计
428 | 			if _, ok := s.filterParamKeyAllValues.Load(key); !ok {
429 | 				s.filterParamKeyAllValues.Store(key, mapset.NewSet(value))
430 | 			} else {
431 | 				if v, ok := s.filterParamKeyAllValues.Load(key); ok {
432 | 					set := v.(mapset.Set)
433 | 					if !set.Contains(value) {
434 | 						set.Add(value)
435 | 					}
436 | 				}
437 | 			}
438 | 
439 | 			// 如果参数值为空，统计该PATH下的空值参数名个数
440 | 			if value == "" {
441 | 				if _, ok := s.filterPathParamEmptyValues.Load(pathId); !ok {
442 | 					s.filterPathParamEmptyValues.Store(pathId, mapset.NewSet(key))
443 | 				} else {
444 | 					if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok {
445 | 						set := v.(mapset.Set)
446 | 						if !set.Contains(key) {
447 | 							set.Add(key)
448 | 						}
449 | 					}
450 | 				}
451 | 			}
452 | 
453 | 			pathIdKey := pathId + key
454 | 			// 某path下的参数值去重标记出现次数统计
455 | 			if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok {
456 | 				if markedStringRegex.MatchString(value.(string)) {
457 | 					s.filterPathParamKeySymbol.Store(pathIdKey, v.(int)+1)
458 | 				}
459 | 			} else {
460 | 				s.filterPathParamKeySymbol.Store(pathIdKey, 1)
461 | 			}
462 | 
463 | 		}
464 | 	}
465 | 
466 | 	// 相对于上一级目录，本级path目录的数量统计，存在文件后缀的情况下，放行常见脚本后缀
467 | 	if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) {
468 | 		return
469 | 	}
470 | 
471 | 	//
472 | 	parentPathId := tools.StrMd5(req.URL.ParentPath())
473 | 	currentPath := strings.Replace(req.Filter.MarkedPath, req.URL.ParentPath(), "", -1)
474 | 	if _, ok := s.filterParentPathValues.Load(parentPathId); !ok {
475 | 		s.filterParentPathValues.Store(parentPathId, mapset.NewSet(currentPath))
476 | 	} else {
477 | 		if v, ok := s.filterParentPathValues.Load(parentPathId); ok {
478 | 			set := v.(mapset.Set)
479 | 			if !set.Contains(currentPath) {
480 | 				set.Add(currentPath)
481 | 			}
482 | 		}
483 | 	}
484 | }
485 | 
486 | /**
487 | 对重复统计之后，超过阈值的部分再次打标记
488 | */
489 | func (s *SmartFilter) overCountMark(req *model.Request) {
490 | 	queryKeyId := req.Filter.QueryKeysId
491 | 	pathId := req.Filter.PathId
492 | 	// 参数不为空，
493 | 	if req.Filter.QueryKeysId != "" {
494 | 		// 某个URL的所有参数名重复数量超过阈值 且该参数有超过三个不同的值 则打标记
495 | 		if v, ok := s.filterParamKeyRepeatCount.Load(queryKeyId); ok && v.(int) > MaxParamKeySingleCount {
496 | 			for key := range req.Filter.MarkedQueryMap {
497 | 				paramQueryKey := queryKeyId + key
498 | 				if set, ok := s.filterParamKeySingleValues.Load(paramQueryKey); ok {
499 | 					set := set.(mapset.Set)
500 | 					if set.Cardinality() > 3 {
501 | 						req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
502 | 					}
503 | 				}
504 | 			}
505 | 		}
506 | 
507 | 		for key := range req.Filter.MarkedQueryMap {
508 | 			// 所有URL中，某个参数不同的值出现次数超过阈值，打标记去重
509 | 			if paramKeySet, ok := s.filterParamKeyAllValues.Load(key); ok {
510 | 				paramKeySet := paramKeySet.(mapset.Set)
511 | 				if paramKeySet.Cardinality() > MaxParamKeyAllCount {
512 | 					req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
513 | 				}
514 | 			}
515 | 
516 | 			pathIdKey := pathId + key
517 | 			// 某个PATH的GET参数值去重标记出现次数超过阈值，则对该PATH的该参数进行全局标记
518 | 			if v, ok := s.filterPathParamKeySymbol.Load(pathIdKey); ok && v.(int) > MaxPathParamKeySymbolCount {
519 | 				req.Filter.MarkedQueryMap[key] = FixParamRepeatMark
520 | 			}
521 | 		}
522 | 
523 | 		// 处理某个path下空参数值的参数个数超过阈值 如伪静态： http://bang.360.cn/?chu_xiu
524 | 		if v, ok := s.filterPathParamEmptyValues.Load(pathId); ok {
525 | 			set := v.(mapset.Set)
526 | 			if set.Cardinality() > MaxPathParamEmptyCount {
527 | 				newMarkerQueryMap := map[string]interface{}{}
528 | 				for key, value := range req.Filter.MarkedQueryMap {
529 | 					if value == "" {
530 | 						newMarkerQueryMap[FixParamRepeatMark] = ""
531 | 					} else {
532 | 						newMarkerQueryMap[key] = value
533 | 					}
534 | 				}
535 | 				req.Filter.MarkedQueryMap = newMarkerQueryMap
536 | 			}
537 | 		}
538 | 	}
539 | 
540 | 	// 处理本级path的伪静态
541 | 	if req.URL.ParentPath() == "" || inCommonScriptSuffix(req.URL.FileExt()) {
542 | 		return
543 | 	}
544 | 	parentPathId := tools.StrMd5(req.URL.ParentPath())
545 | 	if set, ok := s.filterParentPathValues.Load(parentPathId); ok {
546 | 		set := set.(mapset.Set)
547 | 		if set.Cardinality() > MaxParentPathCount {
548 | 			if strings.HasSuffix(req.URL.ParentPath(), "/") {
549 | 				req.Filter.MarkedPath = req.URL.ParentPath() + FixPathMark
550 | 			} else {
551 | 				req.Filter.MarkedPath = req.URL.ParentPath() + "/" + FixPathMark
552 | 			}
553 | 		}
554 | 	}
555 | }
556 | 
557 | // calcFragmentID 计算 fragment 唯一值，如果 fragment 的格式为 url path
558 | func (s *SmartFilter) calcFragmentID(fragment string) string {
559 | 	if fragment == "" || !strings.HasPrefix(fragment, "/") {
560 | 		return ""
561 | 	}
562 | 	fakeUrl, err := model.GetUrl(fragment)
563 | 	if err != nil {
564 | 		logger.Logger.Error("cannot calculate url fragment: ", err)
565 | 		return ""
566 | 	}
567 | 	// XXX: discuss https://github.com/Qianlitp/crawlergo/issues/100
568 | 	fakeReq := model.GetRequest(config.GET, fakeUrl)
569 | 	s.getMark(&fakeReq)
570 | 	// s.repeatCountStatistic(&fakeReq)
571 | 	return fakeReq.Filter.UniqueId
572 | }
573 | 
574 | /**
575 | 计算标记后的唯一请求ID
576 | */
577 | func getMarkedUniqueID(req *model.Request) string {
578 | 	var paramId string
579 | 	if req.Method == config.GET || req.Method == config.DELETE || req.Method == config.HEAD || req.Method == config.OPTIONS {
580 | 		paramId = req.Filter.QueryMapId
581 | 	} else {
582 | 		paramId = req.Filter.PostDataId
583 | 	}
584 | 
585 | 	uniqueStr := req.Method + paramId + req.Filter.PathId + req.URL.Host + req.Filter.FragmentID
586 | 	if req.RedirectionFlag {
587 | 		uniqueStr += "Redirection"
588 | 	}
589 | 	if req.URL.Path == "/" && req.URL.RawQuery == "" && req.URL.Scheme == "https" {
590 | 		uniqueStr += "https"
591 | 	}
592 | 
593 | 	return tools.StrMd5(uniqueStr)
594 | }
595 | 
596 | /**
597 | 计算请求参数的key标记后的唯一ID
598 | */
599 | func getKeysID(dataMap map[string]interface{}) string {
600 | 	var keys []string
601 | 	var idStr string
602 | 	for key := range dataMap {
603 | 		keys = append(keys, key)
604 | 	}
605 | 	sort.Strings(keys)
606 | 	for _, key := range keys {
607 | 		idStr += key
608 | 	}
609 | 	return tools.StrMd5(idStr)
610 | }
611 | 
612 | /**
613 | 计算请求参数标记后的唯一ID
614 | */
615 | func getParamMapID(dataMap map[string]interface{}) string {
616 | 	var keys []string
617 | 	var idStr string
618 | 	var markReplaceRegex = regexp.MustCompile(`{{.+}}`)
619 | 	for key := range dataMap {
620 | 		keys = append(keys, key)
621 | 	}
622 | 	sort.Strings(keys)
623 | 	for _, key := range keys {
624 | 		value := dataMap[key]
625 | 		idStr += key
626 | 		if value, ok := value.(string); ok {
627 | 			idStr += markReplaceRegex.ReplaceAllString(value, "{{mark}}")
628 | 		}
629 | 	}
630 | 	return tools.StrMd5(idStr)
631 | }
632 | 
633 | /**
634 | 计算PATH标记后的唯一ID
635 | */
636 | func getPathID(path string) string {
637 | 	return tools.StrMd5(path)
638 | }
639 | 
640 | /**
641 | 判断字符串中是否存在以下特殊符号
642 | */
643 | func hasSpecialSymbol(str string) bool {
644 | 	symbolList := []string{"{", "}", " ", "|", "#", "@", "$", "*", ",", "<", ">", "/", "?", "\\", "+", "="}
645 | 	for _, sym := range symbolList {
646 | 		if strings.Contains(str, sym) {
647 | 			return true
648 | 		}
649 | 	}
650 | 	return false
651 | }
652 | 
653 | func inCommonScriptSuffix(suffix string) bool {
654 | 	return config.ScriptSuffixSet.Contains(suffix)
655 | }
656 | 


--------------------------------------------------------------------------------
/pkg/filter/smart_filter_test.go:
--------------------------------------------------------------------------------
 1 | package filter
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 7 | 	"github.com/Qianlitp/crawlergo/pkg/model"
 8 | 
 9 | 	"github.com/stretchr/testify/assert"
10 | )
11 | 
12 | var (
13 | 	// queryUrls = []string{
14 | 	// 	"http://test.nil.local.com/cctv/abcd?keyword=crawlergocrawlergo&end=1",
15 | 	// 	"http://test.nil.local.com/cctv/abcd?keyword=crawlergocrawlergo&end=1",
16 | 	// }
17 | 
18 | 	fragmentUrls = []string{
19 | 		// 基准组
20 | 		"http://testhtml5.vuwm.com/latest#/page/1",
21 | 		"http://testhtml5.vuwm.com/latest#/page/search?keyword=Crawlergo&source=2&demo=1423&c=afa",
22 | 		// 被标记成 {{long}}
23 | 		"http://testhtml5.vuwm.com/latest#/page/search/fasdfsdafsdfsdfsdfasfsfasfafdsafssfasdfsd",
24 | 
25 | 		// 对照组
26 | 		"http://testhtml5.vuwm.com/latest#/page/2",
27 | 		// 不应该被标记成 {{long}}
28 | 		"http://testhtml5.vuwm.com/latest#/page/search?keyword=CrawlergoCrawlergoCrawlergo&source=1&demo=1255&c=afa",
29 | 	}
30 | 
31 | 	// completeUrls = []string{
32 | 	// 	"https://test.local.com:1234/adfatd/123456/sx14xi?user=crawlergo&pwd=fa1424&end=1#/user/info",
33 | 	// }
34 | 	smart = SmartFilter{}
35 | )
36 | 
37 | func TestDoFilter_countFragment(t *testing.T) {
38 | 	smart.Init()
39 | 	reqs := []model.Request{}
40 | 	for _, fu := range fragmentUrls {
41 | 		url, err := model.GetUrl(fu)
42 | 		assert.Nil(t, err)
43 | 		reqs = append(reqs, model.GetRequest(config.GET, url))
44 | 	}
45 | 	// #/page/1 和 #/page/2 是同一种类型
46 | 	assert.Equal(t, smart.calcFragmentID(reqs[0].URL.Fragment), smart.calcFragmentID(reqs[3].URL.Fragment))
47 | 	assert.Equal(t, smart.calcFragmentID(reqs[1].URL.Fragment), smart.calcFragmentID(reqs[4].URL.Fragment))
48 | 	for _, rq := range reqs[:2] {
49 | 		// 第一次出现都不应该过滤
50 | 		assert.Equal(t, smart.DoFilter(&rq), false)
51 | 	}
52 | 	for _, rq := range reqs[3:] {
53 | 		// 同类型出现第二次，应该被过滤
54 | 		assert.Equal(t, smart.DoFilter(&rq), true)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/pkg/js/javascript.go:
--------------------------------------------------------------------------------
  1 | package js
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/chromedp/cdproto/cdp"
  6 | )
  7 | 
  8 | const TabInitJS = `
  9 | (function addTabInitScript () {
 10 | 
 11 | 	// Pass the Webdriver Test.
 12 | 	Object.defineProperty(navigator, 'webdriver', {
 13 |         get: () => false,
 14 |     });
 15 | 
 16 | 	// Pass the Plugins Length Test.
 17 | 	// Overwrite the plugins property to use a custom getter.
 18 | 	Object.defineProperty(navigator, 'plugins', {
 19 |         // This just needs to have length > 0 for the current test,
 20 |         // but we could mock the plugins too if necessary.
 21 |         get: () => [1, 2, 3, 4, 5],
 22 |     });
 23 | 	
 24 | 	// Pass the Chrome Test.
 25 | 	// We can mock this in as much depth as we need for the test.
 26 | 	window.chrome = {
 27 | 		runtime: {},
 28 | 	};
 29 | 
 30 | 	// Pass the Permissions Test.
 31 |   	const originalQuery = window.navigator.permissions.query;
 32 | 	window.navigator.permissions.query = (parameters) => (
 33 |     	parameters.name === 'notifications' ?
 34 | 			Promise.resolve({ state: Notification.permission }) :
 35 | 			originalQuery(parameters)
 36 | 	);
 37 | 
 38 | 	//Pass the Permissions Test. navigator.userAgent
 39 | 	Object.defineProperty(navigator, 'userAgent', {
 40 |         get: () => "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36",
 41 |     });
 42 | 
 43 | 	// 修改浏览器对象的属性
 44 | 	Object.defineProperty(navigator, 'platform', {
 45 | 		get: function () { return 'win32'; }
 46 | 	});
 47 | 	
 48 | 	Object.defineProperty(navigator, 'language', {
 49 | 		get: function () { return 'zh-CN'; }
 50 | 	});
 51 | 	
 52 | 	Object.defineProperty(navigator, 'languages', {
 53 | 		get: function () { return ["zh-CN", "zh"]; }
 54 | 	});
 55 | 	
 56 | 	// history api hook
 57 | 	window.history.pushState = function(a, b, c) { 
 58 | 		window.addLink(c, "HistoryAPI");
 59 | 	}
 60 | 	window.history.replaceState = function(a, b, c) { 
 61 | 		window.addLink(c, "HistoryAPI");
 62 | 	}
 63 | 	Object.defineProperty(window.history,"pushState",{"writable": false, "configurable": false});
 64 | 	Object.defineProperty(window.history,"replaceState",{"writable": false, "configurable": false});
 65 | 	// 监听hash改变
 66 | 	window.addEventListener("hashchange", function() {
 67 | 		window.addLink(document.location.href, "HashChange");
 68 | 	});
 69 | 	
 70 | 	var oldWebSocket = window.WebSocket;
 71 | 	window.WebSocket = function(url, arg) {
 72 | 		window.addLink(url, "WebSocket");
 73 | 		return new oldWebSocket(url, arg);
 74 | 	}
 75 | 	
 76 | 	var oldEventSource = window.EventSource;
 77 | 	window.EventSource = function(url) {
 78 | 		window.addLink(url, "EventSource");
 79 | 		return new oldEventSource(url);
 80 | 	}
 81 | 	
 82 | 	var oldFetch = window.fetch;
 83 | 	window.fetch = function(url) {
 84 | 		window.addLink(url, "Fetch");
 85 | 		return oldFetch(url);
 86 | 	}
 87 | 	
 88 | 	// 锁定表单重置
 89 | 	HTMLFormElement.prototype.reset = function() {console.log("cancel reset form")};
 90 | 	Object.defineProperty(HTMLFormElement.prototype,"reset",{"writable": false, "configurable": false});
 91 | 	
 92 | 	// hook dom2 级事件监听
 93 | 	window.add_even_listener_count_sec_auto = {};
 94 | 	// record event func , hook addEventListener
 95 | 	let old_event_handle = Element.prototype.addEventListener;
 96 | 	Element.prototype.addEventListener = function(event_name, event_func, useCapture) {
 97 | 		let name = "<" + this.tagName + "> " + this.id + this.name + this.getAttribute("class") + "|" + event_name;
 98 | 		// console.log(name)
 99 | 		// 对每个事件设定最大的添加次数，防止无限触发，最大次数为5
100 | 		if (!window.add_even_listener_count_sec_auto.hasOwnProperty(name)) {
101 | 			window.add_even_listener_count_sec_auto[name] = 1;
102 | 		} else if (window.add_even_listener_count_sec_auto[name] == 5) {
103 | 			return ;
104 | 		} else {
105 | 			 window.add_even_listener_count_sec_auto[name] += 1;
106 | 		}
107 | 		if (this.hasAttribute("sec_auto_dom2_event_flag")) {
108 | 			let sec_auto_dom2_event_flag = this.getAttribute("sec_auto_dom2_event_flag");
109 | 			this.setAttribute("sec_auto_dom2_event_flag", sec_auto_dom2_event_flag + "|" + event_name);
110 | 		} else {
111 | 			this.setAttribute("sec_auto_dom2_event_flag", event_name);
112 | 		}
113 | 		old_event_handle.apply(this, arguments);
114 | 	};
115 | 	
116 | 	function dom0_listener_hook(that, event_name) {
117 | 		let name = "<" + that.tagName + "> " + that.id + that.name + that.getAttribute("class") + "|" + event_name;
118 | 		// console.log(name);
119 | 		// 对每个事件设定最大的添加次数，防止无限触发，最大次数为5
120 | 		if (!window.add_even_listener_count_sec_auto.hasOwnProperty(name)) {
121 | 			window.add_even_listener_count_sec_auto[name] = 1;
122 | 		} else if (window.add_even_listener_count_sec_auto[name] == 5) {
123 | 			return ;
124 | 		} else {
125 | 			 window.add_even_listener_count_sec_auto[name] += 1;
126 | 		}
127 | 		if (that.hasAttribute("sec_auto_dom2_event_flag")) {
128 | 			let sec_auto_dom2_event_flag = that.getAttribute("sec_auto_dom2_event_flag");
129 | 			that.setAttribute("sec_auto_dom2_event_flag", sec_auto_dom2_event_flag + "|" + event_name);
130 | 		} else {
131 | 			that.setAttribute("sec_auto_dom2_event_flag", event_name);
132 | 		}
133 | 	}
134 | 	
135 | 	// hook dom0 级事件监听
136 | 	Object.defineProperties(HTMLElement.prototype, {
137 | 		onclick: {set: function(newValue){onclick = newValue;dom0_listener_hook(this, "click");}},
138 | 		onchange: {set: function(newValue){onchange = newValue;dom0_listener_hook(this, "change");}},
139 | 		onblur: {set: function(newValue){onblur = newValue;dom0_listener_hook(this, "blur");}},
140 | 		ondblclick: {set: function(newValue){ondblclick = newValue;dom0_listener_hook(this, "dbclick");}},
141 | 		onfocus: {set: function(newValue){onfocus = newValue;dom0_listener_hook(this, "focus");}},
142 | 		onkeydown: {set: function(newValue){onkeydown = newValue;dom0_listener_hook(this, "keydown");}},
143 | 		onkeypress: {set: function(newValue){onkeypress = newValue;dom0_listener_hook(this, "keypress");}},
144 | 		onkeyup: {set: function(newValue){onkeyup = newValue;dom0_listener_hook(this, "keyup");}},
145 | 		onload: {set: function(newValue){onload = newValue;dom0_listener_hook(this, "load");}},
146 | 		onmousedown: {set: function(newValue){onmousedown = newValue;dom0_listener_hook(this, "mousedown");}},
147 | 		onmousemove: {set: function(newValue){onmousemove = newValue;dom0_listener_hook(this, "mousemove");}},
148 | 		onmouseout: {set: function(newValue){onmouseout = newValue;dom0_listener_hook(this, "mouseout");}},
149 | 		onmouseover: {set: function(newValue){onmouseover = newValue;dom0_listener_hook(this, "mouseover");}},
150 | 		onmouseup: {set: function(newValue){onmouseup = newValue;dom0_listener_hook(this, "mouseup");}},
151 | 		onreset: {set: function(newValue){onreset = newValue;dom0_listener_hook(this, "reset");}},
152 | 		onresize: {set: function(newValue){onresize = newValue;dom0_listener_hook(this, "resize");}},
153 | 		onselect: {set: function(newValue){onselect = newValue;dom0_listener_hook(this, "select");}},
154 | 		onsubmit: {set: function(newValue){onsubmit = newValue;dom0_listener_hook(this, "submit");}},
155 | 		onunload: {set: function(newValue){onunload = newValue;dom0_listener_hook(this, "unload");}},
156 | 		onabort: {set: function(newValue){onabort = newValue;dom0_listener_hook(this, "abort");}},
157 | 		onerror: {set: function(newValue){onerror = newValue;dom0_listener_hook(this, "error");}},
158 | 	})
159 | 	
160 | 	// hook window.open 
161 | 	window.open = function (url) {
162 | 		console.log("trying to open window.");
163 | 		window.addLink(url, "OpenWindow");
164 | 	}
165 | 	Object.defineProperty(window,"open",{"writable": false, "configurable": false});
166 | 	
167 | 	// hook window close
168 | 	window.close = function() {console.log("trying to close page.");};
169 | 	Object.defineProperty(window,"close",{"writable": false, "configurable": false});
170 | 	
171 | 	// hook setTimeout
172 | 	//window.__originalSetTimeout = window.setTimeout;
173 | 	//window.setTimeout = function() {
174 | 	//    arguments[1] = 0;
175 | 	//    return window.__originalSetTimeout.apply(this, arguments);
176 | 	//};
177 | 	//Object.defineProperty(window,"setTimeout",{"writable": false, "configurable": false});
178 | 	
179 | 	// hook setInterval 时间设置为60秒 目的是减轻chrome的压力
180 | 	window.__originalSetInterval = window.setInterval;
181 | 	window.setInterval = function() {
182 | 		arguments[1] = 60000;
183 | 		return window.__originalSetInterval.apply(this, arguments);
184 | 	};
185 | 	Object.defineProperty(window,"setInterval",{"writable": false, "configurable": false});
186 | 	
187 | 	// 劫持原生ajax，并对每个请求设置最大请求次数
188 | 	window.ajax_req_count_sec_auto = {};
189 | 	XMLHttpRequest.prototype.__originalOpen = XMLHttpRequest.prototype.open;
190 | 	XMLHttpRequest.prototype.open = function(method, url, async, user, password) {
191 | 		// hook code
192 | 		this.url = url;
193 | 		this.method = method;
194 | 		let name = method + url;
195 | 		if (!window.ajax_req_count_sec_auto.hasOwnProperty(name)) {
196 | 			window.ajax_req_count_sec_auto[name] = 1
197 | 		} else {
198 | 			window.ajax_req_count_sec_auto[name] += 1
199 | 		}
200 | 		
201 | 		if (window.ajax_req_count_sec_auto[name] <= 10) {
202 | 			return this.__originalOpen(method, url, true, user, password);
203 | 		}
204 | 	}
205 | 	Object.defineProperty(XMLHttpRequest.prototype,"open",{"writable": false, "configurable": false});
206 | 	
207 | 	XMLHttpRequest.prototype.__originalSend = XMLHttpRequest.prototype.send;
208 | 	XMLHttpRequest.prototype.send = function(data) {
209 | 		// hook code
210 | 		let name = this.method + this.url;
211 | 		if (window.ajax_req_count_sec_auto[name] <= 10) {
212 | 			return this.__originalSend(data);
213 | 		}
214 | 	}
215 | 	Object.defineProperty(XMLHttpRequest.prototype,"send",{"writable": false, "configurable": false});
216 | 
217 | 	XMLHttpRequest.prototype.__originalAbort = XMLHttpRequest.prototype.abort;
218 | 	XMLHttpRequest.prototype.abort = function() {
219 | 		// hook code
220 | 	}
221 | 	Object.defineProperty(XMLHttpRequest.prototype,"abort",{"writable": false, "configurable": false});
222 | 	
223 | 	// 打乱数组的方法
224 | 	window.randArr = function (arr) {
225 | 		for (var i = 0; i < arr.length; i++) {
226 | 			var iRand = parseInt(arr.length * Math.random());
227 | 			var temp = arr[i];
228 | 			arr[i] = arr[iRand];
229 | 			arr[iRand] = temp;
230 | 		}
231 | 		return arr;
232 | 	}
233 | 	
234 | 	window.sleep = function(time) {
235 | 		return new Promise((resolve) => setTimeout(resolve, time));
236 | 	}
237 | 	
238 | 	Array.prototype.indexOf = function(val) {
239 | 		for (var i = 0; i < this.length; i++) {
240 | 			if (this[i] == val) return i;
241 | 		}
242 | 		return -1;
243 | 	};
244 | 	
245 | 	Array.prototype.remove = function(val) {
246 | 		var index = this.indexOf(val);
247 | 		if (index > -1) {
248 | 			this.splice(index, 1);
249 | 		}
250 | 	};
251 | 
252 | 	const binding = window["addLink"];
253 | 	window["addLink"] = async(...args) => {
254 | 		const me = window["addLink"];
255 | 		let callbacks = me['callbacks'];
256 | 		if (!callbacks) {
257 | 		  callbacks = new Map();
258 | 		  me['callbacks'] = callbacks;
259 | 		}
260 | 		const seq = (me['lastSeq'] || 0) + 1;
261 | 		me['lastSeq'] = seq;
262 | 		const promise = new Promise(fulfill => callbacks.set(seq, fulfill));
263 | 		binding(JSON.stringify({name: "addLink", seq, args}));
264 | 		return promise;
265 | 	};
266 | 
267 | 	const bindingTest = window["Test"];
268 | 	window["Test"] = async(...args) => {
269 | 		const me = window["Test"];
270 | 		let callbacks = me['callbacks'];
271 | 		if (!callbacks) {
272 | 		  callbacks = new Map();
273 | 		  me['callbacks'] = callbacks;
274 | 		}
275 | 		const seq = (me['lastSeq'] || 0) + 1;
276 | 		me['lastSeq'] = seq;
277 | 		const promise = new Promise(fulfill => callbacks.set(seq, fulfill));
278 | 		binding(JSON.stringify({name: "Test", seq, args}));
279 | 		return promise;
280 | 	};
281 | })();
282 | `
283 | 
284 | const DeliverResultJS = `
285 | (function deliverResult(name, seq, result) {
286 | 	window[name]['callbacks'].get(seq)(result);
287 | 	window[name]['callbacks'].delete(seq);
288 | })("%s", %v, "%s")
289 | `
290 | 
291 | const ObserverJS = `
292 | (function init_observer_sec_auto_b() {
293 | 	window.dom_listener_func_sec_auto = function (e) {
294 | 		let node = e.target;
295 | 		let nodeListSrc = node.querySelectorAll("[src]");
296 | 		for (let each of nodeListSrc) {
297 | 			if (each.src) {
298 | 				window.addLink(each.src, "DOM");
299 | 				let attrValue = each.getAttribute("src");
300 | 				if (attrValue.toLocaleLowerCase().startsWith("javascript:")) {
301 | 					try {
302 | 						eval(attrValue.substring(11));
303 | 					}
304 | 					catch {}
305 | 				}
306 | 			}
307 | 		}
308 | 		
309 | 		let nodeListHref = node.querySelectorAll("[href]");
310 | 		nodeListHref = window.randArr(nodeListHref);
311 | 		for (let each of nodeListHref) {
312 | 			if (each.href) {
313 | 				window.addLink(each.href, "DOM");
314 | 				let attrValue = each.getAttribute("href");
315 | 				if (attrValue.toLocaleLowerCase().startsWith("javascript:")) {
316 | 					try {
317 | 						eval(attrValue.substring(11));
318 | 					}
319 | 					catch {}
320 | 				}
321 | 			}
322 | 		}
323 | 	};
324 | 	document.addEventListener('DOMNodeInserted', window.dom_listener_func_sec_auto, true);
325 | 	document.addEventListener('DOMSubtreeModified', window.dom_listener_func_sec_auto, true);
326 | 	document.addEventListener('DOMNodeInsertedIntoDocument', window.dom_listener_func_sec_auto, true);
327 | 	document.addEventListener('DOMAttrModified', window.dom_listener_func_sec_auto, true);
328 | })()
329 | `
330 | 
331 | const RemoveDOMListenerJS = `
332 | (function remove_dom_listener() {
333 | 	document.removeEventListener('DOMNodeInserted', window.dom_listener_func_sec_auto, true);
334 | 	document.removeEventListener('DOMSubtreeModified', window.dom_listener_func_sec_auto, true);
335 | 	document.removeEventListener('DOMNodeInsertedIntoDocument', window.dom_listener_func_sec_auto, true);
336 | 	document.removeEventListener('DOMAttrModified', window.dom_listener_func_sec_auto, true);
337 | })()
338 | `
339 | 
340 | const NewFrameTemplate = `
341 | (function sec_auto_new_iframe () {
342 | 	let frame = document.createElement("iframe");
343 | 	frame.setAttribute("name", "%s");
344 | 	frame.setAttribute("id", "%s");
345 | 	frame.setAttribute("style", "display: none");
346 | 	document.body.appendChild(frame);
347 | })()
348 | `
349 | 
350 | const TriggerInlineEventJS = `
351 | (async function trigger_all_inline_event(){
352 | 	let eventNames = ["onabort", "onblur", "onchange", "onclick", "ondblclick", "onerror", "onfocus", "onkeydown", "onkeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onresize", "onselect", "onsubmit", "onunload"];
353 | 	for (let eventName of eventNames) {
354 | 		let event = eventName.replace("on", "");
355 | 		let nodeList = document.querySelectorAll("[" + eventName + "]");
356 | 		if (nodeList.length > 100) {
357 | 			nodeList = nodeList.slice(0, 100);
358 | 		}
359 | 		nodeList = window.randArr(nodeList);
360 | 		for (let node of nodeList) {
361 | 			await window.sleep(%f);
362 | 			let evt = document.createEvent('CustomEvent');
363 | 			evt.initCustomEvent(event, false, true, null);
364 | 			try {
365 | 				node.dispatchEvent(evt);
366 | 			}
367 | 			catch {}
368 | 		}
369 | 	}
370 | })()
371 | `
372 | 
373 | const TriggerDom2EventJS = `
374 | (async function trigger_all_dom2_custom_event() {
375 | 	function transmit_child(node, event, loop) {
376 | 		let _loop = loop + 1
377 | 		if (_loop > 4) {
378 | 			return;
379 | 		}
380 | 		if (node.nodeType === 1) {
381 | 			if (node.hasChildNodes) {
382 | 				let index = parseInt(Math.random()*node.children.length,10);
383 | 				try {
384 | 					node.children[index].dispatchEvent(event);
385 | 				} catch(e) {}
386 | 				let max = node.children.length>5?5:node.children.length;
387 | 				for (let count=0;count<max;count++) {
388 | 					let index = parseInt(Math.random()*node.children.length,10);
389 | 					transmit_child(node.children[index], event, _loop);
390 | 				}
391 | 			}
392 | 		}
393 | 	}
394 | 	let nodes = document.querySelectorAll("[sec_auto_dom2_event_flag]");
395 | 	if (nodes.length > 200) {
396 | 		nodes = nodes.slice(0, 200);
397 | 	}
398 | 	nodes = window.randArr(nodes);
399 | 	for (let node of nodes) {
400 | 		let loop = 0;
401 | 		await window.sleep(%f);
402 | 		let event_name_list = node.getAttribute("sec_auto_dom2_event_flag").split("|");
403 | 		let event_name_set = new Set(event_name_list);
404 | 		event_name_list = [...event_name_set];
405 | 		for (let event_name of event_name_list) {
406 | 			let evt = document.createEvent('CustomEvent');
407 | 			evt.initCustomEvent(event_name, true, true, null);
408 | 			
409 | 			if (event_name == "click" || event_name == "focus" || event_name == "mouseover" || event_name == "select") {
410 | 				transmit_child(node, evt, loop);
411 | 			}
412 | 			if ( (node.className && node.className.includes("close")) || (node.id && node.id.includes("close"))) {
413 | 				continue;
414 | 			}
415 | 			
416 | 			try {
417 | 				node.dispatchEvent(evt);
418 | 			} catch(e) {}
419 | 		}
420 | 	}
421 | })()
422 | `
423 | 
424 | const TriggerJavascriptProtocol = `
425 | (async function click_all_a_tag_javascript(){
426 | 	let nodeListHref = document.querySelectorAll("[href]");
427 | 	nodeListHref = window.randArr(nodeListHref);
428 | 	for (let node of nodeListHref) {
429 | 		let attrValue = node.getAttribute("href");
430 | 		if (attrValue.toLocaleLowerCase().startsWith("javascript:")) {
431 | 			await window.sleep(%f);
432 | 			try {
433 | 				eval(attrValue.substring(11));
434 | 			}
435 | 			catch {}
436 | 		}
437 | 	}
438 | 	let nodeListSrc = document.querySelectorAll("[src]");
439 | 	nodeListSrc = window.randArr(nodeListSrc);
440 | 	for (let node of nodeListSrc) {
441 | 		let attrValue = node.getAttribute("src");
442 | 		if (attrValue.toLocaleLowerCase().startsWith("javascript:")) {
443 | 			await window.sleep(%f);
444 | 			try {
445 | 				eval(attrValue.substring(11));
446 | 			}
447 | 			catch {}
448 | 		}
449 | 	}
450 | })()
451 | `
452 | 
453 | const FormNodeClickJS = `
454 | (function(a) {
455 | 	try {
456 | 		a.click();
457 | 		return true;
458 | 	} catch(e) {
459 | 		return false;
460 | 	}
461 | })(%s)
462 | `
463 | 
464 | func Snippet(js string, f func(n *cdp.Node) string, sel string, n *cdp.Node, v ...interface{}) string {
465 | 	//return fmt.Sprintf(js, append([]interface{}{sel}, v...)...)
466 | 	return fmt.Sprintf(js, append([]interface{}{f(n)}, v...)...)
467 | }
468 | 
469 | func CashX(flatten bool) func(*cdp.Node) string {
470 | 	return func(n *cdp.Node) string {
471 | 		if flatten {
472 | 			return fmt.Sprintf(`$x(%q)[0]`, n.FullXPath())
473 | 		}
474 | 		return fmt.Sprintf(`$x(%q)`, n.FullXPath())
475 | 	}
476 | }
477 | 


--------------------------------------------------------------------------------
/pkg/logger/logger.go:
--------------------------------------------------------------------------------
 1 | package logger
 2 | 
 3 | import (
 4 | 	"github.com/sirupsen/logrus"
 5 | )
 6 | 
 7 | var logLevelMap = map[string]logrus.Level{
 8 | 	//"Trace":           logrus.TraceLevel,
 9 | 	"Debug": logrus.DebugLevel,
10 | 	"Info":  logrus.InfoLevel,
11 | 	"Warn":  logrus.WarnLevel,
12 | 	"Error": logrus.ErrorLevel,
13 | 	"Fatal": logrus.FatalLevel,
14 | 	//"Panic":           logrus.PanicLevel,
15 | }
16 | 
17 | var Logger *logrus.Logger
18 | 
19 | func init() {
20 | 	Logger = logrus.New()
21 | 	level := "Warn"
22 | 	Logger.SetLevel(logLevelMap[level])
23 | }
24 | 


--------------------------------------------------------------------------------
/pkg/model/request.go:
--------------------------------------------------------------------------------
  1 | package model
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"net/url"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 11 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 12 | )
 13 | 
 14 | type Filter struct {
 15 | 	MarkedQueryMap    map[string]interface{}
 16 | 	QueryKeysId       string
 17 | 	QueryMapId        string
 18 | 	MarkedPostDataMap map[string]interface{}
 19 | 	PostDataId        string
 20 | 	MarkedPath        string
 21 | 	FragmentID        string
 22 | 	PathId            string
 23 | 	UniqueId          string
 24 | }
 25 | 
 26 | type Options struct {
 27 | 	Headers  map[string]interface{}
 28 | 	PostData string
 29 | }
 30 | 
 31 | type Request struct {
 32 | 	URL             *URL
 33 | 	Method          string
 34 | 	Headers         map[string]interface{}
 35 | 	PostData        string
 36 | 	Filter          Filter
 37 | 	Source          string
 38 | 	RedirectionFlag bool
 39 | 	Proxy           string
 40 | }
 41 | 
 42 | var supportContentType = []string{config.JSON, config.URLENCODED}
 43 | 
 44 | /**
 45 | 获取Request对象
 46 | 可选设置headers和postData
 47 | */
 48 | func GetRequest(method string, URL *URL, options ...Options) Request {
 49 | 	var req Request
 50 | 	req.URL = URL
 51 | 	req.Method = strings.ToUpper(method)
 52 | 	if len(options) != 0 {
 53 | 		option := options[0]
 54 | 		if option.Headers != nil {
 55 | 			req.Headers = option.Headers
 56 | 		}
 57 | 
 58 | 		if option.PostData != "" {
 59 | 			req.PostData = option.PostData
 60 | 		}
 61 | 	} else {
 62 | 		req.Headers = map[string]interface{}{}
 63 | 	}
 64 | 
 65 | 	return req
 66 | }
 67 | 
 68 | /**
 69 | 完整格式化输出
 70 | */
 71 | func (req *Request) FormatPrint() {
 72 | 	var tempStr = req.Method
 73 | 	tempStr += " " + req.URL.String() + " HTTP/1.1\r\n"
 74 | 	for k, v := range req.Headers {
 75 | 		tempStr += k + ": " + v.(string) + "\r\n"
 76 | 	}
 77 | 	tempStr += "\r\n"
 78 | 	if req.Method == config.POST {
 79 | 		tempStr += req.PostData
 80 | 	}
 81 | 	fmt.Println(tempStr)
 82 | }
 83 | 
 84 | /**
 85 | 简要输出
 86 | */
 87 | func (req *Request) SimplePrint() {
 88 | 	var tempStr = req.Method
 89 | 	tempStr += " " + req.URL.String() + " "
 90 | 	if req.Method == config.POST {
 91 | 		tempStr += req.PostData
 92 | 	}
 93 | 	fmt.Println(tempStr)
 94 | }
 95 | 
 96 | func (req *Request) SimpleFormat() string {
 97 | 	var tempStr = req.Method
 98 | 	tempStr += " " + req.URL.String() + " "
 99 | 	if req.Method == config.POST {
100 | 		tempStr += req.PostData
101 | 	}
102 | 	return tempStr
103 | }
104 | 
105 | /**
106 | 不加入Header的请求ID
107 | */
108 | func (req *Request) NoHeaderId() string {
109 | 	return tools.StrMd5(req.Method + req.URL.String() + req.PostData)
110 | }
111 | 
112 | func (req *Request) UniqueId() string {
113 | 	if req.RedirectionFlag {
114 | 		return tools.StrMd5(req.NoHeaderId() + "Redirection")
115 | 	} else {
116 | 		return req.NoHeaderId()
117 | 	}
118 | }
119 | 
120 | /**
121 | 返回POST请求数据解析后的map结构
122 | 
123 | 支持 application/x-www-form-urlencoded 、application/json
124 | 
125 | 如果解析失败，则返回 key: postDataStr 的map结构
126 | */
127 | func (req *Request) PostDataMap() map[string]interface{} {
128 | 	contentType, err := req.getContentType()
129 | 	if err != nil {
130 | 		return map[string]interface{}{
131 | 			"key": req.PostData,
132 | 		}
133 | 	}
134 | 
135 | 	if strings.HasPrefix(contentType, config.JSON) {
136 | 		var result map[string]interface{}
137 | 		err = json.Unmarshal([]byte(req.PostData), &result)
138 | 		if err != nil {
139 | 			return map[string]interface{}{
140 | 				"key": req.PostData,
141 | 			}
142 | 		} else {
143 | 			return result
144 | 		}
145 | 	} else if strings.HasPrefix(contentType, config.URLENCODED) {
146 | 		var result = map[string]interface{}{}
147 | 		r, err := url.ParseQuery(req.PostData)
148 | 		if err != nil {
149 | 			return map[string]interface{}{
150 | 				"key": req.PostData,
151 | 			}
152 | 		} else {
153 | 			for key, value := range r {
154 | 				if len(value) == 1 {
155 | 					result[key] = value[0]
156 | 				} else {
157 | 					result[key] = value
158 | 				}
159 | 			}
160 | 			return result
161 | 		}
162 | 	} else {
163 | 		return map[string]interface{}{
164 | 			"key": req.PostData,
165 | 		}
166 | 	}
167 | }
168 | 
169 | /**
170 | 返回GET请求参数解析后的map结构
171 | */
172 | func (req *Request) QueryMap() map[string][]string {
173 | 	return req.URL.Query()
174 | }
175 | 
176 | /**
177 | 获取content-type
178 | */
179 | func (req *Request) getContentType() (string, error) {
180 | 	headers := req.Headers
181 | 	var contentType string
182 | 	if ct, ok := headers["Content-Type"]; ok {
183 | 		contentType = ct.(string)
184 | 	} else if ct, ok := headers["Content-type"]; ok {
185 | 		contentType = ct.(string)
186 | 	} else if ct, ok := headers["content-type"]; ok {
187 | 		contentType = ct.(string)
188 | 	} else {
189 | 		return "", errors.New("no content-type")
190 | 	}
191 | 
192 | 	for _, ct := range supportContentType {
193 | 		if strings.HasPrefix(contentType, ct) {
194 | 			return contentType, nil
195 | 		}
196 | 	}
197 | 	return "", errors.New("dont support such content-type:" + contentType)
198 | }
199 | 


--------------------------------------------------------------------------------
/pkg/model/url.go:
--------------------------------------------------------------------------------
  1 | package model
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"net/url"
  7 | 	"path"
  8 | 	"regexp"
  9 | 	"strings"
 10 | 
 11 | 	"golang.org/x/net/publicsuffix"
 12 | 
 13 | 	"github.com/Qianlitp/crawlergo/pkg/tools/requests"
 14 | )
 15 | 
 16 | type URL struct {
 17 | 	url.URL
 18 | }
 19 | 
 20 | func GetUrl(_url string, parentUrls ...URL) (*URL, error) {
 21 | 	// 补充解析URL为完整格式
 22 | 	var u URL
 23 | 	_url, err := u.parse(_url, parentUrls...)
 24 | 	if err != nil {
 25 | 		return nil, err
 26 | 	}
 27 | 
 28 | 	if len(parentUrls) == 0 {
 29 | 		_u, err := requests.UrlParse(_url)
 30 | 		if err != nil {
 31 | 			return nil, err
 32 | 		}
 33 | 		u = URL{*_u}
 34 | 		if u.Path == "" {
 35 | 			u.Path = "/"
 36 | 		}
 37 | 	} else {
 38 | 		pUrl := parentUrls[0]
 39 | 		_u, err := pUrl.Parse(_url)
 40 | 		if err != nil {
 41 | 			return nil, err
 42 | 		}
 43 | 		u = URL{*_u}
 44 | 		if u.Path == "" {
 45 | 			u.Path = "/"
 46 | 		}
 47 | 		//fmt.Println(_url, pUrl.String(), u.String())
 48 | 	}
 49 | 
 50 | 	fixPath := regexp.MustCompile("^/{2,}")
 51 | 
 52 | 	if fixPath.MatchString(u.Path) {
 53 | 		u.Path = fixPath.ReplaceAllString(u.Path, "/")
 54 | 	}
 55 | 
 56 | 	return &u, nil
 57 | }
 58 | 
 59 | /**
 60 | 修复不完整的URL
 61 | */
 62 | func (u *URL) parse(_url string, parentUrls ...URL) (string, error) {
 63 | 	_url = strings.Trim(_url, " ")
 64 | 
 65 | 	if len(_url) == 0 {
 66 | 		return "", errors.New("invalid url, length 0")
 67 | 	}
 68 | 	// 替换掉多余的#
 69 | 	if strings.Count(_url, "#") > 1 {
 70 | 		_url = regexp.MustCompile(`#+`).ReplaceAllString(_url, "#")
 71 | 	}
 72 | 
 73 | 	// 没有父链接，直接退出
 74 | 	if len(parentUrls) == 0 {
 75 | 		return _url, nil
 76 | 	}
 77 | 
 78 | 	if strings.HasPrefix(_url, "http://") || strings.HasPrefix(_url, "https://") {
 79 | 		return _url, nil
 80 | 	} else if strings.HasPrefix(_url, "javascript:") {
 81 | 		return "", errors.New("invalid url, javascript protocol")
 82 | 	} else if strings.HasPrefix(_url, "mailto:") {
 83 | 		return "", errors.New("invalid url, mailto protocol")
 84 | 	}
 85 | 	return _url, nil
 86 | }
 87 | 
 88 | func (u *URL) QueryMap() map[string]interface{} {
 89 | 	queryMap := map[string]interface{}{}
 90 | 	for key, value := range u.Query() {
 91 | 		if len(value) == 1 {
 92 | 			queryMap[key] = value[0]
 93 | 		} else {
 94 | 			queryMap[key] = value
 95 | 		}
 96 | 	}
 97 | 	return queryMap
 98 | }
 99 | 
100 | /**
101 | 返回去掉请求参数的URL
102 | */
103 | func (u *URL) NoQueryUrl() string {
104 | 	return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, u.Path)
105 | }
106 | 
107 | /**
108 | 返回不带Fragment的URL
109 | */
110 | func (u *URL) NoFragmentUrl() string {
111 | 	return strings.Replace(u.String(), u.Fragment, "", -1)
112 | }
113 | 
114 | func (u *URL) NoSchemeFragmentUrl() string {
115 | 	return fmt.Sprintf("://%s%s", u.Host, u.Path)
116 | }
117 | 
118 | func (u *URL) NavigationUrl() string {
119 | 	return u.NoSchemeFragmentUrl()
120 | }
121 | 
122 | /**
123 | 返回根域名
124 | 
125 | 如 a.b.c.360.cn 返回 360.cn
126 | */
127 | func (u *URL) RootDomain() string {
128 | 	domain := u.Hostname()
129 | 	suffix, icann := publicsuffix.PublicSuffix(strings.ToLower(domain))
130 | 	// 如果不是 icann 的域名，返回空字符串
131 | 	if !icann {
132 | 		return ""
133 | 	}
134 | 	i := len(domain) - len(suffix) - 1
135 | 	// 如果域名错误
136 | 	if i <= 0 {
137 | 		return ""
138 | 	}
139 | 	if domain[i] != '.' {
140 | 		return ""
141 | 	}
142 | 	return domain[1+strings.LastIndex(domain[:i], "."):]
143 | }
144 | 
145 | /**
146 | 文件扩展名
147 | */
148 | func (u *URL) FileName() string {
149 | 	parts := strings.Split(u.Path, `/`)
150 | 	lastPart := parts[len(parts)-1]
151 | 	if strings.Contains(lastPart, ".") {
152 | 		return lastPart
153 | 	} else {
154 | 		return ""
155 | 	}
156 | }
157 | 
158 | /**
159 | 文件扩展名
160 | */
161 | func (u *URL) FileExt() string {
162 | 	parts := path.Ext(u.Path)
163 | 	// 第一个字符会带有 "."
164 | 	if len(parts) > 0 {
165 | 		return strings.ToLower(parts[1:])
166 | 	}
167 | 	return parts
168 | }
169 | 
170 | /**
171 | 回去上一级path, 如果当前就是root path，则返回空字符串
172 | */
173 | func (u *URL) ParentPath() string {
174 | 	if u.Path == "/" {
175 | 		return ""
176 | 	} else if strings.HasSuffix(u.Path, "/") {
177 | 		if strings.Count(u.Path, "/") == 2 {
178 | 			return "/"
179 | 		}
180 | 		parts := strings.Split(u.Path, "/")
181 | 		parts = parts[:len(parts)-2]
182 | 		return strings.Join(parts, "/")
183 | 	} else {
184 | 		if strings.Count(u.Path, "/") == 1 {
185 | 			return "/"
186 | 		}
187 | 		parts := strings.Split(u.Path, "/")
188 | 		parts = parts[:len(parts)-1]
189 | 		return strings.Join(parts, "/")
190 | 	}
191 | }
192 | 


--------------------------------------------------------------------------------
/pkg/model/url_test.go:
--------------------------------------------------------------------------------
 1 | package model
 2 | 
 3 | import (
 4 | 	"net/url"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/stretchr/testify/assert"
 8 | 	"golang.org/x/net/publicsuffix"
 9 | )
10 | 
11 | var (
12 | 	rootDomainTestCases = []struct {
13 | 		domain     string
14 | 		rootDomain string
15 | 		wantICANN  bool
16 | 	}{
17 | 		{"www.amazon.co.uk", "amazon.co.uk", true},
18 | 		{"www.baidu.com", "baidu.com", true},
19 | 		{"www.baidu.com.cn", "baidu.com.cn", true},
20 | 		{"www.pku.edu.cn", "pku.edu.cn", true},
21 | 		{"www.example1.debian.org", "debian.org", true},
22 | 		{"www.golang.dev", "golang.dev", true},
23 | 		// 以下都是一些特殊的 case，主要包括一些特殊的域名和私有域名，一般情况遇不到
24 | 		// error domains
25 | 		{"com.cn", "", true},
26 | 		// not an icann domain
27 | 		{"www.example0.debian.net", "", false},
28 | 		{"s3.cn-north-1.amazonaws.com.cn", "", false},
29 | 		{"www.0emm.com", "", false},
30 | 		{"there.is.no.such-tld", "", false},
31 | 	}
32 | )
33 | 
34 | func TestRootDomain(t *testing.T) {
35 | 	for _, tc := range rootDomainTestCases {
36 | 		u := &URL{url.URL{Host: tc.domain}}
37 | 		rootDomain := u.RootDomain()
38 | 		_, icann := publicsuffix.PublicSuffix(u.Hostname())
39 | 		if rootDomain != tc.rootDomain {
40 | 			t.Errorf("%s parse root domain failed", tc.domain)
41 | 		}
42 | 		if icann != tc.wantICANN {
43 | 			t.Errorf("%s not an icann domain", tc.domain)
44 | 		}
45 | 	}
46 | }
47 | 
48 | func TestFileExt(t *testing.T) {
49 | 	noExtPath := "/user/info"
50 | 	hasExtPath := "/user/info.html"
51 | 	hasExtPathMoreChar := "/user/info.html%2"
52 | 	url, err := GetUrl(noExtPath)
53 | 	assert.Nil(t, err)
54 | 	assert.NotNil(t, url)
55 | 	assert.Equal(t, "", url.FileExt())
56 | 	hasExtUrl, err := GetUrl(hasExtPath)
57 | 	assert.Nil(t, err)
58 | 	assert.Equal(t, "html", hasExtUrl.FileExt())
59 | 	hasExtChar, err := GetUrl(hasExtPathMoreChar)
60 | 	assert.Nil(t, err)
61 | 	assert.Equal(t, "html%2", hasExtChar.FileExt())
62 | }
63 | 
64 | func TestGetUrl(t *testing.T) {
65 | 	testPath := "/user/info"
66 | 	testQueyPath := "/user/info?keyword=crawlergocrawlergo&end=1"
67 | 	url, err := GetUrl(testPath)
68 | 	assert.Nil(t, err)
69 | 	assert.NotNil(t, url)
70 | 	queryUrl, err := GetUrl(testQueyPath)
71 | 	assert.Nil(t, err)
72 | 	assert.Equal(t, queryUrl.Path, testPath)
73 | 	assert.Equal(t, queryUrl.RawQuery, "keyword=crawlergocrawlergo&end=1")
74 | }
75 | 


--------------------------------------------------------------------------------
/pkg/path_expansion.go:
--------------------------------------------------------------------------------
  1 | package pkg
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"regexp"
  6 | 	"strings"
  7 | 	"sync"
  8 | 
  9 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 10 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 11 | 	model2 "github.com/Qianlitp/crawlergo/pkg/model"
 12 | 	"github.com/Qianlitp/crawlergo/pkg/tools"
 13 | 	"github.com/Qianlitp/crawlergo/pkg/tools/requests"
 14 | 
 15 | 	mapset "github.com/deckarep/golang-set"
 16 | 	"github.com/panjf2000/ants/v2"
 17 | )
 18 | 
 19 | const pathStr = "11/123/2017/2018/message/mis/model/abstract/account/act/action" +
 20 | 	"/activity/ad/address/ajax/alarm/api/app/ar/attachment/auth/authority/award/back/backup/bak/base" +
 21 | 	"/bbs/bbs1/cms/bd/gallery/game/gift/gold/bg/bin/blacklist/blog/bootstrap/brand/build/cache/caches" +
 22 | 	"/caching/cacti/cake/captcha/category/cdn/ch/check/city/class/classes/classic/client/cluster" +
 23 | 	"/collection/comment/commit/common/commons/components/conf/config/mysite/confs/console/consumer" +
 24 | 	"/content/control/controllers/core/crontab/crud/css/daily/dashboard/data/database/db/default/demo" +
 25 | 	"/dev/doc/download/duty/es/eva/examples/excel/export/ext/fe/feature/file/files/finance/flashchart" +
 26 | 	"/follow/forum/frame/framework/ft/group/gss/hello/helper/helpers/history/home/hr/htdocs/html/hunter" +
 27 | 	"/image/img11/import/improve/inc/include/includes/index/info/install/interface/item/jobconsume/jobs" +
 28 | 	"/json/kindeditor/l/languages/lib/libraries/libs/link/lite/local/log/login/logs/mail/main" +
 29 | 	"/maintenance/manage/manager/manufacturer/menus/models/modules/monitor/movie/mysql/n/nav/network" +
 30 | 	"/news/notice/nw/oauth/other/page/pages/passport/pay/pcheck/people/person/php/phprpc" +
 31 | 	"/phptest/picture/pl/platform/pm/portal/post/product/project/protected/proxy/ps/public/qq/question" +
 32 | 	"/quote/redirect/redisclient/report/resource/resources/s/save/schedule/schema/script/scripts/search" +
 33 | 	"/security/server/service/shell/show/simple/site/sites/skin/sms/soap/sola/sort/spider/sql/stat" +
 34 | 	"/static/statistics/stats/submit/subways/survey/sv/syslog/system/tag/task/tasks/tcpdf/template" +
 35 | 	"/templates/test/tests/ticket/tmp/token/tool/tools/top/tpl/txt/upload/uploadify/uploads/url/user" +
 36 | 	"/util/v1/v2/vendor/view/views/web/weixin/widgets/wm/wordpress/workspace/ws/www/www2/wwwroot/zone" +
 37 | 	"/admin/admin_bak/mobile/m/js"
 38 | 
 39 | var pathFuzzWG sync.WaitGroup
 40 | var validateUrl mapset.Set
 41 | 
 42 | /**
 43 | 从robots.txt文件中获取路径信息
 44 | */
 45 | func GetPathsFromRobots(navReq model2.Request) []*model2.Request {
 46 | 	logger.Logger.Info("starting to get paths from robots.txt.")
 47 | 	var result []*model2.Request
 48 | 	var urlFindRegex = regexp.MustCompile(`(?:Disallow|Allow):.*?(/.+)`)
 49 | 	var urlRegex = regexp.MustCompile(`(/.+)`)
 50 | 
 51 | 	navReq.URL.Path = "/"
 52 | 	url := navReq.URL.NoQueryUrl() + "robots.txt"
 53 | 
 54 | 	resp, err := requests.Get(url, tools.ConvertHeaders(navReq.Headers),
 55 | 		&requests.ReqOptions{AllowRedirect: false,
 56 | 			Timeout: 5,
 57 | 			Proxy:   navReq.Proxy})
 58 | 	if err != nil {
 59 | 		//for
 60 | 		//logger.Logger.Error("request to robots.txt error ", err)
 61 | 		return result
 62 | 	}
 63 | 
 64 | 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
 65 | 		return result
 66 | 	}
 67 | 	urlList := urlFindRegex.FindAllString(resp.Text, -1)
 68 | 	for _, _url := range urlList {
 69 | 		_url = strings.TrimSpace(_url)
 70 | 		_url = urlRegex.FindString(_url)
 71 | 		url, err := model2.GetUrl(_url, *navReq.URL)
 72 | 		if err != nil {
 73 | 			continue
 74 | 		}
 75 | 		req := model2.GetRequest(config.GET, url)
 76 | 		req.Source = config.FromRobots
 77 | 		result = append(result, &req)
 78 | 	}
 79 | 	return result
 80 | }
 81 | 
 82 | /**
 83 | 使用常见路径列表进行fuzz
 84 | */
 85 | func GetPathsByFuzz(navReq model2.Request) []*model2.Request {
 86 | 	logger.Logger.Info("starting to get paths by fuzzing.")
 87 | 	pathList := strings.Split(pathStr, "/")
 88 | 	return doFuzz(navReq, pathList)
 89 | }
 90 | 
 91 | /**
 92 | 使用字典列表进行fuzz
 93 | */
 94 | func GetPathsByFuzzDict(navReq model2.Request, dictPath string) []*model2.Request {
 95 | 	logger.Logger.Infof("starting to get dict path by fuzzing: %s", dictPath)
 96 | 	pathList := tools.ReadFile(dictPath)
 97 | 	logger.Logger.Debugf("valid path count: %d", len(pathList))
 98 | 	return doFuzz(navReq, pathList)
 99 | }
100 | 
101 | type singleFuzz struct {
102 | 	navReq model2.Request
103 | 	path   string
104 | }
105 | 
106 | func doFuzz(navReq model2.Request, pathList []string) []*model2.Request {
107 | 	validateUrl = mapset.NewSet()
108 | 	var result []*model2.Request
109 | 	pool, _ := ants.NewPool(20)
110 | 	defer pool.Release()
111 | 	for _, path := range pathList {
112 | 		path = strings.TrimPrefix(path, "/")
113 | 		path = strings.TrimSuffix(path, "\n")
114 | 		task := singleFuzz{
115 | 			navReq: navReq,
116 | 			path:   path,
117 | 		}
118 | 		pathFuzzWG.Add(1)
119 | 		go func() {
120 | 			err := pool.Submit(task.doRequest)
121 | 			if err != nil {
122 | 				pathFuzzWG.Done()
123 | 			}
124 | 		}()
125 | 	}
126 | 
127 | 	pathFuzzWG.Wait()
128 | 	for _, _url := range validateUrl.ToSlice() {
129 | 		_url := _url.(string)
130 | 		url, err := model2.GetUrl(_url)
131 | 		if err != nil {
132 | 			continue
133 | 		}
134 | 		req := model2.GetRequest(config.GET, url)
135 | 		req.Source = config.FromFuzz
136 | 		result = append(result, &req)
137 | 	}
138 | 	return result
139 | }
140 | 
141 | /**
142 | 
143 |  */
144 | func (s singleFuzz) doRequest() {
145 | 	defer pathFuzzWG.Done()
146 | 
147 | 	url := fmt.Sprintf(`%s://%s/%s`, s.navReq.URL.Scheme, s.navReq.URL.Host, s.path)
148 | 	resp, errs := requests.Get(url, tools.ConvertHeaders(s.navReq.Headers),
149 | 		&requests.ReqOptions{Timeout: 2, AllowRedirect: false, Proxy: s.navReq.Proxy})
150 | 	if errs != nil {
151 | 		return
152 | 	}
153 | 	if resp.StatusCode >= 200 && resp.StatusCode < 300 {
154 | 		validateUrl.Add(url)
155 | 	} else if resp.StatusCode == 301 {
156 | 		locations := resp.Header["Location"]
157 | 		if len(locations) == 0 {
158 | 			return
159 | 		}
160 | 		location := locations[0]
161 | 		redirectUrl, err := model2.GetUrl(location)
162 | 		if err != nil {
163 | 			return
164 | 		}
165 | 		if redirectUrl.Host == s.navReq.URL.Host {
166 | 			validateUrl.Add(url)
167 | 		}
168 | 	}
169 | }
170 | 


--------------------------------------------------------------------------------
/pkg/task_main.go:
--------------------------------------------------------------------------------
  1 | package pkg
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"sync"
  6 | 
  7 | 	"github.com/Qianlitp/crawlergo/pkg/config"
  8 | 	engine2 "github.com/Qianlitp/crawlergo/pkg/engine"
  9 | 	filter2 "github.com/Qianlitp/crawlergo/pkg/filter"
 10 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 11 | 	"github.com/Qianlitp/crawlergo/pkg/model"
 12 | 
 13 | 	"github.com/panjf2000/ants/v2"
 14 | )
 15 | 
 16 | type CrawlerTask struct {
 17 | 	Browser       *engine2.Browser    //
 18 | 	RootDomain    string              // 当前爬取根域名 用于子域名收集
 19 | 	Targets       []*model.Request    // 输入目标
 20 | 	Result        *Result             // 最终结果
 21 | 	Config        *TaskConfig         // 配置信息
 22 | 	smartFilter   filter2.SmartFilter // 过滤对象
 23 | 	Pool          *ants.Pool          // 协程池
 24 | 	taskWG        sync.WaitGroup      // 等待协程池所有任务结束
 25 | 	crawledCount  int                 // 爬取过的数量
 26 | 	taskCountLock sync.Mutex          // 已爬取的任务总数锁
 27 | }
 28 | 
 29 | type Result struct {
 30 | 	ReqList       []*model.Request // 返回的同域名结果
 31 | 	AllReqList    []*model.Request // 所有域名的请求
 32 | 	AllDomainList []string         // 所有域名列表
 33 | 	SubDomainList []string         // 子域名列表
 34 | 	resultLock    sync.Mutex       // 合并结果时加锁
 35 | }
 36 | 
 37 | type tabTask struct {
 38 | 	crawlerTask *CrawlerTask
 39 | 	browser     *engine2.Browser
 40 | 	req         *model.Request
 41 | }
 42 | 
 43 | /**
 44 | 新建爬虫任务
 45 | */
 46 | func NewCrawlerTask(targets []*model.Request, taskConf TaskConfig) (*CrawlerTask, error) {
 47 | 	crawlerTask := CrawlerTask{
 48 | 		Result: &Result{},
 49 | 		Config: &taskConf,
 50 | 		smartFilter: filter2.SmartFilter{
 51 | 			SimpleFilter: filter2.SimpleFilter{
 52 | 				HostLimit: targets[0].URL.Host,
 53 | 			},
 54 | 		},
 55 | 	}
 56 | 
 57 | 	//if len(targets) == 1 {
 58 | 	//	_newReq := *targets[0]
 59 | 	//	newReq := &_newReq
 60 | 	//	_newURL := *_newReq.URL
 61 | 	//	newReq.URL = &_newURL
 62 | 	//	if targets[0].URL.Scheme == "http" {
 63 | 	//		newReq.URL.Scheme = "https"
 64 | 	//	} else {
 65 | 	//		newReq.URL.Scheme = "http"
 66 | 	//	}
 67 | 	//	targets = append(targets, newReq)
 68 | 	//}
 69 | 	crawlerTask.Targets = targets[:]
 70 | 
 71 | 	for _, req := range targets {
 72 | 		req.Source = config.FromTarget
 73 | 	}
 74 | 
 75 | 	// 业务代码与数据代码分离, 初始化一些默认配置
 76 | 	// 使用 funtion option 和一个代理来初始化 taskConf 的配置
 77 | 	for _, fn := range []TaskConfigOptFunc{
 78 | 		WithTabRunTimeout(config.TabRunTimeout),
 79 | 		WithMaxTabsCount(config.MaxTabsCount),
 80 | 		WithMaxCrawlCount(config.MaxCrawlCount),
 81 | 		WithDomContentLoadedTimeout(config.DomContentLoadedTimeout),
 82 | 		WithEventTriggerInterval(config.EventTriggerInterval),
 83 | 		WithBeforeExitDelay(config.BeforeExitDelay),
 84 | 		WithEventTriggerMode(config.DefaultEventTriggerMode),
 85 | 		WithIgnoreKeywords(config.DefaultIgnoreKeywords),
 86 | 	} {
 87 | 		fn(&taskConf)
 88 | 	}
 89 | 
 90 | 	if taskConf.ExtraHeadersString != "" {
 91 | 		err := json.Unmarshal([]byte(taskConf.ExtraHeadersString), &taskConf.ExtraHeaders)
 92 | 		if err != nil {
 93 | 			logger.Logger.Error("custom headers can't be Unmarshal.")
 94 | 			return nil, err
 95 | 		}
 96 | 	}
 97 | 
 98 | 	crawlerTask.Browser = engine2.InitBrowser(taskConf.ChromiumPath, taskConf.ExtraHeaders, taskConf.Proxy, taskConf.NoHeadless)
 99 | 	crawlerTask.RootDomain = targets[0].URL.RootDomain()
100 | 
101 | 	crawlerTask.smartFilter.Init()
102 | 
103 | 	// 创建协程池
104 | 	p, _ := ants.NewPool(taskConf.MaxTabsCount)
105 | 	crawlerTask.Pool = p
106 | 
107 | 	return &crawlerTask, nil
108 | }
109 | 
110 | /**
111 | 根据请求列表生成tabTask协程任务列表
112 | */
113 | func (t *CrawlerTask) generateTabTask(req *model.Request) *tabTask {
114 | 	task := tabTask{
115 | 		crawlerTask: t,
116 | 		browser:     t.Browser,
117 | 		req:         req,
118 | 	}
119 | 	return &task
120 | }
121 | 
122 | /**
123 | 开始当前任务
124 | */
125 | func (t *CrawlerTask) Run() {
126 | 	defer t.Pool.Release()  // 释放协程池
127 | 	defer t.Browser.Close() // 关闭浏览器
128 | 
129 | 	if t.Config.PathFromRobots {
130 | 		reqsFromRobots := GetPathsFromRobots(*t.Targets[0])
131 | 		logger.Logger.Info("get paths from robots.txt: ", len(reqsFromRobots))
132 | 		t.Targets = append(t.Targets, reqsFromRobots...)
133 | 	}
134 | 
135 | 	if t.Config.FuzzDictPath != "" {
136 | 		if t.Config.PathByFuzz {
137 | 			logger.Logger.Warn("`--fuzz-path` is ignored, using `--fuzz-path-dict` instead")
138 | 		}
139 | 		reqsByFuzz := GetPathsByFuzzDict(*t.Targets[0], t.Config.FuzzDictPath)
140 | 		t.Targets = append(t.Targets, reqsByFuzz...)
141 | 	} else if t.Config.PathByFuzz {
142 | 		reqsByFuzz := GetPathsByFuzz(*t.Targets[0])
143 | 		logger.Logger.Info("get paths by fuzzing: ", len(reqsByFuzz))
144 | 		t.Targets = append(t.Targets, reqsByFuzz...)
145 | 	}
146 | 
147 | 	t.Result.AllReqList = t.Targets[:]
148 | 
149 | 	var initTasks []*model.Request
150 | 	for _, req := range t.Targets {
151 | 		if t.smartFilter.DoFilter(req) {
152 | 			logger.Logger.Debugf("filter req: " + req.URL.RequestURI())
153 | 			continue
154 | 		}
155 | 		initTasks = append(initTasks, req)
156 | 		t.Result.ReqList = append(t.Result.ReqList, req)
157 | 	}
158 | 	logger.Logger.Info("filter repeat, target count: ", len(initTasks))
159 | 
160 | 	for _, req := range initTasks {
161 | 		if !engine2.IsIgnoredByKeywordMatch(*req, t.Config.IgnoreKeywords) {
162 | 			t.addTask2Pool(req)
163 | 		}
164 | 	}
165 | 
166 | 	t.taskWG.Wait()
167 | 
168 | 	// 对全部请求进行唯一去重
169 | 	todoFilterAll := make([]*model.Request, len(t.Result.AllReqList))
170 | 	copy(todoFilterAll, t.Result.AllReqList)
171 | 
172 | 	t.Result.AllReqList = []*model.Request{}
173 | 	var simpleFilter filter2.SimpleFilter
174 | 	for _, req := range todoFilterAll {
175 | 		if !simpleFilter.UniqueFilter(req) {
176 | 			t.Result.AllReqList = append(t.Result.AllReqList, req)
177 | 		}
178 | 	}
179 | 
180 | 	// 全部域名
181 | 	t.Result.AllDomainList = AllDomainCollect(t.Result.AllReqList)
182 | 	// 子域名
183 | 	t.Result.SubDomainList = SubDomainCollect(t.Result.AllReqList, t.RootDomain)
184 | }
185 | 
186 | /**
187 | 添加任务到协程池
188 | 添加之前实时过滤
189 | */
190 | func (t *CrawlerTask) addTask2Pool(req *model.Request) {
191 | 	t.taskCountLock.Lock()
192 | 	if t.crawledCount >= t.Config.MaxCrawlCount {
193 | 		t.taskCountLock.Unlock()
194 | 		return
195 | 	} else {
196 | 		t.crawledCount += 1
197 | 	}
198 | 	t.taskCountLock.Unlock()
199 | 
200 | 	t.taskWG.Add(1)
201 | 	task := t.generateTabTask(req)
202 | 	go func() {
203 | 		err := t.Pool.Submit(task.Task)
204 | 		if err != nil {
205 | 			t.taskWG.Done()
206 | 			logger.Logger.Error("addTask2Pool ", err)
207 | 		}
208 | 	}()
209 | }
210 | 
211 | /**
212 | 单个运行的tab标签任务，实现了workpool的接口
213 | */
214 | func (t *tabTask) Task() {
215 | 	defer t.crawlerTask.taskWG.Done()
216 | 	tab := engine2.NewTab(t.browser, *t.req, engine2.TabConfig{
217 | 		TabRunTimeout:           t.crawlerTask.Config.TabRunTimeout,
218 | 		DomContentLoadedTimeout: t.crawlerTask.Config.DomContentLoadedTimeout,
219 | 		EventTriggerMode:        t.crawlerTask.Config.EventTriggerMode,
220 | 		EventTriggerInterval:    t.crawlerTask.Config.EventTriggerInterval,
221 | 		BeforeExitDelay:         t.crawlerTask.Config.BeforeExitDelay,
222 | 		EncodeURLWithCharset:    t.crawlerTask.Config.EncodeURLWithCharset,
223 | 		IgnoreKeywords:          t.crawlerTask.Config.IgnoreKeywords,
224 | 		CustomFormValues:        t.crawlerTask.Config.CustomFormValues,
225 | 		CustomFormKeywordValues: t.crawlerTask.Config.CustomFormKeywordValues,
226 | 	})
227 | 	tab.Start()
228 | 
229 | 	// 收集结果
230 | 	t.crawlerTask.Result.resultLock.Lock()
231 | 	t.crawlerTask.Result.AllReqList = append(t.crawlerTask.Result.AllReqList, tab.ResultList...)
232 | 	t.crawlerTask.Result.resultLock.Unlock()
233 | 
234 | 	for _, req := range tab.ResultList {
235 | 		if t.crawlerTask.Config.FilterMode == config.SimpleFilterMode {
236 | 			if !t.crawlerTask.smartFilter.SimpleFilter.DoFilter(req) {
237 | 				t.crawlerTask.Result.resultLock.Lock()
238 | 				t.crawlerTask.Result.ReqList = append(t.crawlerTask.Result.ReqList, req)
239 | 				t.crawlerTask.Result.resultLock.Unlock()
240 | 				if !engine2.IsIgnoredByKeywordMatch(*req, t.crawlerTask.Config.IgnoreKeywords) {
241 | 					t.crawlerTask.addTask2Pool(req)
242 | 				}
243 | 			}
244 | 		} else {
245 | 			if !t.crawlerTask.smartFilter.DoFilter(req) {
246 | 				t.crawlerTask.Result.resultLock.Lock()
247 | 				t.crawlerTask.Result.ReqList = append(t.crawlerTask.Result.ReqList, req)
248 | 				t.crawlerTask.Result.resultLock.Unlock()
249 | 				if !engine2.IsIgnoredByKeywordMatch(*req, t.crawlerTask.Config.IgnoreKeywords) {
250 | 					t.crawlerTask.addTask2Pool(req)
251 | 				}
252 | 			}
253 | 		}
254 | 	}
255 | }
256 | 


--------------------------------------------------------------------------------
/pkg/taskconfig.go:
--------------------------------------------------------------------------------
  1 | package pkg
  2 | 
  3 | import "time"
  4 | 
  5 | type TaskConfig struct {
  6 | 	MaxCrawlCount           int    // 最大爬取的数量
  7 | 	FilterMode              string // simple、smart、strict
  8 | 	ExtraHeaders            map[string]interface{}
  9 | 	ExtraHeadersString      string
 10 | 	AllDomainReturn         bool // 全部域名收集
 11 | 	SubDomainReturn         bool // 子域名收集
 12 | 	NoHeadless              bool // headless模式
 13 | 	DomContentLoadedTimeout time.Duration
 14 | 	TabRunTimeout           time.Duration     // 单个标签页超时
 15 | 	PathByFuzz              bool              // 通过字典进行Path Fuzz
 16 | 	FuzzDictPath            string            //Fuzz目录字典
 17 | 	PathFromRobots          bool              // 解析Robots文件找出路径
 18 | 	MaxTabsCount            int               // 允许开启的最大标签页数量 即同时爬取的数量
 19 | 	ChromiumPath            string            // Chromium的程序路径  `/home/zhusiyu1/chrome-linux/chrome`
 20 | 	EventTriggerMode        string            // 事件触发的调用方式： 异步 或 顺序
 21 | 	EventTriggerInterval    time.Duration     // 事件触发的间隔
 22 | 	BeforeExitDelay         time.Duration     // 退出前的等待时间，等待DOM渲染，等待XHR发出捕获
 23 | 	EncodeURLWithCharset    bool              // 使用检测到的字符集自动编码URL
 24 | 	IgnoreKeywords          []string          // 忽略的关键字，匹配上之后将不再扫描且不发送请求
 25 | 	Proxy                   string            // 请求代理
 26 | 	CustomFormValues        map[string]string // 自定义表单填充参数
 27 | 	CustomFormKeywordValues map[string]string // 自定义表单关键词填充内容
 28 | }
 29 | 
 30 | type TaskConfigOptFunc func(*TaskConfig)
 31 | 
 32 | func NewTaskConfig(optFuncs ...TaskConfigOptFunc) *TaskConfig {
 33 | 	conf := &TaskConfig{}
 34 | 	for _, fn := range optFuncs {
 35 | 		fn(conf)
 36 | 	}
 37 | 	return conf
 38 | }
 39 | 
 40 | func WithMaxCrawlCount(maxCrawlCount int) TaskConfigOptFunc {
 41 | 	return func(tc *TaskConfig) {
 42 | 		if tc.MaxCrawlCount == 0 {
 43 | 			tc.MaxCrawlCount = maxCrawlCount
 44 | 		}
 45 | 	}
 46 | }
 47 | 
 48 | func WithFilterMode(gen string) TaskConfigOptFunc {
 49 | 	return func(tc *TaskConfig) {
 50 | 		if tc.FilterMode == "" {
 51 | 			tc.FilterMode = gen
 52 | 		}
 53 | 	}
 54 | }
 55 | 
 56 | func WithExtraHeaders(gen map[string]interface{}) TaskConfigOptFunc {
 57 | 	return func(tc *TaskConfig) {
 58 | 		if tc.ExtraHeaders == nil {
 59 | 			tc.ExtraHeaders = gen
 60 | 		}
 61 | 	}
 62 | }
 63 | 
 64 | func WithExtraHeadersString(gen string) TaskConfigOptFunc {
 65 | 	return func(tc *TaskConfig) {
 66 | 		if tc.ExtraHeadersString == "" {
 67 | 			tc.ExtraHeadersString = gen
 68 | 		}
 69 | 	}
 70 | }
 71 | 
 72 | func WithAllDomainReturn(gen bool) TaskConfigOptFunc {
 73 | 	return func(tc *TaskConfig) {
 74 | 		if !tc.AllDomainReturn {
 75 | 			tc.AllDomainReturn = gen
 76 | 		}
 77 | 	}
 78 | }
 79 | func WithSubDomainReturn(gen bool) TaskConfigOptFunc {
 80 | 	return func(tc *TaskConfig) {
 81 | 		if !tc.SubDomainReturn {
 82 | 			tc.SubDomainReturn = gen
 83 | 		}
 84 | 	}
 85 | }
 86 | 
 87 | func WithNoHeadless(gen bool) TaskConfigOptFunc {
 88 | 	return func(tc *TaskConfig) {
 89 | 		if !tc.NoHeadless {
 90 | 			tc.NoHeadless = gen
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | func WithDomContentLoadedTimeout(gen time.Duration) TaskConfigOptFunc {
 96 | 	return func(tc *TaskConfig) {
 97 | 		if tc.DomContentLoadedTimeout == 0 {
 98 | 			tc.DomContentLoadedTimeout = gen
 99 | 		}
100 | 	}
101 | }
102 | 
103 | func WithTabRunTimeout(gen time.Duration) TaskConfigOptFunc {
104 | 	return func(tc *TaskConfig) {
105 | 		if tc.TabRunTimeout == 0 {
106 | 			tc.TabRunTimeout = gen
107 | 		}
108 | 	}
109 | }
110 | func WithPathByFuzz(gen bool) TaskConfigOptFunc {
111 | 	return func(tc *TaskConfig) {
112 | 		if !tc.PathByFuzz {
113 | 			tc.PathByFuzz = gen
114 | 		}
115 | 	}
116 | }
117 | func WithFuzzDictPath(gen string) TaskConfigOptFunc {
118 | 	return func(tc *TaskConfig) {
119 | 		if tc.FuzzDictPath == "" {
120 | 			tc.FuzzDictPath = gen
121 | 		}
122 | 	}
123 | }
124 | func WithPathFromRobots(gen bool) TaskConfigOptFunc {
125 | 	return func(tc *TaskConfig) {
126 | 		if !tc.PathFromRobots {
127 | 			tc.PathFromRobots = gen
128 | 		}
129 | 	}
130 | }
131 | func WithMaxTabsCount(gen int) TaskConfigOptFunc {
132 | 	return func(tc *TaskConfig) {
133 | 		if tc.MaxTabsCount == 0 {
134 | 			tc.MaxTabsCount = gen
135 | 		}
136 | 	}
137 | }
138 | func WithChromiumPath(gen string) TaskConfigOptFunc {
139 | 	return func(tc *TaskConfig) {
140 | 		if tc.ChromiumPath == "" {
141 | 			tc.ChromiumPath = gen
142 | 		}
143 | 	}
144 | }
145 | func WithEventTriggerMode(gen string) TaskConfigOptFunc {
146 | 	return func(tc *TaskConfig) {
147 | 		if tc.EventTriggerMode == "" {
148 | 			tc.EventTriggerMode = gen
149 | 		}
150 | 	}
151 | }
152 | func WithEventTriggerInterval(gen time.Duration) TaskConfigOptFunc {
153 | 	return func(tc *TaskConfig) {
154 | 		if tc.EventTriggerInterval == 0 {
155 | 			tc.EventTriggerInterval = gen
156 | 		}
157 | 	}
158 | }
159 | func WithBeforeExitDelay(gen time.Duration) TaskConfigOptFunc {
160 | 	return func(tc *TaskConfig) {
161 | 		if tc.BeforeExitDelay == 0 {
162 | 			tc.BeforeExitDelay = gen
163 | 		}
164 | 	}
165 | }
166 | func WithEncodeURLWithCharset(gen bool) TaskConfigOptFunc {
167 | 	return func(tc *TaskConfig) {
168 | 		if !tc.EncodeURLWithCharset {
169 | 			tc.EncodeURLWithCharset = gen
170 | 		}
171 | 	}
172 | }
173 | func WithIgnoreKeywords(gen []string) TaskConfigOptFunc {
174 | 	return func(tc *TaskConfig) {
175 | 		if tc.IgnoreKeywords == nil || len(tc.IgnoreKeywords) == 0 {
176 | 			tc.IgnoreKeywords = gen
177 | 		}
178 | 	}
179 | }
180 | func WithProxy(gen string) TaskConfigOptFunc {
181 | 	return func(tc *TaskConfig) {
182 | 		if tc.Proxy == "" {
183 | 			tc.Proxy = gen
184 | 		}
185 | 	}
186 | }
187 | func WithCustomFormValues(gen map[string]string) TaskConfigOptFunc {
188 | 	return func(tc *TaskConfig) {
189 | 		if tc.CustomFormValues == nil || len(tc.CustomFormValues) == 0 {
190 | 			tc.CustomFormValues = gen
191 | 		}
192 | 	}
193 | }
194 | func WithCustomFormKeywordValues(gen map[string]string) TaskConfigOptFunc {
195 | 	return func(tc *TaskConfig) {
196 | 		if tc.CustomFormKeywordValues == nil || len(tc.CustomFormKeywordValues) == 0 {
197 | 			tc.CustomFormKeywordValues = gen
198 | 		}
199 | 	}
200 | }
201 | 


--------------------------------------------------------------------------------
/pkg/taskconfig_test.go:
--------------------------------------------------------------------------------
 1 | package pkg_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/Qianlitp/crawlergo/pkg"
 8 | 	"github.com/Qianlitp/crawlergo/pkg/config"
 9 | 	"github.com/stretchr/testify/assert"
10 | )
11 | 
12 | func TestTaskConfigOptFunc(t *testing.T) {
13 | 	// 测试 https://github.com/Qianlitp/crawlergo/pull/101 修改的代码
14 | 	var taskConf pkg.TaskConfig
15 | 	for _, fn := range []pkg.TaskConfigOptFunc{
16 | 		pkg.WithTabRunTimeout(config.TabRunTimeout),
17 | 		pkg.WithMaxTabsCount(config.MaxTabsCount),
18 | 		pkg.WithMaxCrawlCount(config.MaxCrawlCount),
19 | 		pkg.WithDomContentLoadedTimeout(config.DomContentLoadedTimeout),
20 | 		pkg.WithEventTriggerInterval(config.EventTriggerInterval),
21 | 		pkg.WithBeforeExitDelay(config.BeforeExitDelay),
22 | 		pkg.WithEventTriggerMode(config.DefaultEventTriggerMode),
23 | 		pkg.WithIgnoreKeywords(config.DefaultIgnoreKeywords),
24 | 	} {
25 | 		fn(&taskConf)
26 | 	}
27 | 
28 | 	// 应该都要等于默认配置
29 | 	assert.Equal(t, taskConf.TabRunTimeout, config.TabRunTimeout)
30 | 	assert.Equal(t, taskConf.MaxTabsCount, config.MaxTabsCount)
31 | 	assert.Equal(t, taskConf.MaxCrawlCount, config.MaxCrawlCount)
32 | 	assert.Equal(t, taskConf.DomContentLoadedTimeout, config.DomContentLoadedTimeout)
33 | 	assert.Equal(t, taskConf.EventTriggerInterval, config.EventTriggerInterval)
34 | 	assert.Equal(t, taskConf.BeforeExitDelay, config.BeforeExitDelay)
35 | 	assert.Equal(t, taskConf.EventTriggerMode, config.DefaultEventTriggerMode)
36 | 	assert.Equal(t, taskConf.IgnoreKeywords, config.DefaultIgnoreKeywords)
37 | 
38 | 	// 重设超时时间
39 | 	taskConf.TabRunTimeout = time.Minute * 5
40 | 
41 | 	// 企图覆盖自定义的时间, 不应该允许, 程序初始化时只能配置一次, 先由用户配置
42 | 	pkg.WithTabRunTimeout(time.Second * 5)(&taskConf)
43 | 	assert.NotEqual(t, taskConf.TabRunTimeout, time.Second*5)
44 | 	assert.NotEqual(t, taskConf.TabRunTimeout, config.TabRunTimeout)
45 | }
46 | 


--------------------------------------------------------------------------------
/pkg/tools/common.go:
--------------------------------------------------------------------------------
 1 | package tools
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"crypto/md5"
 6 | 	"encoding/hex"
 7 | 	"fmt"
 8 | 	"io"
 9 | 	"os"
10 | 	"strings"
11 | 
12 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
13 | )
14 | 
15 | func StrMd5(str string) string {
16 | 	h := md5.New()
17 | 	h.Write([]byte(str))
18 | 	return hex.EncodeToString(h.Sum(nil))
19 | }
20 | 
21 | func ConvertHeaders(h map[string]interface{}) map[string]string {
22 | 	a := map[string]string{}
23 | 	for key, value := range h {
24 | 		a[key] = value.(string)
25 | 	}
26 | 	return a
27 | }
28 | 
29 | func WriteFile(fileName string, content []byte) {
30 | 	f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE, 0644)
31 | 	if err != nil {
32 | 		fmt.Println(err.Error())
33 | 	} else {
34 | 		defer f.Close()
35 | 		_, err = f.Write(content)
36 | 		if err != nil {
37 | 			logger.Logger.Error("write to file error ", err)
38 | 		}
39 | 	}
40 | }
41 | 
42 | func ReadFile(filePath string) []string {
43 | 	filePaths := []string{}
44 | 	f, err := os.OpenFile(filePath, os.O_RDONLY, 0644)
45 | 	if err != nil {
46 | 		fmt.Println(err.Error())
47 | 	} else {
48 | 		defer f.Close()
49 | 		rd := bufio.NewReader(f)
50 | 		for {
51 | 			line, err := rd.ReadString('\n')
52 | 			if err != nil || io.EOF == err {
53 | 				break
54 | 			}
55 | 			filePaths = append(filePaths, line)
56 | 		}
57 | 	}
58 | 	return filePaths
59 | }
60 | 
61 | func StringSliceContain(data []string, item string) bool {
62 | 	for _, value := range data {
63 | 		if value == item {
64 | 			return true
65 | 		}
66 | 	}
67 | 	return false
68 | }
69 | 
70 | func MapStringFormat(data map[string]string) string {
71 | 	str := ""
72 | 	for key, value := range data {
73 | 		str += fmt.Sprintf("%s=%s,", key, value)
74 | 	}
75 | 	str = strings.Trim(str, ",")
76 | 	return str
77 | }
78 | 


--------------------------------------------------------------------------------
/pkg/tools/random.go:
--------------------------------------------------------------------------------
 1 | // 随机数相关函数
 2 | package tools
 3 | 
 4 | import (
 5 | 	"math/rand"
 6 | 	"strings"
 7 | 	"time"
 8 | )
 9 | 
10 | const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
11 | const (
12 | 	letterIdxBits = 6                    // 6 bits to represent a letter index
13 | 	letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
14 | 	letterIdxMax  = 63 / letterIdxBits   // # of letter indices fitting in 63 bits
15 | )
16 | 
17 | const letterBytesLower = "abcdefghijklmnopqrstuvwxyz0123456789"
18 | const (
19 | 	letterIdxBitsLower = 6                         // 6 bits to represent a letter index
20 | 	letterIdxMaskLower = 1<<letterIdxBitsLower - 1 // All 1-bits, as many as letterIdxBits
21 | 	letterIdxMaxLower  = 63 / letterIdxBitsLower   // # of letter indices fitting in 63 bits
22 | )
23 | 
24 | var src = rand.NewSource(time.Now().UnixNano())
25 | 
26 | // RandSeq 生成长度为n的随机序列，包含大小写字母，数字
27 | func RandSeq(n int) string {
28 | 	sb := strings.Builder{}
29 | 	sb.Grow(n)
30 | 	// A src.Int63() generates 63 random bits, enough for letterIdxMax characters!
31 | 	for i, cache, remain := n-1, src.Int63(), letterIdxMax; i >= 0; {
32 | 		if remain == 0 {
33 | 			cache, remain = src.Int63(), letterIdxMax
34 | 		}
35 | 		if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
36 | 			sb.WriteByte(letterBytes[idx])
37 | 			i--
38 | 		}
39 | 		cache >>= letterIdxBits
40 | 		remain--
41 | 	}
42 | 
43 | 	return sb.String()
44 | }
45 | 


--------------------------------------------------------------------------------
/pkg/tools/requests/requests.go:
--------------------------------------------------------------------------------
  1 | package requests
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"crypto/tls"
  6 | 	"fmt"
  7 | 	"net/http"
  8 | 	"net/url"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 13 | 	"github.com/pkg/errors"
 14 | )
 15 | 
 16 | const DefaultUa = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" +
 17 | 	" Chrome/76.0.3809.132 Safari/537.36 C845D9D38B3A68F4F74057DB542AD252 tx/2.0"
 18 | 
 19 | const defaultTimeout int = 15
 20 | 
 21 | // 最大获取100K的响应，适用于绝大部分场景
 22 | const defaultResponseLength = 10240
 23 | const defaultRetry = 0
 24 | 
 25 | var ContentTypes = map[string]string{
 26 | 	"json":      "application/json",
 27 | 	"xml":       "application/xml",
 28 | 	"soap":      "application/soap+xml",
 29 | 	"multipart": "multipart/form-data",
 30 | 	"form":      "application/x-www-form-urlencoded; charset=utf-8",
 31 | }
 32 | 
 33 | // ReqInfo 是一个HTTP请求元素的封装，可以快速进行简单的http请求
 34 | type ReqInfo struct {
 35 | 	Verb    string
 36 | 	Url     string
 37 | 	Headers map[string]string
 38 | 	Body    []byte
 39 | }
 40 | 
 41 | type ReqOptions struct {
 42 | 	Timeout       int    // in seconds
 43 | 	Retry         int    // 0为默认值，-1 代表关闭不retry
 44 | 	VerifySSL     bool   // default false
 45 | 	AllowRedirect bool   // default false
 46 | 	Proxy         string // proxy settings, support http/https proxy only, e.g. http://127.0.0.1:8080
 47 | }
 48 | 
 49 | type session struct {
 50 | 	ReqOptions
 51 | 	client *http.Client
 52 | }
 53 | 
 54 | // getSessionByOptions 根据配置获取一个session
 55 | func getSessionByOptions(options *ReqOptions) *session {
 56 | 	if options == nil {
 57 | 		options = &ReqOptions{}
 58 | 	}
 59 | 	// 设置client的超时与ssl验证
 60 | 	timeout := time.Duration(options.Timeout) * time.Second
 61 | 	if options.Timeout == 0 {
 62 | 		timeout = time.Duration(defaultTimeout) * time.Second
 63 | 	}
 64 | 	tr := &http.Transport{
 65 | 		TLSClientConfig: &tls.Config{InsecureSkipVerify: !options.VerifySSL},
 66 | 	}
 67 | 	if options.Proxy != "" {
 68 | 		proxyUrl, err := url.Parse(options.Proxy)
 69 | 		if err == nil {
 70 | 			tr.Proxy = http.ProxyURL(proxyUrl)
 71 | 		}
 72 | 	}
 73 | 	client := &http.Client{
 74 | 		Timeout:   timeout,
 75 | 		Transport: tr}
 76 | 	// 设置是否跟踪跳转
 77 | 	if !options.AllowRedirect {
 78 | 		client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
 79 | 			return http.ErrUseLastResponse
 80 | 		}
 81 | 	}
 82 | 	// options内容同步到session中
 83 | 	return &session{
 84 | 		ReqOptions: ReqOptions{
 85 | 			options.Timeout,
 86 | 			options.Retry,
 87 | 			options.VerifySSL,
 88 | 			options.AllowRedirect,
 89 | 			options.Proxy,
 90 | 		},
 91 | 		client: client,
 92 | 	}
 93 | }
 94 | 
 95 | // Get GET请求
 96 | func Get(url string, headers map[string]string, options *ReqOptions) (*Response, error) {
 97 | 	sess := getSessionByOptions(options)
 98 | 	return sess.doRequest("GET", url, headers, nil)
 99 | }
100 | 
101 | // Request 自定义请求类型
102 | func Request(verb string, url string, headers map[string]string, body []byte, options *ReqOptions) (*Response, error) {
103 | 	sess := getSessionByOptions(options)
104 | 	return sess.doRequest(verb, url, headers, body)
105 | }
106 | 
107 | // session functions
108 | 
109 | // Get Session的GET请求
110 | func (sess *session) Get(url string, headers map[string]string) (*Response, error) {
111 | 	return sess.doRequest("GET", url, headers, nil)
112 | }
113 | 
114 | // Post Session的POST请求
115 | func (sess *session) Post(url string, headers map[string]string, body []byte) (*Response, error) {
116 | 	return sess.doRequest("POST", url, headers, body)
117 | }
118 | 
119 | // Request Session的自定义请求类型
120 | func (sess *session) Request(verb string, url string, headers map[string]string, body []byte) (*Response, error) {
121 | 	return sess.doRequest(verb, url, headers, body)
122 | }
123 | 
124 | // Request reqInfo的快速调用
125 | func (r *ReqInfo) Request() (*Response, error) {
126 | 	return Request(r.Verb, r.Url, r.Headers, r.Body, nil)
127 | }
128 | 
129 | func (r *ReqInfo) RequestWithOptions(options *ReqOptions) (*Response, error) {
130 | 	return Request(r.Verb, r.Url, r.Headers, r.Body, options)
131 | }
132 | 
133 | func (r *ReqInfo) Clone() *ReqInfo {
134 | 	return &ReqInfo{
135 | 		Verb:    r.Verb,
136 | 		Url:     r.Url,
137 | 		Headers: r.Headers,
138 | 		Body:    r.Body,
139 | 	}
140 | }
141 | 
142 | func (r *ReqInfo) SetHeader(name, value string) {
143 | 	if r.Headers == nil {
144 | 		r.Headers = make(map[string]string)
145 | 	}
146 | 	r.Headers[name] = value
147 | }
148 | 
149 | // doRequest 实际请求的函数
150 | func (sess *session) doRequest(verb string, url string, headers map[string]string, body []byte) (*Response, error) {
151 | 	logger.Logger.Debug("do request to ", url)
152 | 	verb = strings.ToUpper(verb)
153 | 	bodyReader := bytes.NewReader(body)
154 | 	req, err := http.NewRequest(verb, url, bodyReader)
155 | 	if err != nil {
156 | 		// 多数情况下是url中包含%
157 | 		url = escapePercentSign(url)
158 | 		req, err = http.NewRequest(verb, url, bodyReader)
159 | 	}
160 | 	if err != nil {
161 | 		return nil, errors.Wrap(err, "build request error")
162 | 	}
163 | 
164 | 	// 设置headers头
165 | 	for key, value := range headers {
166 | 		req.Header.Set(key, value)
167 | 	}
168 | 	// 设置默认的headers头
169 | 	defaultHeaders := map[string]string{
170 | 		"User-Agent": DefaultUa,
171 | 		"Range":      fmt.Sprintf("bytes=0-%d", defaultResponseLength),
172 | 		"Connection": "close",
173 | 	}
174 | 	for key, value := range defaultHeaders {
175 | 		if _, ok := headers[key]; !ok {
176 | 			req.Header.Set(key, value)
177 | 		}
178 | 	}
179 | 	// 设置Host头
180 | 	if host, ok := headers["Host"]; ok {
181 | 		req.Host = host
182 | 	}
183 | 	// 设置默认的Content-Type头
184 | 	if verb == "POST" && headers["Content-Type"] == "" {
185 | 		req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
186 | 		// 应该手动设置Referer、Origin、和X-Requested-With字段
187 | 	}
188 | 	// 覆盖Connection头
189 | 	req.Header.Set("Connection", "close")
190 | 
191 | 	// 设置重试次数
192 | 	retry := sess.Retry
193 | 	if retry == 0 {
194 | 		retry = defaultRetry
195 | 	} else if retry == -1 {
196 | 		retry = 0
197 | 	}
198 | 
199 | 	// 请求
200 | 	var resp *http.Response
201 | 	for i := 0; i <= retry; i++ {
202 | 		resp, err = sess.client.Do(req)
203 | 		if err != nil {
204 | 			// sleep 0.1s
205 | 			time.Sleep(100 * time.Microsecond)
206 | 			continue
207 | 		} else {
208 | 			break
209 | 		}
210 | 	}
211 | 
212 | 	if err != nil {
213 | 		return nil, errors.Wrap(err, "error occurred during request")
214 | 	}
215 | 	// 带Range头后一般webserver响应都是206 PARTIAL CONTENT，修正为200 OK
216 | 	if resp.StatusCode == 206 {
217 | 		resp.StatusCode = 200
218 | 		resp.Status = "200 OK"
219 | 	}
220 | 
221 | 	return NewResponse(resp), nil
222 | }
223 | 


--------------------------------------------------------------------------------
/pkg/tools/requests/response.go:
--------------------------------------------------------------------------------
 1 | package requests
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/Qianlitp/crawlergo/pkg/logger"
 8 | )
 9 | 
10 | // 自定义一些函数
11 | type Response struct {
12 | 	http.Response
13 | 	// raw text Response
14 | 	Text string
15 | }
16 | 
17 | func getTextFromResp(r *http.Response) string {
18 | 	// TODO: 编码转换
19 | 	if r.ContentLength == 0 {
20 | 		return ""
21 | 	}
22 | 	b, err := ioutil.ReadAll(r.Body)
23 | 	if err != nil {
24 | 		logger.Logger.Debug("get response body err ", err)
25 | 	}
26 | 	_ = r.Body.Close()
27 | 	return string(b)
28 | }
29 | 
30 | func NewResponse(r *http.Response) *Response {
31 | 	return &Response{
32 | 		Response: *r,
33 | 		Text:     getTextFromResp(r),
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/tools/requests/utils.go:
--------------------------------------------------------------------------------
 1 | package requests
 2 | 
 3 | import (
 4 | 	"github.com/pkg/errors"
 5 | 	"net/url"
 6 | 	"strings"
 7 | )
 8 | 
 9 | // UrlParse 调用url.Parse，增加了对%的处理
10 | func UrlParse(sourceUrl string) (*url.URL, error) {
11 | 	u, err := url.Parse(sourceUrl)
12 | 	if err != nil {
13 | 		u, err = url.Parse(escapePercentSign(sourceUrl))
14 | 	}
15 | 	if err != nil {
16 | 		return nil, errors.Wrap(err, "parse url error")
17 | 	}
18 | 	return u, nil
19 | }
20 | 
21 | // escapePercentSign 把url中的%替换为%25
22 | func escapePercentSign(raw string) string {
23 | 	return strings.ReplaceAll(raw, "%", "%25")
24 | }
25 | 


--------------------------------------------------------------------------------