├── .gitignore
├── .vscode
└── launch.json
├── LICENSE
├── README.md
├── package.json
├── snapshot
└── run.gif
└── src
├── config.js
├── index.js
├── lib
├── depth.js
├── downloader.js
└── regular.js
└── resource
├── index.js
└── url
└── api.js
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 |
8 | # Runtime data
9 | pids
10 | *.pid
11 | *.seed
12 | *.pid.lock
13 |
14 | # Directory for instrumented libs generated by jscoverage/JSCover
15 | lib-cov
16 |
17 | # Coverage directory used by tools like istanbul
18 | coverage
19 |
20 | # nyc test coverage
21 | .nyc_output
22 |
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 |
26 | # Bower dependency directory (https://bower.io/)
27 | bower_components
28 |
29 | # node-waf configuration
30 | .lock-wscript
31 |
32 | # Compiled binary addons (https://nodejs.org/api/addons.html)
33 | build/Release
34 |
35 | # Dependency directories
36 | node_modules/
37 | jspm_packages/
38 |
39 | # TypeScript v1 declaration files
40 | typings/
41 |
42 | # Optional npm cache directory
43 | .npm
44 |
45 | # Optional eslint cache
46 | .eslintcache
47 |
48 | # Optional REPL history
49 | .node_repl_history
50 |
51 | # Output of 'npm pack'
52 | *.tgz
53 |
54 | # Yarn Integrity file
55 | .yarn-integrity
56 |
57 | # dotenv environment variables file
58 | .env
59 |
60 | # next.js build output
61 | .next
62 |
63 | # file
64 | dist
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // 使用 IntelliSense 了解相关属性。
3 | // 悬停以查看现有属性的描述。
4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "type": "node",
9 | "request": "launch",
10 | "name": "Debug task",
11 | "program": "${workspaceRoot}/src/index.js",
12 | "stopOnEntry": true,
13 | "args": [""],
14 | "cwd": "${workspaceRoot}/",
15 | "outFiles": [],
16 | "sourceMaps": true,
17 | "runtimeExecutable": null,
18 | "env": {}
19 | }
20 | ]
21 | }
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # QuickDownload
2 |
3 | `通过 URL 抓取内容(HTML/API-JSON)快速下载自定义的资源`
4 | Quickly download images from the remote
5 |
6 | ### 支持情况
7 | - 自定义配置
8 | - 深度抓取模式
9 | - 自动多级目录
10 | - 正向代理池(待开发)
11 | - 队列多线程任务(待开发)
12 |
13 | ## Install
14 |
15 | [Install NodeJS and suggest >= 8.11.0](https://nodejs.org/zh-cn/)
16 |
17 | ## Usage
18 |
19 | - API Config [配置项](./src/resource/url/api.js)
20 | ```Node
21 | src:[{
22 | // 其它节点需按此格式进行配置
23 | // 检索的远程端地址
24 | url: '',
25 | // 是否启用(非必须)
26 | enable: true,
27 | // 深度爬取级别(非必须,不建议超过3级)
28 | depth: 1
29 | }]
30 | ```
31 |
32 | - Run
33 | ```bash
34 | npm start
35 | ```
36 |
37 | ### 演示
38 |
39 |
40 | -------------------
41 |
42 | `禁止商业用途 ❤ 研究学习范畴 ❤ 作者保留解释权`
43 | Commercial use is forbidden and The author reserves the right of interpretion
44 |
45 | [✶ MIT ✶](./LICENSE)
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "quickdownload",
3 | "version": "1.0.0",
4 | "description": "Quickly download images from the remote",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "echo \"Error: no test specified\" && exit 1",
8 | "start": "node src/index.js",
9 | "debug": "node --debug-brk --inspect ./src/index.js"
10 | },
11 | "engines": {
12 | "node": ">=8.11.0"
13 | },
14 | "author": "itenl",
15 | "license": "MIT"
16 | }
17 |
--------------------------------------------------------------------------------
/snapshot/run.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itenl/QuickDownload/3d6c7640ac4289ad0fe4221de2a1733bddac2bc4/snapshot/run.gif
--------------------------------------------------------------------------------
/src/config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | // 是否随机文件名
3 | randomName: false,
4 | // 输出路径
5 | distPath: './dist',
6 | // 自动子路径
7 | // depth > 0
8 | autoChildPath: true,
9 | // 是否保存远程响应的内容(HTML/API-JSON等)
10 | autoSaveRemoteContent: false,
11 | // 允许下载文件后缀
12 | mime: ['jpg', 'jpeg', 'gif', 'png'],
13 | // http Proxy
14 | proxy: {
15 | hostname: '127.0.0.1',
16 | port: '1087',
17 | enable: false
18 | },
19 | // 正则匹配内容时忽略 http:或https:协议 已强制忽略
20 | // ignoreProtocol: true,
21 | // 文件名过滤规则
22 | filter: fileName => {
23 | return encodeURIComponent(fileName);
24 | }
25 | };
26 |
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | const downloader = require('./lib/downloader');
2 | const resource = require('./resource');
3 |
4 | downloader.start(resource.api.src);
5 |
--------------------------------------------------------------------------------
/src/lib/depth.js:
--------------------------------------------------------------------------------
1 | const regular = require('./regular');
2 |
3 | // 处理所匹配到的地址
4 | const processAddr = (url, _protocol) => {
5 | url = regular.process_url(url);
6 | if (true) {
7 | // A标签 存在相对路径的情况需要特殊处理 '/s02/index.html'
8 | url = url.replace(/^\/.*\.html?$/, (a, b, c) => {
9 | return `${_protocol[1]}//${_protocol[3]}${a}`;
10 | });
11 | }
12 |
13 | return {
14 | url: url,
15 | enable: [
16 | () => {
17 | // 过滤无意义的后缀资源链接(可修改成用户传入)
18 | return !new RegExp(`.*?\\.(${['ico', 'css', 'js'].join('|')})$`, 'ig').test(url);
19 | },
20 | () => {
21 | // 当前地址属于所传入的泛域下
22 | return !!(url.indexOf(_protocol[3].replace('www.', '')) > -1);
23 | },
24 | () => {
25 | // 属于合法域名
26 | return !!regular.url.test(url);
27 | }
28 | ].every(func => {
29 | return func && func();
30 | }),
31 | depth: 0
32 | };
33 | };
34 |
35 | // 匹配子页可访问的链接
36 | const startDepth = (content, surplus, dist, _protocol) => {
37 | if (!_protocol || !_protocol[3]) {
38 | console.log('请提供源站地址,方可进行深度查询(避免内外联的深度广度过于宽泛)');
39 | return;
40 | }
41 | const addrs = regular.website(content);
42 | if (addrs) {
43 | const downloader = require('./downloader');
44 | console.log(`剩余深度 ${surplus} 已获取到 ${addrs.length} 个横向地址`);
45 | let items = [];
46 | addrs.forEach(url => {
47 | const item = processAddr(url, _protocol);
48 | if (item.enable) items.push(item);
49 | });
50 | downloader.start(items, dist);
51 | }
52 | };
53 |
54 | module.exports = {
55 | startDepth
56 | };
57 |
--------------------------------------------------------------------------------
/src/lib/downloader.js:
--------------------------------------------------------------------------------
1 | const config = require('../config');
2 | const regular = require('./regular');
3 | const depth = require('./depth');
4 | const protocol = {
5 | http: require('http'),
6 | https: require('https')
7 | };
8 | const URL = require('url');
9 | const fs = require('fs');
10 | const path = require('path');
11 | const counter = {
12 | total: 0,
13 | complete: 0
14 | };
15 | process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
16 | const depth_domain = {};
17 |
18 | const getURI = url => {
19 | const urlObj = URL.parse(url);
20 | if (config.proxy && config.proxy.enable) {
21 | return {
22 | path: url,
23 | hostname: config.proxy.hostname,
24 | port: config.proxy.port,
25 | headers: {
26 | Referer: urlObj.href,
27 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
28 | }
29 | };
30 | } else {
31 | return url;
32 | }
33 | };
34 |
35 | const requestCallBack = (resourceURI, index, dist, _protocol) => {
36 | const fileName = config.randomName ? `${index}-${path.basename(resourceURI)}` : config.filter(resourceURI);
37 | const callback = function(res) {
38 | // console.log('request: ' + resourceURI + ' return status: ' + res.statusCode);
39 | if (res.statusCode == 302 || res.statusCode == 301) {
40 | const location = URL.resolve(resourceURI, res.headers['location']);
41 | // downloadRequirement(location, index, dist, _protocol);
42 | console.error('302 fail, maybe need config headers');
43 | } else {
44 | // 返回 response
45 | const contentLength = parseInt(res.headers['content-length']);
46 | const fileBuff = [];
47 | res.on('data', function(chunk) {
48 | const buffer = new Buffer(chunk);
49 | fileBuff.push(buffer);
50 | });
51 | res.on('end', function() {
52 | // console.log('end downloading ' + resourceURI);
53 | if (isNaN(contentLength)) {
54 | console.log(resourceURI + ' content length error');
55 | return;
56 | }
57 | const totalBuff = Buffer.concat(fileBuff);
58 | // console.log('totalBuff.length = ' + totalBuff.length + ' ' + 'contentLength = ' + contentLength);
59 | if (totalBuff.length < contentLength) {
60 | console.log(resourceURI + ' download error, try again');
61 | downloadRequirement(resourceURI, index, dist, _protocol);
62 | return;
63 | }
64 | fs.appendFile(dist + '/' + fileName, totalBuff, function(err) {
65 | if (err) console.error(err);
66 | });
67 | counter.complete++;
68 | console.log('total ' + counter.total + ' complete ' + counter.complete);
69 | });
70 | }
71 | };
72 |
73 | return callback;
74 | };
75 |
76 | const downloadRequirement = (resourceURI, index, dist, _protocol) => {
77 | try {
78 | const req = protocol[_protocol].request(getURI(resourceURI), requestCallBack(resourceURI, index, dist, _protocol));
79 | req.on('error', function(e) {
80 | console.log('request ' + resourceURI + ' error, try again');
81 | downloadRequirement(resourceURI, index, dist, _protocol);
82 | });
83 | req.end();
84 | } catch (error) {
85 | console.log(_protocol, error);
86 | throw error;
87 | }
88 | };
89 |
90 | const requestRemote = (url, callback, _protocol) => {
91 | try {
92 | protocol[_protocol]
93 | .get(getURI(url), function(res) {
94 | var chunks = [];
95 | var size = 0;
96 | res.on('data', function(chunk) {
97 | chunks.push(chunk);
98 | size += chunk.length;
99 | });
100 | res.on('end', function() {
101 | var data = Buffer.concat(chunks, size);
102 | var html = data.toString();
103 | callback && callback(html, res);
104 | });
105 | })
106 | .on('error', function(err) {
107 | console.log(err, url);
108 | });
109 | } catch (err) {
110 | console.log(err, url);
111 | }
112 | };
113 |
114 | const readyDownloadTask = (srcAddrs, dist) => {
115 | if (!srcAddrs) return;
116 | counter.total = srcAddrs.length;
117 | console.log('total ' + counter.total);
118 | srcAddrs.forEach(function(item, index, array) {
119 | if (!item) return;
120 | const url_array = regular.url.exec(item);
121 | if (!url_array || !url_array[2]) {
122 | console.log(`所匹配到资源 ${item} 不符合规范,缺少协议【http 或 https】 readyDownloadTask`);
123 | return;
124 | }
125 | downloadRequirement(item, index, dist, url_array[2]);
126 | });
127 | };
128 |
129 | const getRequirementAddr = (content, url) => {
130 | if (!content) return false;
131 | try {
132 | if (typeof content == 'object') content = JSON.stringify(content);
133 | const addrs = content.match(regular.userRequirement);
134 | if (!addrs) return false;
135 | const URI = regular.url.exec(url);
136 | return addrs.map((item, index, all) => {
137 | item = item.replace(/src=\"?/gi, (source, group, index) => {
138 | return '';
139 | });
140 | if (item.indexOf('"') == 0) item = item.slice(1);
141 | let result = regular.url.exec(item);
142 | if (!result) (item = URL.resolve(URI[0], item)), (result = regular.url.exec(item));
143 | if (result && !result[1]) item = URL.resolve(URI[0], item);
144 | return item;
145 | });
146 | } catch (error) {
147 | console.log('[getRequirementAddr]', content, error);
148 | }
149 | };
150 |
151 | // 创建多级目录
152 | const mkdirFolder = dist => {
153 | try {
154 | if (!fs.existsSync(dist)) {
155 | let pathtmp;
156 | dist.split('/').forEach(dir => {
157 | if (pathtmp) {
158 | pathtmp = path.join(pathtmp, dir);
159 | } else {
160 | dir ? (pathtmp = dir) : (pathtmp = '/');
161 | }
162 | if (!fs.existsSync(pathtmp)) {
163 | if (!fs.mkdirSync(pathtmp)) {
164 | return false;
165 | }
166 | }
167 | });
168 | }
169 | return true;
170 | } catch (error) {
171 | console.log(error);
172 | return false;
173 | }
174 | };
175 |
176 | const saveRemoteContent = (content, path, res) => {
177 | let contentType = res && res.headers['content-type'];
178 | if (contentType) {
179 | contentType = contentType
180 | .toLowerCase()
181 | .replace(/.*\//gi, '')
182 | .replace(new RegExp(`${['x-javascript', 'plain ', 'x-www-form-urlencoded'].join('|')}`, 'gi'), function(item) {
183 | let mime = '';
184 | switch (item) {
185 | case 'x-javascript':
186 | mime = 'js';
187 | break;
188 | case 'plain':
189 | mime = 'txt';
190 | break;
191 | case 'x-www-form-urlencoded':
192 | mime = 'json';
193 | break;
194 | }
195 | return mime;
196 | });
197 | path = [path, contentType].join('.');
198 | }
199 | fs.writeFile(path, content, function(err) {
200 | if (err) console.log(err);
201 | });
202 | };
203 |
204 | // 启动/递归深度 结束条件 !depth
205 | const start = (srcs, prevDist = '') => {
206 | if (!srcs || !srcs instanceof Array) return;
207 | srcs.forEach(src => {
208 | if (!src || !src.url || (src.enable != undefined && !src.enable)) return;
209 | const url_array = regular.url.exec(src.url);
210 | if (!url_array || !url_array[2]) {
211 | console.log(`所需检索的地址 ${src.url} 不符合规范,缺少协议【http 或 https】`);
212 | return;
213 | }
214 | let encode_url = encodeURIComponent(src.url),
215 | _protocol = url_array[2];
216 | if (depth_domain[encode_url]) return;
217 | if (encode_url.length > 100) encode_url = encode_url.slice(0, 100);
218 | depth_domain[encode_url] = src;
219 | const currentDist = config.autoChildPath ? path.join(prevDist ? prevDist : config.distPath, encode_url) : config.distPath;
220 | src.url = regular.process_url(src.url);
221 | if (mkdirFolder(currentDist)) {
222 | requestRemote(
223 | src.url,
224 | (content, res) => {
225 | if (content) {
226 | if (config.autoSaveRemoteContent) saveRemoteContent(content, path.join(currentDist, encode_url), res);
227 | src.depth && depth.startDepth(content, --src.depth, currentDist, url_array);
228 | const srcAddrs = getRequirementAddr(content, src.url);
229 | if (srcAddrs) {
230 | console.log(`${src.url} 中含有 所需资源 ${srcAddrs.length}`);
231 | readyDownloadTask(srcAddrs, currentDist);
232 | }
233 | }
234 | },
235 | _protocol
236 | );
237 | } else {
238 | console.log('目录创建异常,请排查');
239 | }
240 | });
241 | };
242 |
243 | module.exports = {
244 | start
245 | };
246 |
--------------------------------------------------------------------------------
/src/lib/regular.js:
--------------------------------------------------------------------------------
1 | const config = require('../config');
2 | module.exports = {
3 | url: /((\w+):)?\/\/([^\:|\/]+)(\:\d*)?(.*\/?)([^#|\?|\n]+)?(#.*)?(\?.*)?/i,
4 | userRequirement: (() => {
5 | console.log(config.mime.join('|'));
6 | // let reg = /((http:|https:)?\/\/)+(\w+\.)+(\w+)[\w\/\.\-]*(jpg|gif|png)/gi
7 | // return new RegExp(`(http:\/\/|https:\/\/|\/\/)([\\w.]+\/?)\\S*\\.(${config.mime.join('|')})`, 'gi');
8 | // 宽泛模式 不匹配http|https
9 | // return new RegExp(`\"([\\w.]+\/?)\\S*\\.(${config.mime.join('|')})`, 'gi');
10 | // 需取消贪婪模式
11 | return new RegExp(`src=\".+?\.(${['jpg', 'png'].join('|')})`, 'gi');
12 | })(),
13 | website: (content, distinct = true) => {
14 | let addrs = [];
15 | // 获取a标签href地址
16 | content.replace(/ {
17 | if (group) addrs.push(group);
18 | return source;
19 | });
20 | return distinct ? Array.from(new Set(addrs)) : addrs;
21 | },
22 | process_url: (url, protocol = 'https') => {
23 | return url.replace(/^\/\/.*?(.*)?/gi, (source, group, index) => {
24 | return `${protocol}://${group}`;
25 | });
26 | }
27 | };
28 |
--------------------------------------------------------------------------------
/src/resource/index.js:
--------------------------------------------------------------------------------
1 | const api = require('./url/api');
2 |
3 | module.exports = {
4 | api
5 | };
6 |
--------------------------------------------------------------------------------
/src/resource/url/api.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | src: [
3 | {
4 | // 其它节点需按此格式进行配置
5 | // 检索的远程端地址
6 | url: '',
7 | // 是否启用(非必须)
8 | enable: !true,
9 | // 深度爬取级别(非必须,不建议超过3级)
10 | depth: 1
11 | },
12 | {
13 | url: 'https://www.plmm.com.cn/',
14 | enable: !true,
15 | depth: 1
16 | },
17 | {
18 | url:
19 | 'https://mapi.vip.com/vips-mobile/rest/layout/h5/channel/data?f=www&width=640&height=460&net=wifi&changeResolution=2&channel_name=%E4%BB%8A%E6%97%A5%E6%8E%A8%E8%8D%90&app_name=shop_wap&app_version=4.0&mars_cid=1557162476000_5567b4018f1578cffadcea06b6982af7&warehouse=VIP_BJ&api_key=8cec5243ade04ed3a02c5972bcda0d3f&fdc_area_id=101101101&province_id=101101&city_id=101101101&saturn=&wap_consumer=A1&standby_id=www&source_app=yd_wap&mobile_platform=2&platform=2&client=wap&lightart_version=1&mobile_channel=mobiles-%7C%7C&menu_code=20181203001&load_more_token=eyJjaGFubmVsX2lkIjoiNDkiLCJ0c2lmdCI6IjEiLCJicmFuZF9vZmZzZXQiOiIzMCIsImJyYW5kX3JlZmVyX2luZGV4IjoiOSJ9&_=1557162495946',
20 | enable: !true,
21 | depth: 1
22 | }
23 | ]
24 | };
25 |
--------------------------------------------------------------------------------