├── .gitignore ├── .vscode └── launch.json ├── LICENSE ├── README.md ├── package.json ├── snapshot └── run.gif └── src ├── config.js ├── index.js ├── lib ├── depth.js ├── downloader.js └── regular.js └── resource ├── index.js └── url └── api.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (https://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # TypeScript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | # next.js build output 61 | .next 62 | 63 | # file 64 | dist -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "node", 9 | "request": "launch", 10 | "name": "Debug task", 11 | "program": "${workspaceRoot}/src/index.js", 12 | "stopOnEntry": true, 13 | "args": [""], 14 | "cwd": "${workspaceRoot}/", 15 | "outFiles": [], 16 | "sourceMaps": true, 17 | "runtimeExecutable": null, 18 | "env": {} 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QuickDownload 2 | 3 | `通过 URL 抓取内容(HTML/API-JSON)快速下载自定义的资源` 4 | Quickly download images from the remote 5 | 6 | ### 支持情况 7 | - 自定义配置 8 | - 深度抓取模式 9 | - 自动多级目录 10 | - 正向代理池(待开发) 11 | - 队列多线程任务(待开发) 12 | 13 | ## Install 14 | 15 | [Install NodeJS and suggest >= 8.11.0](https://nodejs.org/zh-cn/) 16 | 17 | ## Usage 18 | 19 | - API Config [配置项](./src/resource/url/api.js) 20 | ```Node 21 | src:[{ 22 | // 其它节点需按此格式进行配置 23 | // 检索的远程端地址 24 | url: '', 25 | // 是否启用(非必须) 26 | enable: true, 27 | // 深度爬取级别(非必须,不建议超过3级) 28 | depth: 1 29 | }] 30 | ``` 31 | 32 | - Run 33 | ```bash 34 | npm start 35 | ``` 36 | 37 | ### 演示 38 | 39 | 40 | ------------------- 41 | 42 | `禁止商业用途 ❤ 研究学习范畴 ❤ 作者保留解释权` 43 | Commercial use is forbidden and The author reserves the right of interpretion 44 | 45 | [✶ MIT ✶](./LICENSE) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "quickdownload", 3 | "version": "1.0.0", 4 | "description": "Quickly download images from the remote", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node src/index.js", 9 | "debug": "node --debug-brk --inspect ./src/index.js" 10 | }, 11 | "engines": { 12 | "node": ">=8.11.0" 13 | }, 14 | "author": "itenl", 15 | "license": "MIT" 16 | } 17 | -------------------------------------------------------------------------------- /snapshot/run.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itenl/QuickDownload/3d6c7640ac4289ad0fe4221de2a1733bddac2bc4/snapshot/run.gif -------------------------------------------------------------------------------- /src/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // 是否随机文件名 3 | randomName: false, 4 | // 输出路径 5 | distPath: './dist', 6 | // 自动子路径 7 | // depth > 0 8 | autoChildPath: true, 9 | // 是否保存远程响应的内容(HTML/API-JSON等) 10 | autoSaveRemoteContent: false, 11 | // 允许下载文件后缀 12 | mime: ['jpg', 'jpeg', 'gif', 'png'], 13 | // http Proxy 14 | proxy: { 15 | hostname: '127.0.0.1', 16 | port: '1087', 17 | enable: false 18 | }, 19 | // 正则匹配内容时忽略 http:或https:协议 已强制忽略 20 | // ignoreProtocol: true, 21 | // 文件名过滤规则 22 | filter: fileName => { 23 | return encodeURIComponent(fileName); 24 | } 25 | }; 26 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const downloader = require('./lib/downloader'); 2 | const resource = require('./resource'); 3 | 4 | downloader.start(resource.api.src); 5 | -------------------------------------------------------------------------------- /src/lib/depth.js: -------------------------------------------------------------------------------- 1 | const regular = require('./regular'); 2 | 3 | // 处理所匹配到的地址 4 | const processAddr = (url, _protocol) => { 5 | url = regular.process_url(url); 6 | if (true) { 7 | // A标签 存在相对路径的情况需要特殊处理 '/s02/index.html' 8 | url = url.replace(/^\/.*\.html?$/, (a, b, c) => { 9 | return `${_protocol[1]}//${_protocol[3]}${a}`; 10 | }); 11 | } 12 | 13 | return { 14 | url: url, 15 | enable: [ 16 | () => { 17 | // 过滤无意义的后缀资源链接(可修改成用户传入) 18 | return !new RegExp(`.*?\\.(${['ico', 'css', 'js'].join('|')})$`, 'ig').test(url); 19 | }, 20 | () => { 21 | // 当前地址属于所传入的泛域下 22 | return !!(url.indexOf(_protocol[3].replace('www.', '')) > -1); 23 | }, 24 | () => { 25 | // 属于合法域名 26 | return !!regular.url.test(url); 27 | } 28 | ].every(func => { 29 | return func && func(); 30 | }), 31 | depth: 0 32 | }; 33 | }; 34 | 35 | // 匹配子页可访问的链接 36 | const startDepth = (content, surplus, dist, _protocol) => { 37 | if (!_protocol || !_protocol[3]) { 38 | console.log('请提供源站地址,方可进行深度查询(避免内外联的深度广度过于宽泛)'); 39 | return; 40 | } 41 | const addrs = regular.website(content); 42 | if (addrs) { 43 | const downloader = require('./downloader'); 44 | console.log(`剩余深度 ${surplus} 已获取到 ${addrs.length} 个横向地址`); 45 | let items = []; 46 | addrs.forEach(url => { 47 | const item = processAddr(url, _protocol); 48 | if (item.enable) items.push(item); 49 | }); 50 | downloader.start(items, dist); 51 | } 52 | }; 53 | 54 | module.exports = { 55 | startDepth 56 | }; 57 | -------------------------------------------------------------------------------- /src/lib/downloader.js: -------------------------------------------------------------------------------- 1 | const config = require('../config'); 2 | const regular = require('./regular'); 3 | const depth = require('./depth'); 4 | const protocol = { 5 | http: require('http'), 6 | https: require('https') 7 | }; 8 | const URL = require('url'); 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const counter = { 12 | total: 0, 13 | complete: 0 14 | }; 15 | process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; 16 | const depth_domain = {}; 17 | 18 | const getURI = url => { 19 | const urlObj = URL.parse(url); 20 | if (config.proxy && config.proxy.enable) { 21 | return { 22 | path: url, 23 | hostname: config.proxy.hostname, 24 | port: config.proxy.port, 25 | headers: { 26 | Referer: urlObj.href, 27 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' 28 | } 29 | }; 30 | } else { 31 | return url; 32 | } 33 | }; 34 | 35 | const requestCallBack = (resourceURI, index, dist, _protocol) => { 36 | const fileName = config.randomName ? `${index}-${path.basename(resourceURI)}` : config.filter(resourceURI); 37 | const callback = function(res) { 38 | // console.log('request: ' + resourceURI + ' return status: ' + res.statusCode); 39 | if (res.statusCode == 302 || res.statusCode == 301) { 40 | const location = URL.resolve(resourceURI, res.headers['location']); 41 | // downloadRequirement(location, index, dist, _protocol); 42 | console.error('302 fail, maybe need config headers'); 43 | } else { 44 | // 返回 response 45 | const contentLength = parseInt(res.headers['content-length']); 46 | const fileBuff = []; 47 | res.on('data', function(chunk) { 48 | const buffer = new Buffer(chunk); 49 | fileBuff.push(buffer); 50 | }); 51 | res.on('end', function() { 52 | // console.log('end downloading ' + resourceURI); 53 | if (isNaN(contentLength)) { 54 | console.log(resourceURI + ' content length error'); 55 | return; 56 | } 57 | const totalBuff = Buffer.concat(fileBuff); 58 | // console.log('totalBuff.length = ' + totalBuff.length + ' ' + 'contentLength = ' + contentLength); 59 | if (totalBuff.length < contentLength) { 60 | console.log(resourceURI + ' download error, try again'); 61 | downloadRequirement(resourceURI, index, dist, _protocol); 62 | return; 63 | } 64 | fs.appendFile(dist + '/' + fileName, totalBuff, function(err) { 65 | if (err) console.error(err); 66 | }); 67 | counter.complete++; 68 | console.log('total ' + counter.total + ' complete ' + counter.complete); 69 | }); 70 | } 71 | }; 72 | 73 | return callback; 74 | }; 75 | 76 | const downloadRequirement = (resourceURI, index, dist, _protocol) => { 77 | try { 78 | const req = protocol[_protocol].request(getURI(resourceURI), requestCallBack(resourceURI, index, dist, _protocol)); 79 | req.on('error', function(e) { 80 | console.log('request ' + resourceURI + ' error, try again'); 81 | downloadRequirement(resourceURI, index, dist, _protocol); 82 | }); 83 | req.end(); 84 | } catch (error) { 85 | console.log(_protocol, error); 86 | throw error; 87 | } 88 | }; 89 | 90 | const requestRemote = (url, callback, _protocol) => { 91 | try { 92 | protocol[_protocol] 93 | .get(getURI(url), function(res) { 94 | var chunks = []; 95 | var size = 0; 96 | res.on('data', function(chunk) { 97 | chunks.push(chunk); 98 | size += chunk.length; 99 | }); 100 | res.on('end', function() { 101 | var data = Buffer.concat(chunks, size); 102 | var html = data.toString(); 103 | callback && callback(html, res); 104 | }); 105 | }) 106 | .on('error', function(err) { 107 | console.log(err, url); 108 | }); 109 | } catch (err) { 110 | console.log(err, url); 111 | } 112 | }; 113 | 114 | const readyDownloadTask = (srcAddrs, dist) => { 115 | if (!srcAddrs) return; 116 | counter.total = srcAddrs.length; 117 | console.log('total ' + counter.total); 118 | srcAddrs.forEach(function(item, index, array) { 119 | if (!item) return; 120 | const url_array = regular.url.exec(item); 121 | if (!url_array || !url_array[2]) { 122 | console.log(`所匹配到资源 ${item} 不符合规范,缺少协议【http 或 https】 readyDownloadTask`); 123 | return; 124 | } 125 | downloadRequirement(item, index, dist, url_array[2]); 126 | }); 127 | }; 128 | 129 | const getRequirementAddr = (content, url) => { 130 | if (!content) return false; 131 | try { 132 | if (typeof content == 'object') content = JSON.stringify(content); 133 | const addrs = content.match(regular.userRequirement); 134 | if (!addrs) return false; 135 | const URI = regular.url.exec(url); 136 | return addrs.map((item, index, all) => { 137 | item = item.replace(/src=\"?/gi, (source, group, index) => { 138 | return ''; 139 | }); 140 | if (item.indexOf('"') == 0) item = item.slice(1); 141 | let result = regular.url.exec(item); 142 | if (!result) (item = URL.resolve(URI[0], item)), (result = regular.url.exec(item)); 143 | if (result && !result[1]) item = URL.resolve(URI[0], item); 144 | return item; 145 | }); 146 | } catch (error) { 147 | console.log('[getRequirementAddr]', content, error); 148 | } 149 | }; 150 | 151 | // 创建多级目录 152 | const mkdirFolder = dist => { 153 | try { 154 | if (!fs.existsSync(dist)) { 155 | let pathtmp; 156 | dist.split('/').forEach(dir => { 157 | if (pathtmp) { 158 | pathtmp = path.join(pathtmp, dir); 159 | } else { 160 | dir ? (pathtmp = dir) : (pathtmp = '/'); 161 | } 162 | if (!fs.existsSync(pathtmp)) { 163 | if (!fs.mkdirSync(pathtmp)) { 164 | return false; 165 | } 166 | } 167 | }); 168 | } 169 | return true; 170 | } catch (error) { 171 | console.log(error); 172 | return false; 173 | } 174 | }; 175 | 176 | const saveRemoteContent = (content, path, res) => { 177 | let contentType = res && res.headers['content-type']; 178 | if (contentType) { 179 | contentType = contentType 180 | .toLowerCase() 181 | .replace(/.*\//gi, '') 182 | .replace(new RegExp(`${['x-javascript', 'plain ', 'x-www-form-urlencoded'].join('|')}`, 'gi'), function(item) { 183 | let mime = ''; 184 | switch (item) { 185 | case 'x-javascript': 186 | mime = 'js'; 187 | break; 188 | case 'plain': 189 | mime = 'txt'; 190 | break; 191 | case 'x-www-form-urlencoded': 192 | mime = 'json'; 193 | break; 194 | } 195 | return mime; 196 | }); 197 | path = [path, contentType].join('.'); 198 | } 199 | fs.writeFile(path, content, function(err) { 200 | if (err) console.log(err); 201 | }); 202 | }; 203 | 204 | // 启动/递归深度 结束条件 !depth 205 | const start = (srcs, prevDist = '') => { 206 | if (!srcs || !srcs instanceof Array) return; 207 | srcs.forEach(src => { 208 | if (!src || !src.url || (src.enable != undefined && !src.enable)) return; 209 | const url_array = regular.url.exec(src.url); 210 | if (!url_array || !url_array[2]) { 211 | console.log(`所需检索的地址 ${src.url} 不符合规范,缺少协议【http 或 https】`); 212 | return; 213 | } 214 | let encode_url = encodeURIComponent(src.url), 215 | _protocol = url_array[2]; 216 | if (depth_domain[encode_url]) return; 217 | if (encode_url.length > 100) encode_url = encode_url.slice(0, 100); 218 | depth_domain[encode_url] = src; 219 | const currentDist = config.autoChildPath ? path.join(prevDist ? prevDist : config.distPath, encode_url) : config.distPath; 220 | src.url = regular.process_url(src.url); 221 | if (mkdirFolder(currentDist)) { 222 | requestRemote( 223 | src.url, 224 | (content, res) => { 225 | if (content) { 226 | if (config.autoSaveRemoteContent) saveRemoteContent(content, path.join(currentDist, encode_url), res); 227 | src.depth && depth.startDepth(content, --src.depth, currentDist, url_array); 228 | const srcAddrs = getRequirementAddr(content, src.url); 229 | if (srcAddrs) { 230 | console.log(`${src.url} 中含有 所需资源 ${srcAddrs.length}`); 231 | readyDownloadTask(srcAddrs, currentDist); 232 | } 233 | } 234 | }, 235 | _protocol 236 | ); 237 | } else { 238 | console.log('目录创建异常,请排查'); 239 | } 240 | }); 241 | }; 242 | 243 | module.exports = { 244 | start 245 | }; 246 | -------------------------------------------------------------------------------- /src/lib/regular.js: -------------------------------------------------------------------------------- 1 | const config = require('../config'); 2 | module.exports = { 3 | url: /((\w+):)?\/\/([^\:|\/]+)(\:\d*)?(.*\/?)([^#|\?|\n]+)?(#.*)?(\?.*)?/i, 4 | userRequirement: (() => { 5 | console.log(config.mime.join('|')); 6 | // let reg = /((http:|https:)?\/\/)+(\w+\.)+(\w+)[\w\/\.\-]*(jpg|gif|png)/gi 7 | // return new RegExp(`(http:\/\/|https:\/\/|\/\/)([\\w.]+\/?)\\S*\\.(${config.mime.join('|')})`, 'gi'); 8 | // 宽泛模式 不匹配http|https 9 | // return new RegExp(`\"([\\w.]+\/?)\\S*\\.(${config.mime.join('|')})`, 'gi'); 10 | // 需取消贪婪模式 11 | return new RegExp(`src=\".+?\.(${['jpg', 'png'].join('|')})`, 'gi'); 12 | })(), 13 | website: (content, distinct = true) => { 14 | let addrs = []; 15 | // 获取a标签href地址 16 | content.replace(/ { 17 | if (group) addrs.push(group); 18 | return source; 19 | }); 20 | return distinct ? Array.from(new Set(addrs)) : addrs; 21 | }, 22 | process_url: (url, protocol = 'https') => { 23 | return url.replace(/^\/\/.*?(.*)?/gi, (source, group, index) => { 24 | return `${protocol}://${group}`; 25 | }); 26 | } 27 | }; 28 | -------------------------------------------------------------------------------- /src/resource/index.js: -------------------------------------------------------------------------------- 1 | const api = require('./url/api'); 2 | 3 | module.exports = { 4 | api 5 | }; 6 | -------------------------------------------------------------------------------- /src/resource/url/api.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | src: [ 3 | { 4 | // 其它节点需按此格式进行配置 5 | // 检索的远程端地址 6 | url: '', 7 | // 是否启用(非必须) 8 | enable: !true, 9 | // 深度爬取级别(非必须,不建议超过3级) 10 | depth: 1 11 | }, 12 | { 13 | url: 'https://www.plmm.com.cn/', 14 | enable: !true, 15 | depth: 1 16 | }, 17 | { 18 | url: 19 | 'https://mapi.vip.com/vips-mobile/rest/layout/h5/channel/data?f=www&width=640&height=460&net=wifi&changeResolution=2&channel_name=%E4%BB%8A%E6%97%A5%E6%8E%A8%E8%8D%90&app_name=shop_wap&app_version=4.0&mars_cid=1557162476000_5567b4018f1578cffadcea06b6982af7&warehouse=VIP_BJ&api_key=8cec5243ade04ed3a02c5972bcda0d3f&fdc_area_id=101101101&province_id=101101&city_id=101101101&saturn=&wap_consumer=A1&standby_id=www&source_app=yd_wap&mobile_platform=2&platform=2&client=wap&lightart_version=1&mobile_channel=mobiles-%7C%7C&menu_code=20181203001&load_more_token=eyJjaGFubmVsX2lkIjoiNDkiLCJ0c2lmdCI6IjEiLCJicmFuZF9vZmZzZXQiOiIzMCIsImJyYW5kX3JlZmVyX2luZGV4IjoiOSJ9&_=1557162495946', 20 | enable: !true, 21 | depth: 1 22 | } 23 | ] 24 | }; 25 | --------------------------------------------------------------------------------