├── filter.py ├── spider.js └── README.md /filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def file_path(path): 4 | for (root, dirs, files) in os.walk(path): 5 | for file in files: 6 | del_small_file(root + '/' + file) 7 | 8 | def del_small_file(file_name): 9 | size = os.path.getsize(file_name) 10 | file_size = 2 * 1024 11 | if size < file_size: 12 | os.remove(file_name) 13 | 14 | if __name__ == '__main__': 15 | path = r'./CNVD' 16 | file_path(path) -------------------------------------------------------------------------------- /spider.js: -------------------------------------------------------------------------------- 1 | /* FileSaver.js 2 | * A saveAs() FileSaver implementation. 3 | * 1.3.2 4 | * 2016-06-16 18:25:19 5 | * 6 | * By Eli Grey, http://eligrey.com 7 | * License: MIT 8 | * See https://github.com/eligrey/FileSaver.js/blob/master/LICENSE.md 9 | */ 10 | 11 | /*global self */ 12 | /*jslint bitwise: true, indent: 4, laxbreak: true, laxcomma: true, smarttabs: true, plusplus: true */ 13 | 14 | /*! @source http://purl.eligrey.com/github/FileSaver.js/blob/master/FileSaver.js */ 15 | 16 | var saveAs = saveAs || (function(view) { 17 | "use strict"; 18 | // IE <10 is explicitly unsupported 19 | if (typeof view === "undefined" || typeof navigator !== "undefined" && /MSIE [1-9]\./.test(navigator.userAgent)) { 20 | return; 21 | } 22 | var 23 | doc = view.document 24 | // only get URL when necessary in case Blob.js hasn't overridden it yet 25 | , get_URL = function() { 26 | return view.URL || view.webkitURL || view; 27 | } 28 | , save_link = doc.createElementNS("http://www.w3.org/1999/xhtml", "a") 29 | , can_use_save_link = "download" in save_link 30 | , click = function(node) { 31 | var event = new MouseEvent("click"); 32 | node.dispatchEvent(event); 33 | } 34 | , is_safari = /constructor/i.test(view.HTMLElement) || view.safari 35 | , is_chrome_ios =/CriOS\/[\d]+/.test(navigator.userAgent) 36 | , throw_outside = function(ex) { 37 | (view.setImmediate || view.setTimeout)(function() { 38 | throw ex; 39 | }, 0); 40 | } 41 | , force_saveable_type = "application/octet-stream" 42 | // the Blob API is fundamentally broken as there is no "downloadfinished" event to subscribe to 43 | , arbitrary_revoke_timeout = 1000 * 40 // in ms 44 | , revoke = function(file) { 45 | var revoker = function() { 46 | if (typeof file === "string") { // file is an object URL 47 | get_URL().revokeObjectURL(file); 48 | } else { // file is a File 49 | file.remove(); 50 | } 51 | }; 52 | setTimeout(revoker, arbitrary_revoke_timeout); 53 | } 54 | , dispatch = function(filesaver, event_types, event) { 55 | event_types = [].concat(event_types); 56 | var i = event_types.length; 57 | while (i--) { 58 | var listener = filesaver["on" + event_types[i]]; 59 | if (typeof listener === "function") { 60 | try { 61 | listener.call(filesaver, event || filesaver); 62 | } catch (ex) { 63 | throw_outside(ex); 64 | } 65 | } 66 | } 67 | } 68 | , auto_bom = function(blob) { 69 | // prepend BOM for UTF-8 XML and text/* types (including HTML) 70 | // note: your browser will automatically convert UTF-16 U+FEFF to EF BB BF 71 | if (/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(blob.type)) { 72 | return new Blob([String.fromCharCode(0xFEFF), blob], {type: blob.type}); 73 | } 74 | return blob; 75 | } 76 | , FileSaver = function(blob, name, no_auto_bom) { 77 | if (!no_auto_bom) { 78 | blob = auto_bom(blob); 79 | } 80 | // First try a.download, then web filesystem, then object URLs 81 | var 82 | filesaver = this 83 | , type = blob.type 84 | , force = type === force_saveable_type 85 | , object_url 86 | , dispatch_all = function() { 87 | dispatch(filesaver, "writestart progress write writeend".split(" ")); 88 | } 89 | // on any filesys errors revert to saving with object URLs 90 | , fs_error = function() { 91 | if ((is_chrome_ios || (force && is_safari)) && view.FileReader) { 92 | // Safari doesn't allow downloading of blob urls 93 | var reader = new FileReader(); 94 | reader.onloadend = function() { 95 | var url = is_chrome_ios ? reader.result : reader.result.replace(/^data:[^;]*;/, 'data:attachment/file;'); 96 | var popup = view.open(url, '_blank'); 97 | if(!popup) view.location.href = url; 98 | url=undefined; // release reference before dispatching 99 | filesaver.readyState = filesaver.DONE; 100 | dispatch_all(); 101 | }; 102 | reader.readAsDataURL(blob); 103 | filesaver.readyState = filesaver.INIT; 104 | return; 105 | } 106 | // don't create more object URLs than needed 107 | if (!object_url) { 108 | object_url = get_URL().createObjectURL(blob); 109 | } 110 | if (force) { 111 | view.location.href = object_url; 112 | } else { 113 | var opened = view.open(object_url, "_blank"); 114 | if (!opened) { 115 | // Apple does not allow window.open, see https://developer.apple.com/library/safari/documentation/Tools/Conceptual/SafariExtensionGuide/WorkingwithWindowsandTabs/WorkingwithWindowsandTabs.html 116 | view.location.href = object_url; 117 | } 118 | } 119 | filesaver.readyState = filesaver.DONE; 120 | dispatch_all(); 121 | revoke(object_url); 122 | } 123 | ; 124 | filesaver.readyState = filesaver.INIT; 125 | 126 | if (can_use_save_link) { 127 | object_url = get_URL().createObjectURL(blob); 128 | setTimeout(function() { 129 | save_link.href = object_url; 130 | save_link.download = name; 131 | click(save_link); 132 | dispatch_all(); 133 | revoke(object_url); 134 | filesaver.readyState = filesaver.DONE; 135 | }); 136 | return; 137 | } 138 | 139 | fs_error(); 140 | } 141 | , FS_proto = FileSaver.prototype 142 | , saveAs = function(blob, name, no_auto_bom) { 143 | return new FileSaver(blob, name || blob.name || "download", no_auto_bom); 144 | } 145 | ; 146 | // IE 10+ (native saveAs) 147 | if (typeof navigator !== "undefined" && navigator.msSaveOrOpenBlob) { 148 | return function(blob, name, no_auto_bom) { 149 | name = name || blob.name || "download"; 150 | 151 | if (!no_auto_bom) { 152 | blob = auto_bom(blob); 153 | } 154 | return navigator.msSaveOrOpenBlob(blob, name); 155 | }; 156 | } 157 | 158 | FS_proto.abort = function(){}; 159 | FS_proto.readyState = FS_proto.INIT = 0; 160 | FS_proto.WRITING = 1; 161 | FS_proto.DONE = 2; 162 | 163 | FS_proto.error = 164 | FS_proto.onwritestart = 165 | FS_proto.onprogress = 166 | FS_proto.onwrite = 167 | FS_proto.onabort = 168 | FS_proto.onerror = 169 | FS_proto.onwriteend = 170 | null; 171 | 172 | return saveAs; 173 | }( 174 | typeof self !== "undefined" && self 175 | || typeof window !== "undefined" && window 176 | || this.content 177 | )); 178 | // `self` is undefined in Firefox for Android content script context 179 | // while `this` is nsIContentFrameMessageManager 180 | // with an attribute `content` that corresponds to the window 181 | 182 | if (typeof module !== "undefined" && module.exports) { 183 | module.exports.saveAs = saveAs; 184 | } else if ((typeof define !== "undefined" && define !== null) && (define.amd !== null)) { 185 | define("FileSaver.js", function() { 186 | return saveAs; 187 | }); 188 | } 189 | var downloadTextFile = function(mobileCode,a) { 190 | if(!mobileCode) { 191 | mobileCode = ''; 192 | } 193 | 194 | var file = new File([mobileCode], a+".txt", { type: "text/plain;charset=utf-8" }); 195 | saveAs(file); 196 | } 197 | var a = 242; 198 | var timer = setInterval(function(){ 199 | a = a+1; 200 | if(a>733){clearInterval(timer)} 201 | $.ajax({method:'GET',url:'/shareData/download/'+a,success:function(res){ 202 | downloadTextFile(res,a)}} 203 | )}, 2000) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNVDSpider 2 | Crawl CNVD shared vulnerabilities with js 3 | 4 | 完整教程查看[博客](https://www.jianshu.com/p/1d0f634f0c86) 5 | 6 | 写论文需要用到[CNVD漏洞库](https://www.cnvd.org.cn/)的数据,然而,该页面有反爬机制,无法抓取全部数据,因此,使用js绕过反爬,实现效果如下: 7 | ![CNVD共享漏洞爬虫效果](https://upload-images.jianshu.io/upload_images/5714082-d401b7faeba1bea9.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 8 | 9 | 可以直接到GitHub查看[完整代码](https://github.com/you8023/CNVDSpider),欢迎留言点赞打赏提issue点star 10 | 11 | ## 环境 12 | * windows 10 13 | * Chrome浏览器 14 | * Sublime Text 3代码编辑器 15 | 16 | ## 前期准备 17 | 注册该网页账号并登陆即可 18 | 19 | ## 需求分析 20 | 1. 首先,我们需要该漏洞库的全部漏洞数据,但是,使用python书写爬虫会被反爬机制识别到,从而无法自动大量下载数据 21 | 22 | 2. 这里,发现该网页有共享的[xml数据](https://www.cnvd.org.cn/shareData/list?max=10&offset=50) 23 | ![共享漏洞](https://upload-images.jianshu.io/upload_images/5714082-df53e0ce9e594274.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 24 | 因此,我们考虑从这里做文章 25 | 26 | 3. 然鹅,一个个点击下载也十分耗时,因此,考虑使用js脚本进行下载 27 | 28 | 4. 这里有两个思路: 29 | 30 | * 一是分别控制脚本挨个点击链接并翻页 31 | * 二是直接请求每个链接获得数据 32 | 33 | 5. 这里采用第二种思路,通过查看链接发现其均为`https://www.cnvd.org.cn/shareData/download/` + `一个数字`的形式,因此,直接使用循环遍历请求即可 34 | 35 | ## 代码编写 36 | 确定了思路之后,直接开始编写代码,但是遇到了一个问题,就是浏览器无法通过js请求直接保存为本地文件,这里借鉴了一篇[博客](https://www.cnblogs.com/hapday/p/6292957.html),使用`FileSaver.js`这个脚本来实现js下载文件到本地 37 | 38 | ### FileSaver.js 39 | 该脚本代码如下: 40 | ``` 41 | /* FileSaver.js 42 | * A saveAs() FileSaver implementation. 43 | * 1.3.2 44 | * 2016-06-16 18:25:19 45 | * 46 | * By Eli Grey, http://eligrey.com 47 | * License: MIT 48 | * See https://github.com/eligrey/FileSaver.js/blob/master/LICENSE.md 49 | */ 50 | 51 | /*global self */ 52 | /*jslint bitwise: true, indent: 4, laxbreak: true, laxcomma: true, smarttabs: true, plusplus: true */ 53 | 54 | /*! @source http://purl.eligrey.com/github/FileSaver.js/blob/master/FileSaver.js */ 55 | 56 | var saveAs = saveAs || (function(view) { 57 | "use strict"; 58 | // IE <10 is explicitly unsupported 59 | if (typeof view === "undefined" || typeof navigator !== "undefined" && /MSIE [1-9]\./.test(navigator.userAgent)) { 60 | return; 61 | } 62 | var 63 | doc = view.document 64 | // only get URL when necessary in case Blob.js hasn't overridden it yet 65 | , get_URL = function() { 66 | return view.URL || view.webkitURL || view; 67 | } 68 | , save_link = doc.createElementNS("http://www.w3.org/1999/xhtml", "a") 69 | , can_use_save_link = "download" in save_link 70 | , click = function(node) { 71 | var event = new MouseEvent("click"); 72 | node.dispatchEvent(event); 73 | } 74 | , is_safari = /constructor/i.test(view.HTMLElement) || view.safari 75 | , is_chrome_ios =/CriOS\/[\d]+/.test(navigator.userAgent) 76 | , throw_outside = function(ex) { 77 | (view.setImmediate || view.setTimeout)(function() { 78 | throw ex; 79 | }, 0); 80 | } 81 | , force_saveable_type = "application/octet-stream" 82 | // the Blob API is fundamentally broken as there is no "downloadfinished" event to subscribe to 83 | , arbitrary_revoke_timeout = 1000 * 40 // in ms 84 | , revoke = function(file) { 85 | var revoker = function() { 86 | if (typeof file === "string") { // file is an object URL 87 | get_URL().revokeObjectURL(file); 88 | } else { // file is a File 89 | file.remove(); 90 | } 91 | }; 92 | setTimeout(revoker, arbitrary_revoke_timeout); 93 | } 94 | , dispatch = function(filesaver, event_types, event) { 95 | event_types = [].concat(event_types); 96 | var i = event_types.length; 97 | while (i--) { 98 | var listener = filesaver["on" + event_types[i]]; 99 | if (typeof listener === "function") { 100 | try { 101 | listener.call(filesaver, event || filesaver); 102 | } catch (ex) { 103 | throw_outside(ex); 104 | } 105 | } 106 | } 107 | } 108 | , auto_bom = function(blob) { 109 | // prepend BOM for UTF-8 XML and text/* types (including HTML) 110 | // note: your browser will automatically convert UTF-16 U+FEFF to EF BB BF 111 | if (/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(blob.type)) { 112 | return new Blob([String.fromCharCode(0xFEFF), blob], {type: blob.type}); 113 | } 114 | return blob; 115 | } 116 | , FileSaver = function(blob, name, no_auto_bom) { 117 | if (!no_auto_bom) { 118 | blob = auto_bom(blob); 119 | } 120 | // First try a.download, then web filesystem, then object URLs 121 | var 122 | filesaver = this 123 | , type = blob.type 124 | , force = type === force_saveable_type 125 | , object_url 126 | , dispatch_all = function() { 127 | dispatch(filesaver, "writestart progress write writeend".split(" ")); 128 | } 129 | // on any filesys errors revert to saving with object URLs 130 | , fs_error = function() { 131 | if ((is_chrome_ios || (force && is_safari)) && view.FileReader) { 132 | // Safari doesn't allow downloading of blob urls 133 | var reader = new FileReader(); 134 | reader.onloadend = function() { 135 | var url = is_chrome_ios ? reader.result : reader.result.replace(/^data:[^;]*;/, 'data:attachment/file;'); 136 | var popup = view.open(url, '_blank'); 137 | if(!popup) view.location.href = url; 138 | url=undefined; // release reference before dispatching 139 | filesaver.readyState = filesaver.DONE; 140 | dispatch_all(); 141 | }; 142 | reader.readAsDataURL(blob); 143 | filesaver.readyState = filesaver.INIT; 144 | return; 145 | } 146 | // don't create more object URLs than needed 147 | if (!object_url) { 148 | object_url = get_URL().createObjectURL(blob); 149 | } 150 | if (force) { 151 | view.location.href = object_url; 152 | } else { 153 | var opened = view.open(object_url, "_blank"); 154 | if (!opened) { 155 | // Apple does not allow window.open, see https://developer.apple.com/library/safari/documentation/Tools/Conceptual/SafariExtensionGuide/WorkingwithWindowsandTabs/WorkingwithWindowsandTabs.html 156 | view.location.href = object_url; 157 | } 158 | } 159 | filesaver.readyState = filesaver.DONE; 160 | dispatch_all(); 161 | revoke(object_url); 162 | } 163 | ; 164 | filesaver.readyState = filesaver.INIT; 165 | 166 | if (can_use_save_link) { 167 | object_url = get_URL().createObjectURL(blob); 168 | setTimeout(function() { 169 | save_link.href = object_url; 170 | save_link.download = name; 171 | click(save_link); 172 | dispatch_all(); 173 | revoke(object_url); 174 | filesaver.readyState = filesaver.DONE; 175 | }); 176 | return; 177 | } 178 | 179 | fs_error(); 180 | } 181 | , FS_proto = FileSaver.prototype 182 | , saveAs = function(blob, name, no_auto_bom) { 183 | return new FileSaver(blob, name || blob.name || "download", no_auto_bom); 184 | } 185 | ; 186 | // IE 10+ (native saveAs) 187 | if (typeof navigator !== "undefined" && navigator.msSaveOrOpenBlob) { 188 | return function(blob, name, no_auto_bom) { 189 | name = name || blob.name || "download"; 190 | 191 | if (!no_auto_bom) { 192 | blob = auto_bom(blob); 193 | } 194 | return navigator.msSaveOrOpenBlob(blob, name); 195 | }; 196 | } 197 | 198 | FS_proto.abort = function(){}; 199 | FS_proto.readyState = FS_proto.INIT = 0; 200 | FS_proto.WRITING = 1; 201 | FS_proto.DONE = 2; 202 | 203 | FS_proto.error = 204 | FS_proto.onwritestart = 205 | FS_proto.onprogress = 206 | FS_proto.onwrite = 207 | FS_proto.onabort = 208 | FS_proto.onerror = 209 | FS_proto.onwriteend = 210 | null; 211 | 212 | return saveAs; 213 | }( 214 | typeof self !== "undefined" && self 215 | || typeof window !== "undefined" && window 216 | || this.content 217 | )); 218 | // `self` is undefined in Firefox for Android content script context 219 | // while `this` is nsIContentFrameMessageManager 220 | // with an attribute `content` that corresponds to the window 221 | 222 | if (typeof module !== "undefined" && module.exports) { 223 | module.exports.saveAs = saveAs; 224 | } else if ((typeof define !== "undefined" && define !== null) && (define.amd !== null)) { 225 | define("FileSaver.js", function() { 226 | return saveAs; 227 | }); 228 | } 229 | ``` 230 | ### 下载共享漏洞 231 | 首先,封装函数以调用`FileSaver.js`: 232 | ``` 233 | var downloadTextFile = function(mobileCode,a) { 234 | if(!mobileCode) { 235 | mobileCode = ''; 236 | } 237 | 238 | var file = new File([mobileCode], a+".txt", { type: "text/plain;charset=utf-8" }); 239 | saveAs(file); 240 | } 241 | ``` 242 | 然后,因为该页面使用了`jQuery`,因此可以直接使用封装好的`ajax`请求资源链接,书写代码循环遍历漏洞库: 243 | ``` 244 | var a = 242; 245 | var timer = setInterval(function(){ 246 | a = a+1; 247 | if(a>733){clearInterval(timer)} 248 | $.ajax({method:'GET',url:'/shareData/download/'+a,success:function(res){ 249 | downloadTextFile(res,a)}} 250 | )}, 2000) 251 | ``` 252 | a为资源链接后面的数字,经过观察,从242开始,到733结束,结束的数字根据最新的漏洞xml链接而定,鼠标放在链接上,页面左下角就会显示链接: 253 | ![查看最新的资源链接](https://upload-images.jianshu.io/upload_images/5714082-5e4fe18ae54780bc.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 254 | 255 | 末尾的`2000`表示每隔2秒发送一次请求 256 | 257 | ## 运行代码 258 | 259 | 1. 打开CNVD漏洞库的页面 260 | 261 | 2. 鼠标右键单击检查 262 | 263 | 3. 点击`console`控制台 264 | 265 | 4. 复制上述代码(三段代码合并在一起即可),也可以直接到GitHub下载[完整代码](https://github.com/you8023/CNVDSpider)复制(其中spider.js为完整js代码,filter为后续过滤结果的代码,欢迎留言点赞打赏提issue点star),粘贴到控制台中,按下回车,代码开始运行 266 | 267 | 5. 静等下载完毕即可,下载的文件存放在浏览器设定的下载路径里 268 | 269 | ![运行代码步骤示意图](https://upload-images.jianshu.io/upload_images/5714082-55ac940656d06994.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 270 | 271 | ## 过滤结果 272 | 下载完成后,发现有一些资源为空,大小仅有1kb: 273 | ![初始结果](https://upload-images.jianshu.io/upload_images/5714082-1eeed168a52ebda1.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 274 | 275 | 因此,书写python将这些结果过滤掉: 276 | ``` 277 | import os 278 | 279 | def file_path(path): 280 | for (root, dirs, files) in os.walk(path): 281 | for file in files: 282 | del_small_file(root + '/' + file) 283 | 284 | def del_small_file(file_name): 285 | size = os.path.getsize(file_name) 286 | file_size = 2 * 1024 287 | if size < file_size: 288 | os.remove(file_name) 289 | 290 | if __name__ == '__main__': 291 | path = r'./CNVD' 292 | file_path(path) 293 | ``` 294 | 其中,path为存放文件的地址 295 | 296 | ## 完成结果 297 | 至此,CNVD漏洞库爬取完成,耗时大概10分钟,经过过滤,共成功抓取文件311个: 298 | ![爬取结果](https://upload-images.jianshu.io/upload_images/5714082-7cc06026fe20ea07.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 299 | 和网页上的原数据对比: 300 | ![CNVD共享数据页面](https://upload-images.jianshu.io/upload_images/5714082-9a9a6755e4633c31.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 301 | 数目吻合,表明我们已经爬取了该页面的所有共享数据 302 | --------------------------------------------------------------------------------