├── README.md ├── config.py ├── images └── cookie_arg.png ├── main.py ├── patten.json ├── yunsee_verify_5ec1f0444e22c.txt └── yunsee_verify_5ec74dfb508af.txt /README.md: -------------------------------------------------------------------------------- 1 | # jsspider 2 | A js infomation dig tool. 3 | 4 | #### Details for reference 5 | 对于该工具介绍已更新到blog(**以下链接**),若有兴趣,在文章下方评论中可以提出你的建议和优化内容及功能。 6 | ps:评论用的是国外的DISQUS还未过墙比较麻烦就没去弄了,需要读者翻墙后才能评论以及看到评论。 7 | 8 | [链接](http://qclover.cn/2019/05/14/%E8%B0%88js%E9%9D%99%E6%80%81%E6%96%87%E4%BB%B6%E5%9C%A8%E6%BC%8F%E6%B4%9E%E6%8C%96%E6%8E%98%E4%B8%AD%E7%9A%84%E5%88%A9%E7%94%A8.html) 9 | 10 | PS:由于最近没什么时间,部分功能还未优化比如,没爬取一个site生成对应的结果,待有时间将继续跟新。 11 | 12 | 6月10日更新 13 | 爬取时添加cookie参数,方便抓取登陆后的内容。 14 | 15 | `python3 main.py -u http://target.com -d 1 -c cookie` 16 | 17 | ![img](https://raw.githubusercontent.com/Qclover/jsspider/master/images/cookie_arg.png) 18 | 19 | 20 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | Addcode=''' 2 | function(){ 3 | _addEventListener = Element.prototype.addEventListener; 4 | var EVENT_LIST=new Array() 5 | Element.prototype.addEventListener = function(a,b,c){ 6 | EVENT_LIST.push({"event":event, "element":this}); 7 | _addEventListener.apply(this, arguments); 8 | }; 9 | for (var i in EVENT_LIST){ 10 | var evt = document.createEvent('CustomEvent'); 11 | evt.initCustomEvent(EVENT_LIST[i]["event"], true, true, null); 12 | EVENT_LIST[i]["element"].dispatchEvent(evt); 13 | } 14 | console.log(EVENT_LIST) 15 | return EVENT_LIST 16 | } 17 | ''' 18 | Domcode=''' 19 | function(){ 20 | var links = ''; 21 | 22 | function trigger_inline() { 23 | var nodes = document.all; 24 | for (var i = 0; i < nodes.length; i++) { 25 | var attrs = nodes[i].attributes; 26 | for (var j = 0; j < attrs.length; j++) { 27 | attr_name = attrs[j].nodeName; 28 | attr_value = attrs[j].nodeValue; 29 | if (attr_name.substr(0, 2) == "on") { 30 | console.log(attr_name + ' : ' + attr_value); 31 | //eval(attr_value.split('return')[0] + ';'); 32 | } 33 | if (attr_name in { 34 | "src": 1, 35 | "href": 1 36 | } && attrs[j].nodeValue.substr(0, 11) == "javascript:") { 37 | console.log(attr_name + ' : ' + attr_value); 38 | //eval(attr_value.substr(11).split('return')[0] + ';'); 39 | } 40 | } 41 | } 42 | } 43 | trigger_inline(); 44 | var getAbsoluteUrl = (function () { 45 | var a; 46 | return function (url) { 47 | if (!a) { 48 | a = document.createElement('a'); 49 | } 50 | a.href = url; 51 | return a.href; 52 | }; 53 | })(); 54 | atags = document.getElementsByTagName("a"); 55 | 56 | for (var i = 0; i < atags.length; i++) { 57 | if (atags[i].getAttribute("href")) { 58 | links += getAbsoluteUrl(atags[i].getAttribute("href")) + '***'; 59 | } 60 | } 61 | iframetag = document.getElementsByTagName("iframe"); 62 | for (var i = 0; i < iframetag.length; i++) { 63 | if (iframetag[i].getAttribute("src")) { 64 | links += getAbsoluteUrl(iframetag[i].getAttribute("src")) + '***'; 65 | } 66 | } 67 | ftags = document.getElementsByTagName("form"); 68 | for (var i = 0; i < ftags.length; i++) { 69 | var link = ''; 70 | var action = getAbsoluteUrl(ftags[i].getAttribute("action")); 71 | if (action) { 72 | if (action.substr(action.length - 1, 1) == '#') { 73 | link = action.substr(0, action.length - 1); 74 | } else { 75 | link = action + '?'; 76 | } 77 | for (var j = 0; j < ftags[i].elements.length; j++) { 78 | if (ftags[i].elements[j].tagName == 'INPUT') { 79 | link = link + ftags[i].elements[j].name + '='; 80 | if (ftags[i].elements[j].value == "" || ftags[i].elements[j].value == null) { 81 | link = link + 'Abc123456!' + '&'; 82 | } else { 83 | link = link + ftags[i].elements[j].value + '&'; 84 | } 85 | } 86 | } 87 | } 88 | links += link.substr(0, link.length - 1) + '***'; 89 | } 90 | document.addEventListener('DOMNodeInserted', function(e) { 91 | var node = e.target; 92 | if(node.src || node.href){ 93 | links += (node.src || node.href)+'***'; 94 | } 95 | }, true); 96 | return links; 97 | } 98 | ''' 99 | 100 | ajaxcode=''' 101 | function(){ 102 | var ajax_LIST=new Array() 103 | XMLHttpRequest.prototype.__originalOpen = XMLHttpRequest.prototype.open; 104 | XMLHttpRequest.prototype.open = function(method, url, async, user, password) { 105 | // hook code 106 | ajax_LIST.push({"url":url}); 107 | return this.__originalOpen(method,url,async,user,password); 108 | } 109 | XMLHttpRequest.prototype.__originalSend = XMLHttpRequest.prototype.send; 110 | XMLHttpRequest.prototype.send = function(data) { 111 | // hook code 112 | return this.__originalSend(data); 113 | } 114 | return ajax_LIST; 115 | } 116 | ''' -------------------------------------------------------------------------------- /images/cookie_arg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Qclover/jsspider/c90238365a9a291a664fea2f314afc0f32e8c5ff/images/cookie_arg.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from config import Addcode, Domcode, ajaxcode 3 | from pyppeteer import launch 4 | from datetime import datetime 5 | from threadpool import makeRequests, ThreadPool 6 | from queue import Queue 7 | from threading import Thread 8 | from threading import Condition 9 | from threading import Lock 10 | from urllib import parse 11 | import sys 12 | import time 13 | import os 14 | import argparse 15 | from datetime import datetime 16 | import re 17 | import json 18 | import hashlib 19 | 20 | 21 | class Topspider(object): 22 | """爬取类""" 23 | def __init__(self, url, depth, threadNum, file,cookie): 24 | """Initialization parameters""" 25 | """Operating status""" 26 | self.status = False 27 | """管理线程池数量唤醒及等待""" 28 | self.processcondition = Condition() 29 | """当前运行的线程数量""" 30 | self.currentrun = 0 31 | """url等待队列""" 32 | self.urlQue = Queue() 33 | """已经请求过的url""" 34 | self.visitedurl = [] 35 | """线程数""" 36 | self.threadNum = threadNum 37 | """设定了线程数的线程池""" 38 | self.pool = ThreadPool(self.threadNum) 39 | """预期爬行深度""" 40 | self.depth = depth 41 | """初始化浏览器""" 42 | self.browser = '' 43 | self.tasksall=[] 44 | """打开json文件""" 45 | self.re_json = json.load(open('patten.json', 'r', encoding='utf-8')) 46 | self.resfile = 'resault.txt' 47 | 48 | self.Cookie={'Cookie':''} 49 | """初始化队列""" 50 | for url in urllist: 51 | self.urlQue.put({'url': url, "depth": int(depth)}) 52 | """当前爬取的url""" 53 | self.spiderdomain = 'start' 54 | 55 | def urlparser(self, url): 56 | """区别url是不是可以直接访问""" 57 | if url.startswith('http://') or url.startswith('https://'): 58 | url=url 59 | else: 60 | url = self.spiderdomain+'/'+url 61 | return url 62 | 63 | def Deduplication(self, u): 64 | """ 根据返回的数据类型来判断存储 """ 65 | if type(u) == dict: 66 | urls = u['urls'] 67 | depth = u['depth'] 68 | for url in urls: 69 | if url!='': 70 | url2= self.urlparser(url) 71 | url1 = url2.encode(encoding='utf-8') 72 | url_md5 = hashlib.md5(url1).hexdigest() 73 | if url_md5 not in self.visitedurl: 74 | with open('file.txt', 'a') as f: 75 | f.write(str(url) + '\n') 76 | self.urlQue.put({'url': url, "depth": depth}) 77 | if type(u) == list: 78 | for t in u: 79 | with open(self.resfile, 'a') as f: 80 | f.write(str(t) + '\n') 81 | def start(self): 82 | """开始爬取""" 83 | self.status = True 84 | print('\n[-] Spider Starting ...........Domain is %s' %self.spiderdomain) 85 | self.urlmanagement() 86 | self.stop() 87 | 88 | def spider(self, urls, loop): 89 | """主要爬虫函数""" 90 | s = 0 91 | print(urls) 92 | 93 | try: 94 | tasks = [] 95 | urllist = [] 96 | asyncio.set_event_loop(loop) 97 | loop.run_until_complete(self.get_browser()) 98 | for x in urls: 99 | url = x['url'] 100 | depth = x['depth'] 101 | #print(type(depth)) 102 | if depth == 0: 103 | print(222) 104 | tasks.append(asyncio.ensure_future( 105 | self.Identify_content(url))) 106 | else: 107 | print(444) 108 | tasks.append(asyncio.ensure_future( 109 | self.get_url(url, depth))) 110 | future = asyncio.wait(tasks) 111 | loop.run_until_complete(future) 112 | for task in tasks: 113 | self.Deduplication(task.result()) 114 | loop.run_until_complete(self.close_browser()) 115 | except Exception as e: 116 | self.processcondition.acquire() 117 | self.currentrun = self.currentrun - 1 118 | s = s + 1 119 | self.processcondition.release() 120 | if s == len(urls): 121 | self.processcondition.notify() 122 | return None 123 | self.processcondition.acquire() 124 | self.currentrun = self.currentrun - (len(urls) - s) 125 | self.processcondition.notify() 126 | self.processcondition.release() 127 | 128 | def urlmanagement(self): 129 | """处理url,分批进行爬取内容。采用线程池""" 130 | self.processcondition.acquire() 131 | while True: 132 | if not (self.urlQue.empty() and self.currentrun == 0): 133 | urls = [] 134 | spiderlist = [] 135 | while not self.urlQue.empty(): 136 | item = self.urlQue.get() 137 | if item not in self.visitedurl: 138 | urls.append(item) 139 | murl = item['url'].encode(encoding='utf-8') 140 | url_md = hashlib.md5(murl).hexdigest() 141 | self.visitedurl.append(url_md) 142 | threadn = len(urls) 143 | step = 5 144 | for i in range(0, len(urls), step): 145 | ulist = urls[i:i + step] 146 | i = asyncio.new_event_loop() 147 | x = ([ulist, i], None) 148 | spiderlist.append(x) 149 | request = makeRequests(self.spider, spiderlist) 150 | self.currentrun = self.currentrun + threadn 151 | [self.pool.putRequest(g) for g in request] 152 | self.processcondition.wait() 153 | else: 154 | break 155 | def stop(self): 156 | """结束函数,现在还没有作用。可当作判断是否结束爬取""" 157 | # self.urlfile.close() 158 | self.status = False 159 | 160 | async def get_browser(self): 161 | """打开浏览器""" 162 | self.browser = await launch({'headless': True,'handleSIGINT': False,'handleSIGTERM': False,'handleSIGHUP': False}) 163 | 164 | async def close_browser(self): 165 | """关闭浏览器""" 166 | await self.browser.close() 167 | 168 | async def get_url(self, url, depth): 169 | """只获取页面中的url,依据深度来判断。深度为0时不进行此函数,返回字典""" 170 | if depth >= int(self.depth): 171 | dep = 0 172 | else: 173 | dep = depth + 1 174 | # print(dep) 175 | page = await self.browser.newPage() 176 | #print(args.cookie) 177 | self.cookie={'Cookie':args.cookie} 178 | print(self.cookie) 179 | await page.setExtraHTTPHeaders(self.cookie) 180 | await page.evaluate(ajaxcode) 181 | await page.evaluate(Addcode) 182 | await page.goto(url) 183 | await page.waitFor(5000) 184 | Domcode1 = await page.evaluate(Domcode) 185 | ls = Domcode1.split('***') 186 | elementsj = await page.querySelectorAll('script') 187 | for elementj in elementsj: 188 | url2 =await page.evaluate('(elementj) => elementj.src', elementj) 189 | ls.append(url2) 190 | lls = {'depth': dep, 'urls': set(ls)} 191 | print(55555) 192 | return lls 193 | 194 | async def Identify_content(self, url): 195 | """深度为0时,进行此函数。依靠正则爬取页面内容,可自定义。返回列表""" 196 | re_list = [] 197 | 198 | page = await self.browser.newPage() 199 | await page.goto(url) 200 | await page.waitFor(5000) 201 | if parse.urlparse(url).path.split('.')[-1].startswith('js'): 202 | mm=[] 203 | html = await page.content() 204 | print(html) 205 | # for keys in self.re_json.keys(): 206 | # pattern = r'([a-z]+(/[0-9a-z]+)+/?)+' 207 | # pattern2 = r'\"[^"]*\"' 208 | # m = re.compile(pattern2, re.S).findall(html) 209 | # # print(m) 210 | # text=[] 211 | # for l in m: 212 | # # print(l+'111111') 213 | # # print(l) 214 | # m2 = re.search(pattern,l.strip('\"'),re.I) 215 | # if m2: 216 | # text.append(m2.group(1)) 217 | # print(m2.group(1)) 218 | regex_str = r""" 219 | (?:"|') # Start newline delimiter 220 | ( 221 | ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or // 222 | [^"'/]{1,}\. # Match a domainname (any character + dot) 223 | [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path 224 | | 225 | ((?:/|\.\./|\./) # Start with /,../,./ 226 | [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be... 227 | [^"'><,;|()]{1,}) # Rest of the characters can't be 228 | | 229 | ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with / 230 | [a-zA-Z0-9_\-/]{1,} # Resource name 231 | \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action) 232 | (?:[\?|/][^"|']{0,}|)) # ? mark with parameters 233 | | 234 | ([a-zA-Z0-9_\-]{1,} # filename 235 | \.(?:php|asp|aspx|jsp|json| 236 | action|html|js|txt|xml) # . + extension 237 | (?:\?[^"|']{0,}|)) # ? mark with parameters 238 | ) 239 | (?:"|') # End newline delimiter 240 | """ 241 | regex=re.compile(regex_str, re.VERBOSE) 242 | for m in re.finditer(regex,html): 243 | mm.append(m.group(1)) 244 | else: 245 | mm = await page.title() 246 | re_list = [{url: mm}] 247 | await asyncio.sleep(3) 248 | return re_list 249 | 250 | 251 | class printInfo(Thread): 252 | """信息打印类""" 253 | def __init__(self, Topspider): 254 | Thread.__init__(self) 255 | """开始时间""" 256 | self.startTime = datetime.now() 257 | """设置为守护线程""" 258 | self.daemon = True 259 | """爬虫类""" 260 | self.topspider = Topspider 261 | """开启线程,相当于调用了run""" 262 | self.start() 263 | 264 | def printEnd(self): 265 | """信息结束""" 266 | self.endTime = datetime.now() 267 | print ('Crawl Depth:%d, Totally visited %d links.\n' % (self.topspider.depth, len(self.topspider.visitedurl))) 268 | print('Start at: '+self.startTime.strftime("%Y-%m-%d %H:%M:%S")) 269 | print ('End at: '+self.endTime.strftime("%Y-%m-%d %H:%M:%S")) 270 | print ('Spend time: '+str((self.endTime - self.startTime).seconds)+'s') 271 | print ('[-] Finished......') 272 | 273 | def run(self): 274 | while True: 275 | if self.topspider.status is True: 276 | time.sleep(10) 277 | print('[+] Now totally visited %s links , %s Coroutines is running .\n' % (int(len(self.topspider.visitedurl)), int(self.topspider.currentrun))) 278 | 279 | 280 | if __name__ == '__main__': 281 | print('[+] Run current process pid:%s...' % os.getpid()) 282 | parser = argparse.ArgumentParser(description="Spider URL Informations") 283 | group = parser.add_mutually_exclusive_group() 284 | parser.add_argument( 285 | "-d", "--depth", help="set spider depth,default 1", default=1) 286 | group.add_argument( 287 | "-u", "--url", help="set spider url") 288 | group.add_argument( 289 | "-us", "--urls", help="set spider urls") 290 | parser.add_argument( 291 | "-c", "--cookie", help="set cookie",default='') 292 | parser.add_argument( 293 | "-t", "--threadNum", help="set threads num,default 3", default=1) 294 | parser.add_argument( 295 | '-f', '--file', help='set save path', default='file.txt') 296 | parser.add_argument( 297 | "-v", '--version', action='version', version='version 1.0') 298 | 299 | args = parser.parse_args() 300 | urllist = [] 301 | 302 | if args.urls: 303 | with open(args.urls, 'r') as f: 304 | for i in f.read().split('\n'): 305 | print(i) 306 | if i.startswith('http://') or i.startswith('https://'): 307 | urllist.append(i) 308 | else: 309 | urllist.append('http://'+i) 310 | print(urllist) 311 | if args.url is None and urllist == ['']: 312 | print(parser.print_help()) 313 | else: 314 | if args.url: 315 | urllist.append(args.url) 316 | 317 | spider = Topspider(urllist, args.depth,args.threadNum, args.file,args.cookie) 318 | # info = printInfo(spider) 319 | spider.start() 320 | # info.printEnd() -------------------------------------------------------------------------------- /patten.json: -------------------------------------------------------------------------------- 1 | { 2 | "jsurl":"\\/.*\\/[a-z|A-Z]+" 3 | } 4 | -------------------------------------------------------------------------------- /yunsee_verify_5ec1f0444e22c.txt: -------------------------------------------------------------------------------- 1 | f041e0b824d29122c0e0190f24ffda785ec1f0444e233 -------------------------------------------------------------------------------- /yunsee_verify_5ec74dfb508af.txt: -------------------------------------------------------------------------------- 1 | 7d962b992973e9670163eb220c8d17305ec74dfb508b6 --------------------------------------------------------------------------------