├── JSFinder.py └── README.md /JSFinder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python" 2 | # coding: utf-8 3 | # By Threezh1 4 | # https://threezh1.github.io/ 5 | 6 | import requests, argparse, sys, re 7 | from requests.packages import urllib3 8 | from urllib.parse import urlparse 9 | from bs4 import BeautifulSoup 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com") 13 | parser.add_argument("-u", "--url", help="The website") 14 | parser.add_argument("-c", "--cookie", help="The website cookie") 15 | parser.add_argument("-f", "--file", help="The file contains url or js") 16 | parser.add_argument("-ou", "--outputurl", help="Output file name. ") 17 | parser.add_argument("-os", "--outputsubdomain", help="Output file name. ") 18 | parser.add_argument("-j", "--js", help="Find in js file", action="store_true") 19 | parser.add_argument("-d", "--deep",help="Deep find", action="store_true") 20 | return parser.parse_args() 21 | 22 | # Regular expression comes from https://github.com/GerbenJavado/LinkFinder 23 | def extract_URL(JS): 24 | pattern_raw = r""" 25 | (?:"|') # Start newline delimiter 26 | ( 27 | ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or // 28 | [^"'/]{1,}\. # Match a domainname (any character + dot) 29 | [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path 30 | | 31 | ((?:/|\.\./|\./) # Start with /,../,./ 32 | [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be... 33 | [^"'><,;|()]{1,}) # Rest of the characters can't be 34 | | 35 | ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with / 36 | [a-zA-Z0-9_\-/]{1,} # Resource name 37 | \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action) 38 | (?:[\?|/][^"|']{0,}|)) # ? mark with parameters 39 | | 40 | ([a-zA-Z0-9_\-]{1,} # filename 41 | \.(?:php|asp|aspx|jsp|json| 42 | action|html|js|txt|xml) # . + extension 43 | (?:\?[^"|']{0,}|)) # ? mark with parameters 44 | ) 45 | (?:"|') # End newline delimiter 46 | """ 47 | pattern = re.compile(pattern_raw, re.VERBOSE) 48 | result = re.finditer(pattern, str(JS)) 49 | if result == None: 50 | return None 51 | js_url = [] 52 | return [match.group().strip('"').strip("'") for match in result 53 | if match.group() not in js_url] 54 | 55 | # Get the page source 56 | def Extract_html(URL): 57 | header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", 58 | "Cookie": args.cookie} 59 | try: 60 | raw = requests.get(URL, headers = header, timeout=3, verify=False) 61 | raw = raw.content.decode("utf-8", "ignore") 62 | return raw 63 | except: 64 | return None 65 | 66 | # Handling relative URLs 67 | def process_url(URL, re_URL): 68 | black_url = ["javascript:"] # Add some keyword for filter url. 69 | URL_raw = urlparse(URL) 70 | ab_URL = URL_raw.netloc 71 | host_URL = URL_raw.scheme 72 | if re_URL[0:2] == "//": 73 | result = host_URL + ":" + re_URL 74 | elif re_URL[0:4] == "http": 75 | result = re_URL 76 | elif re_URL[0:2] != "//" and re_URL not in black_url: 77 | if re_URL[0:1] == "/": 78 | result = host_URL + "://" + ab_URL + re_URL 79 | else: 80 | if re_URL[0:1] == ".": 81 | if re_URL[0:2] == "..": 82 | result = host_URL + "://" + ab_URL + re_URL[2:] 83 | else: 84 | result = host_URL + "://" + ab_URL + re_URL[1:] 85 | else: 86 | result = host_URL + "://" + ab_URL + "/" + re_URL 87 | else: 88 | result = URL 89 | return result 90 | 91 | def find_last(string,str): 92 | positions = [] 93 | last_position=-1 94 | while True: 95 | position = string.find(str,last_position+1) 96 | if position == -1:break 97 | last_position = position 98 | positions.append(position) 99 | return positions 100 | 101 | def find_by_url(url, js = False): 102 | if js == False: 103 | try: 104 | print("url:" + url) 105 | except: 106 | print("Please specify a URL like https://www.baidu.com") 107 | html_raw = Extract_html(url) 108 | if html_raw == None: 109 | print("Fail to access " + url) 110 | return None 111 | #print(html_raw) 112 | html = BeautifulSoup(html_raw, "html.parser") 113 | html_scripts = html.findAll("script") 114 | script_array = {} 115 | script_temp = "" 116 | for html_script in html_scripts: 117 | script_src = html_script.get("src") 118 | if script_src == None: 119 | script_temp += html_script.get_text() + "\n" 120 | else: 121 | purl = process_url(url, script_src) 122 | script_array[purl] = Extract_html(purl) 123 | script_array[url] = script_temp 124 | allurls = [] 125 | for script in script_array: 126 | #print(script) 127 | temp_urls = extract_URL(script_array[script]) 128 | if len(temp_urls) == 0: continue 129 | for temp_url in temp_urls: 130 | allurls.append(process_url(script, temp_url)) 131 | result = [] 132 | for singerurl in allurls: 133 | url_raw = urlparse(url) 134 | domain = url_raw.netloc 135 | positions = find_last(domain, ".") 136 | miandomain = domain 137 | if len(positions) > 1:miandomain = domain[positions[-2] + 1:] 138 | #print(miandomain) 139 | suburl = urlparse(singerurl) 140 | subdomain = suburl.netloc 141 | #print(singerurl) 142 | if miandomain in subdomain or subdomain.strip() == "": 143 | if singerurl.strip() not in result: 144 | result.append(singerurl) 145 | return result 146 | return sorted(set(extract_URL(Extract_html(url)))) or None 147 | 148 | 149 | def find_subdomain(urls, mainurl): 150 | url_raw = urlparse(mainurl) 151 | domain = url_raw.netloc 152 | miandomain = domain 153 | positions = find_last(domain, ".") 154 | if len(positions) > 1:miandomain = domain[positions[-2] + 1:] 155 | subdomains = [] 156 | for url in urls: 157 | suburl = urlparse(url) 158 | subdomain = suburl.netloc 159 | #print(subdomain) 160 | if subdomain.strip() == "": continue 161 | if miandomain in subdomain: 162 | if subdomain not in subdomains: 163 | subdomains.append(subdomain) 164 | return subdomains 165 | 166 | def find_by_url_deep(url): 167 | html_raw = Extract_html(url) 168 | if html_raw == None: 169 | print("Fail to access " + url) 170 | return None 171 | html = BeautifulSoup(html_raw, "html.parser") 172 | html_as = html.findAll("a") 173 | links = [] 174 | for html_a in html_as: 175 | src = html_a.get("href") 176 | if src == "" or src == None: continue 177 | link = process_url(url, src) 178 | if link not in links: 179 | links.append(link) 180 | if links == []: return None 181 | print("ALL Find " + str(len(links)) + " links") 182 | urls = [] 183 | i = len(links) 184 | for link in links: 185 | temp_urls = find_by_url(link) 186 | if temp_urls == None: continue 187 | print("Remaining " + str(i) + " | Find " + str(len(temp_urls)) + " URL in " + link) 188 | for temp_url in temp_urls: 189 | if temp_url not in urls: 190 | urls.append(temp_url) 191 | i -= 1 192 | return urls 193 | 194 | 195 | def find_by_file(file_path, js=False): 196 | with open(file_path, "r") as fobject: 197 | links = fobject.read().split("\n") 198 | if links == []: return None 199 | print("ALL Find " + str(len(links)) + " links") 200 | urls = [] 201 | i = len(links) 202 | for link in links: 203 | if js == False: 204 | temp_urls = find_by_url(link) 205 | else: 206 | temp_urls = find_by_url(link, js=True) 207 | if temp_urls == None: continue 208 | print(str(i) + " Find " + str(len(temp_urls)) + " URL in " + link) 209 | for temp_url in temp_urls: 210 | if temp_url not in urls: 211 | urls.append(temp_url) 212 | i -= 1 213 | return urls 214 | 215 | def giveresult(urls, domian): 216 | if urls == None: 217 | return None 218 | print("Find " + str(len(urls)) + " URL:") 219 | content_url = "" 220 | content_subdomain = "" 221 | for url in urls: 222 | content_url += url + "\n" 223 | print(url) 224 | subdomains = find_subdomain(urls, domian) 225 | print("\nFind " + str(len(subdomains)) + " Subdomain:") 226 | for subdomain in subdomains: 227 | content_subdomain += subdomain + "\n" 228 | print(subdomain) 229 | if args.outputurl != None: 230 | with open(args.outputurl, "a", encoding='utf-8') as fobject: 231 | fobject.write(content_url) 232 | print("\nOutput " + str(len(urls)) + " urls") 233 | print("Path:" + args.outputurl) 234 | if args.outputsubdomain != None: 235 | with open(args.outputsubdomain, "a", encoding='utf-8') as fobject: 236 | fobject.write(content_subdomain) 237 | print("\nOutput " + str(len(subdomains)) + " subdomains") 238 | print("Path:" + args.outputsubdomain) 239 | 240 | if __name__ == "__main__": 241 | urllib3.disable_warnings() 242 | args = parse_args() 243 | if args.file == None: 244 | if args.deep is not True: 245 | urls = find_by_url(args.url) 246 | giveresult(urls, args.url) 247 | else: 248 | urls = find_by_url_deep(args.url) 249 | giveresult(urls, args.url) 250 | else: 251 | if args.js is not True: 252 | urls = find_by_file(args.file) 253 | giveresult(urls, urls[0]) 254 | else: 255 | urls = find_by_file(args.file, js = True) 256 | giveresult(urls, urls[0]) 257 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JSFinder 2 | 3 | JSFinder is a tool for quickly extracting URLs and subdomains from JS files on a website. 4 | 5 | JSFinder是一款用作快速在网站的js文件中提取URL,子域名的工具。 6 | 7 | 提取URL的正则部分使用的是[LinkFinder](https://github.com/GerbenJavado/LinkFinder) 8 | 9 | JSFinder获取URL和子域名的方式: 10 | 11 | ![image](https://i.loli.net/2020/05/24/R2fImgNZHPkvhEj.png) 12 | 13 | Blog: https://threezh1.com/ 14 | 15 | ## 更新说明 16 | 17 | - 增加油猴脚本用于在浏览器上访问页面时获取域名与接口,具体可见:https://github.com/Threezh1/Deconstruct/tree/main/DevTools_JSFinder 18 | 19 | ## 用法 20 | 21 | - **简单爬取** 22 | 23 | ``` 24 | python JSFinder.py -u http://www.mi.com 25 | ``` 26 | 27 | 这个命令会爬取 http://www.mi.com 这单个页面的所有的js链接,并在其中发现url和子域名 28 | 29 | 返回示例: 30 | 31 | ``` 32 | url:http://www.mi.com 33 | Find 50 URL: 34 | http://api-order.test.mi.com 35 | http://api.order.mi.com 36 | http://userid.xiaomi.com/userId 37 | http://order.mi.com/site/login?redirectUrl= 38 | ...已省略 39 | 40 | Find 26 Subdomain: 41 | api-order.test.mi.com 42 | api.order.mi.com 43 | userid.xiaomi.com 44 | order.mi.com 45 | ...已省略 46 | 47 | ``` 48 | 49 | - **深度爬取** 50 | 51 | ``` 52 | python JSFinder.py -u http://www.mi.com -d 53 | ``` 54 | 55 | 深入一层页面爬取JS,时间会消耗的更长。 56 | 57 | 建议使用-ou 和 -os来指定保存URL和子域名的文件名。 例如: 58 | 59 | ``` 60 | python JSFinder.py -u http://www.mi.com -d -ou mi_url.txt -os mi_subdomain.txt 61 | ``` 62 | 63 | - **批量指定URL/指定JS** 64 | 65 | 指定URL: 66 | 67 | ``` 68 | python JSFinder.py -f text.txt 69 | ``` 70 | 71 | 指定JS: 72 | 73 | ``` 74 | python JSFinder.py -f text.txt -j 75 | ``` 76 | 77 | 可以用brupsuite爬取网站后提取出URL或者JS链接,保存到txt文件中,一行一个。 78 | 79 | 指定URL或JS就不需要加深度爬取,单个页面即可。 80 | 81 | - **其他** 82 | 83 | -c 指定cookie来爬取页面 例: 84 | 85 | ``` 86 | python JSFinder.py -u http://www.mi.com -c "session=xxx" 87 | ``` 88 | 89 | -ou 指定文件名保存URL链接 例: 90 | 91 | ``` 92 | python JSFinder.py -u http://www.mi.com -ou mi_url.txt 93 | ``` 94 | 95 | -os 指定文件名保存子域名 例: 96 | 97 | ``` 98 | python JSFinder.py -u http://www.mi.com -os mi_subdomain.txt 99 | ``` 100 | 101 | - **注意** 102 | 103 | url 不用加引号 104 | 105 | url 需要http:// 或 https:// 106 | 107 | 指定JS文件爬取时,返回的URL为相对URL 108 | 109 | 指定URL文件爬取时,返回的相对URL都会以指定的第一个链接的域名作为其域名来转化为绝对URL。 110 | 111 | - **截图** 112 | 113 | 实测简单爬取: 114 | 115 | ``` 116 | python3 JSFinder.py -u https://www.jd.com/ 117 | ``` 118 | 119 | URL: 120 | 121 | ![02.jpg](https://i.loli.net/2020/05/24/aROFI5fC3UyK8EP.jpg) 122 | 123 | ![03.jpg](https://i.loli.net/2020/05/24/rXC4Bba7oMw8AHW.jpg) 124 | 125 | Subdomain: 126 | 127 | ![01.jpg](https://i.loli.net/2020/05/24/69WvDmy7al4hQfd.jpg) 128 | 129 | 实测深度爬取: 130 | 131 | ``` 132 | python3 JSFinder.py -u https://www.jd.com/ -d -ou jd_url.txt -os jd_domain.txt 133 | ``` 134 | 135 | ![05.jpg](https://i.loli.net/2020/05/24/dhxTQnaW4ef9Vzu.jpg) 136 | 137 | ![06.jpg](https://i.loli.net/2020/05/24/NAX9PnLaW6melVk.jpg) 138 | 139 | 实际测试: 140 | ``` 141 | http://www.oppo.com 142 | URL:4426 个 143 | 子域名:24 个 144 | 145 | http://www.mi.com 146 | URL:1043 个 147 | 子域名:111 个 148 | 149 | http://www.jd.com 150 | URL:3627 个 151 | 子域名:306 个 152 | ``` 153 | --------------------------------------------------------------------------------