├── JSFinder.py
└── README.md


/JSFinder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python"
  2 | # coding: utf-8
  3 | # By Threezh1
  4 | # https://threezh1.github.io/
  5 | 
  6 | import requests, argparse, sys, re
  7 | from requests.packages import urllib3
  8 | from urllib.parse import urlparse
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | def parse_args():
 12 |     parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com")
 13 |     parser.add_argument("-u", "--url", help="The website")
 14 |     parser.add_argument("-c", "--cookie", help="The website cookie")
 15 |     parser.add_argument("-f", "--file", help="The file contains url or js")
 16 |     parser.add_argument("-ou", "--outputurl", help="Output file name. ")
 17 |     parser.add_argument("-os", "--outputsubdomain", help="Output file name. ")
 18 |     parser.add_argument("-j", "--js", help="Find in js file", action="store_true")
 19 |     parser.add_argument("-d", "--deep",help="Deep find", action="store_true")
 20 |     return parser.parse_args()
 21 | 
 22 | # Regular expression comes from https://github.com/GerbenJavado/LinkFinder
 23 | def extract_URL(JS):
 24 | 	pattern_raw = r"""
 25 | 	  (?:"|')                               # Start newline delimiter
 26 | 	  (
 27 | 	    ((?:[a-zA-Z]{1,10}://|//)           # Match a scheme [a-Z]*1-10 or //
 28 | 	    [^"'/]{1,}\.                        # Match a domainname (any character + dot)
 29 | 	    [a-zA-Z]{2,}[^"']{0,})              # The domainextension and/or path
 30 | 	    |
 31 | 	    ((?:/|\.\./|\./)                    # Start with /,../,./
 32 | 	    [^"'><,;| *()(%%$^/\\\[\]]          # Next character can't be...
 33 | 	    [^"'><,;|()]{1,})                   # Rest of the characters can't be
 34 | 	    |
 35 | 	    ([a-zA-Z0-9_\-/]{1,}/               # Relative endpoint with /
 36 | 	    [a-zA-Z0-9_\-/]{1,}                 # Resource name
 37 | 	    \.(?:[a-zA-Z]{1,4}|action)          # Rest + extension (length 1-4 or action)
 38 | 	    (?:[\?|/][^"|']{0,}|))              # ? mark with parameters
 39 | 	    |
 40 | 	    ([a-zA-Z0-9_\-]{1,}                 # filename
 41 | 	    \.(?:php|asp|aspx|jsp|json|
 42 | 	         action|html|js|txt|xml)             # . + extension
 43 | 	    (?:\?[^"|']{0,}|))                  # ? mark with parameters
 44 | 	  )
 45 | 	  (?:"|')                               # End newline delimiter
 46 | 	"""
 47 | 	pattern = re.compile(pattern_raw, re.VERBOSE)
 48 | 	result = re.finditer(pattern, str(JS))
 49 | 	if result == None:
 50 | 		return None
 51 | 	js_url = []
 52 | 	return [match.group().strip('"').strip("'") for match in result
 53 | 		if match.group() not in js_url]
 54 | 
 55 | # Get the page source
 56 | def Extract_html(URL):
 57 | 	header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
 58 | 	"Cookie": args.cookie}
 59 | 	try:
 60 | 		raw = requests.get(URL, headers = header, timeout=3, verify=False)
 61 | 		raw = raw.content.decode("utf-8", "ignore")
 62 | 		return raw
 63 | 	except:
 64 | 		return None
 65 | 
 66 | # Handling relative URLs
 67 | def process_url(URL, re_URL):
 68 | 	black_url = ["javascript:"]	# Add some keyword for filter url.
 69 | 	URL_raw = urlparse(URL)
 70 | 	ab_URL = URL_raw.netloc
 71 | 	host_URL = URL_raw.scheme
 72 | 	if re_URL[0:2] == "//":
 73 | 		result = host_URL  + ":" + re_URL
 74 | 	elif re_URL[0:4] == "http":
 75 | 		result = re_URL
 76 | 	elif re_URL[0:2] != "//" and re_URL not in black_url:
 77 | 		if re_URL[0:1] == "/":
 78 | 			result = host_URL + "://" + ab_URL + re_URL
 79 | 		else:
 80 | 			if re_URL[0:1] == ".":
 81 | 				if re_URL[0:2] == "..":
 82 | 					result = host_URL + "://" + ab_URL + re_URL[2:]
 83 | 				else:
 84 | 					result = host_URL + "://" + ab_URL + re_URL[1:]
 85 | 			else:
 86 | 				result = host_URL + "://" + ab_URL + "/" + re_URL
 87 | 	else:
 88 | 		result = URL
 89 | 	return result
 90 | 
 91 | def find_last(string,str):
 92 | 	positions = []
 93 | 	last_position=-1
 94 | 	while True:
 95 | 		position = string.find(str,last_position+1)
 96 | 		if position == -1:break
 97 | 		last_position = position
 98 | 		positions.append(position)
 99 | 	return positions
100 | 
101 | def find_by_url(url, js = False):
102 | 	if js == False:
103 | 		try:
104 | 			print("url:" + url)
105 | 		except:
106 | 			print("Please specify a URL like https://www.baidu.com")
107 | 		html_raw = Extract_html(url)
108 | 		if html_raw == None: 
109 | 			print("Fail to access " + url)
110 | 			return None
111 | 		#print(html_raw)
112 | 		html = BeautifulSoup(html_raw, "html.parser")
113 | 		html_scripts = html.findAll("script")
114 | 		script_array = {}
115 | 		script_temp = ""
116 | 		for html_script in html_scripts:
117 | 			script_src = html_script.get("src")
118 | 			if script_src == None:
119 | 				script_temp += html_script.get_text() + "\n"
120 | 			else:
121 | 				purl = process_url(url, script_src)
122 | 				script_array[purl] = Extract_html(purl)
123 | 		script_array[url] = script_temp
124 | 		allurls = []
125 | 		for script in script_array:
126 | 			#print(script)
127 | 			temp_urls = extract_URL(script_array[script])
128 | 			if len(temp_urls) == 0: continue
129 | 			for temp_url in temp_urls:
130 | 				allurls.append(process_url(script, temp_url)) 
131 | 		result = []
132 | 		for singerurl in allurls:
133 | 			url_raw = urlparse(url)
134 | 			domain = url_raw.netloc
135 | 			positions = find_last(domain, ".")
136 | 			miandomain = domain
137 | 			if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
138 | 			#print(miandomain)
139 | 			suburl = urlparse(singerurl)
140 | 			subdomain = suburl.netloc
141 | 			#print(singerurl)
142 | 			if miandomain in subdomain or subdomain.strip() == "":
143 | 				if singerurl.strip() not in result:
144 | 					result.append(singerurl)
145 | 		return result
146 | 	return sorted(set(extract_URL(Extract_html(url)))) or None
147 | 
148 | 
149 | def find_subdomain(urls, mainurl):
150 | 	url_raw = urlparse(mainurl)
151 | 	domain = url_raw.netloc
152 | 	miandomain = domain
153 | 	positions = find_last(domain, ".")
154 | 	if len(positions) > 1:miandomain = domain[positions[-2] + 1:]
155 | 	subdomains = []
156 | 	for url in urls:
157 | 		suburl = urlparse(url)
158 | 		subdomain = suburl.netloc
159 | 		#print(subdomain)
160 | 		if subdomain.strip() == "": continue
161 | 		if miandomain in subdomain:
162 | 			if subdomain not in subdomains:
163 | 				subdomains.append(subdomain)
164 | 	return subdomains
165 | 
166 | def find_by_url_deep(url):
167 | 	html_raw = Extract_html(url)
168 | 	if html_raw == None: 
169 | 		print("Fail to access " + url)
170 | 		return None
171 | 	html = BeautifulSoup(html_raw, "html.parser")
172 | 	html_as = html.findAll("a")
173 | 	links = []
174 | 	for html_a in html_as:
175 | 		src = html_a.get("href")
176 | 		if src == "" or src == None: continue
177 | 		link = process_url(url, src)
178 | 		if link not in links:
179 | 			links.append(link)
180 | 	if links == []: return None
181 | 	print("ALL Find " + str(len(links)) + " links")
182 | 	urls = []
183 | 	i = len(links)
184 | 	for link in links:
185 | 		temp_urls = find_by_url(link)
186 | 		if temp_urls == None: continue
187 | 		print("Remaining " + str(i) + " | Find " + str(len(temp_urls)) + " URL in " + link)
188 | 		for temp_url in temp_urls:
189 | 			if temp_url not in urls:
190 | 				urls.append(temp_url)
191 | 		i -= 1
192 | 	return urls
193 | 
194 | 	
195 | def find_by_file(file_path, js=False):
196 | 	with open(file_path, "r") as fobject:
197 | 		links = fobject.read().split("\n")
198 | 	if links == []: return None
199 | 	print("ALL Find " + str(len(links)) + " links")
200 | 	urls = []
201 | 	i = len(links)
202 | 	for link in links:
203 | 		if js == False:
204 | 			temp_urls = find_by_url(link)
205 | 		else:
206 | 			temp_urls = find_by_url(link, js=True)
207 | 		if temp_urls == None: continue
208 | 		print(str(i) + " Find " + str(len(temp_urls)) + " URL in " + link)
209 | 		for temp_url in temp_urls:
210 | 			if temp_url not in urls:
211 | 				urls.append(temp_url)
212 | 		i -= 1
213 | 	return urls
214 | 
215 | def giveresult(urls, domian):
216 | 	if urls == None:
217 | 		return None
218 | 	print("Find " + str(len(urls)) + " URL:")
219 | 	content_url = ""
220 | 	content_subdomain = ""
221 | 	for url in urls:
222 | 		content_url += url + "\n"
223 | 		print(url)
224 | 	subdomains = find_subdomain(urls, domian)
225 | 	print("\nFind " + str(len(subdomains)) + " Subdomain:")
226 | 	for subdomain in subdomains:
227 | 		content_subdomain += subdomain + "\n"
228 | 		print(subdomain)
229 | 	if args.outputurl != None:
230 | 		with open(args.outputurl, "a", encoding='utf-8') as fobject:
231 | 			fobject.write(content_url)
232 | 		print("\nOutput " + str(len(urls)) + " urls")
233 | 		print("Path:" + args.outputurl)
234 | 	if args.outputsubdomain != None:
235 | 		with open(args.outputsubdomain, "a", encoding='utf-8') as fobject:
236 | 			fobject.write(content_subdomain)
237 | 		print("\nOutput " + str(len(subdomains)) + " subdomains")
238 | 		print("Path:" + args.outputsubdomain)
239 | 
240 | if __name__ == "__main__":
241 | 	urllib3.disable_warnings()
242 | 	args = parse_args()
243 | 	if args.file == None:
244 | 		if args.deep is not True:
245 | 			urls = find_by_url(args.url)
246 | 			giveresult(urls, args.url)
247 | 		else:
248 | 			urls = find_by_url_deep(args.url)
249 | 			giveresult(urls, args.url)
250 | 	else:
251 | 		if args.js is not True:
252 | 			urls = find_by_file(args.file)
253 | 			giveresult(urls, urls[0])
254 | 		else:
255 | 			urls = find_by_file(args.file, js = True)
256 | 			giveresult(urls, urls[0])
257 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # JSFinder
  2 | 
  3 | JSFinder is a tool for quickly extracting URLs and subdomains from JS files on a website.
  4 | 
  5 | JSFinder是一款用作快速在网站的js文件中提取URL，子域名的工具。
  6 | 
  7 | 提取URL的正则部分使用的是[LinkFinder](https://github.com/GerbenJavado/LinkFinder) 
  8 | 
  9 | JSFinder获取URL和子域名的方式：
 10 | 
 11 | ![image](https://i.loli.net/2020/05/24/R2fImgNZHPkvhEj.png)
 12 | 
 13 | Blog: https://threezh1.com/
 14 | 
 15 | ## 更新说明
 16 | 
 17 | - 增加油猴脚本用于在浏览器上访问页面时获取域名与接口，具体可见：https://github.com/Threezh1/Deconstruct/tree/main/DevTools_JSFinder
 18 | 
 19 | ## 用法
 20 | 
 21 | - **简单爬取**
 22 | 
 23 | ```
 24 | python JSFinder.py -u http://www.mi.com
 25 | ```
 26 | 
 27 | 这个命令会爬取 http://www.mi.com 这单个页面的所有的js链接，并在其中发现url和子域名
 28 | 
 29 | 返回示例：
 30 | 
 31 | ```
 32 | url:http://www.mi.com                                         
 33 | Find 50 URL:                                                  
 34 | http://api-order.test.mi.com                                  
 35 | http://api.order.mi.com                                       
 36 | http://userid.xiaomi.com/userId                               
 37 | http://order.mi.com/site/login?redirectUrl=                                                   
 38 | ...已省略                            
 39 |                                                               
 40 | Find 26 Subdomain:                                            
 41 | api-order.test.mi.com                                         
 42 | api.order.mi.com                                              
 43 | userid.xiaomi.com                                             
 44 | order.mi.com                                                                                              
 45 | ...已省略
 46 | 
 47 | ```
 48 | 
 49 | - **深度爬取**
 50 | 
 51 | ```
 52 | python JSFinder.py -u http://www.mi.com -d
 53 | ```
 54 | 
 55 | 深入一层页面爬取JS，时间会消耗的更长。
 56 | 
 57 | 建议使用-ou 和 -os来指定保存URL和子域名的文件名。 例如：
 58 | 
 59 | ```
 60 | python JSFinder.py -u http://www.mi.com -d -ou mi_url.txt -os mi_subdomain.txt
 61 | ```
 62 | 
 63 | - **批量指定URL/指定JS**
 64 | 
 65 | 指定URL：
 66 | 
 67 | ```
 68 | python JSFinder.py -f text.txt
 69 | ```
 70 | 
 71 | 指定JS：
 72 | 
 73 | ```
 74 | python JSFinder.py -f text.txt -j
 75 | ```
 76 | 
 77 | 可以用brupsuite爬取网站后提取出URL或者JS链接，保存到txt文件中，一行一个。
 78 | 
 79 | 指定URL或JS就不需要加深度爬取，单个页面即可。
 80 | 
 81 | - **其他**
 82 | 
 83 | -c 指定cookie来爬取页面 例：
 84 | 
 85 | ```
 86 | python JSFinder.py -u http://www.mi.com -c "session=xxx"
 87 | ```
 88 | 
 89 | -ou 指定文件名保存URL链接 例：
 90 | 
 91 | ```
 92 | python JSFinder.py -u http://www.mi.com -ou mi_url.txt
 93 | ```
 94 | 
 95 | -os 指定文件名保存子域名 例：
 96 | 
 97 | ```
 98 | python JSFinder.py -u http://www.mi.com -os mi_subdomain.txt
 99 | ```
100 | 
101 | - **注意**
102 | 
103 | url 不用加引号
104 | 
105 | url 需要http:// 或 https://
106 | 
107 | 指定JS文件爬取时，返回的URL为相对URL
108 | 
109 | 指定URL文件爬取时，返回的相对URL都会以指定的第一个链接的域名作为其域名来转化为绝对URL。
110 | 
111 | - **截图**
112 | 
113 | 实测简单爬取：
114 | 
115 | ```
116 | python3 JSFinder.py -u https://www.jd.com/
117 | ```
118 | 
119 | URL:
120 | 
121 | ![02.jpg](https://i.loli.net/2020/05/24/aROFI5fC3UyK8EP.jpg)
122 | 
123 | ![03.jpg](https://i.loli.net/2020/05/24/rXC4Bba7oMw8AHW.jpg)
124 | 
125 | Subdomain:
126 | 
127 | ![01.jpg](https://i.loli.net/2020/05/24/69WvDmy7al4hQfd.jpg)
128 | 
129 | 实测深度爬取：
130 | 
131 | ```
132 | python3 JSFinder.py -u https://www.jd.com/ -d -ou jd_url.txt -os jd_domain.txt
133 | ```
134 | 
135 | ![05.jpg](https://i.loli.net/2020/05/24/dhxTQnaW4ef9Vzu.jpg)
136 | 
137 | ![06.jpg](https://i.loli.net/2020/05/24/NAX9PnLaW6melVk.jpg)
138 | 
139 | 实际测试：
140 | ```
141 | http://www.oppo.com
142 | URL:4426 个
143 | 子域名：24 个
144 | 
145 | http://www.mi.com
146 | URL:1043 个
147 | 子域名：111 个
148 | 
149 | http://www.jd.com
150 | URL:3627 个
151 | 子域名：306 个
152 | ```
153 | 


--------------------------------------------------------------------------------