├── requirements.txt
└── pyin.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.6.2
2 | loguru==0.4.1
3 | tldextract==2.2.2


--------------------------------------------------------------------------------
/pyin.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import os
  4 | import re
  5 | import sys
  6 | import threading
  7 | from asyncio import CancelledError
  8 | from queue import Queue
  9 | from urllib.parse import urlparse
 10 | import json
 11 | import aiohttp
 12 | from loguru import logger
 13 | from tldextract import extract
 14 | import socket
 15 | from html import unescape
 16 | import time
 17 | 
 18 | socket.setdefaulttimeout(20)
 19 | 
 20 | class PYINFO:
 21 |     def argparser(self):
 22 | 
 23 |         parser = argparse.ArgumentParser(description='JSINFO can help you find the information hidden in JS and '
 24 |                                                      'expand the scope of your assets.',
 25 |                                          epilog='\tUsage:\npython ' + sys.argv[
 26 |                                              0] + " --target www.baidu.com --keywords baidu")
 27 |         parser.add_argument('--target', help='A target like www.example.com or subdomains.txt', required=True)
 28 |         parser.add_argument('--keywords', help='Keyword will be split in "," to extract subdomain')
 29 |         parser.add_argument('--black_keywords', help='Black keywords in html source')
 30 |         args = parser.parse_args()
 31 |         return args
 32 | 
 33 |     def __init__(self):
 34 | 
 35 |         self.banner()
 36 |         args = self.argparser()
 37 | 
 38 |         self.queue = Queue()
 39 |         self.root_domains = []
 40 |         target = args.target
 41 |         if not target.startswith(('http://', 'https://')) and not os.path.isfile(target):
 42 |             target = 'http://' + target
 43 |         elif os.path.isfile(target):
 44 |             with open(target, 'r+', encoding='utf-8') as f:
 45 |                 for domain in f:
 46 |                     domain = domain.strip()
 47 |                     if not domain.startswith(('http://', 'https://')):
 48 |                         self.root_domains.append(domain)
 49 |                         domain = 'http://www.' + domain
 50 |                         self.queue.put(domain)
 51 |         if args.keywords is None:
 52 |             keyword = extract(target).domain
 53 |         else:
 54 |             keyword = args.keywords
 55 |         self.keywords = keyword.split(',')
 56 |         if args.black_keywords is not None:
 57 |             self.black_keywords = args.black_keywords.split(',')
 58 |         else:
 59 |             self.black_keywords = []
 60 | 
 61 |         self.black_extend_list = ['png', 'jpg', 'gif', 'jpeg', 'ico', 'svg', 'bmp', 'mp3', 'mp4', 'avi', 'mpeg', 'mpg',
 62 |                                   'mov', 'zip', 'rar', 'tar', 'gz', 'mpeg', 'mkv', 'rmvb', 'iso', 'css', 'txt', 'ppt',
 63 |                                   'dmg', 'app', 'exe', 'pem', 'doc', 'docx', 'pkg', 'pdf', 'xml', 'eml''ini', 'so',
 64 |                                   'vbs', 'json', 'webp', 'woff', 'ttf', 'otf', 'log', 'image', 'map', 'woff2', 'mem',
 65 |                                   'wasm', 'pexe', 'nmf']
 66 |         self.black_filename_list = ['jquery', 'bootstrap', 'react', 'vue', 'google-analytics']
 67 |         self.extract_urls = []
 68 |         self._value_lock = threading.Lock()
 69 |         self.leak_infos = [] 
 70 |         self.leak_infos_match = []
 71 |      
 72 |         if not os.path.isfile(target):
 73 |             self.queue.put(target)
 74 | 
 75 |     
 76 |         self.apis = []
 77 |         self.sub_domains = []
 78 | 
 79 |         self.headers = {
 80 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) '
 81 |                           'Chrome/79.0.3945.130 Safari/537.36'}
 82 |         
 83 |         link_pattern = r"""
 84 |             (?:"|')                               # Start newline delimiter
 85 |             (
 86 |                 ((?:[a-zA-Z]{1,10}://|//)           # Match a scheme [a-Z]*1-10 or //
 87 |                 [^"'/]{1,}\.                        # Match a domainname (any character + dot)
 88 |                 [a-zA-Z]{2,}[^"']{0,})              # The domainextension and/or path
 89 |                 |
 90 |                 ((?:/|\.\./|\./)                    # Start with /,../,./
 91 |                 [^"'><,;| *()(%%$^/\\\[\]]          # Next character can't be...
 92 |                 [^"'><,;|()]{1,})                   # Rest of the characters can't be
 93 |                 |
 94 |                 ([a-zA-Z0-9_\-/]{1,}/               # Relative endpoint with /
 95 |                 [a-zA-Z0-9_\-/]{1,}                 # Resource name
 96 |                 \.(?:[a-zA-Z]{1,4}|action)          # Rest + extension (length 1-4 or action)
 97 |                 (?:[\?|/][^"|']{0,}|))              # ? mark with parameters
 98 |                 |
 99 |                 ([a-zA-Z0-9_\-]{1,}                 # filename
100 |                 \.(?:php|asp|aspx|jsp|json|
101 |                     action|html|js|txt|xml)             # . + extension
102 |                 (?:\?[^"|']{0,}|))                  # ? mark with parameters
103 |             )
104 |             (?:"|')                               # End newline delimiter
105 | 		"""
106 |         self.link_pattern = re.compile(link_pattern, re.VERBOSE)
107 |         self.js_pattern = 'src=["\'](.*?)["\']'
108 |         self.href_pattern = 'href=["\'](.*?)["\']'
109 |         self.leak_info_patterns = {'mail': r'([-_a-zA-Z0-9\.]{1,64}@%s)', 'author': '@author[: ]+(.*?) ',
110 |                                    'accesskey_id': 'accesskeyid.*?["\'](.*?)["\']',
111 |                                    'accesskey_secret': 'accesskeyid.*?["\'](.*?)["\']',
112 |                                    'access_key': 'access_key.*?["\'](.*?)["\']', 'google_api': r'AIza[0-9A-Za-z-_]{35}',
113 |                                    'google_captcha': r'6L[0-9A-Za-z-_]{38}|^6[0-9a-zA-Z_-]{39}$',
114 |                                    'google_oauth': r'ya29\.[0-9A-Za-z\-_]+',
115 |                                    'amazon_aws_access_key_id': r'AKIA[0-9A-Z]{16}',
116 |                                    'amazon_mws_auth_toke': r'amzn\\.mws\\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}',
117 |                                    'amazon_aws_url': r's3\.amazonaws.com[/]+|[a-zA-Z0-9_-]*\.s3\.amazonaws.com',
118 |                                    'amazon_aws_url2': r"("r"[a-zA-Z0-9-\.\_]+\.s3\.amazonaws\.com"r"|s3://[a-zA-Z0-9-\.\_]+"r"|s3-[a-zA-Z0-9-\.\_\/]+"r"|s3.amazonaws.com/[a-zA-Z0-9-\.\_]+"r"|s3.console.aws.amazon.com/s3/buckets/[a-zA-Z0-9-\.\_]+)",
119 |                                    'facebook_access_token': r'EAACEdEose0cBA[0-9A-Za-z]+',
120 |                                    'authorization_basic': r'basic [a-zA-Z0-9=:_\+\/-]{5,100}',
121 |                                    'authorization_bearer': r'bearer [a-zA-Z0-9_\-\.=:_\+\/]{5,100}',
122 |                                    'authorization_api': r'api[key|_key|\s+]+[a-zA-Z0-9_\-]{5,100}',
123 |                                    'mailgun_api_key': r'key-[0-9a-zA-Z]{32}',
124 |                                    'twilio_api_key': r'SK[0-9a-fA-F]{32}',
125 |                                    'twilio_account_sid': r'AC[a-zA-Z0-9_\-]{32}',
126 |                                    'twilio_app_sid': r'AP[a-zA-Z0-9_\-]{32}',
127 |                                    'paypal_braintree_access_token': r'access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}',
128 |                                    'square_oauth_secret': r'sq0csp-[ 0-9A-Za-z\-_]{43}|sq0[a-z]{3}-[0-9A-Za-z\-_]{22,43}',
129 |                                    'square_access_token': r'sqOatp-[0-9A-Za-z\-_]{22}|EAAA[a-zA-Z0-9]{60}',
130 |                                    'stripe_standard_api': r'sk_live_[0-9a-zA-Z]{24}',
131 |                                    'stripe_restricted_api': r'rk_live_[0-9a-zA-Z]{24}',
132 |                                    'github_access_token': r'[a-zA-Z0-9_-]*:[a-zA-Z0-9_\-]+@github\.com*',
133 |                                    'rsa_private_key': r'-----BEGIN RSA PRIVATE KEY-----',
134 |                                    'ssh_dsa_private_key': r'-----BEGIN DSA PRIVATE KEY-----',
135 |                                    'ssh_dc_private_key': r'-----BEGIN EC PRIVATE KEY-----',
136 |                                    'pgp_private_block': r'-----BEGIN PGP PRIVATE KEY BLOCK-----',
137 |                                    'json_web_token': r'ey[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*$',
138 |                                    'slack_token': r"\"api_token\":\"(xox[a-zA-Z]-[a-zA-Z0-9-]+)\"",
139 |                                    'SSH_privKey': r"([-]+BEGIN [^\s]+ PRIVATE KEY[-]+[\s]*[^-]*[-]+END [^\s]+ PRIVATE KEY[-]+)",
140 |                                    'possible_Creds': r"(?i)("r"password\s*[`=:\"]+\s*[^\s]+|"r"password is\s*[`=:\"]*\s*[^\s]+|"r"pwd\s*[`=:\"]*\s*[^\s]+|"r"passwd\s*[`=:\"]+\s*[^\s]+)", }
141 | 
142 |       
143 |         if not os.path.isfile(target):
144 |             logger.info('[+]Target ==> {}'.format(target))
145 |         else:
146 |             logger.info('[+]Target ==> {}'.format(self.root_domains))
147 |         logger.info('[+]Keywords ==> {}'.format(self.keywords))
148 |         logger.info('[+]Black Keywords ==> {}'.format(self.black_keywords))
149 | 
150 |     def banner(self):
151 |         
152 |         banner = r""" _____  ___    _  _   _  ___    _____ 
153 | 
154 |         (_)( ) ( )(  _`\ (  _  )
155 |         | `\| || (_(_)| ( ) |
156 |         | || , ` ||  _)  | | | |
157 |         | || |`\ || |    | (_) |
158 |         (_)(_) (_)(_)    (_____)
159 | 
160 |         Author： SUBUX
161 |             """
162 |         print(banner)
163 | 
164 |     def start(self):
165 |         loop = asyncio.get_event_loop()
166 |         while self.queue.qsize() > 0:
167 |             try:
168 |                 while not self.queue.empty():
169 |                     tasks = []
170 |                     i = 0
171 |                     while i < 50 and not self.queue.empty():
172 |                         
173 |                         url = self.queue.get()
174 | 
175 |                         filename = os.path.basename(url)
176 |                         file_extend = self.get_file_extend(filename)
177 |                         if file_extend == 'js':
178 |                             tasks.append(asyncio.ensure_future(self.FindLinkInJs(url)))
179 |                         else:
180 |                             tasks.append(asyncio.ensure_future(self.FindLinkInPage(url)))
181 |                         i += 1
182 |                     
183 |                     if tasks:
184 |                         loop.run_until_complete(asyncio.wait(tasks))
185 |                     logger.info('-' * 20)
186 |                     logger.info('[+]root domain count ==> {}'.format(len(self.root_domains)))
187 |                     logger.info('[+]sub domain count ==> {}'.format(len(self.sub_domains)))
188 |                     logger.info('[+]api count ==> {}'.format(len(self.apis)))
189 |                     logger.info('[+]leakinfos count ==> {}'.format(len(self.leak_infos)))
190 |                     logger.info('-' * 20)
191 |             except KeyboardInterrupt:
192 |                 logger.info('[+]Break From Queue.')
193 |                 break
194 |             except CancelledError:
195 |                 pass
196 | 
197 |         logger.info('[+]All root domain count ==> {}'.format(len(self.root_domains)))
198 |         logger.info('[+]All sub domain count ==> {}'.format(len(self.sub_domains)))
199 |         logger.info('[+]All api count ==> {}'.format(len(self.apis)))
200 |         logger.info('[+]All leakinfos count ==> {}'.format(len(self.leak_infos)))
201 | 
202 |         now_time = str(int(time.time()))
203 |         with open(now_time + '_rootdomain', 'a+', encoding='utf-8') as f:
204 |             for i in self.root_domains:
205 |                 f.write(i.strip() + '\n')
206 | 
207 |         with open(now_time + '_subdomain', 'a+', encoding='utf-8') as f:
208 |             for i in self.sub_domains:
209 |                 f.write(i.strip() + '\n')
210 | 
211 |         with open(now_time + '_apis', 'a+', encoding='utf-8') as f:
212 |             for i in self.apis:
213 |                 f.write(i.strip() + '\n')
214 | 
215 |         with open(now_time + '_leakinfos', 'a+', encoding='utf-8') as f:
216 |             for i in self.leak_infos:
217 |                 i = str(i)
218 |                 f.write(i.strip() + '\n')
219 | 
220 |         logger.info('[+]Root domains ==> {}'.format(now_time + '_rootdomain'))
221 |         logger.info('[+]Sub domains ==> {}'.format(now_time + '_subdomain'))
222 |         logger.info('[+]Apis ==> {}'.format(now_time + '_apis'))
223 |         logger.info('[+]LeakInfos ==> {}'.format(now_time + '_leakinfos'))
224 | 
225 |     async def FindLinkInPage(self, url):
226 |         
227 |         try:
228 |             resp = await self.send_request(url)
229 |         except ConnectionResetError:
230 |             return None
231 |         if not resp:
232 |             return None
233 |         if self.black_keywords:
234 |             for black_keyword in self.black_keywords:
235 |                 if black_keyword in resp:
236 |                     return False
237 |         self.find_leak_info(url, resp) 
238 |         
239 |         try:
240 |             hrefs = re.findall(self.href_pattern, resp)
241 |         except TypeError:
242 |             hrefs = []
243 |         try:
244 |             js_urls = re.findall(self.js_pattern, resp)
245 |         except TypeError:
246 |             js_urls = []
247 |         try:
248 |             js_texts = re.findall('<script>(.*?)</script>', resp)
249 |         except TypeError:
250 |             js_texts = []
251 | 
252 |         parse_url = urlparse(url)
253 |         for href in hrefs:
254 |             full_href_url = self.extract_link(parse_url, href)
255 |             if full_href_url is False:
256 |                 continue
257 |         for js_url in js_urls:
258 |             full_js_url = self.extract_link(parse_url, js_url)
259 |             if full_js_url is False:
260 |                 continue
261 |         for js_text in js_texts:
262 |             self.FindLinkInJsText(url, js_text)
263 | 
264 |     async def FindLinkInJs(self, url):
265 |         resp = await self.send_request(url)
266 |         if not resp:
267 |             return False
268 |         if self.black_keywords:
269 |             for black_keyword in self.black_keywords:
270 |                 if black_keyword in resp:
271 |                     return False
272 |         self.find_leak_info(url, resp)
273 |         try:
274 |             link_finder_matchs = re.finditer(self.link_pattern, str(resp))
275 |         except:
276 |             return None
277 |         for match in link_finder_matchs:
278 |             match = match.group().strip('"').strip("'")
279 |             full_api_url = self.extract_link(urlparse(url), match)
280 |             if full_api_url is False:
281 |                 continue
282 | 
283 |     def FindLinkInJsText(self, url, text):
284 |         try:
285 |             link_finder_matchs = re.finditer(self.link_pattern, str(text))
286 |         except:
287 |             return None
288 |         self.find_leak_info(url, text)
289 |         for match in link_finder_matchs:
290 |             match = match.group().strip('"').strip("'")
291 |             full_api_url = self.extract_link(urlparse(url), match)
292 |             if full_api_url is False:
293 |                 continue
294 | 
295 |     async def send_request(self, url):
296 | 
297 |         sem = asyncio.Semaphore(1024)
298 |         try:
299 |             async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
300 |                 async with sem:
301 |                     async with session.get(url, timeout=20, headers=self.headers) as req:
302 |                         await asyncio.sleep(1)
303 |                         response = await req.text('utf-8', 'ignore')
304 |                         req.close()
305 |                         return response
306 |         except CancelledError:
307 |             pass
308 |         except ConnectionResetError:
309 |             pass
310 |         except Exception as e:
311 |             logger.warning('[-]Resolve {} fail'.format(url))
312 |             return False
313 | 
314 |     def filter_black_extend(self, file_extend):
315 |         if file_extend in self.black_extend_list:
316 |             return True
317 | 
318 |     def get_file_extend(self, filename):
319 |         return filename.split('/')[-1].split('?')[0].split('.')[-1].lower()
320 | 
321 |     def get_format_url(self, parse_link, filename, file_extend):
322 |         if '-' in filename:
323 |             split_filename = filename.split('-')
324 |         elif '_' in filename:
325 |             split_filename = filename.split('_')
326 |         else:
327 |             split_filename = filename.split('-')
328 | 
329 |         format_filename = ''
330 |         for split_name in split_filename:
331 |             try:
332 |                 load_json = json.loads(split_name)
333 |                 if isinstance(load_json, int) or isinstance(load_json, float):
334 |                     format_filename += '-int'
335 |             except:
336 |                 format_filename += split_name
337 |         return parse_link.scheme + '://' + parse_link.netloc + parse_link.path.replace(filename, format_filename)
338 | 
339 |     def extract_link(self, parse_url, link):
340 |         
341 |         link = unescape(link)
342 | 
343 |         filename = os.path.basename(link)
344 |         file_extend = self.get_file_extend(filename)
345 |         is_link = False
346 |         if link.startswith(('http://', 'https://')) and file_extend not in self.black_extend_list:
347 |             full_url = link
348 |         elif link.startswith('javascript:'):
349 |             return False
350 |         elif link.startswith('////') and len(link) > 4:
351 |             full_url = 'http://' + link[2:]
352 |         elif link.startswith('//') and len(link) > 2:
353 |             full_url = 'http:' + link
354 |         elif link.startswith('/'):
355 |             full_url = parse_url.scheme + '://' + parse_url.netloc + link
356 |         elif link.startswith('./'):
357 |             full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + link[1:]
358 |         else:
359 |             full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + '/' + link
360 |         
361 |         extract_full_url_domain = extract(full_url)
362 |         root_domain = extract_full_url_domain.domain + '.' + extract_full_url_domain.suffix
363 |         sub_domain = urlparse(full_url).netloc
364 |         
365 |         in_keyword = False
366 |         for keyword in self.keywords:
367 |             if keyword in root_domain:
368 |                 in_keyword = True
369 |         if not in_keyword:
370 |             return False
371 | 
372 |         try:
373 |             self._value_lock.acquire()
374 |             if root_domain not in self.root_domains:
375 |                 self.root_domains.append(root_domain)
376 |                 logger.info('[+]Find a new root domain ==> {}'.format(root_domain))
377 |                 if root_domain not in self.extract_urls:
378 |                     self.extract_urls.append(root_domain)
379 |                     self.queue.put('http://' + root_domain)
380 |         finally:
381 |             self._value_lock.release()
382 | 
383 |         
384 |         try:
385 |             self._value_lock.acquire()
386 |             if sub_domain not in self.sub_domains and sub_domain != root_domain:
387 |                 self.sub_domains.append(sub_domain)
388 |                 logger.info('[+]Find a new subdomain ==> {}'.format(sub_domain))
389 |                 if sub_domain not in self.extract_urls:
390 |                     self.extract_urls.append(sub_domain)
391 |                     self.queue.put('http://' + sub_domain)
392 |         finally:
393 |             self._value_lock.release()
394 |         if file_extend in self.black_extend_list:
395 |             return False
396 |         if is_link is True:
397 |             return link
398 |         try:
399 |             self._value_lock.acquire()
400 |             if full_url not in self.apis and file_extend != 'html' and file_extend != 'js':
401 |                 self.apis.append(full_url)
402 |                 # logger.info('[+]Find a new api in {}'.format(parse_url.netloc))
403 |         finally:
404 |             self._value_lock.release()
405 | 
406 |         format_url = self.get_format_url(urlparse(full_url), filename, file_extend)
407 | 
408 |         try:
409 |             self._value_lock.acquire()
410 |             if format_url not in self.extract_urls:
411 |                 self.extract_urls.append(format_url)
412 |                 self.queue.put(full_url)
413 |         finally:
414 |             self._value_lock.release()
415 | 
416 |     def find_leak_info(self, url, text):
417 |         for k in self.leak_info_patterns.keys():
418 |             pattern = self.leak_info_patterns[k]
419 |             if k == 'mail':
420 |                 for netloc in self.root_domains:
421 |                     mail_pattern = '([-_a-zA-Z0-9\.]{1,64}@%s)' % netloc
422 |                     self.process_pattern(k, mail_pattern, text, url)
423 |             else:
424 |                 self.process_pattern(k, pattern, text, url)
425 | 
426 |     def process_pattern(self, key, pattern, text, url):
427 |         try:
428 |             self._value_lock.acquire()
429 |             matchs = re.findall(pattern, text, re.IGNORECASE)
430 |             for match in matchs:
431 |                 match_tuple = (key, match, url)
432 |                 if match not in self.leak_infos_match:
433 |                     self.leak_infos.append(match_tuple)
434 |                     self.leak_infos_match.append(match)
435 |                     # logger.info('[+]Find a leak info ==> {}'.format(match_tuple))
436 |         except Exception as e:
437 |             logger.warning(e)
438 |         finally:
439 |             self._value_lock.release()
440 | 
441 | 
442 | if __name__ == '__main__':
443 |     PYINFO().start()


--------------------------------------------------------------------------------