├── requirements.txt └── pyin.py /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.6.2 2 | loguru==0.4.1 3 | tldextract==2.2.2 -------------------------------------------------------------------------------- /pyin.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | import re 5 | import sys 6 | import threading 7 | from asyncio import CancelledError 8 | from queue import Queue 9 | from urllib.parse import urlparse 10 | import json 11 | import aiohttp 12 | from loguru import logger 13 | from tldextract import extract 14 | import socket 15 | from html import unescape 16 | import time 17 | 18 | socket.setdefaulttimeout(20) 19 | 20 | class PYINFO: 21 | def argparser(self): 22 | 23 | parser = argparse.ArgumentParser(description='JSINFO can help you find the information hidden in JS and ' 24 | 'expand the scope of your assets.', 25 | epilog='\tUsage:\npython ' + sys.argv[ 26 | 0] + " --target www.baidu.com --keywords baidu") 27 | parser.add_argument('--target', help='A target like www.example.com or subdomains.txt', required=True) 28 | parser.add_argument('--keywords', help='Keyword will be split in "," to extract subdomain') 29 | parser.add_argument('--black_keywords', help='Black keywords in html source') 30 | args = parser.parse_args() 31 | return args 32 | 33 | def __init__(self): 34 | 35 | self.banner() 36 | args = self.argparser() 37 | 38 | self.queue = Queue() 39 | self.root_domains = [] 40 | target = args.target 41 | if not target.startswith(('http://', 'https://')) and not os.path.isfile(target): 42 | target = 'http://' + target 43 | elif os.path.isfile(target): 44 | with open(target, 'r+', encoding='utf-8') as f: 45 | for domain in f: 46 | domain = domain.strip() 47 | if not domain.startswith(('http://', 'https://')): 48 | self.root_domains.append(domain) 49 | domain = 'http://www.' + domain 50 | self.queue.put(domain) 51 | if args.keywords is None: 52 | keyword = extract(target).domain 53 | else: 54 | keyword = args.keywords 55 | self.keywords = keyword.split(',') 56 | if args.black_keywords is not None: 57 | self.black_keywords = args.black_keywords.split(',') 58 | else: 59 | self.black_keywords = [] 60 | 61 | self.black_extend_list = ['png', 'jpg', 'gif', 'jpeg', 'ico', 'svg', 'bmp', 'mp3', 'mp4', 'avi', 'mpeg', 'mpg', 62 | 'mov', 'zip', 'rar', 'tar', 'gz', 'mpeg', 'mkv', 'rmvb', 'iso', 'css', 'txt', 'ppt', 63 | 'dmg', 'app', 'exe', 'pem', 'doc', 'docx', 'pkg', 'pdf', 'xml', 'eml''ini', 'so', 64 | 'vbs', 'json', 'webp', 'woff', 'ttf', 'otf', 'log', 'image', 'map', 'woff2', 'mem', 65 | 'wasm', 'pexe', 'nmf'] 66 | self.black_filename_list = ['jquery', 'bootstrap', 'react', 'vue', 'google-analytics'] 67 | self.extract_urls = [] 68 | self._value_lock = threading.Lock() 69 | self.leak_infos = [] 70 | self.leak_infos_match = [] 71 | 72 | if not os.path.isfile(target): 73 | self.queue.put(target) 74 | 75 | 76 | self.apis = [] 77 | self.sub_domains = [] 78 | 79 | self.headers = { 80 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) ' 81 | 'Chrome/79.0.3945.130 Safari/537.36'} 82 | 83 | link_pattern = r""" 84 | (?:"|') # Start newline delimiter 85 | ( 86 | ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or // 87 | [^"'/]{1,}\. # Match a domainname (any character + dot) 88 | [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path 89 | | 90 | ((?:/|\.\./|\./) # Start with /,../,./ 91 | [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be... 92 | [^"'><,;|()]{1,}) # Rest of the characters can't be 93 | | 94 | ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with / 95 | [a-zA-Z0-9_\-/]{1,} # Resource name 96 | \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action) 97 | (?:[\?|/][^"|']{0,}|)) # ? mark with parameters 98 | | 99 | ([a-zA-Z0-9_\-]{1,} # filename 100 | \.(?:php|asp|aspx|jsp|json| 101 | action|html|js|txt|xml) # . + extension 102 | (?:\?[^"|']{0,}|)) # ? mark with parameters 103 | ) 104 | (?:"|') # End newline delimiter 105 | """ 106 | self.link_pattern = re.compile(link_pattern, re.VERBOSE) 107 | self.js_pattern = 'src=["\'](.*?)["\']' 108 | self.href_pattern = 'href=["\'](.*?)["\']' 109 | self.leak_info_patterns = {'mail': r'([-_a-zA-Z0-9\.]{1,64}@%s)', 'author': '@author[: ]+(.*?) ', 110 | 'accesskey_id': 'accesskeyid.*?["\'](.*?)["\']', 111 | 'accesskey_secret': 'accesskeyid.*?["\'](.*?)["\']', 112 | 'access_key': 'access_key.*?["\'](.*?)["\']', 'google_api': r'AIza[0-9A-Za-z-_]{35}', 113 | 'google_captcha': r'6L[0-9A-Za-z-_]{38}|^6[0-9a-zA-Z_-]{39}$', 114 | 'google_oauth': r'ya29\.[0-9A-Za-z\-_]+', 115 | 'amazon_aws_access_key_id': r'AKIA[0-9A-Z]{16}', 116 | 'amazon_mws_auth_toke': r'amzn\\.mws\\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', 117 | 'amazon_aws_url': r's3\.amazonaws.com[/]+|[a-zA-Z0-9_-]*\.s3\.amazonaws.com', 118 | 'amazon_aws_url2': r"("r"[a-zA-Z0-9-\.\_]+\.s3\.amazonaws\.com"r"|s3://[a-zA-Z0-9-\.\_]+"r"|s3-[a-zA-Z0-9-\.\_\/]+"r"|s3.amazonaws.com/[a-zA-Z0-9-\.\_]+"r"|s3.console.aws.amazon.com/s3/buckets/[a-zA-Z0-9-\.\_]+)", 119 | 'facebook_access_token': r'EAACEdEose0cBA[0-9A-Za-z]+', 120 | 'authorization_basic': r'basic [a-zA-Z0-9=:_\+\/-]{5,100}', 121 | 'authorization_bearer': r'bearer [a-zA-Z0-9_\-\.=:_\+\/]{5,100}', 122 | 'authorization_api': r'api[key|_key|\s+]+[a-zA-Z0-9_\-]{5,100}', 123 | 'mailgun_api_key': r'key-[0-9a-zA-Z]{32}', 124 | 'twilio_api_key': r'SK[0-9a-fA-F]{32}', 125 | 'twilio_account_sid': r'AC[a-zA-Z0-9_\-]{32}', 126 | 'twilio_app_sid': r'AP[a-zA-Z0-9_\-]{32}', 127 | 'paypal_braintree_access_token': r'access_token\$production\$[0-9a-z]{16}\$[0-9a-f]{32}', 128 | 'square_oauth_secret': r'sq0csp-[ 0-9A-Za-z\-_]{43}|sq0[a-z]{3}-[0-9A-Za-z\-_]{22,43}', 129 | 'square_access_token': r'sqOatp-[0-9A-Za-z\-_]{22}|EAAA[a-zA-Z0-9]{60}', 130 | 'stripe_standard_api': r'sk_live_[0-9a-zA-Z]{24}', 131 | 'stripe_restricted_api': r'rk_live_[0-9a-zA-Z]{24}', 132 | 'github_access_token': r'[a-zA-Z0-9_-]*:[a-zA-Z0-9_\-]+@github\.com*', 133 | 'rsa_private_key': r'-----BEGIN RSA PRIVATE KEY-----', 134 | 'ssh_dsa_private_key': r'-----BEGIN DSA PRIVATE KEY-----', 135 | 'ssh_dc_private_key': r'-----BEGIN EC PRIVATE KEY-----', 136 | 'pgp_private_block': r'-----BEGIN PGP PRIVATE KEY BLOCK-----', 137 | 'json_web_token': r'ey[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*$', 138 | 'slack_token': r"\"api_token\":\"(xox[a-zA-Z]-[a-zA-Z0-9-]+)\"", 139 | 'SSH_privKey': r"([-]+BEGIN [^\s]+ PRIVATE KEY[-]+[\s]*[^-]*[-]+END [^\s]+ PRIVATE KEY[-]+)", 140 | 'possible_Creds': r"(?i)("r"password\s*[`=:\"]+\s*[^\s]+|"r"password is\s*[`=:\"]*\s*[^\s]+|"r"pwd\s*[`=:\"]*\s*[^\s]+|"r"passwd\s*[`=:\"]+\s*[^\s]+)", } 141 | 142 | 143 | if not os.path.isfile(target): 144 | logger.info('[+]Target ==> {}'.format(target)) 145 | else: 146 | logger.info('[+]Target ==> {}'.format(self.root_domains)) 147 | logger.info('[+]Keywords ==> {}'.format(self.keywords)) 148 | logger.info('[+]Black Keywords ==> {}'.format(self.black_keywords)) 149 | 150 | def banner(self): 151 | 152 | banner = r""" _____ ___ _ _ _ ___ _____ 153 | 154 | (_)( ) ( )( _`\ ( _ ) 155 | | `\| || (_(_)| ( ) | 156 | | || , ` || _) | | | | 157 | | || |`\ || | | (_) | 158 | (_)(_) (_)(_) (_____) 159 | 160 | Author: SUBUX 161 | """ 162 | print(banner) 163 | 164 | def start(self): 165 | loop = asyncio.get_event_loop() 166 | while self.queue.qsize() > 0: 167 | try: 168 | while not self.queue.empty(): 169 | tasks = [] 170 | i = 0 171 | while i < 50 and not self.queue.empty(): 172 | 173 | url = self.queue.get() 174 | 175 | filename = os.path.basename(url) 176 | file_extend = self.get_file_extend(filename) 177 | if file_extend == 'js': 178 | tasks.append(asyncio.ensure_future(self.FindLinkInJs(url))) 179 | else: 180 | tasks.append(asyncio.ensure_future(self.FindLinkInPage(url))) 181 | i += 1 182 | 183 | if tasks: 184 | loop.run_until_complete(asyncio.wait(tasks)) 185 | logger.info('-' * 20) 186 | logger.info('[+]root domain count ==> {}'.format(len(self.root_domains))) 187 | logger.info('[+]sub domain count ==> {}'.format(len(self.sub_domains))) 188 | logger.info('[+]api count ==> {}'.format(len(self.apis))) 189 | logger.info('[+]leakinfos count ==> {}'.format(len(self.leak_infos))) 190 | logger.info('-' * 20) 191 | except KeyboardInterrupt: 192 | logger.info('[+]Break From Queue.') 193 | break 194 | except CancelledError: 195 | pass 196 | 197 | logger.info('[+]All root domain count ==> {}'.format(len(self.root_domains))) 198 | logger.info('[+]All sub domain count ==> {}'.format(len(self.sub_domains))) 199 | logger.info('[+]All api count ==> {}'.format(len(self.apis))) 200 | logger.info('[+]All leakinfos count ==> {}'.format(len(self.leak_infos))) 201 | 202 | now_time = str(int(time.time())) 203 | with open(now_time + '_rootdomain', 'a+', encoding='utf-8') as f: 204 | for i in self.root_domains: 205 | f.write(i.strip() + '\n') 206 | 207 | with open(now_time + '_subdomain', 'a+', encoding='utf-8') as f: 208 | for i in self.sub_domains: 209 | f.write(i.strip() + '\n') 210 | 211 | with open(now_time + '_apis', 'a+', encoding='utf-8') as f: 212 | for i in self.apis: 213 | f.write(i.strip() + '\n') 214 | 215 | with open(now_time + '_leakinfos', 'a+', encoding='utf-8') as f: 216 | for i in self.leak_infos: 217 | i = str(i) 218 | f.write(i.strip() + '\n') 219 | 220 | logger.info('[+]Root domains ==> {}'.format(now_time + '_rootdomain')) 221 | logger.info('[+]Sub domains ==> {}'.format(now_time + '_subdomain')) 222 | logger.info('[+]Apis ==> {}'.format(now_time + '_apis')) 223 | logger.info('[+]LeakInfos ==> {}'.format(now_time + '_leakinfos')) 224 | 225 | async def FindLinkInPage(self, url): 226 | 227 | try: 228 | resp = await self.send_request(url) 229 | except ConnectionResetError: 230 | return None 231 | if not resp: 232 | return None 233 | if self.black_keywords: 234 | for black_keyword in self.black_keywords: 235 | if black_keyword in resp: 236 | return False 237 | self.find_leak_info(url, resp) 238 | 239 | try: 240 | hrefs = re.findall(self.href_pattern, resp) 241 | except TypeError: 242 | hrefs = [] 243 | try: 244 | js_urls = re.findall(self.js_pattern, resp) 245 | except TypeError: 246 | js_urls = [] 247 | try: 248 | js_texts = re.findall('', resp) 249 | except TypeError: 250 | js_texts = [] 251 | 252 | parse_url = urlparse(url) 253 | for href in hrefs: 254 | full_href_url = self.extract_link(parse_url, href) 255 | if full_href_url is False: 256 | continue 257 | for js_url in js_urls: 258 | full_js_url = self.extract_link(parse_url, js_url) 259 | if full_js_url is False: 260 | continue 261 | for js_text in js_texts: 262 | self.FindLinkInJsText(url, js_text) 263 | 264 | async def FindLinkInJs(self, url): 265 | resp = await self.send_request(url) 266 | if not resp: 267 | return False 268 | if self.black_keywords: 269 | for black_keyword in self.black_keywords: 270 | if black_keyword in resp: 271 | return False 272 | self.find_leak_info(url, resp) 273 | try: 274 | link_finder_matchs = re.finditer(self.link_pattern, str(resp)) 275 | except: 276 | return None 277 | for match in link_finder_matchs: 278 | match = match.group().strip('"').strip("'") 279 | full_api_url = self.extract_link(urlparse(url), match) 280 | if full_api_url is False: 281 | continue 282 | 283 | def FindLinkInJsText(self, url, text): 284 | try: 285 | link_finder_matchs = re.finditer(self.link_pattern, str(text)) 286 | except: 287 | return None 288 | self.find_leak_info(url, text) 289 | for match in link_finder_matchs: 290 | match = match.group().strip('"').strip("'") 291 | full_api_url = self.extract_link(urlparse(url), match) 292 | if full_api_url is False: 293 | continue 294 | 295 | async def send_request(self, url): 296 | 297 | sem = asyncio.Semaphore(1024) 298 | try: 299 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: 300 | async with sem: 301 | async with session.get(url, timeout=20, headers=self.headers) as req: 302 | await asyncio.sleep(1) 303 | response = await req.text('utf-8', 'ignore') 304 | req.close() 305 | return response 306 | except CancelledError: 307 | pass 308 | except ConnectionResetError: 309 | pass 310 | except Exception as e: 311 | logger.warning('[-]Resolve {} fail'.format(url)) 312 | return False 313 | 314 | def filter_black_extend(self, file_extend): 315 | if file_extend in self.black_extend_list: 316 | return True 317 | 318 | def get_file_extend(self, filename): 319 | return filename.split('/')[-1].split('?')[0].split('.')[-1].lower() 320 | 321 | def get_format_url(self, parse_link, filename, file_extend): 322 | if '-' in filename: 323 | split_filename = filename.split('-') 324 | elif '_' in filename: 325 | split_filename = filename.split('_') 326 | else: 327 | split_filename = filename.split('-') 328 | 329 | format_filename = '' 330 | for split_name in split_filename: 331 | try: 332 | load_json = json.loads(split_name) 333 | if isinstance(load_json, int) or isinstance(load_json, float): 334 | format_filename += '-int' 335 | except: 336 | format_filename += split_name 337 | return parse_link.scheme + '://' + parse_link.netloc + parse_link.path.replace(filename, format_filename) 338 | 339 | def extract_link(self, parse_url, link): 340 | 341 | link = unescape(link) 342 | 343 | filename = os.path.basename(link) 344 | file_extend = self.get_file_extend(filename) 345 | is_link = False 346 | if link.startswith(('http://', 'https://')) and file_extend not in self.black_extend_list: 347 | full_url = link 348 | elif link.startswith('javascript:'): 349 | return False 350 | elif link.startswith('////') and len(link) > 4: 351 | full_url = 'http://' + link[2:] 352 | elif link.startswith('//') and len(link) > 2: 353 | full_url = 'http:' + link 354 | elif link.startswith('/'): 355 | full_url = parse_url.scheme + '://' + parse_url.netloc + link 356 | elif link.startswith('./'): 357 | full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + link[1:] 358 | else: 359 | full_url = parse_url.scheme + '://' + parse_url.netloc + parse_url.path + '/' + link 360 | 361 | extract_full_url_domain = extract(full_url) 362 | root_domain = extract_full_url_domain.domain + '.' + extract_full_url_domain.suffix 363 | sub_domain = urlparse(full_url).netloc 364 | 365 | in_keyword = False 366 | for keyword in self.keywords: 367 | if keyword in root_domain: 368 | in_keyword = True 369 | if not in_keyword: 370 | return False 371 | 372 | try: 373 | self._value_lock.acquire() 374 | if root_domain not in self.root_domains: 375 | self.root_domains.append(root_domain) 376 | logger.info('[+]Find a new root domain ==> {}'.format(root_domain)) 377 | if root_domain not in self.extract_urls: 378 | self.extract_urls.append(root_domain) 379 | self.queue.put('http://' + root_domain) 380 | finally: 381 | self._value_lock.release() 382 | 383 | 384 | try: 385 | self._value_lock.acquire() 386 | if sub_domain not in self.sub_domains and sub_domain != root_domain: 387 | self.sub_domains.append(sub_domain) 388 | logger.info('[+]Find a new subdomain ==> {}'.format(sub_domain)) 389 | if sub_domain not in self.extract_urls: 390 | self.extract_urls.append(sub_domain) 391 | self.queue.put('http://' + sub_domain) 392 | finally: 393 | self._value_lock.release() 394 | if file_extend in self.black_extend_list: 395 | return False 396 | if is_link is True: 397 | return link 398 | try: 399 | self._value_lock.acquire() 400 | if full_url not in self.apis and file_extend != 'html' and file_extend != 'js': 401 | self.apis.append(full_url) 402 | # logger.info('[+]Find a new api in {}'.format(parse_url.netloc)) 403 | finally: 404 | self._value_lock.release() 405 | 406 | format_url = self.get_format_url(urlparse(full_url), filename, file_extend) 407 | 408 | try: 409 | self._value_lock.acquire() 410 | if format_url not in self.extract_urls: 411 | self.extract_urls.append(format_url) 412 | self.queue.put(full_url) 413 | finally: 414 | self._value_lock.release() 415 | 416 | def find_leak_info(self, url, text): 417 | for k in self.leak_info_patterns.keys(): 418 | pattern = self.leak_info_patterns[k] 419 | if k == 'mail': 420 | for netloc in self.root_domains: 421 | mail_pattern = '([-_a-zA-Z0-9\.]{1,64}@%s)' % netloc 422 | self.process_pattern(k, mail_pattern, text, url) 423 | else: 424 | self.process_pattern(k, pattern, text, url) 425 | 426 | def process_pattern(self, key, pattern, text, url): 427 | try: 428 | self._value_lock.acquire() 429 | matchs = re.findall(pattern, text, re.IGNORECASE) 430 | for match in matchs: 431 | match_tuple = (key, match, url) 432 | if match not in self.leak_infos_match: 433 | self.leak_infos.append(match_tuple) 434 | self.leak_infos_match.append(match) 435 | # logger.info('[+]Find a leak info ==> {}'.format(match_tuple)) 436 | except Exception as e: 437 | logger.warning(e) 438 | finally: 439 | self._value_lock.release() 440 | 441 | 442 | if __name__ == '__main__': 443 | PYINFO().start() --------------------------------------------------------------------------------