├── .DS_Store ├── .gitignore ├── README.md ├── app.py ├── extractorFunctions.py ├── featureExtractor.py ├── main.py ├── model ├── pca_model.pkl └── phishingdetection.pkl ├── requirements.txt └── templates └── index.html /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sannjayy/python-phishing-url-detection/3ee3ea4d54fc698d0c67e7088a8a42a2bf723e9c/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | zenv/ 2 | __pycache__/ 3 | 4 | logs.log 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Phishing URL Detection 2 | --- 3 | 4 | ### Demo Video: [https://youtu.be/9t4gbbrkfks](https://youtu.be/9t4gbbrkfks) 5 | 6 | **Python 3.11.9 _(Currently Using)_** 7 | 8 | 9 | ## How to Run? 10 | 11 | - Clone or download [python-phishing-url-detection](https://github.com/sannjayy/python-phishing-url-detection) 12 | 13 | `git clone git@github.com:sannjayy/python-phishing-url-detection.git` 14 | 15 | 16 | - Create a virtual environment 17 | ```bash 18 | python -m venv zenv 19 | source zenv/Scripts/activate # Windows 20 | source zenv/bin/activate # Mac 21 | ``` 22 | 23 | 24 | - Install basic requirements 25 | ```bash 26 | pip install -r requirements.txt 27 | 28 | # OR INITIAL INSTALLATION 29 | pip install --upgrade pip 30 | pip install --upgrade setuptools 31 | 32 | pip install pandas whois httpx 33 | pip install pycaret # It will take sometime. 34 | ``` 35 | 36 | ### Replace Domains 37 | 38 | ```python 39 | if __name__ == "__main__": 40 | phishing_url_1 = 'https://bafybeifqd2yktzvwjw5g42l2ghvxsxn76khhsgqpkaqfdhnqf3kiuiegw4.ipfs.dweb.link/' 41 | phishing_url_2 = 'http://about-ads-microsoft-com.o365.frc.skyfencenet.com' 42 | real_url_1 = 'https://chat.openai.com' 43 | real_url_2 = 'https://github.com/' 44 | 45 | 46 | print(predict(phishing_url_1)) 47 | print(predict(phishing_url_2)) 48 | print(predict(real_url_1)) 49 | print(predict(real_url_2)) 50 | ``` 51 | 52 | ### To Run 53 | 54 | ```bash 55 | python main.py 56 | 57 | 58 | # OUTPUT: {'prediction_label': 0, 'prediction_score': 68.39} 59 | 60 | # 0 = False | 1 True 61 | ``` 62 | 63 | --- 64 | 65 | ### To Run GUI 66 | 67 | ```bash 68 | pip install flask 69 | 70 | python app.py 71 | ``` 72 | 73 | Open http://127.0.0.1:5000 in your browser! 74 | 75 | 76 | 77 | --- 78 | --- 79 | 80 | - 🌏 [GitHub Repo](https://github.com/sannjayy/python-phishing-url-detection) 81 | - 🌏 [Website](https://www.sanjaysikdar.dev) 82 | - 📫 83 | - 📖 [read.sanjaysikdar.dev](https://read.sanjaysikdar.dev) 84 | - 📦 [pypi releases](https://pypi.org/user/sannjayy/) | [npm releases](https://www.npmjs.com/~sannjayy) 85 | 86 | --- 87 | 88 | [![](https://img.shields.io/github/followers/sannjayy?style=social)](https://github.com/sannjayy) 89 | Developed with ❤️ by *[sanjaysikdar.dev](https://www.sanjaysikdar.dev)*. 90 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template 2 | from featureExtractor import featureExtraction 3 | from pycaret.classification import load_model, predict_model 4 | 5 | model = load_model('model/phishingdetection') 6 | 7 | def predict(url): 8 | data = featureExtraction(url) 9 | result = predict_model(model, data=data) 10 | prediction_score = result['prediction_score'][0] 11 | prediction_label = result['prediction_label'][0] 12 | 13 | return { 14 | 'prediction_label': prediction_label, 15 | 'prediction_score': prediction_score * 100, 16 | } 17 | 18 | 19 | 20 | app = Flask(__name__) 21 | 22 | @app.route("/", methods=["GET", "POST"]) 23 | def index(): 24 | data = None 25 | if request.method == "POST": 26 | url = request.form["url"] 27 | data = predict(url) 28 | return render_template('index.html', url=url, data=data ) 29 | return render_template("index.html", data=data) 30 | 31 | if __name__ == "__main__": 32 | app.run(debug=True) -------------------------------------------------------------------------------- /extractorFunctions.py: -------------------------------------------------------------------------------- 1 | # importing required packages for Address Bar Based feature Extraction 2 | from urllib.parse import urlparse, urlencode, unquote 3 | import re 4 | # importing required packages for Domain Based Feature Extraction 5 | from datetime import datetime 6 | 7 | 8 | # 2.Checks for IP address in URL (Have_IP) 9 | def havingIP(url): 10 | ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b" 11 | match = re.search(ip_pattern, url) 12 | if match: 13 | return 1 14 | return 0 15 | 16 | # 3.Checks the presence of @ in URL (Have_At) 17 | def haveAtSign(url): 18 | if "@" in url: 19 | at = 1 20 | else: 21 | at = 0 22 | return at 23 | 24 | # 4.Finding the length of URL and categorizing (URL_Length) 25 | def getLength(url): 26 | return len(url) 27 | 28 | # 5.Gives number of '/' in URL (URL_Depth) 29 | def getDepth(url): 30 | s = urlparse(url).path.split('/') 31 | depth = 0 32 | for j in range(len(s)): 33 | if len(s[j]) != 0: 34 | depth = depth+1 35 | return depth 36 | 37 | #listing shortening services 38 | shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \ 39 | r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \ 40 | r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \ 41 | r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \ 42 | r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \ 43 | r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \ 44 | r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \ 45 | r"tr\.im|link\.zip\.net" 46 | 47 | # 8. Checking for Shortening Services in URL (Tiny_URL) 48 | def tinyURL(url): 49 | match=re.search(shortening_services,url) 50 | if match: 51 | return 1 52 | else: 53 | return 0 54 | 55 | # 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix) 56 | def prefixSuffix(url): 57 | if '-' in urlparse(url).netloc: 58 | return 1 # phishing 59 | else: 60 | return 0 # legitimate 61 | 62 | def no_of_dots(url): 63 | return url.count('.') 64 | 65 | sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail", 66 | "install", "toolbar", "backup", "paypal", "password", "username", "verify", "update", 67 | "login", "support", "billing", "transaction", "security", "payment", "verify", "online", 68 | "customer", "service", "accountupdate", "verification", "important", "confidential", 69 | "limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice" 70 | "myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member" 71 | "personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"] 72 | 73 | def sensitive_word(url): 74 | domain = urlparse(url).netloc 75 | for i in sensitiveWords: 76 | if i in domain: 77 | return 1 78 | return 0 79 | 80 | 81 | def has_unicode(url): 82 | # Parse the URL 83 | parsed_url = urlparse(url) 84 | 85 | # Get the netloc part of the URL 86 | netloc = parsed_url.netloc 87 | 88 | # Decode the netloc using IDNA encoding 89 | decoded_netloc = netloc.encode('latin1').decode('idna') 90 | 91 | # Unquote the decoded netloc 92 | unquoted_netloc = unquote(decoded_netloc) 93 | 94 | # Compare the unquoted netloc with the original netloc 95 | if unquoted_netloc != netloc: 96 | return 1 97 | 98 | return 0 99 | 100 | # 13.Survival time of domain: The difference between termination time and creation time (Domain_Age) 101 | def domainAge(domain_name): 102 | creation_date = domain_name.creation_date 103 | expiration_date = domain_name.expiration_date 104 | if (isinstance(creation_date,str) or isinstance(expiration_date,str)): 105 | try: 106 | creation_date = datetime.strptime(creation_date,'%Y-%m-%d') 107 | expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") 108 | except: 109 | return 1 110 | if ((expiration_date is None) or (creation_date is None)): 111 | return 1 112 | elif ((type(expiration_date) is list) or (type(creation_date) is list)): 113 | return 1 114 | else: 115 | ageofdomain = abs((expiration_date - creation_date).days) 116 | if ((ageofdomain/30) < 6): 117 | age = 1 118 | else: 119 | age = 0 120 | return age 121 | 122 | # 14.End time of domain: The difference between termination time and current time (Domain_End) 123 | def domainEnd(domain_name): 124 | expiration_date = domain_name.expiration_date 125 | if isinstance(expiration_date,str): 126 | try: 127 | expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") 128 | except: 129 | return 1 130 | if (expiration_date is None): 131 | return 1 132 | elif (type(expiration_date) is list): 133 | return 1 134 | else: 135 | today = datetime.now() 136 | end = abs((expiration_date - today).days) 137 | if ((end/30) < 6): 138 | end = 0 139 | else: 140 | end = 1 141 | return end 142 | 143 | # 15. IFrame Redirection (iFrame) 144 | def iframe(response): 145 | if response == "": 146 | return 1 147 | else: 148 | if re.findall(r"[