├── .DS_Store
├── .gitignore
├── README.md
├── app.py
├── extractorFunctions.py
├── featureExtractor.py
├── main.py
├── model
    ├── pca_model.pkl
    └── phishingdetection.pkl
├── requirements.txt
└── templates
    └── index.html


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sannjayy/python-phishing-url-detection/3ee3ea4d54fc698d0c67e7088a8a42a2bf723e9c/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | zenv/
2 | __pycache__/
3 | 
4 | logs.log
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python Phishing URL Detection
 2 | ---
 3 | 
 4 | ### Demo Video: [https://youtu.be/9t4gbbrkfks](https://youtu.be/9t4gbbrkfks)
 5 | 
 6 | **Python 3.11.9 _(Currently Using)_**
 7 | 
 8 | 
 9 | ## How to Run?
10 | 
11 | - Clone or download [python-phishing-url-detection](https://github.com/sannjayy/python-phishing-url-detection) 
12 | 
13 | `git clone git@github.com:sannjayy/python-phishing-url-detection.git`
14 | 
15 | 
16 | - Create a virtual environment
17 | ```bash
18 | python -m venv zenv
19 | source zenv/Scripts/activate # Windows
20 | source zenv/bin/activate # Mac
21 | ```
22 | 
23 | 
24 | - Install basic requirements
25 | ```bash
26 | pip install -r requirements.txt
27 | 
28 | # OR INITIAL INSTALLATION 
29 | pip install --upgrade pip
30 | pip install --upgrade setuptools
31 | 
32 | pip install pandas whois httpx
33 | pip install pycaret # It will take sometime.
34 | ```
35 | 
36 | ### Replace Domains
37 | 
38 | ```python
39 | if __name__ == "__main__": 
40 |     phishing_url_1 = 'https://bafybeifqd2yktzvwjw5g42l2ghvxsxn76khhsgqpkaqfdhnqf3kiuiegw4.ipfs.dweb.link/'
41 |     phishing_url_2 = 'http://about-ads-microsoft-com.o365.frc.skyfencenet.com'
42 |     real_url_1 = 'https://chat.openai.com'
43 |     real_url_2 = 'https://github.com/'
44 |     
45 |     
46 |     print(predict(phishing_url_1))
47 |     print(predict(phishing_url_2))
48 |     print(predict(real_url_1))
49 |     print(predict(real_url_2))
50 | ```
51 | 
52 | ### To Run
53 | 
54 | ```bash
55 | python main.py
56 | 
57 | 
58 | # OUTPUT: {'prediction_label': 0, 'prediction_score': 68.39} 
59 | 
60 | # 0 = False | 1 True
61 | ```
62 | 
63 | ---
64 | 
65 | ### To Run GUI
66 | 
67 | ```bash
68 | pip install flask
69 | 
70 | python app.py
71 | ```
72 | 
73 | Open http://127.0.0.1:5000 in your browser!
74 | 
75 | 
76 | 
77 | --- 
78 | ---
79 | 
80 | - 🌏 [GitHub Repo](https://github.com/sannjayy/python-phishing-url-detection) 
81 | - 🌏 [Website](https://www.sanjaysikdar.dev) 
82 | - 📫 <me@sanjaysikdar.dev>
83 | - 📖 [read.sanjaysikdar.dev](https://read.sanjaysikdar.dev)
84 | - 📦 [pypi releases](https://pypi.org/user/sannjayy/) | [npm releases](https://www.npmjs.com/~sannjayy)
85 | 
86 | ---
87 | 
88 | [![](https://img.shields.io/github/followers/sannjayy?style=social)](https://github.com/sannjayy)  
89 | Developed with ❤️ by *[sanjaysikdar.dev](https://www.sanjaysikdar.dev)*.
90 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, render_template
 2 | from featureExtractor import featureExtraction
 3 | from pycaret.classification import load_model, predict_model
 4 | 
 5 | model = load_model('model/phishingdetection')
 6 | 
 7 | def predict(url):
 8 |     data = featureExtraction(url)
 9 |     result = predict_model(model, data=data)
10 |     prediction_score = result['prediction_score'][0]  
11 |     prediction_label = result['prediction_label'][0] 
12 |     
13 |     return {
14 |         'prediction_label': prediction_label,
15 |         'prediction_score': prediction_score * 100,
16 |     }
17 |     
18 |     
19 |     
20 | app = Flask(__name__)
21 | 
22 | @app.route("/", methods=["GET", "POST"])
23 | def index():
24 |     data = None
25 |     if request.method == "POST":
26 |         url = request.form["url"]
27 |         data = predict(url)
28 |         return render_template('index.html', url=url, data=data )
29 |     return render_template("index.html", data=data)
30 | 
31 | if __name__ == "__main__":
32 |     app.run(debug=True)


--------------------------------------------------------------------------------
/extractorFunctions.py:
--------------------------------------------------------------------------------
  1 | # importing required packages for Address Bar Based feature Extraction
  2 | from urllib.parse import urlparse, urlencode, unquote
  3 | import re
  4 | # importing required packages for Domain Based Feature Extraction
  5 | from datetime import datetime
  6 | 
  7 | 
  8 | # 2.Checks for IP address in URL (Have_IP)
  9 | def havingIP(url):
 10 |     ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
 11 |     match = re.search(ip_pattern, url)
 12 |     if match:
 13 |         return 1
 14 |     return 0
 15 | 
 16 | # 3.Checks the presence of @ in URL (Have_At)
 17 | def haveAtSign(url):
 18 |   if "@" in url:
 19 |     at = 1
 20 |   else:
 21 |     at = 0
 22 |   return at
 23 | 
 24 | # 4.Finding the length of URL and categorizing (URL_Length)
 25 | def getLength(url):
 26 |   return len(url)
 27 | 
 28 | # 5.Gives number of '/' in URL (URL_Depth)
 29 | def getDepth(url):
 30 |   s = urlparse(url).path.split('/')
 31 |   depth = 0
 32 |   for j in range(len(s)):
 33 |     if len(s[j]) != 0:
 34 |       depth = depth+1
 35 |   return depth
 36 | 
 37 | #listing shortening services
 38 | shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
 39 |                       r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
 40 |                       r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
 41 |                       r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
 42 |                       r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
 43 |                       r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
 44 |                       r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
 45 |                       r"tr\.im|link\.zip\.net"
 46 | 
 47 | # 8. Checking for Shortening Services in URL (Tiny_URL)
 48 | def tinyURL(url):
 49 |     match=re.search(shortening_services,url)
 50 |     if match:
 51 |         return 1
 52 |     else:
 53 |         return 0
 54 | 
 55 | # 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
 56 | def prefixSuffix(url):
 57 |     if '-' in urlparse(url).netloc:
 58 |         return 1            # phishing
 59 |     else:
 60 |         return 0            # legitimate
 61 | 
 62 | def no_of_dots(url):
 63 |   return url.count('.')
 64 | 
 65 | sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail",
 66 |                   "install", "toolbar", "backup", "paypal", "password", "username", "verify", "update",
 67 |                   "login", "support", "billing", "transaction", "security", "payment", "verify", "online",
 68 |                   "customer", "service", "accountupdate", "verification", "important", "confidential",
 69 |                   "limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice"
 70 |                   "myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member"
 71 |                   "personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"]
 72 | 
 73 | def sensitive_word(url):
 74 |   domain = urlparse(url).netloc
 75 |   for i in sensitiveWords:
 76 |     if i in domain:
 77 |       return 1
 78 |   return 0
 79 | 
 80 | 
 81 | def has_unicode(url):
 82 |     # Parse the URL
 83 |     parsed_url = urlparse(url)
 84 | 
 85 |     # Get the netloc part of the URL
 86 |     netloc = parsed_url.netloc
 87 | 
 88 |     # Decode the netloc using IDNA encoding
 89 |     decoded_netloc = netloc.encode('latin1').decode('idna')
 90 | 
 91 |     # Unquote the decoded netloc
 92 |     unquoted_netloc = unquote(decoded_netloc)
 93 | 
 94 |     # Compare the unquoted netloc with the original netloc
 95 |     if unquoted_netloc != netloc:
 96 |         return 1
 97 | 
 98 |     return 0
 99 | 
100 | # 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
101 | def domainAge(domain_name):
102 |   creation_date = domain_name.creation_date
103 |   expiration_date = domain_name.expiration_date
104 |   if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
105 |     try:
106 |       creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
107 |       expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
108 |     except:
109 |       return 1
110 |   if ((expiration_date is None) or (creation_date is None)):
111 |       return 1
112 |   elif ((type(expiration_date) is list) or (type(creation_date) is list)):
113 |       return 1
114 |   else:
115 |     ageofdomain = abs((expiration_date - creation_date).days)
116 |     if ((ageofdomain/30) < 6):
117 |       age = 1
118 |     else:
119 |       age = 0
120 |   return age
121 | 
122 | # 14.End time of domain: The difference between termination time and current time (Domain_End)
123 | def domainEnd(domain_name):
124 |   expiration_date = domain_name.expiration_date
125 |   if isinstance(expiration_date,str):
126 |     try:
127 |       expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
128 |     except:
129 |       return 1
130 |   if (expiration_date is None):
131 |       return 1
132 |   elif (type(expiration_date) is list):
133 |       return 1
134 |   else:
135 |     today = datetime.now()
136 |     end = abs((expiration_date - today).days)
137 |     if ((end/30) < 6):
138 |       end = 0
139 |     else:
140 |       end = 1
141 |   return end
142 | 
143 | # 15. IFrame Redirection (iFrame)
144 | def iframe(response):
145 |   if response == "":
146 |       return 1
147 |   else:
148 |       if re.findall(r"[<iframe>|<frameBorder>]", response.text):
149 |           return 0
150 |       else:
151 |           return 1
152 | 
153 | # 16.Checks the effect of mouse over on status bar (Mouse_Over)
154 | def mouseOver(response):
155 |   if response == "" :
156 |     return 1
157 |   else:
158 |     try:
159 |       if re.findall("<script>.+onmouseover.+</script>", response.text):
160 |         return 1
161 |       else:
162 |         return 0
163 |     except:
164 |       return 1
165 | 
166 | # 18.Checks the number of forwardings (Web_Forwards)
167 | def forwarding(response):
168 |   if response == "":
169 |     return 1
170 |   else:
171 |     if len(response.history) <= 2:
172 |       return 0
173 |     else:
174 |       return 1
175 | 


--------------------------------------------------------------------------------
/featureExtractor.py:
--------------------------------------------------------------------------------
 1 | import whois
 2 | from urllib.parse import urlparse
 3 | import httpx
 4 | import pickle as pk
 5 | import pandas as pd
 6 | import extractorFunctions as ef
 7 | 
 8 | #Function to extract features
 9 | def featureExtraction(url):
10 | 
11 |   features = []
12 |   #Address bar based features (12)
13 |   features.append(ef.getLength(url))
14 |   features.append(ef.getDepth(url))
15 |   features.append(ef.tinyURL(url))
16 |   features.append(ef.prefixSuffix(url))
17 |   features.append(ef.no_of_dots(url))
18 |   features.append(ef.sensitive_word(url))
19 | 
20 | 
21 |   domain_name = ''
22 |   #Domain based features (4)
23 |   dns = 0
24 |   try:
25 |     domain_name = whois.whois(urlparse(url).netloc)
26 |   except:
27 |     dns = 1
28 | 
29 |   features.append(1 if dns == 1 else ef.domainAge(domain_name))
30 |   features.append(1 if dns == 1 else ef.domainEnd(domain_name))
31 | 
32 |   # HTML & Javascript based features (4)
33 |   dom = []
34 |   try:
35 |     response = httpx.get(url)
36 |   except:
37 |     response = ""
38 | 
39 |   dom.append(ef.iframe(response))
40 |   dom.append(ef.mouseOver(response))
41 |   dom.append(ef.forwarding(response))
42 | 
43 |   features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url))
44 | 
45 |   with open('model/pca_model.pkl', 'rb') as file:
46 |     pca = pk.load(file)
47 | 
48 |   #converting the list to dataframe
49 |   feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words',
50 |                        'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att']
51 |   dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over'])
52 |   features.append(pca.transform(dom_pd)[0][0])
53 | 
54 |   row = pd.DataFrame([features], columns= feature_names)
55 | 
56 |   return row


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from featureExtractor import featureExtraction
 2 | from pycaret.classification import load_model, predict_model
 3 | 
 4 | model = load_model('model/phishingdetection')
 5 | 
 6 | 
 7 | def predict(url):
 8 |     data = featureExtraction(url)
 9 |     result = predict_model(model, data=data)
10 |     
11 |     # Get the prediction score for the positive class (Phishing)
12 |     prediction_score = result['prediction_score'][0]  
13 |     prediction_label = result['prediction_label'][0]  
14 |     # domain_age = result['Domain_Age'][0]  
15 |     # print('Result -> ', url)
16 |     
17 |     return {
18 |         'prediction_label': prediction_label,
19 |         'prediction_score': prediction_score * 100,
20 |     }
21 | 
22 | if __name__ == "__main__": 
23 |     phishing_url_1 = 'https://bafybeifqd2yktzvwjw5g42l2ghvxsxn76khhsgqpkaqfdhnqf3kiuiegw4.ipfs.dweb.link/'
24 |     phishing_url_2 = 'http://about-ads-microsoft-com.o365.frc.skyfencenet.com'
25 |     real_url_1 = 'https://chat.openai.com'
26 |     real_url_2 = 'https://github.com/'
27 |     
28 |     
29 |     print(predict(phishing_url_1))
30 |     print(predict(phishing_url_2))
31 |     print(predict(real_url_1))
32 |     print(predict(real_url_2))


--------------------------------------------------------------------------------
/model/pca_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sannjayy/python-phishing-url-detection/3ee3ea4d54fc698d0c67e7088a8a42a2bf723e9c/model/pca_model.pkl


--------------------------------------------------------------------------------
/model/phishingdetection.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sannjayy/python-phishing-url-detection/3ee3ea4d54fc698d0c67e7088a8a42a2bf723e9c/model/phishingdetection.pkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | anyio==4.3.0
  2 | asttokens==2.4.1
  3 | attrs==23.2.0
  4 | blinker==1.7.0
  5 | category-encoders==2.6.3
  6 | certifi==2024.2.2
  7 | charset-normalizer==3.3.2
  8 | click==8.1.7
  9 | cloudpickle==3.0.0
 10 | colorama==0.4.6
 11 | comm==0.2.2
 12 | contourpy==1.2.1
 13 | cycler==0.12.1
 14 | Cython==3.0.10
 15 | dash==2.16.1
 16 | dash-core-components==2.0.0
 17 | dash-html-components==2.0.0
 18 | dash-table==5.0.0
 19 | decorator==5.1.1
 20 | deprecation==2.1.0
 21 | executing==2.0.1
 22 | fastjsonschema==2.19.1
 23 | Flask==3.0.3
 24 | fonttools==4.51.0
 25 | h11==0.14.0
 26 | httpcore==1.0.5
 27 | httpx==0.27.0
 28 | idna==3.7
 29 | imbalanced-learn==0.12.2
 30 | importlib_metadata==7.1.0
 31 | ipython==8.23.0
 32 | ipywidgets==8.1.2
 33 | itsdangerous==2.2.0
 34 | jedi==0.19.1
 35 | Jinja2==3.1.3
 36 | joblib==1.3.2
 37 | jsonschema==4.21.1
 38 | jsonschema-specifications==2023.12.1
 39 | jupyter_core==5.7.2
 40 | jupyterlab_widgets==3.0.10
 41 | kaleido==0.2.1
 42 | kiwisolver==1.4.5
 43 | lightgbm==4.3.0
 44 | llvmlite==0.42.0
 45 | MarkupSafe==2.1.5
 46 | matplotlib==3.7.5
 47 | matplotlib-inline==0.1.7
 48 | nbformat==5.10.4
 49 | nest-asyncio==1.6.0
 50 | numba==0.59.1
 51 | numpy==1.26.4
 52 | orjson==3.10.1
 53 | packaging==24.0
 54 | pandas==2.1.4
 55 | parso==0.8.4
 56 | patsy==0.5.6
 57 | pillow==10.3.0
 58 | platformdirs==4.2.1
 59 | plotly==5.21.0
 60 | plotly-resampler==0.10.0
 61 | pmdarima==2.0.4
 62 | prompt-toolkit==3.0.43
 63 | psutil==5.9.8
 64 | pure-eval==0.2.2
 65 | pycaret==3.3.1
 66 | Pygments==2.17.2
 67 | pyod==1.1.3
 68 | pyparsing==3.1.2
 69 | python-dateutil==2.9.0.post0
 70 | pytz==2024.1
 71 | pywin32==306
 72 | referencing==0.35.0
 73 | requests==2.31.0
 74 | retrying==1.3.4
 75 | rpds-py==0.18.0
 76 | schemdraw==0.15
 77 | scikit-base==0.7.7
 78 | scikit-learn==1.4.2
 79 | scikit-plot==0.3.7
 80 | scipy==1.11.4
 81 | six==1.16.0
 82 | sktime==0.26.0
 83 | sniffio==1.3.1
 84 | stack-data==0.6.3
 85 | statsmodels==0.14.2
 86 | tbats==1.1.3
 87 | tenacity==8.2.3
 88 | threadpoolctl==3.4.0
 89 | tqdm==4.66.2
 90 | traitlets==5.14.3
 91 | tsdownsample==0.1.3
 92 | typing_extensions==4.11.0
 93 | tzdata==2024.1
 94 | urllib3==2.2.1
 95 | wcwidth==0.2.13
 96 | Werkzeug==3.0.2
 97 | whois==1.20240129.2
 98 | widgetsnbextension==4.0.10
 99 | xxhash==3.4.1
100 | yellowbrick==1.5
101 | zipp==3.18.1
102 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <meta name="description" content="URL Safety Checker." />
 8 |     <meta name="author" content="SANJAY SIKDAR" />
 9 |     <link rel="icon" href="https://znas.in/favicon.png" type="image/png" />
10 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous" />
11 |     <link rel="preconnect" href="https://fonts.googleapis.com" />
12 |     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
13 |     <link href="https://fonts.googleapis.com/css2?family=Ubuntu:ital,wght@0,300;0,400;0,500;0,700;1,300;1,400;1,500;1,700&display=swap" rel="stylesheet" />
14 |     <title>PHISHING URL DETECTION</title>
15 | 
16 |     <style>
17 |       body {
18 |         font-family: 'Ubuntu', sans-serif;
19 |         font-weight: 400;
20 |         font-style: normal;
21 |         background-color: #d0d0d05c;
22 |       }
23 |     </style>
24 |   </head>
25 | 
26 |   <body class="bg-">
27 |     <div class="container mt-5">
28 |       <div class="row justify-content-center">
29 |         <div class="col-12 text-center border-bottom-1">
30 |           <h1 class="fw-bold display-2 text-primary mb-5" style="font-weight: 700;">PHISHING URL DETECTION</h1>
31 |         </div>
32 |       </div>
33 |       <div class="row justify-content-center">
34 |         <div class="col-md-8 col-12 border shadow-sm bg-light p-3">
35 |           <form method="post" action="/">
36 |             <div class="mb-3">
37 |               <label for="inputText" class="form-label">URL:</label>
38 |               <input type="text" class="form-control form-control-lg" id="url" name="url" required placeholder="Paste URL"/>
39 |             </div>
40 |             <button type="submit" class="btn btn-success">Check now</button>
41 |           </form>
42 |         </div>
43 |       </div>
44 |       {% if url %}
45 |       <div class="row justify-content-center mt-3">
46 |         
47 |         <div class="col-md-8 col-12 mb-3 mt-3 border shadow-sm bg-light rounded p-3">
48 |             
49 |             {% if data.prediction_label == 1 %}            
50 |                 <h1 class='display-6 text-danger fw-bold'>URL does not look secure!</h1>
51 |             {% else %}
52 |                 <h1 class='display-6 text-success fw-bold'>URL Looks Secure!</h1>
53 |             {% endif %}
54 |             <p class="text-secondary fw-bold"> <small>{{url}}</small></p>
55 |         </div>
56 |       </div>
57 |       {% endif %}
58 |     </div>
59 | 
60 |     <!-- JavaScript -->
61 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-YvpcrYf0tY3lHB60NNkmXc5s9fDVZLESaAA55NDzOxhy9GkcIdslK1eN7N6jIeHz" crossorigin="anonymous"></script>
62 | 
63 |    
64 |   </body>
65 | </html>
66 | 


--------------------------------------------------------------------------------