├── README.md
└── my_email2.py


/README.md:
--------------------------------------------------------------------------------
 1 | HOW TO RUN THE PROJECT 
 2 | 1.Initialization and URL Parsing 
 3 | 2. Feature Extraction
 4 | UsingIp:
 5 | Checks if the URL contains an IP address.
 6 | Phishing (-1) if IP address is used, Legitimate (1) otherwise.
 7 | longUrl: Measures the length of the URL. 
 8 | Legitimate (1) for short URLs (<54 characters). Suspicious (0) for medium-length (54-75). Phishing (-1) for long URLs (>75). 
 9 | shortUrl: Checks for known URL shorteners (e.g., bit.ly, goo.gl). Phishing (-1) if detected, Legitimate (1) otherwise. 
10 | symbol: Verifies the presence of @ symbol in the URL. Phishing (-1) if present, Legitimate (1) otherwise.
11 | redirecting: Checks for multiple // after the protocol. Phishing (-1) if more than one, Legitimate (1) otherwise. 
12 | prefixSuffix: Detects hyphens (-) in the domain. Phishing (-1) if present, Legitimate (1) otherwise. 
13 | SubDomains: Counts the number of subdomains. Legitimate (1) for <= 2 subdomains. Suspicious (0) for 3 subdomains. Phishing (-1) for more than 3 subdomains.
14 | Hppts: Verifies if the URL uses HTTPS. Legitimate (1) for HTTPS, Phishing (-1) otherwise. 
15 | DomainRegLen: Checks the domain registration duration via WHOIS. Legitimate (1) for registrations >= 1 year. Phishing (-1) for shorter durations. 
16 | RequestURL: Examines external content (images, media) hosted outside the domain. Legitimate (1) if <22%, Suspicious (0) if 22-61%, Phishing (-1) if >61%.
17 | AnchorURL: Checks if anchor tags () contain unsafe links (e.g., #, javascript, mailto). Legitimate (1) if <31%, Suspicious (0) for 31-67%, Phishing (-1) for >67%.
18 | 3. Feature Weighting and Scoring Each feature has a predefined weight based on its importance.
19 | The overall score is calculated by multiplying each feature's value by its weight and summing them up. 4. Classification If the final score is negative, the URL is classified as Phishing. 
20 | Otherwise, it is classified as Not Phishing.
21 | 


--------------------------------------------------------------------------------
/my_email2.py:
--------------------------------------------------------------------------------
  1 | import ipaddress
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | from urllib.parse import urlparse
  5 | import whois
  6 | from datetime import date
  7 | 
  8 | 
  9 | class FeatureExtraction:
 10 |     def __init__(self, url):
 11 |         self.url = url
 12 |         self.domain = ""
 13 |         self.whois_response = None
 14 |         self.urlparse = urlparse(url)
 15 |         self.response = None
 16 |         self.soup = None
 17 | 
 18 |         try:
 19 |             self.response = requests.get(url, timeout=10)
 20 |             self.soup = BeautifulSoup(self.response.text, 'html.parser')
 21 |         except requests.RequestException:
 22 |             self.response = None
 23 |             self.soup = None
 24 | 
 25 |         try:
 26 |             self.domain = self.urlparse.netloc
 27 |             self.whois_response = whois.whois(self.domain)
 28 |         except:
 29 |             self.whois_response = None
 30 | 
 31 |     def UsingIp(self):
 32 |         try:
 33 |             ipaddress.ip_address(self.url)
 34 |             return -1
 35 |         except ValueError:
 36 |             return 1
 37 | 
 38 |     def longUrl(self):
 39 |         if len(self.url) < 54:
 40 |             return 1
 41 |         elif 54 <= len(self.url) <= 75:
 42 |             return 0
 43 |         return -1
 44 | 
 45 |     def shortUrl(self):
 46 |         shorteners = (
 47 |             "bit.ly", "goo.gl", "tinyurl.com", "ow.ly", "t.co", "bit.do", "cutt.ly", "is.gd", "v.gd", "shorte.st"
 48 |         )
 49 |         if any(shortener in self.url for shortener in shorteners):
 50 |             return -1
 51 |         return 1
 52 | 
 53 |     def symbol(self):
 54 |         return -1 if "@" in self.url else 1
 55 | 
 56 |     def redirecting(self):
 57 |         return -1 if self.url.find("//", 7) != -1 else 1
 58 | 
 59 |     def prefixSuffix(self):
 60 |         return -1 if "-" in self.domain else 1
 61 | 
 62 |     def SubDomains(self):
 63 |         subdomains = self.domain.split(".")
 64 |         if len(subdomains) <= 2:
 65 |             return 1
 66 |         elif len(subdomains) == 3:
 67 |             return 0
 68 |         return -1
 69 | 
 70 |     def Hppts(self):
 71 |         return 1 if self.urlparse.scheme == "https" else -1
 72 | 
 73 |     def DomainRegLen(self):
 74 |         try:
 75 |             if self.whois_response and self.whois_response.expiration_date:
 76 |                 expiration_date = (
 77 |                     self.whois_response.expiration_date[0]
 78 |                     if isinstance(self.whois_response.expiration_date, list)
 79 |                     else self.whois_response.expiration_date
 80 |                 )
 81 |                 if expiration_date:
 82 |                     reg_length = (expiration_date - date.today()).days
 83 |                     return 1 if reg_length >= 365 else -1
 84 |         except:
 85 |             pass
 86 |         return 0  # Treat missing registration length as neutral
 87 | 
 88 |     def Favicon(self):
 89 |         try:
 90 |             for link in self.soup.find_all('link', rel="icon", href=True):
 91 |                 if self.domain in link['href']:
 92 |                     return 1
 93 |             return -1
 94 |         except:
 95 |             return -1
 96 | 
 97 |     def RequestURL(self):
 98 |         try:
 99 |             total = len(self.soup.find_all(["img", "audio", "embed", "iframe"], src=True))
100 |             external = sum(1 for tag in self.soup.find_all(["img", "audio", "embed", "iframe"], src=True)
101 |                            if self.domain not in tag['src'])
102 |             percentage = (external / total) * 100 if total else 0
103 |             if percentage < 22:
104 |                 return 1
105 |             elif percentage <= 61:
106 |                 return 0
107 |             return -1
108 |         except:
109 |             return -1
110 | 
111 |     def AnchorURL(self):
112 |         try:
113 |             total = len(self.soup.find_all('a', href=True))
114 |             unsafe = sum(1 for a in self.soup.find_all('a', href=True)
115 |                          if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower())
116 |             percentage = (unsafe / total) * 100 if total else 0
117 |             if percentage < 31:
118 |                 return 1
119 |             elif percentage <= 67:
120 |                 return 0
121 |             return -1
122 |         except:
123 |             return -1
124 | 
125 |     def getFeaturesList(self):
126 |         return [
127 |             self.UsingIp(),
128 |             self.longUrl(),
129 |             self.shortUrl(),
130 |             self.symbol(),
131 |             self.redirecting(),
132 |             self.prefixSuffix(),
133 |             self.SubDomains(),
134 |             self.Hppts(),
135 |             self.DomainRegLen(),
136 |             self.Favicon(),
137 |             self.RequestURL(),
138 |             self.AnchorURL(),
139 |         ]
140 | 
141 |     def classify(self):
142 |         # Feature weights based on importance
143 |         feature_weights = [2, 1, 3, 1, 1, 1, 1, 3, 2, 2, 2, 1]
144 |         features = self.getFeaturesList()
145 | 
146 |         # Replace None values with neutral score (0)
147 |         features = [0 if f is None else f for f in features]
148 | 
149 |         # Calculate weighted score
150 |         score = sum(f * w for f, w in zip(features, feature_weights))
151 | 
152 |         # Threshold for classification
153 |         return "Phishing" if score < 0 else "Not Phishing"
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     # Test URLs
158 |     urls = [
159 |         "http://bit.ly/fake-google-login",  # Phishing
160 |         "https://mail.google.com",  # Legitimate
161 |         "http://bit.ly/fake-facebook-login",
162 |         "http://bit.ly/fake-instagram-login"
163 |     ]
164 | 
165 |     for url in urls:
166 |         features = FeatureExtraction(url)
167 |         print(f"URL: {url}")
168 |         print(f"Features: {features.getFeaturesList()}")
169 |         print(f"Classification: {features.classify()}")
170 |         print("-" * 50)
171 | 


--------------------------------------------------------------------------------