├── README.md └── my_email2.py /README.md: -------------------------------------------------------------------------------- 1 | HOW TO RUN THE PROJECT 2 | 1.Initialization and URL Parsing 3 | 2. Feature Extraction 4 | UsingIp: 5 | Checks if the URL contains an IP address. 6 | Phishing (-1) if IP address is used, Legitimate (1) otherwise. 7 | longUrl: 8 | Measures the length of the URL. 9 | Legitimate (1) for short URLs (<54 characters). 10 | Suspicious (0) for medium-length (54-75). 11 | Phishing (-1) for long URLs (>75). 12 | shortUrl: 13 | Checks for known URL shorteners (e.g., bit.ly, goo.gl). 14 | Phishing (-1) if detected, Legitimate (1) otherwise. 15 | symbol: 16 | Verifies the presence of @ symbol in the URL. 17 | Phishing (-1) if present, Legitimate (1) otherwise. 18 | redirecting: 19 | Checks for multiple // after the protocol. 20 | Phishing (-1) if more than one, Legitimate (1) otherwise. 21 | prefixSuffix: 22 | Detects hyphens (-) in the domain. 23 | Phishing (-1) if present, Legitimate (1) otherwise. 24 | SubDomains: 25 | Counts the number of subdomains. 26 | Legitimate (1) for <= 2 subdomains. 27 | Suspicious (0) for 3 subdomains. 28 | Phishing (-1) for more than 3 subdomains. 29 | Hppts: 30 | Verifies if the URL uses HTTPS. 31 | Legitimate (1) for HTTPS, Phishing (-1) otherwise. 32 | DomainRegLen: 33 | Checks the domain registration duration via WHOIS. 34 | Legitimate (1) for registrations >= 1 year. 35 | Phishing (-1) for shorter durations. 36 | RequestURL: 37 | Examines external content (images, media) hosted outside the domain. 38 | Legitimate (1) if <22%, Suspicious (0) if 22-61%, Phishing (-1) if >61%. 39 | AnchorURL: 40 | Checks if anchor tags () contain unsafe links (e.g., #, javascript, mailto). 41 | Legitimate (1) if <31%, Suspicious (0) for 31-67%, Phishing (-1) for >67%. 42 | 3. Feature Weighting and Scoring 43 | Each feature has a predefined weight based on its importance. 44 | The overall score is calculated by multiplying each feature's value by its weight 45 | and summing them up. 46 | 4. Classification 47 | If the final score is negative, the URL is classified as Phishing. 48 | Otherwise, it is classified as Not Phishing. 49 | -------------------------------------------------------------------------------- /my_email2.py: -------------------------------------------------------------------------------- 1 | import ipaddress 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urlparse 5 | import whois 6 | from datetime import date 7 | 8 | 9 | class FeatureExtraction: 10 | def __init__(self, url): 11 | self.url = url 12 | self.domain = "" 13 | self.whois_response = None 14 | self.urlparse = urlparse(url) 15 | self.response = None 16 | self.soup = None 17 | 18 | try: 19 | self.response = requests.get(url, timeout=10) 20 | self.soup = BeautifulSoup(self.response.text, 'html.parser') 21 | except requests.RequestException: 22 | self.response = None 23 | self.soup = None 24 | 25 | try: 26 | self.domain = self.urlparse.netloc 27 | self.whois_response = whois.whois(self.domain) 28 | except: 29 | self.whois_response = None 30 | 31 | def UsingIp(self): 32 | try: 33 | ipaddress.ip_address(self.url) 34 | return -1 35 | except ValueError: 36 | return 1 37 | 38 | def longUrl(self): 39 | if len(self.url) < 54: 40 | return 1 41 | elif 54 <= len(self.url) <= 75: 42 | return 0 43 | return -1 44 | 45 | def shortUrl(self): 46 | shorteners = ( 47 | "bit.ly", "goo.gl", "tinyurl.com", "ow.ly", "t.co", "bit.do", "cutt.ly", "is.gd", "v.gd", "shorte.st" 48 | ) 49 | if any(shortener in self.url for shortener in shorteners): 50 | return -1 51 | return 1 52 | 53 | def symbol(self): 54 | return -1 if "@" in self.url else 1 55 | 56 | def redirecting(self): 57 | return -1 if self.url.find("//", 7) != -1 else 1 58 | 59 | def prefixSuffix(self): 60 | return -1 if "-" in self.domain else 1 61 | 62 | def SubDomains(self): 63 | subdomains = self.domain.split(".") 64 | if len(subdomains) <= 2: 65 | return 1 66 | elif len(subdomains) == 3: 67 | return 0 68 | return -1 69 | 70 | def Hppts(self): 71 | return 1 if self.urlparse.scheme == "https" else -1 72 | 73 | def DomainRegLen(self): 74 | try: 75 | if self.whois_response and self.whois_response.expiration_date: 76 | expiration_date = ( 77 | self.whois_response.expiration_date[0] 78 | if isinstance(self.whois_response.expiration_date, list) 79 | else self.whois_response.expiration_date 80 | ) 81 | if expiration_date: 82 | reg_length = (expiration_date - date.today()).days 83 | return 1 if reg_length >= 365 else -1 84 | except: 85 | pass 86 | return 0 # Treat missing registration length as neutral 87 | 88 | def Favicon(self): 89 | try: 90 | for link in self.soup.find_all('link', rel="icon", href=True): 91 | if self.domain in link['href']: 92 | return 1 93 | return -1 94 | except: 95 | return -1 96 | 97 | def RequestURL(self): 98 | try: 99 | total = len(self.soup.find_all(["img", "audio", "embed", "iframe"], src=True)) 100 | external = sum(1 for tag in self.soup.find_all(["img", "audio", "embed", "iframe"], src=True) 101 | if self.domain not in tag['src']) 102 | percentage = (external / total) * 100 if total else 0 103 | if percentage < 22: 104 | return 1 105 | elif percentage <= 61: 106 | return 0 107 | return -1 108 | except: 109 | return -1 110 | 111 | def AnchorURL(self): 112 | try: 113 | total = len(self.soup.find_all('a', href=True)) 114 | unsafe = sum(1 for a in self.soup.find_all('a', href=True) 115 | if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower()) 116 | percentage = (unsafe / total) * 100 if total else 0 117 | if percentage < 31: 118 | return 1 119 | elif percentage <= 67: 120 | return 0 121 | return -1 122 | except: 123 | return -1 124 | 125 | def getFeaturesList(self): 126 | return [ 127 | self.UsingIp(), 128 | self.longUrl(), 129 | self.shortUrl(), 130 | self.symbol(), 131 | self.redirecting(), 132 | self.prefixSuffix(), 133 | self.SubDomains(), 134 | self.Hppts(), 135 | self.DomainRegLen(), 136 | self.Favicon(), 137 | self.RequestURL(), 138 | self.AnchorURL(), 139 | ] 140 | 141 | def classify(self): 142 | # Feature weights based on importance 143 | feature_weights = [2, 1, 3, 1, 1, 1, 1, 3, 2, 2, 2, 1] 144 | features = self.getFeaturesList() 145 | 146 | # Replace None values with neutral score (0) 147 | features = [0 if f is None else f for f in features] 148 | 149 | # Calculate weighted score 150 | score = sum(f * w for f, w in zip(features, feature_weights)) 151 | 152 | # Threshold for classification 153 | return "Phishing" if score < 0 else "Not Phishing" 154 | 155 | 156 | if __name__ == "__main__": 157 | # Test URLs 158 | urls = [ 159 | "http://bit.ly/fake-google-login", # Phishing 160 | "https://mail.google.com", # Legitimate 161 | "http://bit.ly/fake-facebook-login", 162 | "http://bit.ly/fake-instagram-login" 163 | ] 164 | 165 | for url in urls: 166 | features = FeatureExtraction(url) 167 | print(f"URL: {url}") 168 | print(f"Features: {features.getFeaturesList()}") 169 | print(f"Classification: {features.classify()}") 170 | print("-" * 50) 171 | --------------------------------------------------------------------------------