├── README.md └── my_email2.py /README.md: -------------------------------------------------------------------------------- 1 | HOW TO RUN THE PROJECT 2 | 1.Initialization and URL Parsing 3 | 2. Feature Extraction 4 | UsingIp: 5 | Checks if the URL contains an IP address. 6 | Phishing (-1) if IP address is used, Legitimate (1) otherwise. 7 | longUrl: Measures the length of the URL. 8 | Legitimate (1) for short URLs (<54 characters). Suspicious (0) for medium-length (54-75). Phishing (-1) for long URLs (>75). 9 | shortUrl: Checks for known URL shorteners (e.g., bit.ly, goo.gl). Phishing (-1) if detected, Legitimate (1) otherwise. 10 | symbol: Verifies the presence of @ symbol in the URL. Phishing (-1) if present, Legitimate (1) otherwise. 11 | redirecting: Checks for multiple // after the protocol. Phishing (-1) if more than one, Legitimate (1) otherwise. 12 | prefixSuffix: Detects hyphens (-) in the domain. Phishing (-1) if present, Legitimate (1) otherwise. 13 | SubDomains: Counts the number of subdomains. Legitimate (1) for <= 2 subdomains. Suspicious (0) for 3 subdomains. Phishing (-1) for more than 3 subdomains. 14 | Hppts: Verifies if the URL uses HTTPS. Legitimate (1) for HTTPS, Phishing (-1) otherwise. 15 | DomainRegLen: Checks the domain registration duration via WHOIS. Legitimate (1) for registrations >= 1 year. Phishing (-1) for shorter durations. 16 | RequestURL: Examines external content (images, media) hosted outside the domain. Legitimate (1) if <22%, Suspicious (0) if 22-61%, Phishing (-1) if >61%. 17 | AnchorURL: Checks if anchor tags () contain unsafe links (e.g., #, javascript, mailto). Legitimate (1) if <31%, Suspicious (0) for 31-67%, Phishing (-1) for >67%. 18 | 3. Feature Weighting and Scoring Each feature has a predefined weight based on its importance. 19 | The overall score is calculated by multiplying each feature's value by its weight and summing them up. 4. Classification If the final score is negative, the URL is classified as Phishing. 20 | Otherwise, it is classified as Not Phishing. 21 | -------------------------------------------------------------------------------- /my_email2.py: -------------------------------------------------------------------------------- 1 | import ipaddress 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urlparse 5 | import whois 6 | from datetime import date 7 | 8 | 9 | class FeatureExtraction: 10 | def __init__(self, url): 11 | self.url = url 12 | self.domain = "" 13 | self.whois_response = None 14 | self.urlparse = urlparse(url) 15 | self.response = None 16 | self.soup = None 17 | 18 | try: 19 | self.response = requests.get(url, timeout=10) 20 | self.soup = BeautifulSoup(self.response.text, 'html.parser') 21 | except requests.RequestException: 22 | self.response = None 23 | self.soup = None 24 | 25 | try: 26 | self.domain = self.urlparse.netloc 27 | self.whois_response = whois.whois(self.domain) 28 | except: 29 | self.whois_response = None 30 | 31 | def UsingIp(self): 32 | try: 33 | ipaddress.ip_address(self.url) 34 | return -1 35 | except ValueError: 36 | return 1 37 | 38 | def longUrl(self): 39 | if len(self.url) < 54: 40 | return 1 41 | elif 54 <= len(self.url) <= 75: 42 | return 0 43 | return -1 44 | 45 | def shortUrl(self): 46 | shorteners = ( 47 | "bit.ly", "goo.gl", "tinyurl.com", "ow.ly", "t.co", "bit.do", "cutt.ly", "is.gd", "v.gd", "shorte.st" 48 | ) 49 | if any(shortener in self.url for shortener in shorteners): 50 | return -1 51 | return 1 52 | 53 | def symbol(self): 54 | return -1 if "@" in self.url else 1 55 | 56 | def redirecting(self): 57 | return -1 if self.url.find("//", 7) != -1 else 1 58 | 59 | def prefixSuffix(self): 60 | return -1 if "-" in self.domain else 1 61 | 62 | def SubDomains(self): 63 | subdomains = self.domain.split(".") 64 | if len(subdomains) <= 2: 65 | return 1 66 | elif len(subdomains) == 3: 67 | return 0 68 | return -1 69 | 70 | def Hppts(self): 71 | return 1 if self.urlparse.scheme == "https" else -1 72 | 73 | def DomainRegLen(self): 74 | try: 75 | if self.whois_response and self.whois_response.expiration_date: 76 | expiration_date = ( 77 | self.whois_response.expiration_date[0] 78 | if isinstance(self.whois_response.expiration_date, list) 79 | else self.whois_response.expiration_date 80 | ) 81 | if expiration_date: 82 | reg_length = (expiration_date - date.today()).days 83 | return 1 if reg_length >= 365 else -1 84 | except: 85 | pass 86 | return 0 # Treat missing registration length as neutral 87 | 88 | def Favicon(self): 89 | try: 90 | for link in self.soup.find_all('link', rel="icon", href=True): 91 | if self.domain in link['href']: 92 | return 1 93 | return -1 94 | except: 95 | return -1 96 | 97 | def RequestURL(self): 98 | try: 99 | total = len(self.soup.find_all(["img", "audio", "embed", "iframe"], src=True)) 100 | external = sum(1 for tag in self.soup.find_all(["img", "audio", "embed", "iframe"], src=True) 101 | if self.domain not in tag['src']) 102 | percentage = (external / total) * 100 if total else 0 103 | if percentage < 22: 104 | return 1 105 | elif percentage <= 61: 106 | return 0 107 | return -1 108 | except: 109 | return -1 110 | 111 | def AnchorURL(self): 112 | try: 113 | total = len(self.soup.find_all('a', href=True)) 114 | unsafe = sum(1 for a in self.soup.find_all('a', href=True) 115 | if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower()) 116 | percentage = (unsafe / total) * 100 if total else 0 117 | if percentage < 31: 118 | return 1 119 | elif percentage <= 67: 120 | return 0 121 | return -1 122 | except: 123 | return -1 124 | 125 | def getFeaturesList(self): 126 | return [ 127 | self.UsingIp(), 128 | self.longUrl(), 129 | self.shortUrl(), 130 | self.symbol(), 131 | self.redirecting(), 132 | self.prefixSuffix(), 133 | self.SubDomains(), 134 | self.Hppts(), 135 | self.DomainRegLen(), 136 | self.Favicon(), 137 | self.RequestURL(), 138 | self.AnchorURL(), 139 | ] 140 | 141 | def classify(self): 142 | # Feature weights based on importance 143 | feature_weights = [2, 1, 3, 1, 1, 1, 1, 3, 2, 2, 2, 1] 144 | features = self.getFeaturesList() 145 | 146 | # Replace None values with neutral score (0) 147 | features = [0 if f is None else f for f in features] 148 | 149 | # Calculate weighted score 150 | score = sum(f * w for f, w in zip(features, feature_weights)) 151 | 152 | # Threshold for classification 153 | return "Phishing" if score < 0 else "Not Phishing" 154 | 155 | 156 | if __name__ == "__main__": 157 | # Test URLs 158 | urls = [ 159 | "http://bit.ly/fake-google-login", # Phishing 160 | "https://mail.google.com", # Legitimate 161 | "http://bit.ly/fake-facebook-login", 162 | "http://bit.ly/fake-instagram-login" 163 | ] 164 | 165 | for url in urls: 166 | features = FeatureExtraction(url) 167 | print(f"URL: {url}") 168 | print(f"Features: {features.getFeaturesList()}") 169 | print(f"Classification: {features.classify()}") 170 | print("-" * 50) 171 | --------------------------------------------------------------------------------