├── requirements.txt ├── LICENSE.md ├── install.py ├── README.md └── webextractor.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Sreeraj S Kurup 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | def is_termux(): 5 | return 'com.termux' in os.environ.get('PREFIX', '') 6 | 7 | choice = input('[+] To install press (Y) to uninstall press (N) >> ') 8 | run = os.system 9 | 10 | if choice.lower() == 'y': 11 | if is_termux(): 12 | prefix = os.environ.get('PREFIX') 13 | bin_path = os.path.join(prefix, 'bin', 'webextractor') 14 | share_path = os.path.join(prefix, 'share', 'webextractor') 15 | 16 | run('chmod 755 webextractor.py') 17 | run(f'mkdir -p {share_path}') 18 | run(f'cp webextractor.py {share_path}/webextractor.py') 19 | 20 | termux_launcher = f'#! /data/data/com.termux/files/usr/bin/sh\nexec python3 {share_path}/webextractor.py "$@"' 21 | with open(bin_path, 'w') as file: 22 | file.write(termux_launcher) 23 | 24 | run(f'chmod +x {bin_path} && chmod +x {share_path}/webextractor.py') 25 | print('''\n\n[+] WebExtractor installed successfully in Termux 26 | [+] Now just type \x1b[6;30;42mwebextractor\x1b[0m in terminal''') 27 | 28 | else: 29 | if os.geteuid() != 0: 30 | print("Please run as root or with sudo") 31 | sys.exit(1) 32 | 33 | run('chmod 755 webextractor.py') 34 | run('mkdir -p /usr/share/webextractor') 35 | run('cp webextractor.py /usr/share/webextractor/webextractor.py') 36 | 37 | linux_launcher = '#! /bin/sh\nexec python3 /usr/share/webextractor/webextractor.py "$@"' 38 | with open('/usr/bin/webextractor', 'w') as file: 39 | file.write(linux_launcher) 40 | 41 | run('chmod +x /usr/bin/webextractor && chmod +x /usr/share/webextractor/webextractor.py') 42 | print('''\n\n[+] WebExtractor installed successfully on Linux 43 | [+] Now just type \x1b[6;30;42mwebextractor\x1b[0m in terminal''') 44 | 45 | elif choice.lower() == 'n': 46 | if is_termux(): 47 | prefix = os.environ.get('PREFIX') 48 | run(f'rm -rf {prefix}/share/webextractor') 49 | run(f'rm -f {prefix}/bin/webextractor') 50 | print('[!] WebExtractor removed from Termux successfully') 51 | else: 52 | if os.geteuid() != 0: 53 | print("Please run as root or with sudo") 54 | sys.exit(1) 55 | run('rm -rf /usr/share/webextractor') 56 | run('rm -f /usr/bin/webextractor') 57 | print('[!] WebExtractor removed from Linux successfully') 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## WebExtractor 2 | WebExtractor is a powerful **OSINT** and **ethical hacking tool developed in Python**. It is used to extract **email addresses**, **phone numbers**, and **links** (including visible, hidden, and social media links) from a target website. Designed for cybersecurity professionals, bug bounty hunters, and ethical hackers, it helps gather critical intelligence from web pages. 3 | 4 | The extracted links can also assist in identifying potential vulnerabilities in the website, such as SQL injection (SQLi) points, open directories, exposed admin panels, or unvalidated input fields. These links serve as entry points for further vulnerability assessments and exploitation attempts during ethical hacking or penetration testing. 5 | 6 | ## Features 7 | - **Extracts:** 8 | - Emails 9 | - Phone Numbers 10 | - All Links (including visible, hidden, and social media links) 11 | 12 | - Saves the extracted information for further analysis. 13 | 14 | - Clean and organized output 15 | 16 | - Works on Linux and Termux 17 | 18 | - Simple CLI interface 19 | 20 | - Lightweight and fast 21 | 22 | ## Compatibility 23 | - Linux (Debian, RHEL, Arch, etc.) 24 | - Termux (Android) 25 | 26 | The tool automatically detects the environment and installs itself accordingly. 27 | 28 | ## Disclaimer 29 | This tool is intended for educational and ethical OSINT purposes only. Use it only on websites you own or have explicit permission to analyze. The developer is not responsible for any misuse of this tool. 30 | 31 | ## Installation 32 | **Step 1: Clone the Repository** 33 | ```bash 34 | git clone https://github.com/s-r-e-e-r-a-j/WebExtractor.git 35 | ``` 36 | **step2: Navigate to the WebExtractor directory** 37 | ```bash 38 | cd WebExtractor 39 | ``` 40 | **Step 3: Install Dependencies** 41 | ```bash 42 | pip3 install -r requirements.txt 43 | ``` 44 | **Note for Kali, Parrot, Ubuntu 23.04+ users:** 45 | 46 | If you see an error like: 47 | ```go 48 | error: externally-managed-environment 49 | ``` 50 | then use: 51 | ```bash 52 | pip3 install -r requirements.txt --break-system-packages 53 | ``` 54 | 55 | **Step 4: Run Installer (Linux or Termux)** 56 | ```bash 57 | python3 install.py 58 | ``` 59 | **Then type `y` for install** 60 | 61 | **Step 5: Run the Tool** 62 | ```bash 63 | webextractor 64 | ``` 65 | 66 | ## Usage 67 | **Just run the tool:** 68 | ```bash 69 | webextractor 70 | ``` 71 | 1. Provide a valid URL when prompted. 72 | 73 | 2. Choose whether to extract email addresses, phone numbers, links (including visible, hidden, and social media links), or all three 74 | 75 | 3. It will display the **extracted emails, phone numbers, and links (including visible, hidden, and social media links)** in a clean format. 76 | 77 | 4. Optionally save the extracted data to a folder. 78 | 79 | ## Uninstallation 80 | **Run the install.py script** 81 | ```bash 82 | python3 install.py 83 | ``` 84 | Then type `n` for uninstall 85 | ## License 86 | This project is licensed under the MIT License 87 | 88 | 89 | -------------------------------------------------------------------------------- /webextractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import os 5 | import sys 6 | import requests 7 | from bs4 import BeautifulSoup 8 | from urllib.parse import urlparse 9 | import unicodedata 10 | 11 | # Terminal color codes 12 | class Colors: 13 | BrightRed = "\033[1;91m" 14 | BrightGreen = "\033[1;92m" 15 | BrightYellow = "\033[1;93m" 16 | BrightWhite = "\033[1;97m" 17 | Cyan = "\033[1;96m" 18 | Reset = "\033[0m" 19 | 20 | # Display tool banner 21 | def display_banner(): 22 | os.system("clear") 23 | print(f"{Colors.BrightGreen}") 24 | print(r""" 25 | __ __ _ ______ _ _ 26 | \ \ / / | | | ____| | | | | 27 | \ \ /\ / /__| |__ | |__ __ _| |_ _ __ __ _ ___| |_ ___ _ __ 28 | \ \/ \/ / _ \ '_ \| __| \ \/ / __| '__/ _` |/ __| __/ _ \| '__| 29 | \ /\ / __/ |_) | |____ > <| |_| | | (_| | (__| || (_) | | 30 | \/ \/ \___|_.__/|______/_/\_\\__|_| \__,_|\___|\__\___/|_| 31 | 32 | Developer: Sreeraj 33 | 34 | """) 35 | print(f"{Colors.Cyan}* Email, Phone Number, and Link Scraper Tool {Colors.Reset}") 36 | print(f"{Colors.BrightYellow}* GitHub: https://github.com/s-r-e-e-r-a-j{Colors.Reset}\n") 37 | 38 | # Check internet connectivity 39 | def check_connection(): 40 | print(f"{Colors.BrightWhite}[{Colors.BrightRed}!{Colors.BrightWhite}] {Colors.BrightRed}Checking internet connection...{Colors.Reset}") 41 | try: 42 | requests.get("http://google.com", timeout=10) 43 | print(f"{Colors.BrightWhite}[{Colors.BrightYellow}*{Colors.BrightWhite}] {Colors.BrightYellow}Connected to the internet.{Colors.Reset}") 44 | except requests.ConnectionError: 45 | print(f"{Colors.BrightWhite}[{Colors.BrightRed}!{Colors.BrightWhite}] {Colors.BrightRed}No internet connection detected. Try again later.{Colors.Reset}") 46 | sys.exit(1) 47 | 48 | # URL format validation 49 | def is_valid_url(url): 50 | try: 51 | result = urlparse(url) 52 | return all([result.scheme, result.netloc]) 53 | except ValueError: 54 | return False 55 | 56 | def clean_text(text): 57 | text = re.sub(r'[\u200b\u200c\u200d\u200e\u200f\uFEFF]', '', text) 58 | text = text.replace('\u2024', '.').replace('\u2027', '.') 59 | text = unicodedata.normalize("NFKC", text) 60 | return text 61 | 62 | # Email extraction using regex 63 | def scrape_emails(text, html): 64 | text = clean_text(text) 65 | email_pattern = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}', re.IGNORECASE) 66 | emails = set(email_pattern.findall(text)) | set(email_pattern.findall(html)) 67 | blocked_ext = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico') 68 | emails = {e for e in emails if not e.lower().endswith(blocked_ext)} 69 | return list(emails) 70 | 71 | # phone number extraction 72 | def scrape_phone_numbers(text): 73 | phone_pattern = re.compile(r'(\+?\d{1,3})?[\s\-\.]?\(?\d{2,4}\)?[\s\-\.]?\d{3,5}[\s\-\.]?\d{3,5}') 74 | phone_numbers = [match.group().strip() for match in re.finditer(phone_pattern, text) if len(match.group().strip()) >= 7] 75 | return list(set(phone_numbers)) # Remove duplicates 76 | 77 | # Link extraction using regex (links with and without query parameters) 78 | def scrape_links(text): 79 | link_pattern = re.compile(r'https?://[^\s"\']+', re.IGNORECASE) 80 | return list(set(link_pattern.findall(text))) 81 | 82 | # Main scraping logic 83 | def scrape_website(url, scrape_em, scrape_ph, scrape_ln): 84 | try: 85 | headers = { 86 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' 87 | } 88 | res = requests.get(url, headers=headers, timeout=10) 89 | res.raise_for_status() 90 | 91 | soup = BeautifulSoup(res.text, 'html.parser') 92 | text = clean_text(soup.get_text()) 93 | html = clean_text(res.text) 94 | results = {} 95 | 96 | if scrape_em: 97 | emails = scrape_emails(text,html) 98 | results['emails'] = emails 99 | print(f"\n{Colors.BrightYellow}[+] Emails Found:{Colors.Reset}") 100 | print("\n".join(emails) if emails else "None") 101 | 102 | if scrape_ph: 103 | phones = scrape_phone_numbers(text) 104 | results['phones'] = phones 105 | print(f"\n{Colors.BrightYellow}[+] Phone Numbers Found:{Colors.Reset}") 106 | print("\n".join(phones) if phones else "None") 107 | 108 | if scrape_ln: 109 | links = scrape_links(html) 110 | results['links'] = links 111 | print(f"\n{Colors.BrightYellow}[+] Links Found:{Colors.Reset}") 112 | print("\n".join(links) if links else "None") 113 | 114 | return results 115 | 116 | except requests.exceptions.RequestException as err: 117 | print(f"{Colors.BrightRed}[!] Error: {err}{Colors.Reset}") 118 | return {} 119 | 120 | # Save extracted results 121 | def save_results(results, folder): 122 | try: 123 | os.makedirs(folder, exist_ok=True) 124 | if results.get('emails'): 125 | with open(os.path.join(folder, 'emails.txt'), 'w') as f: 126 | f.write("\n".join(results['emails'])) 127 | if results.get('phones'): 128 | with open(os.path.join(folder, 'phones.txt'), 'w') as f: 129 | f.write("\n".join(results['phones'])) 130 | if results.get('links'): 131 | with open(os.path.join(folder, 'links.txt'), 'w') as f: 132 | f.write("\n".join(results['links'])) 133 | print(f"{Colors.BrightGreen}[+] Results saved in '{folder}'{Colors.Reset}") 134 | except Exception as e: 135 | print(f"{Colors.BrightRed}[!] Failed to save results: {e}{Colors.Reset}") 136 | 137 | # Main function with user interaction 138 | def main(): 139 | display_banner() 140 | check_connection() 141 | 142 | while True: 143 | url = input(f"{Colors.BrightGreen}[+] Enter a valid URL: {Colors.Reset}").strip() 144 | if is_valid_url(url): 145 | break 146 | print(f"{Colors.BrightRed}[!] Invalid URL. Try again.{Colors.Reset}") 147 | 148 | scrape_em = input(f"{Colors.BrightYellow}[?] Scrape emails? (y/n): {Colors.Reset}").lower() == 'y' 149 | scrape_ph = input(f"{Colors.BrightYellow}[?] Scrape phone numbers? (y/n): {Colors.Reset}").lower() == 'y' 150 | scrape_ln = input(f"{Colors.BrightYellow}[?] Scrape links? (y/n): {Colors.Reset}").lower() == 'y' 151 | 152 | if not any([scrape_em, scrape_ph, scrape_ln]): 153 | print(f"{Colors.BrightRed}[!] No options selected. Exiting...{Colors.Reset}") 154 | sys.exit(0) 155 | 156 | results = scrape_website(url, scrape_em, scrape_ph, scrape_ln) 157 | 158 | if any(results.values()): 159 | if input(f"{Colors.BrightGreen}[?] Save results to folder? (y/n): {Colors.Reset}").lower() == 'y': 160 | while True: 161 | folder = input(f"{Colors.BrightGreen}[+] Enter folder name: {Colors.Reset}").strip() 162 | if folder: 163 | save_results(results, folder) 164 | break 165 | print(f"{Colors.BrightRed}[!] Folder name cannot be empty.{Colors.Reset}") 166 | 167 | print(f"{Colors.BrightRed}[*] Exiting...{Colors.Reset}") 168 | 169 | if __name__ == "__main__": 170 | try: 171 | main() 172 | except KeyboardInterrupt: 173 | print(f"{Colors.BrightRed} User Aborted {Colors.Reset}"); 174 | sys.exit() 175 | --------------------------------------------------------------------------------