├── requirements.txt
├── LICENSE.md
├── install.py
├── README.md
└── webextractor.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Sreeraj S Kurup 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | def is_termux():
 5 |     return 'com.termux' in os.environ.get('PREFIX', '')
 6 | 
 7 | choice = input('[+] To install press (Y) to uninstall press (N) >> ')
 8 | run = os.system
 9 | 
10 | if choice.lower() == 'y':
11 |     if is_termux():
12 |         prefix = os.environ.get('PREFIX')
13 |         bin_path = os.path.join(prefix, 'bin', 'webextractor')
14 |         share_path = os.path.join(prefix, 'share', 'webextractor')
15 | 
16 |         run('chmod 755 webextractor.py')
17 |         run(f'mkdir -p {share_path}')
18 |         run(f'cp webextractor.py {share_path}/webextractor.py')
19 | 
20 |         termux_launcher = f'#! /data/data/com.termux/files/usr/bin/sh\nexec python3 {share_path}/webextractor.py "$@"'
21 |         with open(bin_path, 'w') as file:
22 |             file.write(termux_launcher)
23 | 
24 |         run(f'chmod +x {bin_path} && chmod +x {share_path}/webextractor.py')
25 |         print('''\n\n[+] WebExtractor installed successfully in Termux
26 | [+] Now just type \x1b[6;30;42mwebextractor\x1b[0m in terminal''')
27 | 
28 |     else:
29 |         if os.geteuid() != 0:
30 |             print("Please run as root or with sudo")
31 |             sys.exit(1)
32 | 
33 |         run('chmod 755 webextractor.py')
34 |         run('mkdir -p /usr/share/webextractor')
35 |         run('cp webextractor.py /usr/share/webextractor/webextractor.py')
36 | 
37 |         linux_launcher = '#! /bin/sh\nexec python3 /usr/share/webextractor/webextractor.py "$@"'
38 |         with open('/usr/bin/webextractor', 'w') as file:
39 |             file.write(linux_launcher)
40 | 
41 |         run('chmod +x /usr/bin/webextractor && chmod +x /usr/share/webextractor/webextractor.py')
42 |         print('''\n\n[+] WebExtractor installed successfully on Linux
43 | [+] Now just type \x1b[6;30;42mwebextractor\x1b[0m in terminal''')
44 | 
45 | elif choice.lower() == 'n':
46 |     if is_termux():
47 |         prefix = os.environ.get('PREFIX')
48 |         run(f'rm -rf {prefix}/share/webextractor')
49 |         run(f'rm -f {prefix}/bin/webextractor')
50 |         print('[!] WebExtractor removed from Termux successfully')
51 |     else:
52 |         if os.geteuid() != 0:
53 |             print("Please run as root or with sudo")
54 |             sys.exit(1)
55 |         run('rm -rf /usr/share/webextractor')
56 |         run('rm -f /usr/bin/webextractor')
57 |         print('[!] WebExtractor removed from Linux successfully')
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## WebExtractor
 2 | WebExtractor is a powerful **OSINT** and **ethical hacking tool developed in Python**. It is used to extract **email addresses**, **phone numbers**, and **links** (including visible, hidden, and social media links) from a target website. Designed for cybersecurity professionals, bug bounty hunters, and ethical hackers, it helps gather critical intelligence from web pages.
 3 | 
 4 | The extracted links can also assist in identifying potential vulnerabilities in the website, such as SQL injection (SQLi) points, open directories, exposed admin panels, or unvalidated input fields. These links serve as entry points for further vulnerability assessments and exploitation attempts during ethical hacking or penetration testing.
 5 | 
 6 | ## Features
 7 | - **Extracts:**
 8 |    - Emails
 9 |    - Phone Numbers
10 |    - All Links (including visible, hidden, and social media links)
11 |    
12 | - Saves the extracted information for further analysis.
13 | 
14 | - Clean and organized output
15 | 
16 | - Works on Linux and Termux
17 | 
18 | - Simple CLI interface
19 | 
20 | - Lightweight and fast
21 |   
22 | ## Compatibility
23 | - Linux (Debian, RHEL, Arch, etc.)
24 | - Termux (Android)
25 | 
26 | The tool automatically detects the environment and installs itself accordingly.
27 | 
28 | ## Disclaimer 
29 | This tool is intended for educational and ethical OSINT purposes only. Use it only on websites you own or have explicit permission to analyze. The developer is not responsible for any misuse of this tool.
30 | 
31 |  ## Installation
32 |  **Step 1: Clone the Repository**
33 | ```bash
34 | git clone https://github.com/s-r-e-e-r-a-j/WebExtractor.git
35 | ```
36 | **step2: Navigate to the WebExtractor directory**
37 | ```bash
38 | cd WebExtractor
39 | ```
40 | **Step 3: Install Dependencies**
41 | ```bash
42 | pip3 install -r requirements.txt
43 | ```
44 | **Note for Kali, Parrot, Ubuntu 23.04+ users:**
45 | 
46 | If you see an error like:
47 | ```go
48 | error: externally-managed-environment
49 | ```
50 | then use:
51 | ```bash
52 | pip3 install -r requirements.txt --break-system-packages
53 | ```
54 | 
55 | **Step 4: Run Installer (Linux or Termux)**
56 | ```bash
57 | python3 install.py
58 | ```
59 | **Then type `y` for install**
60 | 
61 | **Step 5: Run the Tool**
62 | ```bash
63 | webextractor
64 | ```
65 | 
66 | ## Usage
67 | **Just run the tool:**
68 | ```bash
69 | webextractor
70 | ```
71 | 1. Provide a valid URL when prompted.
72 | 
73 | 2. Choose whether to extract email addresses, phone numbers, links (including visible, hidden, and social media links), or all three
74 | 
75 | 3. It will display the **extracted emails, phone numbers, and links (including visible, hidden, and social media links)** in a clean format.
76 | 
77 | 4. Optionally save the extracted data to a folder.
78 | 
79 | ## Uninstallation
80 | **Run the install.py script**
81 | ```bash
82 | python3 install.py
83 | ```
84 | Then type `n` for uninstall
85 | ## License
86 | This project is licensed under the MIT License
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/webextractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import re
  4 | import os
  5 | import sys
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | from urllib.parse import urlparse
  9 | import unicodedata
 10 | 
 11 | # Terminal color codes
 12 | class Colors:
 13 |     BrightRed = "\033[1;91m"
 14 |     BrightGreen = "\033[1;92m"
 15 |     BrightYellow = "\033[1;93m"
 16 |     BrightWhite = "\033[1;97m"
 17 |     Cyan = "\033[1;96m"
 18 |     Reset = "\033[0m"
 19 | 
 20 | # Display tool banner
 21 | def display_banner():
 22 |     os.system("clear")
 23 |     print(f"{Colors.BrightGreen}")
 24 |     print(r"""
 25 |       __          __  _     ______      _                  _             
 26 |       \ \        / / | |   |  ____|    | |                | |            
 27 |        \ \  /\  / /__| |__ | |__  __  _| |_ _ __ __ _  ___| |_ ___  _ __ 
 28 |         \ \/  \/ / _ \ '_ \|  __| \ \/ / __| '__/ _` |/ __| __/ _ \| '__|
 29 |          \  /\  /  __/ |_) | |____ >  <| |_| | | (_| | (__| || (_) | |   
 30 |           \/  \/ \___|_.__/|______/_/\_\\__|_|  \__,_|\___|\__\___/|_|   
 31 |                                                                     
 32 |                                                   Developer: Sreeraj         
 33 |                                                                   
 34 |                           """)
 35 |     print(f"{Colors.Cyan}* Email, Phone Number, and Link Scraper Tool {Colors.Reset}")
 36 |     print(f"{Colors.BrightYellow}* GitHub: https://github.com/s-r-e-e-r-a-j{Colors.Reset}\n")
 37 |   
 38 | # Check internet connectivity
 39 | def check_connection():
 40 |     print(f"{Colors.BrightWhite}[{Colors.BrightRed}!{Colors.BrightWhite}] {Colors.BrightRed}Checking internet connection...{Colors.Reset}")
 41 |     try:
 42 |         requests.get("http://google.com", timeout=10)
 43 |         print(f"{Colors.BrightWhite}[{Colors.BrightYellow}*{Colors.BrightWhite}] {Colors.BrightYellow}Connected to the internet.{Colors.Reset}")
 44 |     except requests.ConnectionError:
 45 |         print(f"{Colors.BrightWhite}[{Colors.BrightRed}!{Colors.BrightWhite}] {Colors.BrightRed}No internet connection detected. Try again later.{Colors.Reset}")
 46 |         sys.exit(1)
 47 | 
 48 | # URL format validation
 49 | def is_valid_url(url):
 50 |     try:
 51 |         result = urlparse(url)
 52 |         return all([result.scheme, result.netloc])
 53 |     except ValueError:
 54 |         return False
 55 | 
 56 | def clean_text(text):
 57 |     text = re.sub(r'[\u200b\u200c\u200d\u200e\u200f\uFEFF]', '', text)
 58 |     text = text.replace('\u2024', '.').replace('\u2027', '.')
 59 |     text = unicodedata.normalize("NFKC", text)
 60 |     return text
 61 | 
 62 | # Email extraction using regex
 63 | def scrape_emails(text, html):
 64 |     text = clean_text(text)
 65 |     email_pattern = re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}', re.IGNORECASE)
 66 |     emails = set(email_pattern.findall(text)) | set(email_pattern.findall(html))
 67 |     blocked_ext = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico')
 68 |     emails = {e for e in emails if not e.lower().endswith(blocked_ext)}
 69 |     return list(emails)
 70 | 
 71 | #  phone number extraction 
 72 | def scrape_phone_numbers(text):
 73 |     phone_pattern = re.compile(r'(\+?\d{1,3})?[\s\-\.]?\(?\d{2,4}\)?[\s\-\.]?\d{3,5}[\s\-\.]?\d{3,5}')
 74 |     phone_numbers = [match.group().strip() for match in re.finditer(phone_pattern, text) if len(match.group().strip()) >= 7]
 75 |     return list(set(phone_numbers))  # Remove duplicates
 76 | 
 77 | # Link extraction using regex (links with and without query parameters)
 78 | def scrape_links(text):
 79 |     link_pattern = re.compile(r'https?://[^\s"\']+', re.IGNORECASE)
 80 |     return list(set(link_pattern.findall(text)))
 81 | 
 82 | # Main scraping logic
 83 | def scrape_website(url, scrape_em, scrape_ph, scrape_ln):
 84 |     try:
 85 |         headers = {
 86 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
 87 |         }
 88 |         res = requests.get(url, headers=headers, timeout=10)
 89 |         res.raise_for_status()
 90 | 
 91 |         soup = BeautifulSoup(res.text, 'html.parser')
 92 |         text = clean_text(soup.get_text())
 93 |         html = clean_text(res.text)
 94 |         results = {}
 95 | 
 96 |         if scrape_em:
 97 |             emails = scrape_emails(text,html)
 98 |             results['emails'] = emails
 99 |             print(f"\n{Colors.BrightYellow}[+] Emails Found:{Colors.Reset}")
100 |             print("\n".join(emails) if emails else "None")
101 | 
102 |         if scrape_ph:
103 |             phones = scrape_phone_numbers(text)
104 |             results['phones'] = phones
105 |             print(f"\n{Colors.BrightYellow}[+] Phone Numbers Found:{Colors.Reset}")
106 |             print("\n".join(phones) if phones else "None")
107 | 
108 |         if scrape_ln:
109 |             links = scrape_links(html)
110 |             results['links'] = links
111 |             print(f"\n{Colors.BrightYellow}[+] Links Found:{Colors.Reset}")
112 |             print("\n".join(links) if links else "None")
113 | 
114 |         return results
115 | 
116 |     except requests.exceptions.RequestException as err:
117 |         print(f"{Colors.BrightRed}[!] Error: {err}{Colors.Reset}")
118 |         return {}
119 | 
120 | # Save extracted results
121 | def save_results(results, folder):
122 |     try:
123 |         os.makedirs(folder, exist_ok=True)
124 |         if results.get('emails'):
125 |             with open(os.path.join(folder, 'emails.txt'), 'w') as f:
126 |                 f.write("\n".join(results['emails']))
127 |         if results.get('phones'):
128 |             with open(os.path.join(folder, 'phones.txt'), 'w') as f:
129 |                 f.write("\n".join(results['phones']))
130 |         if results.get('links'):
131 |             with open(os.path.join(folder, 'links.txt'), 'w') as f:
132 |                 f.write("\n".join(results['links']))
133 |         print(f"{Colors.BrightGreen}[+] Results saved in '{folder}'{Colors.Reset}")
134 |     except Exception as e:
135 |         print(f"{Colors.BrightRed}[!] Failed to save results: {e}{Colors.Reset}")
136 | 
137 | # Main function with user interaction
138 | def main():
139 |     display_banner()
140 |     check_connection()
141 | 
142 |     while True:
143 |         url = input(f"{Colors.BrightGreen}[+] Enter a valid URL: {Colors.Reset}").strip()
144 |         if is_valid_url(url):
145 |             break
146 |         print(f"{Colors.BrightRed}[!] Invalid URL. Try again.{Colors.Reset}")
147 | 
148 |     scrape_em = input(f"{Colors.BrightYellow}[?] Scrape emails? (y/n): {Colors.Reset}").lower() == 'y'
149 |     scrape_ph = input(f"{Colors.BrightYellow}[?] Scrape phone numbers? (y/n): {Colors.Reset}").lower() == 'y'
150 |     scrape_ln = input(f"{Colors.BrightYellow}[?] Scrape links? (y/n): {Colors.Reset}").lower() == 'y'
151 | 
152 |     if not any([scrape_em, scrape_ph, scrape_ln]):
153 |         print(f"{Colors.BrightRed}[!] No options selected. Exiting...{Colors.Reset}")
154 |         sys.exit(0)
155 | 
156 |     results = scrape_website(url, scrape_em, scrape_ph, scrape_ln)
157 | 
158 |     if any(results.values()):
159 |         if input(f"{Colors.BrightGreen}[?] Save results to folder? (y/n): {Colors.Reset}").lower() == 'y':
160 |             while True:
161 |                 folder = input(f"{Colors.BrightGreen}[+] Enter folder name: {Colors.Reset}").strip()
162 |                 if folder:
163 |                     save_results(results, folder)
164 |                     break
165 |                 print(f"{Colors.BrightRed}[!] Folder name cannot be empty.{Colors.Reset}")
166 | 
167 |     print(f"{Colors.BrightRed}[*] Exiting...{Colors.Reset}")
168 | 
169 | if __name__ == "__main__":
170 |     try:
171 |         main()
172 |     except KeyboardInterrupt:
173 |             print(f"{Colors.BrightRed} User Aborted {Colors.Reset}");
174 |             sys.exit()
175 | 


--------------------------------------------------------------------------------