├── requirements.txt ├── README.md ├── LICENSE ├── .gitignore └── main.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.25.1 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | e-scraper 2 | ========= 3 | Python script developed to extract unique emails addresses from a list of specified domains. Designed to crawl the whole domain. 4 | 5 | ## Requirements 6 | - Python >= 2.7 7 | - Install all necessary libraries using 8 | ```bash 9 | $ pip install -r requirements.txt 10 | ``` 11 | 12 | ## Usage 13 | To use the script it is necessary to provide a list of domains in a "domains.txt" file and then execute the script. 14 | Example: 15 | ```bash 16 | $ python main.py 17 | ``` 18 | 19 | ## Authors 20 | [0x4D-5A](https://github.com/0x4D-5A) - *Initial work* 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 0x4D-5A 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from time import sleep 3 | from secrets import SystemRandom 4 | from re import compile as regex_compile, IGNORECASE as regex_IGNORECASE, MULTILINE as regex_MULTILINE 5 | from requests import Session, packages, adapters 6 | from requests.adapters import TimeoutSauce 7 | from requests.exceptions import Timeout 8 | 9 | MAX_TIMEOUT_SECONDS = 6 10 | DOMAINS_FILE = 'domains.txt' 11 | INCLUDE_SUBDOMAINS = False 12 | 13 | class CustomTimeout(TimeoutSauce): 14 | def __init__(self, *args, **kwargs): 15 | if kwargs["connect"] is None: 16 | kwargs["connect"] = MAX_TIMEOUT_SECONDS 17 | if kwargs["read"] is None: 18 | kwargs["read"] = MAX_TIMEOUT_SECONDS 19 | super().__init__(*args, **kwargs) 20 | 21 | adapters.TimeoutSauce = CustomTimeout 22 | #stackoverflow.com/questions/45267003/python-requests-hanging-freezing 23 | 24 | def rand_int(min_:int, max_:int): 25 | rng = SystemRandom() 26 | return rng.randrange(min_, max_ + 1) 27 | 28 | def get_headers(): 29 | # Try to get the most white-listed user-agent 30 | 31 | random_platform = rand_int(1, 3) 32 | is_mobile = False 33 | 34 | if random_platform == 1: 35 | platform = 'Windows' 36 | elif random_platform == 2: 37 | platform = 'macOS' 38 | else: 39 | platform = 'iOS' 40 | is_mobile = True 41 | 42 | if platform == 'Windows': 43 | rand_win_vers = rand_int(1, 3) 44 | if rand_win_vers == 1: #8.1 45 | win_ver = '6.3' 46 | platform_version = '0.3.0' 47 | elif rand_win_vers == 2: #10 48 | win_ver = '10.0' 49 | r_major = rand_int(1, 3) 50 | if r_major == 1: 51 | platform_version = '7.0.0' #Windows 10 1809 52 | elif r_major == 2: 53 | platform_version = '8.0.0' #Windows 10 1903 | 1909 54 | elif r_major == 3: 55 | platform_version = '10.0.0' #Windows 10 2004 | 20H2 | 21H1 56 | 57 | elif rand_win_vers == 3: #11 58 | win_ver = '10.0' 59 | platform_version = '15.0.0' 60 | else: 61 | win_ver = '10.0' #11 62 | platform_version = '15.0.0' 63 | 64 | chrome_agent = f"Mozilla/5.0 (Windows NT {win_ver}; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" 65 | elif platform == 'macOS': 66 | r_macos = rand_int(1, 4) 67 | if r_macos == 1: 68 | macos_version = '10_15_7' 69 | 70 | r_big = rand_int(1, 2) 71 | if r_big == 1: 72 | platform_version = '12.5.1' #monterey 73 | else: 74 | platform_version = '11.6.2' #big sur 75 | elif r_macos == 2: 76 | macos_version = '10_15_6' #Catalina 77 | platform_version = '10.15.6' 78 | elif r_macos == 3: 79 | macos_version = '10_14_6' #Mojave 80 | platform_version = '10.14.6' 81 | elif r_macos == 4: 82 | macos_version = '10_13_6' #High Sierra 83 | platform_version = '10.13.6' 84 | else: 85 | macos_version = '10_15_7' 86 | platform_version = '12.5.1' 87 | 88 | chrome_agent = f"Mozilla/5.0 (Macintosh; Intel Mac OS X {macos_version}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" 89 | else: 90 | r_device = rand_int(1, 2) 91 | if r_device == 1: 92 | ios_device = 'iPhone; CPU iPhone' 93 | else: 94 | ios_device = 'iPad; CPU' 95 | 96 | r_ios = rand_int(1, 4) 97 | if r_ios == 1: 98 | ios_version = '16_1' 99 | elif r_ios == 2: 100 | ios_version = '16_0' 101 | elif r_ios == 3: 102 | ios_version = '15_7' 103 | elif r_ios == 4: 104 | ios_version = '15_6' 105 | else: 106 | ios_version = '16_1' 107 | 108 | chrome_agent = f"Mozilla/5.0 ({ios_device} OS {ios_version} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/119.0.6045.109 Mobile/15E148 Safari/604.1" 109 | 110 | if not is_mobile: 111 | c_headers = {'Upgrade-Insecure-Requests': '1', 112 | 'User-Agent': chrome_agent, 113 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 114 | 'sec-ch-ua': '"Google Chrome";v="109", "Chromium";v="109", "Not=A?Brand";v="24"', 115 | 'sec-ch-ua-mobile': '?0', 116 | 'sec-ch-ua-full-version': '"109.0.5414.94"', 117 | 'sec-ch-ua-arch': '"x86"', 118 | 'sec-ch-ua-platform': f'"{platform}"', 119 | 'sec-ch-ua-platform-version': f'"{platform_version}"', 120 | 'sec-ch-ua-model': '""', 121 | 'sec-ch-ua-bitness': '"64"', 122 | 'sec-ch-ua-wow64': '?0', 123 | 'sec-ch-ua-full-version-list': '"Google Chrome";v="109.0.5414.94", "Chromium";v="109.0.5414.94", "Not=A?Brand";v="24.0.0.0"', 124 | 'sec-ch-prefers-color-scheme': 'light', 125 | 'Sec-Fetch-Site': 'none', 126 | 'Sec-Fetch-Mode': 'navigate', 127 | 'Sec-Fetch-User': '?1', 128 | 'Sec-Fetch-Dest': 'document', 129 | 'Accept-Encoding': 'gzip, deflate', 130 | 'Accept-Language': 'en-US,en;q=0.9'} 131 | else: 132 | c_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 133 | 'Accept-Language': 'en-US,en;q=0.9', 134 | 'Accept-Encoding': 'gzip, deflate', 135 | 'User-Agent': chrome_agent} 136 | 137 | return c_headers 138 | 139 | def do_get(session, get_url:str, proxy:dict, headers:dict): 140 | 141 | if headers: 142 | session.headers.update(headers) 143 | 144 | try: 145 | resp = session.get(get_url, verify=False, allow_redirects=True, proxies=proxy) 146 | 147 | if resp.status_code == 429: #Too Many Requests 148 | retry_after = int(resp.headers.get('Retry-After')) 149 | print(f'[-] Retry-After: {str(retry_after)} - {get_url}') 150 | sleep(retry_after + 10) 151 | resp = session.get(get_url, verify=False, allow_redirects=True, proxies=proxy) 152 | 153 | if resp.status_code == 200: 154 | return resp 155 | except ConnectionError: 156 | print(f'[-] {get_url} (ConnectionError)') 157 | except Timeout: 158 | print(f'[-] {get_url} (Timeout)') 159 | except Exception as e: 160 | print(f'[-] {str(e)} on: {get_url}') 161 | 162 | return False 163 | 164 | def get_page_links(base_url:str, domain:str, resp, disallowed:tuple, regex_href): 165 | 166 | # Search only on html pages. 167 | content_type = resp.headers.get('Content-Type') 168 | if content_type is None or content_type.startswith('text/html') is False: 169 | return False 170 | 171 | links = set() 172 | regex_links = regex_href.findall(resp.text) 173 | 174 | for href in regex_links: 175 | 176 | href = href[1] 177 | if href == "" or href is None or href == '#': 178 | continue 179 | 180 | #Full link 181 | if href.startswith((base_url)): 182 | links.add(href) 183 | continue 184 | 185 | #Absolute link 186 | if href.startswith("/"): 187 | 188 | #Check if it's Disallowed in robots.txt 189 | if href.startswith(disallowed): 190 | continue 191 | 192 | links.add(base_url + href) 193 | continue 194 | 195 | if INCLUDE_SUBDOMAINS: 196 | #Check if it's really current domain subdomain 197 | sp1 = href.split('://') 198 | if len(sp1) != 1: 199 | sp2 = sp1[1].split('/') 200 | if len(sp2) != 1: 201 | hostname = sp2[0] 202 | sp3 = hostname.split('.' + domain) 203 | if len(sp3) != 1: #subdomain = sp3[0] 204 | links.add(href) 205 | 206 | return list(links) 207 | 208 | def extract_emails(file, regex_emails, text:str, domain_emails:set, tlds): 209 | all_emails = set(regex_emails.findall(text)) 210 | parsend_emails = set() 211 | 212 | for email in all_emails: 213 | email = email.lower() 214 | if email not in domain_emails: 215 | if tlds: 216 | last_dot = email.rfind('.') 217 | 218 | if not last_dot: 219 | continue 220 | 221 | tld = email[last_dot + 1:] 222 | if tld not in tlds: 223 | continue 224 | 225 | file.write(email + '\n') 226 | parsend_emails.add(email) 227 | 228 | file.flush() 229 | 230 | return parsend_emails 231 | 232 | def parse_domain(domain:str, regex_emails, regex_robots, regex_href, headers:dict, tlds): 233 | 234 | session = Session() 235 | 236 | #Firstly try to crawl via HTTP, as it's faster 237 | base_url = f'http://{domain}' 238 | main_resp = do_get(session, base_url, None, headers) 239 | 240 | if main_resp: 241 | if main_resp.url.startswith('https://'): 242 | base_url = f'https://{domain}' 243 | else: 244 | base_url = f'https://{domain}' 245 | main_resp = do_get(session, base_url, None, headers) 246 | 247 | if main_resp is False: 248 | return False 249 | 250 | resp = do_get(session, base_url + '/robots.txt', None, headers) 251 | disallowed = tuple() 252 | 253 | if resp: 254 | #Parse robots.txt 255 | r_entries = regex_robots.findall(resp.text) 256 | for e in r_entries: 257 | url = e.strip() 258 | if url.endswith('?') or url.endswith('$') or url.endswith('*'): 259 | url = url[:-1] 260 | 261 | if url not in disallowed: 262 | disallowed += (url, ) 263 | 264 | resp = None 265 | links_visited = set() 266 | domain_emails = set() 267 | 268 | links = get_page_links(base_url, domain, main_resp, disallowed, regex_href) 269 | 270 | try: 271 | f = open(f'{domain}_emails.txt', 'w') 272 | except Exception as e: 273 | print(f'[-] {str(e)}') 274 | return False 275 | 276 | if links: 277 | for link in links: 278 | 279 | if link in links_visited: 280 | continue 281 | 282 | print(f'[+] {link}') 283 | resp = do_get(session, link, None, headers) 284 | 285 | links_visited.add(link) 286 | 287 | if resp: 288 | new_list = get_page_links(base_url, domain, resp, disallowed, regex_href) 289 | 290 | if new_list: 291 | links.extend(new_list) 292 | domain_emails.update(extract_emails(f, regex_emails, resp.text, domain_emails, tlds)) 293 | else: 294 | #If page has no href links and it's an html page then search in it too. 295 | content_type = resp.headers.get('Content-Type') 296 | if content_type and content_type.startswith('text/html'): 297 | domain_emails.update(extract_emails(f, regex_emails, resp.text, domain_emails, tlds)) 298 | 299 | else: 300 | #If main page has no href links and it's an html page then search in it too. 301 | content_type = main_resp.headers.get('Content-Type') 302 | if content_type and content_type.startswith('text/html'): 303 | domain_emails.update(extract_emails(f, regex_emails, main_resp.text, domain_emails, tlds)) 304 | 305 | f.close() 306 | print(f'[+] Extracted {len(domain_emails)} emails from {domain}') 307 | 308 | return True 309 | 310 | def get_iana_tlds(regex_href): 311 | 312 | session = Session() 313 | 314 | iana_path = '/domains/root/db' 315 | resp = do_get(session, f'http://www.iana.org{iana_path}', None, get_headers()) 316 | 317 | if not resp: 318 | return False 319 | 320 | regex_links = regex_href.findall(resp.text) 321 | iana_len = len(iana_path) + 1 322 | 323 | if not regex_links: 324 | return False 325 | 326 | tlds = set() 327 | 328 | for href in regex_links: 329 | 330 | href = href[1] 331 | if href == "" or href is None or href == '#': 332 | continue 333 | 334 | if not href.startswith(iana_path + '/'): 335 | continue 336 | 337 | tld_html = href[iana_len:] 338 | 339 | # just to be sure 340 | if not tld_html.endswith('.html'): 341 | continue 342 | 343 | # 5 = len('.html') 344 | tld = tld_html[:-5] 345 | 346 | tlds.add(tld) 347 | 348 | if tlds: 349 | return tlds 350 | 351 | return False 352 | 353 | def main(): 354 | 355 | try: 356 | with open(DOMAINS_FILE, 'r') as f: 357 | domains = f.read().splitlines() 358 | except FileNotFoundError: 359 | print(f'[-] File {DOMAINS_FILE} not found!') 360 | return 361 | except PermissionError: 362 | print(f'[-] Insufficient permission to read {DOMAINS_FILE}!') 363 | return 364 | 365 | if len(domains) == 0: 366 | print(f'[-] File {DOMAINS_FILE} is empty!') 367 | return 368 | 369 | unique_domains = set() 370 | 371 | for domain in domains: 372 | if '.' in domain: 373 | unique_domains.add(domain) 374 | 375 | if len(unique_domains) == 0: 376 | print(f'[-] File {DOMAINS_FILE} contains no domains!') 377 | return 378 | 379 | packages.urllib3.disable_warnings() 380 | regex_pattern = regex_compile("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b", regex_IGNORECASE) #regular-expressions.info/email.html 381 | regex_robots = regex_compile("^Disallow[ ]*:(.*)", regex_MULTILINE) 382 | regex_href = regex_compile("]*?\\s+)?href=([\"'])(.*?)\\1") 383 | 384 | tlds = get_iana_tlds(regex_href) 385 | 386 | for domain in unique_domains: 387 | parse_domain(domain, regex_pattern, regex_robots, regex_href, get_headers(), tlds) 388 | 389 | return 390 | 391 | 392 | if __name__ == "__main__": 393 | try: 394 | main() 395 | except KeyboardInterrupt: 396 | exit("Bye") 397 | --------------------------------------------------------------------------------