├── requirements.txt
├── README.md
├── LICENSE
├── .gitignore
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.25.1
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | e-scraper
 2 | =========
 3 | Python script developed to extract unique emails addresses from a list of specified domains. Designed to crawl the whole domain.
 4 | 
 5 | ## Requirements
 6 | - Python >= 2.7
 7 | - Install all necessary libraries using
 8 | ```bash
 9 | $ pip install -r requirements.txt
10 | ```
11 | 
12 | ## Usage
13 | To use the script it is necessary to provide a list of domains in a "domains.txt" file and then execute the script.
14 | Example:
15 | ```bash
16 | $ python main.py
17 | ```
18 | 
19 | ## Authors
20 | [0x4D-5A](https://github.com/0x4D-5A) - *Initial work*
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 0x4D-5A
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from time import sleep
  3 | from secrets import SystemRandom
  4 | from re import compile as regex_compile, IGNORECASE as regex_IGNORECASE, MULTILINE as regex_MULTILINE
  5 | from requests import Session, packages, adapters
  6 | from requests.adapters import TimeoutSauce
  7 | from requests.exceptions import Timeout
  8 | 
  9 | MAX_TIMEOUT_SECONDS = 6
 10 | DOMAINS_FILE = 'domains.txt'
 11 | INCLUDE_SUBDOMAINS = False
 12 | 
 13 | class CustomTimeout(TimeoutSauce):
 14 |     def __init__(self, *args, **kwargs):
 15 |         if kwargs["connect"] is None:
 16 |             kwargs["connect"] = MAX_TIMEOUT_SECONDS
 17 |         if kwargs["read"] is None:
 18 |             kwargs["read"] = MAX_TIMEOUT_SECONDS
 19 |         super().__init__(*args, **kwargs)
 20 | 
 21 | adapters.TimeoutSauce = CustomTimeout
 22 | #stackoverflow.com/questions/45267003/python-requests-hanging-freezing
 23 | 
 24 | def rand_int(min_:int, max_:int):
 25 |   rng = SystemRandom()
 26 |   return rng.randrange(min_, max_ + 1)
 27 | 
 28 | def get_headers():
 29 |   # Try to get the most white-listed user-agent
 30 |   
 31 |   random_platform = rand_int(1, 3)
 32 |   is_mobile = False
 33 |   
 34 |   if random_platform == 1:
 35 |     platform = 'Windows'
 36 |   elif random_platform == 2:
 37 |     platform = 'macOS'
 38 |   else:
 39 |     platform = 'iOS'
 40 |     is_mobile = True
 41 |   
 42 |   if platform == 'Windows':
 43 |     rand_win_vers = rand_int(1, 3)
 44 |     if rand_win_vers == 1:    #8.1
 45 |       win_ver = '6.3'
 46 |       platform_version = '0.3.0'
 47 |     elif rand_win_vers == 2:   #10
 48 |       win_ver = '10.0'
 49 |       r_major = rand_int(1, 3)
 50 |       if r_major == 1:
 51 |         platform_version = '7.0.0' #Windows 10 1809
 52 |       elif r_major == 2:
 53 |         platform_version = '8.0.0' #Windows 10 1903 | 1909
 54 |       elif r_major == 3:
 55 |         platform_version = '10.0.0' #Windows 10 2004 | 20H2 | 21H1
 56 |         
 57 |     elif rand_win_vers == 3:   #11
 58 |       win_ver = '10.0' 
 59 |       platform_version = '15.0.0'
 60 |     else:
 61 |       win_ver = '10.0' #11
 62 |       platform_version = '15.0.0'
 63 |     
 64 |     chrome_agent = f"Mozilla/5.0 (Windows NT {win_ver}; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
 65 |   elif platform == 'macOS':
 66 |     r_macos = rand_int(1, 4)
 67 |     if r_macos == 1:
 68 |       macos_version = '10_15_7'
 69 |       
 70 |       r_big = rand_int(1, 2)
 71 |       if r_big == 1:
 72 |         platform_version = '12.5.1' #monterey
 73 |       else:
 74 |         platform_version = '11.6.2' #big sur
 75 |     elif r_macos == 2:
 76 |       macos_version = '10_15_6' #Catalina
 77 |       platform_version = '10.15.6'
 78 |     elif r_macos == 3:
 79 |       macos_version = '10_14_6' #Mojave
 80 |       platform_version = '10.14.6'
 81 |     elif r_macos == 4:
 82 |       macos_version = '10_13_6' #High Sierra
 83 |       platform_version = '10.13.6'
 84 |     else:
 85 |       macos_version = '10_15_7'
 86 |       platform_version = '12.5.1'
 87 |       
 88 |     chrome_agent = f"Mozilla/5.0 (Macintosh; Intel Mac OS X {macos_version}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
 89 |   else:
 90 |     r_device = rand_int(1, 2)
 91 |     if r_device == 1:
 92 |       ios_device = 'iPhone; CPU iPhone'
 93 |     else:
 94 |       ios_device = 'iPad; CPU'
 95 |     
 96 |     r_ios = rand_int(1, 4)
 97 |     if r_ios == 1:
 98 |       ios_version = '16_1'
 99 |     elif r_ios == 2:
100 |       ios_version = '16_0'
101 |     elif r_ios == 3:
102 |       ios_version = '15_7'
103 |     elif r_ios == 4:
104 |       ios_version = '15_6'
105 |     else:
106 |       ios_version = '16_1'
107 |     
108 |     chrome_agent = f"Mozilla/5.0 ({ios_device} OS {ios_version} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/119.0.6045.109 Mobile/15E148 Safari/604.1"
109 | 
110 |   if not is_mobile:
111 |     c_headers = {'Upgrade-Insecure-Requests': '1',
112 |     'User-Agent': chrome_agent,
113 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
114 |     'sec-ch-ua': '"Google Chrome";v="109", "Chromium";v="109", "Not=A?Brand";v="24"',
115 |     'sec-ch-ua-mobile': '?0',
116 |     'sec-ch-ua-full-version': '"109.0.5414.94"',
117 |     'sec-ch-ua-arch': '"x86"',
118 |     'sec-ch-ua-platform': f'"{platform}"',
119 |     'sec-ch-ua-platform-version': f'"{platform_version}"',
120 |     'sec-ch-ua-model': '""',
121 |     'sec-ch-ua-bitness': '"64"',
122 |     'sec-ch-ua-wow64': '?0',
123 |     'sec-ch-ua-full-version-list': '"Google Chrome";v="109.0.5414.94", "Chromium";v="109.0.5414.94", "Not=A?Brand";v="24.0.0.0"',
124 |     'sec-ch-prefers-color-scheme': 'light',
125 |     'Sec-Fetch-Site': 'none',
126 |     'Sec-Fetch-Mode': 'navigate',
127 |     'Sec-Fetch-User': '?1',
128 |     'Sec-Fetch-Dest': 'document',
129 |     'Accept-Encoding': 'gzip, deflate',
130 |     'Accept-Language': 'en-US,en;q=0.9'}
131 |   else:
132 |     c_headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
133 |     'Accept-Language': 'en-US,en;q=0.9',
134 |     'Accept-Encoding': 'gzip, deflate',
135 |     'User-Agent': chrome_agent}
136 |     
137 |   return c_headers
138 | 
139 | def do_get(session, get_url:str, proxy:dict, headers:dict):
140 |    
141 |    if headers:
142 |      session.headers.update(headers)
143 |    
144 |    try:
145 |      resp = session.get(get_url, verify=False, allow_redirects=True, proxies=proxy)
146 |      
147 |      if resp.status_code == 429: #Too Many Requests
148 |        retry_after = int(resp.headers.get('Retry-After'))
149 |        print(f'[-] Retry-After: {str(retry_after)} - {get_url}')
150 |        sleep(retry_after + 10)
151 |        resp = session.get(get_url, verify=False, allow_redirects=True, proxies=proxy)
152 |      
153 |      if resp.status_code == 200:
154 |        return resp
155 |    except ConnectionError:
156 |      print(f'[-] {get_url} (ConnectionError)')
157 |    except Timeout:
158 |       print(f'[-] {get_url} (Timeout)')
159 |    except Exception as e:
160 |      print(f'[-] {str(e)} on: {get_url}')
161 |    
162 |    return False
163 | 
164 | def get_page_links(base_url:str, domain:str, resp, disallowed:tuple, regex_href):
165 |   
166 |   # Search only on html pages.
167 |   content_type = resp.headers.get('Content-Type')
168 |   if content_type is None or content_type.startswith('text/html') is False:
169 |     return False
170 |   
171 |   links = set()
172 |   regex_links = regex_href.findall(resp.text)
173 |   
174 |   for href in regex_links:
175 |     
176 |     href = href[1]
177 |     if href == "" or href is None or href == '#':
178 |       continue
179 |     
180 |     #Full link
181 |     if href.startswith((base_url)):
182 |       links.add(href)
183 |       continue
184 |       
185 |     #Absolute link
186 |     if href.startswith("/"):
187 |       
188 |       #Check if it's Disallowed in robots.txt
189 |       if href.startswith(disallowed):
190 |         continue
191 |       
192 |       links.add(base_url + href)
193 |       continue
194 |     
195 |     if INCLUDE_SUBDOMAINS:
196 |       #Check if it's really current domain subdomain
197 |       sp1 = href.split('://')
198 |       if len(sp1) != 1:
199 |         sp2 = sp1[1].split('/')
200 |         if len(sp2) != 1:
201 |           hostname = sp2[0]
202 |           sp3 = hostname.split('.' + domain)
203 |           if len(sp3) != 1: #subdomain = sp3[0]
204 |             links.add(href)
205 |   
206 |   return list(links)
207 | 
208 | def extract_emails(file, regex_emails, text:str, domain_emails:set, tlds):
209 |   all_emails = set(regex_emails.findall(text))
210 |   parsend_emails = set()
211 |   
212 |   for email in all_emails:
213 |     email = email.lower()
214 |     if email not in domain_emails:
215 |       if tlds:
216 |         last_dot = email.rfind('.')
217 |         
218 |         if not last_dot:
219 |           continue
220 |           
221 |         tld = email[last_dot + 1:]
222 |         if tld not in tlds:
223 |           continue
224 |         
225 |       file.write(email + '\n')
226 |       parsend_emails.add(email)
227 |   
228 |   file.flush()
229 |   
230 |   return parsend_emails
231 |           
232 | def parse_domain(domain:str, regex_emails, regex_robots, regex_href, headers:dict, tlds):
233 | 
234 |   session = Session()
235 |   
236 |   #Firstly try to crawl via HTTP, as it's faster
237 |   base_url = f'http://{domain}'
238 |   main_resp = do_get(session, base_url, None, headers)
239 |   
240 |   if main_resp:
241 |     if main_resp.url.startswith('https://'):
242 |       base_url = f'https://{domain}'
243 |   else:
244 |     base_url = f'https://{domain}'
245 |     main_resp = do_get(session, base_url, None, headers)
246 | 
247 |   if main_resp is False:
248 |     return False
249 |   
250 |   resp = do_get(session, base_url + '/robots.txt', None, headers)
251 |   disallowed = tuple()
252 |   
253 |   if resp:
254 |     #Parse robots.txt
255 |     r_entries = regex_robots.findall(resp.text)
256 |     for e in r_entries:
257 |       url = e.strip()
258 |       if url.endswith('?') or url.endswith('$') or url.endswith('*'):
259 |         url = url[:-1]
260 |     
261 |       if url not in disallowed:
262 |         disallowed += (url, )
263 |   
264 |   resp = None
265 |   links_visited = set()
266 |   domain_emails = set()
267 |   
268 |   links = get_page_links(base_url, domain, main_resp, disallowed, regex_href)
269 |   
270 |   try:
271 |     f = open(f'{domain}_emails.txt', 'w')
272 |   except Exception as e:
273 |     print(f'[-] {str(e)}')
274 |     return False
275 |   
276 |   if links:
277 |     for link in links:
278 |   
279 |       if link in links_visited:
280 |         continue
281 |     
282 |       print(f'[+] {link}')
283 |       resp = do_get(session, link, None, headers)
284 |     
285 |       links_visited.add(link)
286 |       
287 |       if resp:
288 |         new_list = get_page_links(base_url, domain, resp, disallowed, regex_href)
289 |       
290 |         if new_list:
291 |           links.extend(new_list)
292 |           domain_emails.update(extract_emails(f, regex_emails, resp.text, domain_emails, tlds))
293 |         else:
294 |           #If page has no href links and it's an html page then search in it too.
295 |           content_type = resp.headers.get('Content-Type')
296 |           if content_type and content_type.startswith('text/html'):
297 |             domain_emails.update(extract_emails(f, regex_emails, resp.text, domain_emails, tlds))
298 |     
299 |   else:
300 |     #If main page has no href links and it's an html page then search in it too.
301 |     content_type = main_resp.headers.get('Content-Type')
302 |     if content_type and content_type.startswith('text/html'):
303 |       domain_emails.update(extract_emails(f, regex_emails, main_resp.text, domain_emails, tlds))
304 |     
305 |   f.close()
306 |   print(f'[+] Extracted {len(domain_emails)} emails from {domain}')
307 |   
308 |   return True
309 | 
310 | def get_iana_tlds(regex_href):
311 |   
312 |   session = Session()
313 |   
314 |   iana_path = '/domains/root/db'
315 |   resp = do_get(session, f'http://www.iana.org{iana_path}', None, get_headers())
316 |   
317 |   if not resp:
318 |     return False
319 |   
320 |   regex_links = regex_href.findall(resp.text)
321 |   iana_len = len(iana_path) + 1
322 |   
323 |   if not regex_links:
324 |     return False
325 |   
326 |   tlds = set()
327 |   
328 |   for href in regex_links:
329 |     
330 |     href = href[1]
331 |     if href == "" or href is None or href == '#':
332 |       continue
333 |     
334 |     if not href.startswith(iana_path + '/'):
335 |       continue
336 |     
337 |     tld_html = href[iana_len:]
338 |     
339 |     # just to be sure
340 |     if not tld_html.endswith('.html'):
341 |       continue
342 |     
343 |     # 5 = len('.html')
344 |     tld = tld_html[:-5]
345 |     
346 |     tlds.add(tld)
347 |   
348 |   if tlds:
349 |     return tlds
350 |   
351 |   return False
352 |   
353 | def main():
354 |   
355 |   try:
356 |     with open(DOMAINS_FILE, 'r') as f:
357 |       domains = f.read().splitlines()
358 |   except FileNotFoundError:
359 |     print(f'[-] File {DOMAINS_FILE} not found!')
360 |     return
361 |   except PermissionError:
362 |     print(f'[-] Insufficient permission to read {DOMAINS_FILE}!')
363 |     return
364 |   
365 |   if len(domains) == 0:
366 |     print(f'[-] File {DOMAINS_FILE} is empty!')
367 |     return
368 |   
369 |   unique_domains = set()
370 |   
371 |   for domain in domains:
372 |     if '.' in domain:
373 |       unique_domains.add(domain)
374 |     
375 |   if len(unique_domains) == 0:
376 |     print(f'[-] File {DOMAINS_FILE} contains no domains!')
377 |     return
378 |   
379 |   packages.urllib3.disable_warnings()
380 |   regex_pattern = regex_compile("\\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\b", regex_IGNORECASE) #regular-expressions.info/email.html
381 |   regex_robots = regex_compile("^Disallow[ ]*:(.*)", regex_MULTILINE)
382 |   regex_href = regex_compile("<a\\s+(?:[^>]*?\\s+)?href=([\"'])(.*?)\\1")
383 |   
384 |   tlds = get_iana_tlds(regex_href)
385 |   
386 |   for domain in unique_domains:
387 |     parse_domain(domain, regex_pattern, regex_robots, regex_href, get_headers(), tlds)
388 |   
389 |   return
390 |   
391 |    
392 | if __name__ == "__main__":
393 |   try:
394 |       main()
395 |   except KeyboardInterrupt:
396 |     exit("Bye")
397 | 


--------------------------------------------------------------------------------