├── urls.txt ├── emails.txt ├── .gitattributes ├── README.md ├── .gitignore └── EmailScraping.py /urls.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /emails.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Email-Scraping 2 | 3 | This **Advanced** Email-Scraping with Python Script allows you to easily and quickly scan **bulk** of websites and collect **unique** email addresses from them. 4 | 5 | ## Getting Started 6 | 7 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. 8 | 9 | ### Prerequisites 10 | 11 | What things you need to install the software 12 | 13 | ``` 14 | [Python 3.x](https://www.python.org/downloads/) - A Programing languague 15 | [urllib.request](https://docs.python.org/3.0/library/urllib.request.html) - urllib.request — extensible library for opening URLs 16 | 17 | ``` 18 | 19 | ## Deployment 20 | 21 | * Clone or Download Zip file 22 | * Create/Open urls.txt file 23 | * Paste all the websites/urls in the urls.txt file(make sure break line after every url) 24 | * Run the Script 25 | * Enjoy with Harvested Email Addresses. 26 | 27 | ## Built With 28 | 29 | * [Python 3.x](https://www.python.org/) - A Programing languague 30 | 31 | ## Contributing 32 | 33 | We welcome contributions from the public. 34 | 35 | ## Authors 36 | 37 | * **Ayush Agarwal** - *Initial work* - [Ayush](https://github.com/ayushagarwalk) 38 | 39 | See also the list of [contributors](https://github.com/ayushagarwalk/Email-Scraping/contributors) who participated in this project. 40 | 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # Environments 82 | .env 83 | .venv 84 | env/ 85 | venv/ 86 | ENV/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # mypy 99 | .mypy_cache/ 100 | -------------------------------------------------------------------------------- /EmailScraping.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | import re, urllib.request, time 3 | 4 | emailRegex = re.compile(r''' 5 | #example : 6 | #something-.+_@somedomain.com 7 | ( 8 | ([a-zA-Z0-9_.+]+ 9 | @ 10 | [a-zA-Z0-9_.+]+) 11 | ) 12 | ''', re.VERBOSE) 13 | 14 | #Extacting Emails 15 | def extractEmailsFromUrlText(urlText): 16 | extractedEmail = emailRegex.findall(urlText) 17 | allemails = [] 18 | for email in extractedEmail: 19 | allemails.append(email[0]) 20 | lenh = len(allemails) 21 | print("\tNumber of Emails : %s\n"%lenh ) 22 | seen = set() 23 | for email in allemails: 24 | if email not in seen: # faster than `word not in output` 25 | seen.add(email) 26 | emailFile.write(email+"\n")#appending Emails to a filerea 27 | 28 | #HtmlPage Read Func 29 | def htmlPageRead(url, i): 30 | try: 31 | start = time.time() 32 | headers = { 'User-Agent' : 'Mozilla/5.0' } 33 | request = urllib.request.Request(url, None, headers) 34 | response = urllib.request.urlopen(request) 35 | urlHtmlPageRead = response.read() 36 | urlText = urlHtmlPageRead.decode() 37 | print ("%s.%s\tFetched in : %s" % (i, url, (time.time() - start))) 38 | extractEmailsFromUrlText(urlText) 39 | except: 40 | pass 41 | 42 | #EmailsLeechFunction 43 | def emailsLeechFunc(url, i): 44 | 45 | try: 46 | htmlPageRead(url,i) 47 | except urllib.error.HTTPError as err: 48 | if err.code == 404: 49 | try: 50 | url = 'http://webcache.googleusercontent.com/search?q=cache:'+url 51 | htmlPageRead(url, i) 52 | except: 53 | pass 54 | else: 55 | pass 56 | 57 | # TODO: Open a file for reading urls 58 | start = time.time() 59 | urlFile = open("urls.txt", 'r') 60 | emailFile = open("emails.txt", 'a') 61 | i=0 62 | #Iterate Opened file for getting single url 63 | for urlLink in urlFile.readlines(): 64 | urlLink = urlLink.strip('\'"') 65 | i=i+1 66 | emailsLeechFunc(urlLink, i) 67 | print ("Elapsed Time: %s" % (time.time() - start)) 68 | 69 | urlFile.close() 70 | emailFile.close() 71 | 72 | 73 | 74 | 75 | --------------------------------------------------------------------------------