├── EmailScraper.py ├── LICENSE └── README.md /EmailScraper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A simple email scraper that will spider every webpage and extract emails using regex 3 | Usage Example: 4 | scrapy runspider EmailScraper.py -a url=http://example.com/ -o emails.json -L INFO 5 | ''' 6 | import re 7 | from scrapy.spiders import CrawlSpider, Rule 8 | from scrapy.linkextractors import LinkExtractor 9 | 10 | class EmailSpider(CrawlSpider): 11 | name = 'EmailScraper' 12 | #keep track of previously seen emails to prevent duplication 13 | emailHistory = {} 14 | custom_settings = { 15 | 'ROBOTSTXT_OBEY' : False 16 | # ,'DEPTH_LIMIT' : 6 17 | } 18 | 19 | emailRegex = re.compile(("([a-zA-Z0-9_{|}~-]+(?:\.[a-zA-Z0-9_" 20 | "{|}~-]+)*(@)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9]){2,}?(\." 21 | "))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) 22 | 23 | def __init__(self, url=None, *args, **kwargs): 24 | super(EmailSpider, self).__init__(*args, **kwargs) 25 | self.start_urls = [url] 26 | self.allowed_domains = [url.replace("http://","").replace("www.", "").replace("/","")] 27 | rules = (Rule (LinkExtractor(),callback="parse_item",follow=True),) 28 | def parse_item(self, response): 29 | item = {} 30 | emails = re.findall(EmailSpider.emailRegex, response._body); 31 | for email in emails: 32 | if email[0] in EmailSpider.emailHistory: 33 | continue 34 | else: 35 | EmailSpider.emailHistory[email[0]] = True; 36 | yield { 37 | 'site':response.url, 38 | 'email':email[0] 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Kevin Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EmailScraper 2 | A scrapy script to spider a website and scrape all emails using a regex. EmailScraper outputs the email and the url it was found in JSON format. The output is generated as the website is spidered and does not contain duplicates. 3 | # Requirements 4 | Scrapy 5 | ``` 6 | pip install scrapy 7 | ``` 8 | # Usage 9 | Scrape all emails from example.com and save the output to emails.json, and only print status of spider (not every GET request). 10 | ``` 11 | scrapy runspider EmailScraper.py -a url=http://example.com/ -o emails.json -L INFO 12 | ``` 13 | 14 | # License 15 | MIT License 16 | --------------------------------------------------------------------------------