├── .gitignore ├── ColorStreamHandler.py ├── README.md ├── __init__.py ├── data └── .gitignore ├── database.py ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── email_crawler.py ├── logs └── .gitignore └── settings.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Project specifics 4 | email_crawler.sublime-project 5 | email_crawler.sublime-workspace 6 | 7 | # Packages 8 | *.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | 20 | # Installer logs 21 | pip-log.txt 22 | 23 | # Unit test / coverage reports 24 | .coverage 25 | .tox 26 | 27 | #Translations 28 | *.mo 29 | 30 | #Mr Developer 31 | .mr.developer.cfg 32 | 33 | docs/_build/ 34 | crawler.sqlite 35 | pycrawler.log 36 | emails.csv 37 | -------------------------------------------------------------------------------- /ColorStreamHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import curses 3 | 4 | class ColorStreamHandler(logging.Handler): 5 | 6 | def __init__(self, use_colors): 7 | logging.Handler.__init__(self) 8 | self.use_colors = use_colors 9 | 10 | # Initialize environment 11 | curses.setupterm() 12 | 13 | # Get the foreground color attribute for this environment 14 | self.fcap = curses.tigetstr('setaf') 15 | 16 | #Get the normal attribute 17 | self.COLOR_NORMAL = curses.tigetstr('sgr0') 18 | 19 | # Get + Save the color sequences 20 | self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN) 21 | self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED) 22 | self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW) 23 | self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE) 24 | 25 | def color(self, msg, level): 26 | if level == "INFO": 27 | return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL) 28 | elif level == "WARNING": 29 | return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL) 30 | elif level == "ERROR": 31 | return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL) 32 | elif level == "DEBUG": 33 | return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL) 34 | else: 35 | return msg 36 | 37 | def emit(self, record): 38 | record.msg = record.msg.encode('utf-8', 'ignore') 39 | msg = self.format(record) 40 | 41 | # This just removes the date and milliseconds from asctime 42 | temp = msg.split(']') 43 | msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1] 44 | 45 | if self.use_colors: 46 | msg = self.color(msg, record.levelname) 47 | print msg 48 | 49 | # 'record' has the following attributes: 50 | # threadName 51 | # name 52 | # thread 53 | # created 54 | # process 55 | # processName 56 | # args 57 | # module 58 | # filename 59 | # levelno 60 | # exc_text 61 | # pathname 62 | # lineno 63 | # msg 64 | # exc_info 65 | # funcName 66 | # relativeCreated 67 | # levelname 68 | # msecs -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Python Email Crawler 2 | ==================== 3 | 4 | This python script search/google certain keywords, crawls the webpages from the results, and return all emails found. 5 | 6 | Requirements 7 | ------------ 8 | 9 | - sqlalchemy 10 | - urllib2 11 | 12 | If you don't have, simply `sudo pip install sqlalchemy`. 13 | 14 | 15 | Usage 16 | ------- 17 | 18 | Start the search with a keyword. We use "iphone developers" as an example. 19 | 20 | python email_crawler.py "iphone developers" 21 | 22 | The search and crawling process will take quite a while, as it retrieve up to 500 search results (from Google), and crawl up to 2 level deep. It should crawl around 10,000 webpages :) 23 | 24 | After the process finished, run this command to get the list of emails 25 | 26 | python email_crawler.py --emails 27 | 28 | The emails will be saved in ./data/emails.csv -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samwize/python-email-crawler/0ec34e3173c5b84e779b33c59e83969640e551ae/__init__.py -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Table, Column, Integer, Unicode, Boolean, MetaData, select 2 | import urlparse 3 | 4 | DATABASE_NAME = 'data/crawler.sqlite' 5 | HTML_DIR = 'data/html/' 6 | 7 | class CrawlerDb: 8 | 9 | def __init__(self): 10 | self.connected = False 11 | 12 | def connect(self): 13 | 14 | self.engine = create_engine('sqlite:///' + DATABASE_NAME) 15 | self.connection = self.engine.connect() 16 | self.connected = True if self.connection else False 17 | self.metadata = MetaData() 18 | 19 | # Define the tables 20 | self.website_table = Table('website', self.metadata, 21 | Column('id', Integer, primary_key=True), 22 | Column('url', Unicode, nullable=False), 23 | Column('has_crawled', Boolean, default=False), 24 | Column('emails', Unicode, nullable=True), 25 | ) 26 | 27 | # Create the tables 28 | self.metadata.create_all(self.engine) 29 | 30 | def enqueue(self, url, emails = None): 31 | if not self.connected: 32 | return False 33 | 34 | s = select([self.website_table]).where(self.website_table.c.url == url) 35 | res = self.connection.execute(s) 36 | result = res.fetchall() 37 | res.close() 38 | # If we get a result, then this url is not unique 39 | if len(result) > 0: 40 | # print 'Duplicated: %s' % url 41 | return False 42 | 43 | args = [{'url':unicode(url)}] 44 | if (emails != None): 45 | args = [{'url':unicode(url), 'has_crawled':True, 'emails':unicode(",".join(emails))}] 46 | result = self.connection.execute(self.website_table.insert(), args) 47 | if result: 48 | return True 49 | return False 50 | 51 | 52 | def dequeue(self): 53 | if not self.connected: 54 | return False 55 | # Get the first thing in the queue 56 | s = select([self.website_table]).limit(1).where(self.website_table.c.has_crawled == False) 57 | res = self.connection.execute(s) 58 | result = res.fetchall() 59 | res.close() 60 | # If we get a result 61 | if len(result) > 0: 62 | # Remove from the queue ? 63 | # delres = self.connection.execute(self.queue_table.delete().where(self.queue_table.c.id == result[0][0])) 64 | # if not delres: 65 | # return False 66 | # Return the row 67 | # print result[0].url 68 | return result[0] 69 | return False 70 | 71 | 72 | def crawled(self, website, new_emails=None): 73 | if not self.connected: 74 | return False 75 | stmt = self.website_table.update() \ 76 | .where(self.website_table.c.id==website.id) \ 77 | .values(has_crawled=True, emails=new_emails) 78 | self.connection.execute(stmt) 79 | 80 | 81 | def get_all_emails(self): 82 | if not self.connected: 83 | return None 84 | 85 | s = select([self.website_table]) 86 | res = self.connection.execute(s) 87 | results = res.fetchall() 88 | res.close() 89 | email_set = set() 90 | for result in results: 91 | if (result.emails == None): 92 | continue 93 | for email in result.emails.split(','): 94 | email_set.add(email) 95 | 96 | return email_set 97 | 98 | def get_all_domains(self): 99 | if not self.connected: 100 | return None 101 | 102 | s = select([self.website_table]) 103 | res = self.connection.execute(s) 104 | results = res.fetchall() 105 | res.close() 106 | domain_set = set() 107 | for result in results: 108 | if (result.url == None): 109 | continue 110 | url = urlparse.urlparse(result.url) 111 | hostname = url.hostname.split(".") 112 | # Simplistic assumeption of a domain. If 2nd last name is <4 char, then it has 3 parts eg. just2us.com.sg 113 | hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:]) 114 | domain_set.add(hostname) 115 | 116 | return domain_set 117 | 118 | 119 | def close(self): 120 | self.connection.close() 121 | 122 | 123 | def save_html(filename, html): 124 | filename = os.path.join(HTML_DIR, filename) 125 | file = open(filename,"w+") 126 | file.writelines(html) 127 | file.close() 128 | 129 | 130 | def test(self): 131 | c = CrawlerDb() 132 | c.connect() 133 | # c.enqueue(['a12222', '11']) 134 | # c.enqueue(['dddaaaaaa2', '22']) 135 | c.enqueue('111') 136 | c.enqueue('222') 137 | website = c.dequeue() 138 | c.crawled(website) 139 | website = c.dequeue() 140 | c.crawled(website, "a,b") 141 | print '---' 142 | c.dequeue() 143 | 144 | 145 | # CrawlerDb().test() 146 | 147 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PythonEmailCrawler.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PythonEmailCrawler.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PythonEmailCrawler" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PythonEmailCrawler" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Python Email Crawler documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Aug 3 12:26:56 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'Python Email Crawler' 44 | copyright = u'2012, Junda Ong' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '1.0' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '1.0' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'PythonEmailCrawlerdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'PythonEmailCrawler.tex', u'Python Email Crawler Documentation', 187 | u'Junda Ong', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'pythonemailcrawler', u'Python Email Crawler Documentation', 217 | [u'Junda Ong'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'PythonEmailCrawler', u'Python Email Crawler Documentation', 231 | u'Junda Ong', 'PythonEmailCrawler', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Python Email Crawler documentation master file, created by 2 | sphinx-quickstart on Fri Aug 3 12:26:56 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Python Email Crawler's documentation! 7 | ================================================ 8 | 9 | This python script search certain keywords on Google, crawls the webpages from the results, and return all emails found. 10 | 11 | For each result from Google, the crawler will crawl that page for an email. If it could not find an email, it will crawl the linked pages (up to 2nd level). 12 | 13 | This is useful when the result returns the hompage of a website, and the email is usually in the Contact Us page. 14 | 15 | ------------ 16 | Requirements 17 | ------------ 18 | 19 | * sqlalchemy 20 | * urllib2 21 | 22 | 23 | ------ 24 | Usage 25 | ------ 26 | Start the search with a keyword. We use "iphone developers" as an example. 27 | 28 | .. code-block:: bash 29 | 30 | $ ./email_crawler.py "iphone developers" 31 | 32 | The search and crawling process will take quite a while, as it retrieve up to 500 search results (from Google), and crawl up to 2 level deep. It shold crawl around 10,000 webpages :) 33 | 34 | After the process finished, run this command to get the list of emails 35 | 36 | .. code-block:: bash 37 | 38 | $ ./email_crawler.py --emails 39 | 40 | The emails will be saved in ./data/emails.csv 41 | 42 | 43 | Contents: 44 | 45 | .. toctree:: 46 | :maxdepth: 2 47 | 48 | 49 | 50 | Indices and tables 51 | ================== 52 | 53 | * :ref:`genindex` 54 | * :ref:`modindex` 55 | * :ref:`search` 56 | 57 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PythonEmailCrawler.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PythonEmailCrawler.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /email_crawler.py: -------------------------------------------------------------------------------- 1 | from settings import LOGGING 2 | import logging, logging.config 3 | import urllib, urllib2 4 | import re, urlparse 5 | import traceback 6 | from database import CrawlerDb 7 | 8 | # Debugging 9 | # import pdb;pdb.set_trace() 10 | 11 | # Logging 12 | logging.config.dictConfig(LOGGING) 13 | logger = logging.getLogger("crawler_logger") 14 | 15 | google_adurl_regex = re.compile('adurl=(.*?)"') 16 | google_url_regex = re.compile('url\?q=(.*?)&sa=') 17 | email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE) 18 | url_regex = re.compile('') 19 | # Below url_regex will run into 'Castrophic Backtracking'! 20 | # http://stackoverflow.com/questions/8010005/python-re-infinite-execution 21 | # url_regex = re.compile('') 22 | 23 | # Maximum number of search results to start the crawl 24 | MAX_SEARCH_RESULTS = 150 25 | 26 | EMAILS_FILENAME = 'data/emails.csv' 27 | DOMAINS_FILENAME = 'data/domains.csv' 28 | 29 | # Set up the database 30 | db = CrawlerDb() 31 | db.connect() 32 | 33 | 34 | def crawl(keywords): 35 | """ 36 | This method will 37 | 38 | 1) Google the keywords, and extract MAX_SEARCH_RESULTS 39 | 2) For every result (aka website), crawl the website 2 levels deep. 40 | That is the homepage (level 1) and all it's links (level 2). 41 | But if level 1 has the email, then skip going to level 2. 42 | 3) Store the html in /data/html/ and update the database of the crawled emails 43 | 44 | crawl(keywords): 45 | Extract Google search results and put all in database 46 | Process each search result, the webpage: 47 | Crawl webpage level 1, the homepage 48 | Crawl webpage level 2, a link away from the homepage 49 | Update all crawled page in database, with has_crawled = True immediately 50 | Store the HTML 51 | """ 52 | logger.info("-"*40) 53 | logger.info("Keywords to Google for: %s" % keywords.decode('utf-8')) 54 | logger.info("-"*40) 55 | 56 | # Step 1: Crawl Google Page 57 | # eg http://www.google.com/search?q=singapore+web+development&start=0 58 | # Next page: https://www.google.com/search?q=singapore+web+development&start=10 59 | # Google search results are paged with 10 urls each. There are also adurls 60 | for page_index in range(0, MAX_SEARCH_RESULTS, 10): 61 | query = {'q': keywords} 62 | url = 'http://www.google.com/search?' + urllib.urlencode(query) + '&start=' + str(page_index) 63 | data = retrieve_html(url) 64 | # print("data: \n%s" % data) 65 | for url in google_url_regex.findall(data): 66 | db.enqueue(unicode(url)) 67 | for url in google_adurl_regex.findall(data): 68 | db.enqueue(unicode(url)) 69 | 70 | # Step 2: Crawl each of the search result 71 | # We search till level 2 deep 72 | while (True): 73 | # Dequeue an uncrawled webpage from db 74 | uncrawled = db.dequeue() 75 | if (uncrawled == False): 76 | break 77 | email_set = find_emails_2_level_deep(uncrawled.url) 78 | if (len(email_set) > 0): 79 | db.crawled(uncrawled, ",".join(list(email_set))) 80 | else: 81 | db.crawled(uncrawled, None) 82 | 83 | def retrieve_html(url): 84 | """ 85 | Crawl a website, and returns the whole html as an ascii string. 86 | 87 | On any error, return. 88 | """ 89 | req = urllib2.Request(url) 90 | req.add_header('User-Agent', 'Just-Crawling 0.1') 91 | request = None 92 | status = 0 93 | try: 94 | logger.info("Crawling %s" % url) 95 | request = urllib2.urlopen(req) 96 | except urllib2.URLError, e: 97 | logger.error("Exception at url: %s\n%s" % (url, e)) 98 | except urllib2.HTTPError, e: 99 | status = e.code 100 | except Exception, e: 101 | return 102 | if status == 0: 103 | status = 200 104 | 105 | try: 106 | data = request.read() 107 | except Exception, e: 108 | return 109 | 110 | return str(data) 111 | 112 | 113 | def find_emails_2_level_deep(url): 114 | """ 115 | Find the email at level 1. 116 | If there is an email, good. Return that email 117 | Else, find in level 2. Store all results in database directly, and return None 118 | """ 119 | html = retrieve_html(url) 120 | email_set = find_emails_in_html(html) 121 | 122 | if (len(email_set) > 0): 123 | # If there is a email, we stop at level 1. 124 | return email_set 125 | 126 | else: 127 | # No email at level 1. Crawl level 2 128 | logger.info('No email at level 1.. proceeding to crawl level 2') 129 | 130 | link_set = find_links_in_html_with_same_hostname(url, html) 131 | for link in link_set: 132 | # Crawl them right away! 133 | # Enqueue them too 134 | html = retrieve_html(link) 135 | if (html == None): 136 | continue 137 | email_set = find_emails_in_html(html) 138 | db.enqueue(link, list(email_set)) 139 | 140 | # We return an empty set 141 | return set() 142 | 143 | 144 | def find_emails_in_html(html): 145 | if (html == None): 146 | return set() 147 | email_set = set() 148 | for email in email_regex.findall(html): 149 | email_set.add(email) 150 | return email_set 151 | 152 | 153 | def find_links_in_html_with_same_hostname(url, html): 154 | """ 155 | Find all the links with same hostname as url 156 | """ 157 | if (html == None): 158 | return set() 159 | url = urlparse.urlparse(url) 160 | links = url_regex.findall(html) 161 | link_set = set() 162 | for link in links: 163 | if link == None: 164 | continue 165 | try: 166 | link = str(link) 167 | if link.startswith("/"): 168 | link_set.add('http://'+url.netloc+link) 169 | elif link.startswith("http") or link.startswith("https"): 170 | if (link.find(url.netloc)): 171 | link_set.add(link) 172 | elif link.startswith("#"): 173 | continue 174 | else: 175 | link_set.add(urlparse.urljoin(url.geturl(),link)) 176 | except Exception, e: 177 | pass 178 | 179 | return link_set 180 | 181 | 182 | 183 | 184 | if __name__ == "__main__": 185 | import sys 186 | try: 187 | arg = sys.argv[1].lower() 188 | if (arg == '--emails') or (arg == '-e'): 189 | # Get all the emails and save in a CSV 190 | logger.info("="*40) 191 | logger.info("Processing...") 192 | emails = db.get_all_emails() 193 | logger.info("There are %d emails" % len(emails)) 194 | file = open(EMAILS_FILENAME, "w+") 195 | file.writelines("\n".join(emails)) 196 | file.close() 197 | logger.info("All emails saved to ./data/emails.csv") 198 | logger.info("="*40) 199 | elif (arg == '--domains') or (arg == '-d'): 200 | # Get all the domains and save in a CSV 201 | logger.info("="*40) 202 | logger.info("Processing...") 203 | domains = db.get_all_domains() 204 | logger.info("There are %d domains" % len(domains)) 205 | file = open(DOMAINS_FILENAME, "w+") 206 | file.writelines("\n".join(domains)) 207 | file.close() 208 | logger.info("All domains saved to ./data/domains.csv") 209 | logger.info("="*40) 210 | else: 211 | # Crawl the supplied keywords! 212 | crawl(arg) 213 | 214 | except KeyboardInterrupt: 215 | logger.error("Stopping (KeyboardInterrupt)") 216 | sys.exit() 217 | except Exception, e: 218 | logger.error("EXCEPTION: %s " % e) 219 | traceback.print_exc() 220 | -------------------------------------------------------------------------------- /logs/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | DEBUG = True # Whether or not to show DEBUG level messages 4 | USE_COLORS = True # Whether or not colors should be used when outputting text 5 | 6 | 7 | LOGGING = { # dictConfig for output stream and file logging 8 | 'version': 1, 9 | 'disable_existing_loggers': False, 10 | 11 | 'formatters': { 12 | 'console': { 13 | 'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s', 14 | }, 15 | 'file': { 16 | 'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s', 17 | }, 18 | }, 19 | 20 | 'handlers': { 21 | 'console': { 22 | 'class': 'ColorStreamHandler.ColorStreamHandler', 23 | 'formatter':'console', 24 | 'level': 'DEBUG', 25 | 'use_colors': USE_COLORS, 26 | }, 27 | 'file': { 28 | 'class': 'logging.handlers.TimedRotatingFileHandler', 29 | 'formatter':'file', 30 | 'level': 'INFO', 31 | 'when': 'midnight', 32 | 'filename': 'logs/pycrawler.log', 33 | 'interval': 1, 34 | 'backupCount': 0, 35 | 'encoding': None, 36 | 'delay': False, 37 | 'utc': False, 38 | }, 39 | }, 40 | 41 | 'loggers': { 42 | 'crawler_logger': { 43 | 'handlers': ['console', 'file'], 44 | 'level': 'DEBUG' if DEBUG else 'INFO', 45 | 'propagate': True, 46 | }, 47 | } 48 | } 49 | 50 | --------------------------------------------------------------------------------