├── .gitignore
├── ColorStreamHandler.py
├── README.md
├── __init__.py
├── data
    └── .gitignore
├── database.py
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── email_crawler.py
├── logs
    └── .gitignore
└── settings.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Project specifics
 4 | email_crawler.sublime-project
 5 | email_crawler.sublime-workspace
 6 | 
 7 | # Packages
 8 | *.egg
 9 | *.egg-info
10 | dist
11 | build
12 | eggs
13 | parts
14 | bin
15 | var
16 | sdist
17 | develop-eggs
18 | .installed.cfg
19 | 
20 | # Installer logs
21 | pip-log.txt
22 | 
23 | # Unit test / coverage reports
24 | .coverage
25 | .tox
26 | 
27 | #Translations
28 | *.mo
29 | 
30 | #Mr Developer
31 | .mr.developer.cfg
32 | 
33 | docs/_build/
34 | crawler.sqlite
35 | pycrawler.log
36 | emails.csv
37 | 


--------------------------------------------------------------------------------
/ColorStreamHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import curses
 3 | 
 4 | class ColorStreamHandler(logging.Handler):
 5 | 
 6 | 	def __init__(self, use_colors):
 7 | 		logging.Handler.__init__(self)
 8 | 		self.use_colors = use_colors
 9 | 
10 | 		# Initialize environment
11 | 		curses.setupterm()
12 | 
13 | 		# Get the foreground color attribute for this environment
14 | 		self.fcap = curses.tigetstr('setaf')
15 | 
16 | 		#Get the normal attribute
17 | 		self.COLOR_NORMAL = curses.tigetstr('sgr0')
18 | 
19 | 		# Get + Save the color sequences
20 | 		self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN)
21 | 		self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
22 | 		self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW)
23 | 		self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE)
24 | 
25 | 	def color(self, msg, level):
26 | 		if level == "INFO":
27 | 			return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL)
28 | 		elif level == "WARNING":
29 | 			return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL)
30 | 		elif level == "ERROR":
31 | 			return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL)
32 | 		elif level == "DEBUG":
33 | 			return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL)
34 | 		else:
35 | 			return msg
36 | 	
37 | 	def emit(self, record):
38 | 		record.msg = record.msg.encode('utf-8', 'ignore')
39 | 		msg = self.format(record)
40 | 
41 | 		# This just removes the date and milliseconds from asctime
42 | 		temp = msg.split(']')
43 | 		msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]
44 | 
45 | 		if self.use_colors:
46 | 			msg = self.color(msg, record.levelname)
47 | 		print msg
48 | 
49 | # 'record' has the following attributes:
50 | # threadName
51 | # name
52 | # thread
53 | # created
54 | # process
55 | # processName
56 | # args
57 | # module
58 | # filename
59 | # levelno
60 | # exc_text
61 | # pathname
62 | # lineno
63 | # msg
64 | # exc_info
65 | # funcName
66 | # relativeCreated
67 | # levelname
68 | # msecs


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Python Email Crawler
 2 | ====================
 3 | 
 4 | This python script search/google certain keywords, crawls the webpages from the results, and return all emails found.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | - sqlalchemy
10 | - urllib2
11 | 
12 | If you don't have, simply `sudo pip install sqlalchemy`. 
13 | 
14 | 
15 | Usage
16 | -------
17 | 
18 | Start the search with a keyword. We use "iphone developers" as an example.
19 | 
20 | 	python email_crawler.py "iphone developers"
21 | 
22 | The search and crawling process will take quite a while, as it retrieve up to 500 search results (from Google), and crawl up to 2 level deep. It should crawl around 10,000 webpages :)
23 | 
24 | After the process finished, run this command to get the list of emails
25 | 
26 | 	python email_crawler.py --emails
27 | 
28 | The emails will be saved in ./data/emails.csv


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samwize/python-email-crawler/0ec34e3173c5b84e779b33c59e83969640e551ae/__init__.py


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import create_engine, Table, Column, Integer, Unicode, Boolean, MetaData, select
  2 | import urlparse
  3 | 
  4 | DATABASE_NAME = 'data/crawler.sqlite'
  5 | HTML_DIR = 'data/html/'
  6 | 
  7 | class CrawlerDb:
  8 | 
  9 | 	def __init__(self):
 10 | 		self.connected = False
 11 | 
 12 | 	def connect(self):
 13 | 
 14 | 		self.engine = create_engine('sqlite:///' + DATABASE_NAME)
 15 | 		self.connection = self.engine.connect()
 16 | 		self.connected = True if self.connection else False
 17 | 		self.metadata = MetaData()
 18 | 
 19 | 		# Define the tables
 20 | 		self.website_table = Table('website', self.metadata,
 21 | 			Column('id', Integer, primary_key=True),
 22 | 			Column('url', Unicode, nullable=False),
 23 | 			Column('has_crawled', Boolean, default=False),
 24 | 			Column('emails', Unicode, nullable=True),
 25 | 		)
 26 | 
 27 | 		# Create the tables
 28 | 		self.metadata.create_all(self.engine)
 29 | 		
 30 | 	def enqueue(self, url, emails = None):
 31 | 		if not self.connected:
 32 | 			return False
 33 | 
 34 | 		s = select([self.website_table]).where(self.website_table.c.url == url)
 35 | 		res = self.connection.execute(s)
 36 | 		result = res.fetchall()
 37 | 		res.close()
 38 | 		# If we get a result, then this url is not unique
 39 | 		if len(result) > 0:
 40 | # 			print 'Duplicated: %s' % url
 41 | 			return False
 42 | 
 43 | 		args = [{'url':unicode(url)}]
 44 | 		if (emails != None):
 45 | 			args = [{'url':unicode(url), 'has_crawled':True, 'emails':unicode(",".join(emails))}]
 46 | 		result = self.connection.execute(self.website_table.insert(), args)
 47 | 		if result:
 48 | 			return True
 49 | 		return False
 50 | 		
 51 | 		
 52 | 	def dequeue(self):
 53 | 		if not self.connected:
 54 | 			return False
 55 | 		# Get the first thing in the queue
 56 | 		s = select([self.website_table]).limit(1).where(self.website_table.c.has_crawled == False)
 57 | 		res = self.connection.execute(s)
 58 | 		result = res.fetchall()
 59 | 		res.close()
 60 | 		# If we get a result
 61 | 		if len(result) > 0:
 62 | 			# Remove from the queue ?
 63 | 			# delres = self.connection.execute(self.queue_table.delete().where(self.queue_table.c.id == result[0][0]))
 64 | 			# if not delres:
 65 | 			# 	return False
 66 | 			# Return the row
 67 | 			# print result[0].url
 68 | 			return result[0]
 69 | 		return False
 70 | 		
 71 | 		
 72 | 	def crawled(self, website, new_emails=None):
 73 | 		if not self.connected:
 74 | 			return False
 75 | 		stmt = self.website_table.update() \
 76 | 			.where(self.website_table.c.id==website.id) \
 77 | 			.values(has_crawled=True, emails=new_emails)
 78 | 		self.connection.execute(stmt)
 79 | 
 80 | 
 81 | 	def get_all_emails(self):
 82 | 		if not self.connected:
 83 | 			return None
 84 | 
 85 | 		s = select([self.website_table])
 86 | 		res = self.connection.execute(s)
 87 | 		results = res.fetchall()
 88 | 		res.close()
 89 | 		email_set = set()
 90 | 		for result in results:
 91 | 			if (result.emails == None):
 92 | 				continue
 93 | 			for email in result.emails.split(','):
 94 | 				email_set.add(email)
 95 | 
 96 | 		return email_set
 97 | 
 98 | 	def get_all_domains(self):
 99 | 		if not self.connected:
100 | 			return None
101 | 
102 | 		s = select([self.website_table])
103 | 		res = self.connection.execute(s)
104 | 		results = res.fetchall()
105 | 		res.close()
106 | 		domain_set = set()
107 | 		for result in results:
108 | 			if (result.url == None):
109 | 				continue
110 | 			url = urlparse.urlparse(result.url)
111 | 			hostname = url.hostname.split(".")
112 | 			# Simplistic assumeption of a domain. If 2nd last name is <4 char, then it has 3 parts eg. just2us.com.sg
113 | 			hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:])
114 | 			domain_set.add(hostname)
115 | 
116 | 		return domain_set
117 | 
118 | 
119 | 	def close(self):
120 | 		self.connection.close()
121 | 		
122 | 
123 | 	def save_html(filename, html):
124 | 		filename = os.path.join(HTML_DIR, filename)
125 | 		file = open(filename,"w+")
126 | 		file.writelines(html)
127 | 		file.close()
128 | 
129 | 
130 | 	def test(self):
131 | 		c = CrawlerDb()
132 | 		c.connect()
133 | 		# c.enqueue(['a12222', '11'])
134 | 		# c.enqueue(['dddaaaaaa2', '22'])
135 | 		c.enqueue('111')
136 | 		c.enqueue('222')
137 | 		website = c.dequeue()
138 | 		c.crawled(website)
139 | 		website = c.dequeue()
140 | 		c.crawled(website, "a,b")
141 | 		print '---'
142 | 		c.dequeue()
143 | 	
144 | 	
145 | # CrawlerDb().test()
146 | 
147 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PythonEmailCrawler.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PythonEmailCrawler.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/PythonEmailCrawler"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PythonEmailCrawler"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Python Email Crawler documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Aug  3 12:26:56 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'Python Email Crawler'
 44 | copyright = u'2012, Junda Ong'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '1.0'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '1.0'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'PythonEmailCrawlerdoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | #'papersize': 'letterpaper',
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #'pointsize': '10pt',
178 | 
179 | # Additional stuff for the LaTeX preamble.
180 | #'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |   ('index', 'PythonEmailCrawler.tex', u'Python Email Crawler Documentation',
187 |    u'Junda Ong', 'manual'),
188 | ]
189 | 
190 | # The name of an image file (relative to this directory) to place at the top of
191 | # the title page.
192 | #latex_logo = None
193 | 
194 | # For "manual" documents, if this is true, then toplevel headings are parts,
195 | # not chapters.
196 | #latex_use_parts = False
197 | 
198 | # If true, show page references after internal links.
199 | #latex_show_pagerefs = False
200 | 
201 | # If true, show URL addresses after external links.
202 | #latex_show_urls = False
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'pythonemailcrawler', u'Python Email Crawler Documentation',
217 |      [u'Junda Ong'], 1)
218 | ]
219 | 
220 | # If true, show URL addresses after external links.
221 | #man_show_urls = False
222 | 
223 | 
224 | # -- Options for Texinfo output ------------------------------------------------
225 | 
226 | # Grouping the document tree into Texinfo files. List of tuples
227 | # (source start file, target name, title, author,
228 | #  dir menu entry, description, category)
229 | texinfo_documents = [
230 |   ('index', 'PythonEmailCrawler', u'Python Email Crawler Documentation',
231 |    u'Junda Ong', 'PythonEmailCrawler', 'One line description of project.',
232 |    'Miscellaneous'),
233 | ]
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #texinfo_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #texinfo_domain_indices = True
240 | 
241 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
242 | #texinfo_show_urls = 'footnote'
243 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Python Email Crawler documentation master file, created by
 2 |    sphinx-quickstart on Fri Aug  3 12:26:56 2012.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Python Email Crawler's documentation!
 7 | ================================================
 8 | 
 9 | This python script search certain keywords on Google, crawls the webpages from the results, and return all emails found.
10 | 
11 | For each result from Google, the crawler will crawl that page for an email. If it could not find an email, it will crawl the linked pages (up to 2nd level). 
12 | 
13 | This is useful when the result returns the hompage of a website, and the email is usually in the Contact Us page.
14 | 
15 | ------------
16 | Requirements
17 | ------------
18 | 
19 | * sqlalchemy
20 | * urllib2
21 | 
22 | 
23 | ------
24 | Usage
25 | ------
26 | Start the search with a keyword. We use "iphone developers" as an example.
27 | 
28 | .. code-block:: bash
29 | 
30 | 	$ ./email_crawler.py "iphone developers"
31 | 
32 | The search and crawling process will take quite a while, as it retrieve up to 500 search results (from Google), and crawl up to 2 level deep. It shold crawl around 10,000 webpages :)
33 | 
34 | After the process finished, run this command to get the list of emails
35 | 
36 | .. code-block:: bash
37 | 
38 | 	$ ./email_crawler.py --emails
39 | 
40 | The emails will be saved in ./data/emails.csv
41 | 
42 | 
43 | Contents:
44 | 
45 | .. toctree::
46 |    :maxdepth: 2
47 | 
48 | 
49 | 
50 | Indices and tables
51 | ==================
52 | 
53 | * :ref:`genindex`
54 | * :ref:`modindex`
55 | * :ref:`search`
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PythonEmailCrawler.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PythonEmailCrawler.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/email_crawler.py:
--------------------------------------------------------------------------------
  1 | from settings import LOGGING
  2 | import logging, logging.config
  3 | import urllib, urllib2
  4 | import re, urlparse
  5 | import traceback
  6 | from database import CrawlerDb
  7 | 
  8 | # Debugging
  9 | # import pdb;pdb.set_trace()
 10 | 
 11 | # Logging
 12 | logging.config.dictConfig(LOGGING)
 13 | logger = logging.getLogger("crawler_logger")
 14 | 
 15 | google_adurl_regex = re.compile('adurl=(.*?)"')
 16 | google_url_regex = re.compile('url\?q=(.*?)&amp;sa=')
 17 | email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
 18 | url_regex = re.compile('<a\s.*?href=[\'"](.*?)[\'"].*?>')
 19 | # Below url_regex will run into 'Castrophic Backtracking'!
 20 | # http://stackoverflow.com/questions/8010005/python-re-infinite-execution
 21 | # url_regex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
 22 | 
 23 | # Maximum number of search results to start the crawl
 24 | MAX_SEARCH_RESULTS = 150
 25 | 
 26 | EMAILS_FILENAME = 'data/emails.csv'
 27 | DOMAINS_FILENAME = 'data/domains.csv'
 28 | 
 29 | # Set up the database
 30 | db = CrawlerDb()
 31 | db.connect()
 32 | 
 33 | 
 34 | def crawl(keywords):
 35 | 	"""
 36 | 	This method will
 37 | 
 38 | 	1) Google the keywords, and extract MAX_SEARCH_RESULTS
 39 | 	2) For every result (aka website), crawl the website 2 levels deep.
 40 | 		That is the homepage (level 1) and all it's links (level 2).
 41 | 		But if level 1 has the email, then skip going to level 2.
 42 | 	3) Store the html in /data/html/ and update the database of the crawled emails
 43 | 
 44 | 	crawl(keywords):
 45 | 		Extract Google search results and put all in database
 46 | 		Process each search result, the webpage:
 47 | 			Crawl webpage level 1, the homepage
 48 | 			Crawl webpage level 2, a link away from the homepage
 49 | 			Update all crawled page in database, with has_crawled = True immediately
 50 | 			Store the HTML
 51 | 	"""
 52 | 	logger.info("-"*40)
 53 | 	logger.info("Keywords to Google for: %s" % keywords.decode('utf-8'))
 54 | 	logger.info("-"*40)
 55 | 
 56 | 	# Step 1: Crawl Google Page
 57 | 	# eg http://www.google.com/search?q=singapore+web+development&start=0
 58 | 	# Next page: https://www.google.com/search?q=singapore+web+development&start=10
 59 | 	# Google search results are paged with 10 urls each. There are also adurls
 60 | 	for page_index in range(0, MAX_SEARCH_RESULTS, 10):
 61 | 		query = {'q': keywords}
 62 | 		url = 'http://www.google.com/search?' + urllib.urlencode(query) + '&start=' + str(page_index)
 63 | 		data = retrieve_html(url)
 64 | 		# 	print("data: \n%s" % data)
 65 | 		for url in google_url_regex.findall(data):
 66 | 			db.enqueue(unicode(url))
 67 | 		for url in google_adurl_regex.findall(data):
 68 | 			db.enqueue(unicode(url))
 69 | 
 70 | 	# Step 2: Crawl each of the search result
 71 | 	# We search till level 2 deep
 72 | 	while (True):
 73 | 		# Dequeue an uncrawled webpage from db
 74 | 		uncrawled = db.dequeue()
 75 | 		if (uncrawled == False):
 76 | 			break
 77 | 		email_set = find_emails_2_level_deep(uncrawled.url)
 78 | 		if (len(email_set) > 0):
 79 | 			db.crawled(uncrawled, ",".join(list(email_set)))
 80 | 		else:
 81 | 			db.crawled(uncrawled, None)
 82 | 
 83 | def retrieve_html(url):
 84 | 	"""
 85 | 	Crawl a website, and returns the whole html as an ascii string.
 86 | 
 87 | 	On any error, return.
 88 | 	"""
 89 | 	req = urllib2.Request(url)
 90 | 	req.add_header('User-Agent', 'Just-Crawling 0.1')
 91 | 	request = None
 92 | 	status = 0
 93 | 	try:
 94 | 		logger.info("Crawling %s" % url)
 95 | 		request = urllib2.urlopen(req)
 96 | 	except urllib2.URLError, e:
 97 | 		logger.error("Exception at url: %s\n%s" % (url, e))
 98 | 	except urllib2.HTTPError, e:
 99 | 		status = e.code
100 | 	except Exception, e:
101 | 		return
102 | 	if status == 0:
103 | 		status = 200
104 | 
105 | 	try:
106 | 		data = request.read()
107 | 	except Exception, e:
108 | 		return
109 | 
110 | 	return str(data)
111 | 
112 | 
113 | def find_emails_2_level_deep(url):
114 | 	"""
115 | 	Find the email at level 1.
116 | 	If there is an email, good. Return that email
117 | 	Else, find in level 2. Store all results in database directly, and return None
118 | 	"""
119 | 	html = retrieve_html(url)
120 | 	email_set = find_emails_in_html(html)
121 | 
122 | 	if (len(email_set) > 0):
123 | 		# If there is a email, we stop at level 1.
124 | 		return email_set
125 | 
126 | 	else:
127 | 		# No email at level 1. Crawl level 2
128 | 		logger.info('No email at level 1.. proceeding to crawl level 2')
129 | 
130 | 		link_set = find_links_in_html_with_same_hostname(url, html)
131 | 		for link in link_set:
132 | 			# Crawl them right away!
133 | 			# Enqueue them too
134 | 			html = retrieve_html(link)
135 | 			if (html == None):
136 | 				continue
137 | 			email_set = find_emails_in_html(html)
138 | 			db.enqueue(link, list(email_set))
139 | 
140 | 		# We return an empty set
141 | 		return set()
142 | 
143 | 
144 | def find_emails_in_html(html):
145 | 	if (html == None):
146 | 		return set()
147 | 	email_set = set()
148 | 	for email in email_regex.findall(html):
149 | 		email_set.add(email)
150 | 	return email_set
151 | 
152 | 
153 | def find_links_in_html_with_same_hostname(url, html):
154 | 	"""
155 | 	Find all the links with same hostname as url
156 | 	"""
157 | 	if (html == None):
158 | 		return set()
159 | 	url = urlparse.urlparse(url)
160 | 	links = url_regex.findall(html)
161 | 	link_set = set()
162 | 	for link in links:
163 | 		if link == None:
164 | 			continue
165 | 		try:
166 | 			link = str(link)
167 | 			if link.startswith("/"):
168 | 				link_set.add('http://'+url.netloc+link)
169 | 			elif link.startswith("http") or link.startswith("https"):
170 | 				if (link.find(url.netloc)):
171 | 					link_set.add(link)
172 | 			elif link.startswith("#"):
173 | 				continue
174 | 			else:
175 | 				link_set.add(urlparse.urljoin(url.geturl(),link))
176 | 		except Exception, e:
177 | 			pass
178 | 
179 | 	return link_set
180 | 
181 | 
182 | 
183 | 
184 | if __name__ == "__main__":
185 | 	import sys
186 | 	try:
187 | 		arg = sys.argv[1].lower()
188 | 		if (arg == '--emails') or (arg == '-e'):
189 | 			# Get all the emails and save in a CSV
190 | 			logger.info("="*40)
191 | 			logger.info("Processing...")
192 | 			emails = db.get_all_emails()
193 | 			logger.info("There are %d emails" % len(emails))
194 | 			file = open(EMAILS_FILENAME, "w+")
195 | 			file.writelines("\n".join(emails))
196 | 			file.close()
197 | 			logger.info("All emails saved to ./data/emails.csv")
198 | 			logger.info("="*40)
199 | 		elif (arg == '--domains') or (arg == '-d'):
200 | 			# Get all the domains and save in a CSV
201 | 			logger.info("="*40)
202 | 			logger.info("Processing...")
203 | 			domains = db.get_all_domains()
204 | 			logger.info("There are %d domains" % len(domains))
205 | 			file = open(DOMAINS_FILENAME, "w+")
206 | 			file.writelines("\n".join(domains))
207 | 			file.close()
208 | 			logger.info("All domains saved to ./data/domains.csv")
209 | 			logger.info("="*40)
210 | 		else:
211 | 			# Crawl the supplied keywords!
212 | 			crawl(arg)
213 | 
214 | 	except KeyboardInterrupt:
215 | 		logger.error("Stopping (KeyboardInterrupt)")
216 | 		sys.exit()
217 | 	except Exception, e:
218 | 		logger.error("EXCEPTION: %s " % e)
219 | 		traceback.print_exc()
220 | 


--------------------------------------------------------------------------------
/logs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | DEBUG = True 					# Whether or not to show DEBUG level messages
 4 | USE_COLORS = True 				# Whether or not colors should be used when outputting text
 5 | 
 6 | 
 7 | LOGGING = {						# dictConfig for output stream and file logging
 8 | 	'version': 1,              
 9 |     'disable_existing_loggers': False,
10 | 
11 | 	'formatters': {
12 | 		'console': {
13 | 			'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
14 | 		},
15 | 		'file': {
16 | 			'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
17 | 		},
18 | 	},
19 | 
20 | 	'handlers': {
21 | 		'console': {
22 | 			'class': 'ColorStreamHandler.ColorStreamHandler',
23 | 			'formatter':'console',
24 | 			'level': 'DEBUG',
25 | 			'use_colors': USE_COLORS,
26 | 		},
27 | 		'file': {
28 | 			'class': 'logging.handlers.TimedRotatingFileHandler',
29 | 			'formatter':'file',
30 | 			'level': 'INFO',
31 | 			'when': 'midnight',
32 | 			'filename': 'logs/pycrawler.log',
33 | 			'interval': 1,
34 | 			'backupCount': 0,
35 | 			'encoding': None,
36 | 			'delay': False,
37 | 			'utc': False,
38 | 		},
39 | 	},
40 | 
41 | 	'loggers': {
42 | 		'crawler_logger': {
43 | 			'handlers': ['console', 'file'],
44 | 			'level': 'DEBUG' if DEBUG else 'INFO',
45 | 			'propagate': True,
46 | 		},
47 | 	}
48 | }   
49 | 
50 | 


--------------------------------------------------------------------------------