├── crawler
    ├── __init__.py
    ├── conf.py
    ├── crawler.py
    └── cli.py
├── .gitignore
├── requirements.txt
├── setup.py
├── LICENSE
└── README.md


/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__
3 | *.egg-info
4 | GeoLite2-Country.mmdb
5 | ipasn.dat
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | celery[redis]==3.1.19
 2 | redis==2.10.5
 3 | rethinkdb==2.2.0.post1
 4 | requests==2.9.1
 5 | beautifulsoup4==4.4.1
 6 | click==6.2
 7 | geoip2==2.2.0
 8 | pyasn==1.5.0b6
 9 | threadpool==1.3.2
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='crawler',
 5 |     version='0.1.0',
 6 |     include_package_data=True,
 7 |     install_requires=[
 8 |         'celery[redis]==3.1.19',
 9 |         'redis==2.10.5',
10 |         'rethinkdb==2.2.0.post1',
11 |         'requests==2.9.1',
12 |         'beautifulsoup4==4.4.1',
13 |         'click==6.2',
14 |         'geoip2==2.2.0',
15 |         'pyasn==1.5.0b6',
16 |         'threadpool==1.3.2'
17 |     ],
18 |     entry_points='''
19 |         [console_scripts]
20 |         crawler=crawler.cli:cli
21 |     ''',
22 | )
23 | 


--------------------------------------------------------------------------------
/crawler/conf.py:
--------------------------------------------------------------------------------
 1 | class CeleryConf:
 2 | 
 3 |     CELERY_RESULT_BACKEND = 'redis://172.17.0.2:6379/1'
 4 |     BROKER_URL = 'redis://172.17.0.2:6379/0'
 5 |     BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 600}
 6 |     CELERY_TASK_SERIALIZER = 'json'
 7 |     CELERY_RESULT_SERIALIZER = 'json'
 8 |     CELERY_ACCEPT_CONTENT=['json']
 9 |     CELERY_TIMEZONE = 'Europe/Paris'
10 |     CELERY_ENABLE_UTC = True
11 | 
12 | 
13 | class RethinkDBConf:
14 | 
15 |     HOST = '172.17.0.3'
16 |     DB = 'crawler'
17 |     DURABILITY = 'soft'
18 | 
19 | 
20 | ASN_FILE = 'ipasn.dat'
21 | GEOIP2_FILE = 'GeoLite2-Country.mmdb'
22 | REQUESTS_TIMEOUT = (5, 15)
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Nicolas Le Manchet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Crawl the web for fun
 2 | =====================
 3 | 
 4 | Have you ever heard statistics like "half of the websites in the world run
 5 | Apache" or "the number one hosting company in the US is xxx"? Have you ever
 6 | wondered how these figures were calculated? Well I do and I was a bit
 7 | skeptical, so I've decided to write my own crawler in Python to check this by
 8 | myself.
 9 | 
10 | Fortunately Python makes this super easy. Basically the whole program is:
11 | 
12 | * fetch the homepage of a domain with requests
13 | * search for all the links to external domains with Beautiful Soup
14 | * schedule a Celery job for these domains
15 | * repeat
16 | 
17 | The crawler only checks the homepage of each domain. Why that? Because hitting
18 | once every website in the world sounds possible, however hitting once every
19 | page of every website must be quite costly. The downside is that it will
20 | probably miss a few domains.
21 | 
22 | Getting information about networks
23 | ----------------------------------
24 | 
25 | In order to display useful information this program needs to fetch data about
26 | the network hosting a website. This is usually done with the Maxmind GeoIP
27 | database. However it is not freely available, so instead it uses two different
28 | databases:
29 | 
30 | * GeoLite2 Country from Maxmind
31 | * An ASN database generated from routeviews.org (more on that later)
32 | 
33 | Installation
34 | ------------
35 | 
36 | This program is written in Python 3. Start by cloning the repository:
37 | 
38 |     git clone https://github.com/NicolasLM/crawler.git
39 |     cd crawler
40 | 
41 | Create a new virtualenv:
42 | 
43 |     pyvenv venv
44 |     source venv/bin/activate
45 | 
46 | Install the package and its requirements:
47 | 
48 |     pip install --editable .
49 | 
50 | Run Redis which is used by Celery as broker and result backend:
51 | 
52 |     docker run -d redis
53 | 
54 | Run RethinkDB, a document store to save data about domains:
55 | 
56 |     docker run -d rethinkdb rethinkdb --bind all
57 | 
58 | Download GeoLite2 Country from http://dev.maxmind.com/geoip/geoip2/geolite2/
59 | 
60 | Download and format the ASN db used by pyasn:
61 | 
62 |     pyasn_util_download.py --latest
63 |     pyasn_util_convert.py --single rib.2016[...].bz2 ipasn.dat
64 | 
65 | You might want to tweak `crawler/conf.py` before initializing RethinkDB:
66 | 
67 |     crawler rethinkdb
68 | 
69 | Usage
70 | -----
71 | 
72 | Put a single domain in the Celery task list:
73 | 
74 |     crawler insert www.python.org
75 | 
76 | Run 10 Celery workers in parallel:
77 | 
78 |     celery worker -A crawler.crawler.app -c 10 -P threads -Ofair --loglevel INFO
79 | 
80 | Explore the command line and get statistics:
81 | 
82 |     $ crawler countries --count 5
83 |     Top 5 countries
84 |              France  711
85 |       United States  698
86 |               Japan  367
87 |         Netherlands  175
88 |             Germany  73
89 | 
90 | 
91 | License
92 | -------
93 | 
94 | MIT
95 | 


--------------------------------------------------------------------------------
/crawler/crawler.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | from collections import namedtuple
  3 | import socket
  4 | 
  5 | import requests
  6 | from requests.packages import urllib3
  7 | from bs4 import BeautifulSoup
  8 | import rethinkdb as r
  9 | from celery import Celery
 10 | from celery.utils.log import get_task_logger
 11 | import pyasn
 12 | import geoip2.database, geoip2.errors
 13 | 
 14 | import crawler.conf as conf
 15 | 
 16 | logger = get_task_logger(__name__)
 17 | app = Celery('crawler')
 18 | app.config_from_object(conf.CeleryConf)
 19 | asn_db = pyasn.pyasn(conf.ASN_FILE)
 20 | geoip2_db = geoip2.database.Reader(conf.GEOIP2_FILE)
 21 | 
 22 | DomainInfo = namedtuple(
 23 |     'DomainInfo',
 24 |     ['name', 'elapsed', 'headers', 'linked_domains', 'asn', 'country']
 25 | )
 26 | 
 27 | 
 28 | class UncrawlableDomain(Exception):
 29 |     pass
 30 | 
 31 | 
 32 | def get_page(domain):
 33 |     urls = ['http://' + domain, 'https://' + domain]
 34 |     for url in urls:
 35 |         try:
 36 |             return requests.get('http://' + domain,
 37 |                                 timeout=conf.REQUESTS_TIMEOUT)
 38 |         except (requests.RequestException, urllib3.exceptions.HTTPError):
 39 |             continue
 40 |     raise UncrawlableDomain('Cannot crawl ' + domain)
 41 | 
 42 | 
 43 | def get_asn_from_ip(ip):
 44 |     try:
 45 |         return asn_db.lookup(ip)[0]
 46 |     except ValueError:
 47 |         return None
 48 | 
 49 | 
 50 | def get_country_from_ip(ip):
 51 |     try:
 52 |         return geoip2_db.country(ip).country.name
 53 |     except (ValueError, geoip2.errors.AddressNotFoundError):
 54 |         return None
 55 | 
 56 | 
 57 | def get_domain_info(domain):
 58 |     response = get_page(domain)
 59 |     if 'text/html' not in response.headers.get('Content-Type', ''):
 60 |         raise UncrawlableDomain('Cannot crawl ' + domain)
 61 |     
 62 |     domains = list()
 63 |     soup = BeautifulSoup(response.content, 'html.parser')
 64 |     for link in soup.find_all('a'):
 65 |         parsed_link = urlparse(link.get('href'))
 66 |         if parsed_link.netloc:
 67 |             domains.append(parsed_link.netloc.lower())
 68 | 
 69 |     try:
 70 |         ip = socket.gethostbyname(domain)
 71 |         asn = get_asn_from_ip(ip)
 72 |         country = get_country_from_ip(ip)
 73 |     except socket.gaierror:
 74 |         asn = None
 75 |         country = none
 76 | 
 77 |     return DomainInfo(
 78 |         name=domain,
 79 |         elapsed=round(response.elapsed.microseconds / 1000),
 80 |         headers=response.headers,
 81 |         linked_domains=set(domains),
 82 |         asn=asn,
 83 |         country=country
 84 |     )
 85 | 
 86 | 
 87 | def record_success(conn, domain_name, domain_info):
 88 |     r.table('domains').insert({
 89 |         'name': domain_name,
 90 |         'success': True,
 91 |         'headers': domain_info.headers,
 92 |         'elapsed': domain_info.elapsed,
 93 |         'asn': domain_info.asn,
 94 |         'country': domain_info.country,
 95 |         'date': r.now()
 96 |     }).run(conn)
 97 |     logger.info('Fetched domain {} in {}ms'.format(domain_name,
 98 |                                                    domain_info.elapsed))
 99 | 
100 | 
101 | def record_failure(conn, domain_name):
102 |     r.table('domains').insert({
103 |         'name': domain_name,
104 |         'success': False,
105 |         'date': r.now()
106 |     }).run(conn)
107 |     logger.info('Could not fetch domain {}'.format(domain_name))
108 | 
109 | 
110 | @app.task(name='crawler.crawl_domain')
111 | def crawl_domain(domain):
112 | 
113 |     # Connect to rethinkdb
114 |     conn = r.connect(host=conf.RethinkDBConf.HOST,
115 |                      db=conf.RethinkDBConf.DB)
116 | 
117 |     # Do not process already crawled domains
118 |     if r.table('domains').filter({'name': domain}).count().run(conn):
119 |         return
120 | 
121 |     try:
122 |         domain_info = get_domain_info(domain)
123 |     except UncrawlableDomain:
124 |         record_failure(conn, domain)
125 |         return 
126 | 
127 |     # Create a task for each domain not seen yet
128 |     for linked_domain in domain_info.linked_domains:
129 |         if r.table('domains').filter({'name': linked_domain}).count().run(conn):
130 |             continue
131 |         crawl_domain.delay(linked_domain)
132 | 
133 |     record_success(conn, domain, domain_info)
134 | 


--------------------------------------------------------------------------------
/crawler/cli.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from urllib.parse import urlparse
  3 | 
  4 | import click
  5 | import rethinkdb as r
  6 | import redis
  7 | 
  8 | import crawler.conf as conf
  9 | 
 10 | # cli does not need to be thread-safe
 11 | conn = r.connect(host=conf.RethinkDBConf.HOST,
 12 |                  db=conf.RethinkDBConf.DB)
 13 | domains = r.table('domains')
 14 | 
 15 | 
 16 | @click.group()
 17 | @click.version_option()
 18 | def cli():
 19 |     """Crawler command line tool."""
 20 | 
 21 | 
 22 | @cli.command('as', short_help='most popular AS')
 23 | @click.option('--count', default=15, help='number of AS to show')
 24 | def top_as(count):
 25 |     """Show which Autonomous Systems are the most popular."""
 26 |     data = domains.filter(r.row['success'] == True).\
 27 |                group(r.row['asn']).count().run(conn)
 28 |     top('Autonomous Systems', count, data)
 29 | 
 30 | 
 31 | @cli.command('countries', short_help='most popular countries')
 32 | @click.option('--count', default=15, help='number of countries to show')
 33 | def top_countries(count):
 34 |     """Show which countries are the most popular."""
 35 |     data = domains.filter(r.row['success'] == True).\
 36 |                group(r.row['country']).count().run(conn)
 37 |     top('countries', count, data)
 38 | 
 39 | 
 40 | def top(kind, count, data):
 41 |     top = OrderedDict(sorted(data.items(), key=lambda t: -t[1]))
 42 |     i = 1
 43 |     click.secho('Top {} {}'.format(count, kind), bold=True)
 44 |     for value, occurences in top.items():
 45 |         if not value:
 46 |             continue
 47 |         click.echo('{:>15}  {}'.format(value, occurences))
 48 |         i += 1
 49 |         if i > count:
 50 |             break
 51 | 
 52 | 
 53 | @cli.command('stats', short_help='statistics about domains')
 54 | def stats():
 55 |     """Show statistics about domains."""
 56 |     success = domains.filter(r.row['success'] == True).count().run(conn)
 57 |     failure = domains.filter(r.row['success'] == False).count().run(conn)
 58 |     redis_url = urlparse(conf.CeleryConf.BROKER_URL)
 59 |     redis_conn = redis.StrictRedis(redis_url.hostname,
 60 |                                    port=redis_url.port,
 61 |                                    db=redis_url.path[1:])
 62 |     pending = redis_conn.llen('celery')
 63 |     try:
 64 |         percent_failure = failure*100/success
 65 |     except ZeroDivisionError:
 66 |         percent_failure = 0.0
 67 | 
 68 |     click.secho('Domain statistics', bold=True)
 69 |     click.secho('Success: {}'.format(success), fg='green')
 70 |     click.secho('Pending: {}'.format(pending), fg='yellow')
 71 |     click.secho('Failed: {} ({:.2f}%)'.format(failure, percent_failure),
 72 |                fg='red')
 73 | 
 74 | 
 75 | @cli.command('domain', short_help='information about a domain')
 76 | @click.argument('name')
 77 | def domain(name):
 78 |     """Show information about a domain."""
 79 |     import pprint
 80 |     domain_name = name.lower()
 81 |     try:
 82 |         pprint.pprint(domains.filter({'name': domain_name}).run(conn).next())
 83 |     except r.net.DefaultCursorEmpty:
 84 |         click.echo('No information on {}'.format(domain_name))
 85 | 
 86 | 
 87 | @cli.command('insert', short_help='insert a domain in the list to crawl')
 88 | @click.argument('name')
 89 | def insert(name):
 90 |     """Insert a domain in the list of domains to crawl."""
 91 |     from .crawler import crawl_domain
 92 |     name = name.lower()
 93 |     crawl_domain.delay(name)
 94 |     click.secho('Domain {} added to Celery tasks'.format(name),
 95 |                 fg='yellow')
 96 | 
 97 | 
 98 | @cli.command('rethinkdb', short_help='prepare RethinkDB')
 99 | def rethinkdb():
100 |     """Prepare database and table in RethinkDB"""
101 |     from rethinkdb.errors import ReqlOpFailedError, ReqlRuntimeError
102 |     conn = r.connect(host=conf.RethinkDBConf.HOST)
103 | 
104 |     # Create database
105 |     try:
106 |         r.db_create(conf.RethinkDBConf.DB).run(conn)
107 |         click.secho('Created database {}'.format(conf.RethinkDBConf.DB),
108 |                     fg='yellow')
109 |     except ReqlOpFailedError:
110 |         click.secho('Database {} already exists'.format(conf.RethinkDBConf.DB),
111 |                     fg='green')
112 | 
113 |     # Create table 'domains'
114 |     conn = r.connect(host=conf.RethinkDBConf.HOST,
115 |                      db=conf.RethinkDBConf.DB)
116 |     try:
117 |         r.table_create('domains', durability=conf.RethinkDBConf.DURABILITY).\
118 |             run(conn)
119 |         click.secho('Created table domains', fg='yellow')
120 |     except ReqlOpFailedError:
121 |         click.secho('Table domains already exists', fg='green')
122 |     
123 |     # Create index on domains.name
124 |     try:
125 |         r.table('domains').index_create('name').run(conn)
126 |         click.secho('Created index domains.name', fg='yellow')
127 |     except ReqlRuntimeError:
128 |         click.secho('Index domains.name already exists', fg='green')
129 | 


--------------------------------------------------------------------------------